Merge branch 'stable/drivers' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Jul 2011 20:45:15 +0000 (13:45 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Jul 2011 20:45:15 +0000 (13:45 -0700)
* 'stable/drivers' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
  xen/pciback: Have 'passthrough' option instead of XEN_PCIDEV_BACKEND_PASS and XEN_PCIDEV_BACKEND_VPCI
  xen/pciback: Remove the DEBUG option.
  xen/pciback: Drop two backends, squash and cleanup some code.
  xen/pciback: Print out the MSI/MSI-X (PIRQ) values
  xen/pciback: Don't setup an fake IRQ handler for SR-IOV devices.
  xen: rename pciback module to xen-pciback.
  xen/pciback: Fine-grain the spinlocks and fix BUG: scheduling while atomic cases.
  xen/pciback: Allocate IRQ handler for device that is shared with guest.
  xen/pciback: Disable MSI/MSI-X when reseting a device
  xen/pciback: guest SR-IOV support for PV guest
  xen/pciback: Register the owner (domain) of the PCI device.
  xen/pciback: Cleanup the driver based on checkpatch warnings and errors.
  xen/pciback: xen pci backend driver.
  xen: tmem: self-ballooning and frontswap-selfshrinking
  xen: Add module alias to autoload backend drivers
  xen: Populate xenbus device attributes
  xen: Add __attribute__((format(printf... where appropriate
  xen: prepare tmem shim to handle frontswap
  xen: allow enable use of VGA console on dom0

32 files changed:
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/vga.c [new file with mode: 0644]
arch/x86/xen/xen-ops.h
drivers/block/xen-blkback/xenbus.c
drivers/xen/Kconfig
drivers/xen/Makefile
drivers/xen/tmem.c
drivers/xen/xen-balloon.c
drivers/xen/xen-pciback/Makefile [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space.c [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space.h [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space_capability.c [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space_header.c [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space_quirks.c [new file with mode: 0644]
drivers/xen/xen-pciback/conf_space_quirks.h [new file with mode: 0644]
drivers/xen/xen-pciback/passthrough.c [new file with mode: 0644]
drivers/xen/xen-pciback/pci_stub.c [new file with mode: 0644]
drivers/xen/xen-pciback/pciback.h [new file with mode: 0644]
drivers/xen/xen-pciback/pciback_ops.c [new file with mode: 0644]
drivers/xen/xen-pciback/vpci.c [new file with mode: 0644]
drivers/xen/xen-pciback/xenbus.c [new file with mode: 0644]
drivers/xen/xen-selfballoon.c [new file with mode: 0644]
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xenbus/xenbus_probe.h
drivers/xen/xenbus/xenbus_probe_backend.c
drivers/xen/xenbus/xenbus_probe_frontend.c
include/xen/balloon.h
include/xen/hvc-console.h
include/xen/interface/xen.h
include/xen/tmem.h [new file with mode: 0644]
include/xen/xenbus.h

index 17c565d..a6575b9 100644 (file)
@@ -18,5 +18,5 @@ obj-y         := enlighten.o setup.o multicalls.o mmu.o irq.o \
 obj-$(CONFIG_SMP)              += smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)     += debugfs.o
-
+obj-$(CONFIG_XEN_DOM0)         += vga.o
 obj-$(CONFIG_SWIOTLB_XEN)      += pci-swiotlb-xen.o
index 5525163..5325742 100644 (file)
@@ -1248,6 +1248,14 @@ asmlinkage void __init xen_start_kernel(void)
                if (pci_xen)
                        x86_init.pci.arch_init = pci_xen_init;
        } else {
+               const struct dom0_vga_console_info *info =
+                       (void *)((char *)xen_start_info +
+                                xen_start_info->console.dom0.info_off);
+
+               xen_init_vga(info, xen_start_info->console.dom0.info_size);
+               xen_start_info->console.domU.mfn = 0;
+               xen_start_info->console.domU.evtchn = 0;
+
                /* Make sure ACS will be enabled */
                pci_request_acs();
        }
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644 (file)
index 0000000..1cd7f4d
--- /dev/null
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+       struct screen_info *screen_info = &boot_params.screen_info;
+
+       /* This is drawn from a dump from vgacon:startup in
+        * standard Linux. */
+       screen_info->orig_video_mode = 3;
+       screen_info->orig_video_isVGA = 1;
+       screen_info->orig_video_lines = 25;
+       screen_info->orig_video_cols = 80;
+       screen_info->orig_video_ega_bx = 3;
+       screen_info->orig_video_points = 16;
+       screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+       switch (info->video_type) {
+       case XEN_VGATYPE_TEXT_MODE_3:
+               if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+                   + sizeof(info->u.text_mode_3))
+                       break;
+               screen_info->orig_video_lines = info->u.text_mode_3.rows;
+               screen_info->orig_video_cols = info->u.text_mode_3.columns;
+               screen_info->orig_x = info->u.text_mode_3.cursor_x;
+               screen_info->orig_y = info->u.text_mode_3.cursor_y;
+               screen_info->orig_video_points =
+                       info->u.text_mode_3.font_height;
+               break;
+
+       case XEN_VGATYPE_VESA_LFB:
+               if (size < offsetof(struct dom0_vga_console_info,
+                                   u.vesa_lfb.gbl_caps))
+                       break;
+               screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+               screen_info->lfb_width = info->u.vesa_lfb.width;
+               screen_info->lfb_height = info->u.vesa_lfb.height;
+               screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+               screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+               screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+               screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+               screen_info->red_size = info->u.vesa_lfb.red_size;
+               screen_info->red_pos = info->u.vesa_lfb.red_pos;
+               screen_info->green_size = info->u.vesa_lfb.green_size;
+               screen_info->green_pos = info->u.vesa_lfb.green_pos;
+               screen_info->blue_size = info->u.vesa_lfb.blue_size;
+               screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+               screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+               screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+               if (size >= offsetof(struct dom0_vga_console_info,
+                                    u.vesa_lfb.gbl_caps)
+                   + sizeof(info->u.vesa_lfb.gbl_caps))
+                       screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+               if (size >= offsetof(struct dom0_vga_console_info,
+                                    u.vesa_lfb.mode_attrs)
+                   + sizeof(info->u.vesa_lfb.mode_attrs))
+                       screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+               break;
+       }
+}
index 97dfdc8..b095739 100644 (file)
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
 }
 #endif
 
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+                                      size_t size)
+{
+}
+#endif
+
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
 #define DECL_ASM(ret, name, ...)               \
index 6cc0db1..3f129b4 100644 (file)
@@ -684,7 +684,7 @@ again:
 
        err = xenbus_switch_state(dev, XenbusStateConnected);
        if (err)
-               xenbus_dev_fatal(dev, err, "switching to Connected state",
+               xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
                                 dev->nodename);
 
        return;
index a59638b..03bc471 100644 (file)
@@ -9,6 +9,23 @@ config XEN_BALLOON
          the system to expand the domain's memory allocation, or alternatively
          return unneeded memory to the system.
 
+config XEN_SELFBALLOONING
+       bool "Dynamically self-balloon kernel memory to target"
+       depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP
+       default n
+       help
+         Self-ballooning dynamically balloons available kernel memory driven
+         by the current usage of anonymous memory ("committed AS") and
+         controlled by various sysfs-settable parameters.  Configuring
+         FRONTSWAP is highly recommended; if it is not configured, self-
+         ballooning is disabled by default but can be enabled with the
+         'selfballooning' kernel boot parameter.  If FRONTSWAP is configured,
+         frontswap-selfshrinking is enabled by default but can be disabled
+         with the 'noselfshrink' kernel boot parameter; and self-ballooning
+         is enabled by default but can be disabled with the 'noselfballooning'
+         kernel boot parameter.  Note that systems without a sufficiently
+         large swap device should not enable self-ballooning.
+
 config XEN_SCRUB_PAGES
        bool "Scrub pages before returning them to system"
        depends on XEN_BALLOON
@@ -105,4 +122,33 @@ config SWIOTLB_XEN
        depends on PCI
        select SWIOTLB
 
+config XEN_TMEM
+       bool
+       default y if (CLEANCACHE || FRONTSWAP)
+       help
+         Shim to interface in-kernel Transcendent Memory hooks
+         (e.g. cleancache and frontswap) to Xen tmem hypercalls.
+
+config XEN_PCIDEV_BACKEND
+       tristate "Xen PCI-device backend driver"
+       depends on PCI && X86 && XEN
+       depends on XEN_BACKEND
+       default m
+       help
+         The PCI device backend driver allows the kernel to export arbitrary
+         PCI devices to other guests. If you select this to be a module, you
+         will need to make sure no other driver has bound to the device(s)
+         you want to make visible to other guests.
+
+         The parameter "passthrough" allows you specify how you want the PCI
+         devices to appear in the guest. You can choose the default (0) where
+         PCI topology starts at 00.00.0, or (1) for passthrough if you want
+         the PCI devices topology appear the same as in the host.
+
+         The "hide" parameter (only applicable if backend driver is compiled
+         into the kernel) allows you to bind the PCI devices to this module
+         from the default device drivers. The argument is the list of PCI BDFs:
+         xen-pciback.hide=(03:00.0)(04:00.0)
+
+         If in doubt, say m.
 endmenu
index bbc1825..72bbb27 100644 (file)
@@ -1,6 +1,5 @@
 obj-y  += grant-table.o features.o events.o manage.o balloon.o
 obj-y  += xenbus/
-obj-y  += tmem.o
 
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o                      := $(nostackp)
@@ -9,14 +8,17 @@ obj-$(CONFIG_BLOCK)                   += biomerge.o
 obj-$(CONFIG_HOTPLUG_CPU)              += cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)              += xencomm.o
 obj-$(CONFIG_XEN_BALLOON)              += xen-balloon.o
+obj-$(CONFIG_XEN_SELFBALLOONING)       += xen-selfballoon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)           += xen-evtchn.o
 obj-$(CONFIG_XEN_GNTDEV)               += xen-gntdev.o
 obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)      += xen-gntalloc.o
 obj-$(CONFIG_XENFS)                    += xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
 obj-$(CONFIG_XEN_PLATFORM_PCI)         += xen-platform-pci.o
+obj-$(CONFIG_XEN_TMEM)                 += tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)                 += pci.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += xen-pciback/
 
 xen-evtchn-y                           := evtchn.o
 xen-gntdev-y                           := gntdev.o
index 816a449..d369965 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Xen implementation for transcendent memory (tmem)
  *
- * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Copyright (C) 2009-2011 Oracle Corp.  All rights reserved.
  * Author: Dan Magenheimer
  */
 
@@ -9,8 +9,14 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/module.h>
 #include <linux/cleancache.h>
 
+/* temporary ifdef until include/linux/frontswap.h is upstream */
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+#endif
+
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <asm/xen/hypercall.h>
@@ -122,14 +128,8 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
        return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
 }
 
-static int xen_tmem_destroy_pool(u32 pool_id)
-{
-       struct tmem_oid oid = { { 0 } };
-
-       return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
-int tmem_enabled;
+int tmem_enabled __read_mostly;
+EXPORT_SYMBOL(tmem_enabled);
 
 static int __init enable_tmem(char *s)
 {
@@ -139,6 +139,14 @@ static int __init enable_tmem(char *s)
 
 __setup("tmem", enable_tmem);
 
+#ifdef CONFIG_CLEANCACHE
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+       struct tmem_oid oid = { { 0 } };
+
+       return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
 /* cleancache ops */
 
 static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
@@ -240,18 +248,156 @@ static struct cleancache_ops tmem_cleancache_ops = {
        .init_shared_fs = tmem_cleancache_init_shared_fs,
        .init_fs = tmem_cleancache_init_fs
 };
+#endif
 
-static int __init xen_tmem_init(void)
+#ifdef CONFIG_FRONTSWAP
+/* frontswap tmem operations */
+
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int tmem_frontswap_poolid;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS              4
+#define SWIZ_MASK              ((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind)    ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)            (_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
 {
-       struct cleancache_ops old_ops;
+       struct tmem_oid oid = { .oid = { 0 } };
+       oid.oid[0] = _oswiz(type, ind);
+       return oid;
+}
 
+/* returns 0 if the page was successfully put into frontswap, -1 if not */
+static int tmem_frontswap_put_page(unsigned type, pgoff_t offset,
+                                  struct page *page)
+{
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       unsigned long pfn = page_to_pfn(page);
+       int pool = tmem_frontswap_poolid;
+       int ret;
+
+       if (pool < 0)
+               return -1;
+       if (ind64 != ind)
+               return -1;
+       mb(); /* ensure page is quiescent; tmem may address it with an alias */
+       ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       /* translate Xen tmem return values to linux semantics */
+       if (ret == 1)
+               return 0;
+       else
+               return -1;
+}
+
+/*
+ * returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!)
+ */
+static int tmem_frontswap_get_page(unsigned type, pgoff_t offset,
+                                  struct page *page)
+{
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       unsigned long pfn = page_to_pfn(page);
+       int pool = tmem_frontswap_poolid;
+       int ret;
+
+       if (pool < 0)
+               return -1;
+       if (ind64 != ind)
+               return -1;
+       ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       /* translate Xen tmem return values to linux semantics */
+       if (ret == 1)
+               return 0;
+       else
+               return -1;
+}
+
+/* flush a single page from frontswap */
+static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       int pool = tmem_frontswap_poolid;
+
+       if (pool < 0)
+               return;
+       if (ind64 != ind)
+               return;
+       (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void tmem_frontswap_flush_area(unsigned type)
+{
+       int pool = tmem_frontswap_poolid;
+       int ind;
+
+       if (pool < 0)
+               return;
+       for (ind = SWIZ_MASK; ind >= 0; ind--)
+               (void)xen_tmem_flush_object(pool, oswiz(type, ind));
+}
+
+static void tmem_frontswap_init(unsigned ignored)
+{
+       struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID;
+
+       /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+       if (tmem_frontswap_poolid < 0)
+               tmem_frontswap_poolid =
+                   xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
+}
+
+static int __initdata use_frontswap = 1;
+
+static int __init no_frontswap(char *s)
+{
+       use_frontswap = 0;
+       return 1;
+}
+
+__setup("nofrontswap", no_frontswap);
+
+static struct frontswap_ops tmem_frontswap_ops = {
+       .put_page = tmem_frontswap_put_page,
+       .get_page = tmem_frontswap_get_page,
+       .flush_page = tmem_frontswap_flush_page,
+       .flush_area = tmem_frontswap_flush_area,
+       .init = tmem_frontswap_init
+};
+#endif
+
+static int __init xen_tmem_init(void)
+{
        if (!xen_domain())
                return 0;
+#ifdef CONFIG_FRONTSWAP
+       if (tmem_enabled && use_frontswap) {
+               char *s = "";
+               struct frontswap_ops old_ops =
+                       frontswap_register_ops(&tmem_frontswap_ops);
+
+               tmem_frontswap_poolid = -1;
+               if (old_ops.init != NULL)
+                       s = " (WARNING: frontswap_ops overridden)";
+               printk(KERN_INFO "frontswap enabled, RAM provided by "
+                                "Xen Transcendent Memory\n");
+       }
+#endif
 #ifdef CONFIG_CLEANCACHE
        BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
        if (tmem_enabled && use_cleancache) {
                char *s = "";
-               old_ops = cleancache_register_ops(&tmem_cleancache_ops);
+               struct cleancache_ops old_ops =
+                       cleancache_register_ops(&tmem_cleancache_ops);
                if (old_ops.init_fs != NULL)
                        s = " (WARNING: cleancache_ops overridden)";
                printk(KERN_INFO "cleancache enabled, RAM provided by "
index a4ff225..5c9dc43 100644 (file)
@@ -98,6 +98,8 @@ static int __init balloon_init(void)
 
        register_balloon(&balloon_sysdev);
 
+       register_xen_selfballooning(&balloon_sysdev);
+
        target_watch.callback = watch_target;
        xenstore_notifier.notifier_call = balloon_init_watcher;
 
diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile
new file mode 100644 (file)
index 0000000..ffe0ad3
--- /dev/null
@@ -0,0 +1,7 @@
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+                conf_space_capability.o \
+                conf_space_quirks.o vpci.o \
+                passthrough.o
diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c
new file mode 100644 (file)
index 0000000..a803144
--- /dev/null
@@ -0,0 +1,438 @@
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ *               exported PCI Devices.
+ *               It's dangerous to allow PCI Driver Domains to change their
+ *               device's resources (memory, i/o ports, interrupts). We need to
+ *               restrict changes to certain PCI Configuration registers:
+ *               BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DRV_NAME       "xen-pciback"
+static int permissive;
+module_param(permissive, bool, 0644);
+
+/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word,
+ * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */
+#define DEFINE_PCI_CONFIG(op, size, type)                      \
+int xen_pcibk_##op##_config_##size                             \
+(struct pci_dev *dev, int offset, type value, void *data)      \
+{                                                              \
+       return pci_##op##_config_##size(dev, offset, value);    \
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+                          const struct config_field_entry *entry,
+                          int offset, u32 *value)
+{
+       int ret = 0;
+       const struct config_field *field = entry->field;
+
+       *value = 0;
+
+       switch (field->size) {
+       case 1:
+               if (field->u.b.read)
+                       ret = field->u.b.read(dev, offset, (u8 *) value,
+                                             entry->data);
+               break;
+       case 2:
+               if (field->u.w.read)
+                       ret = field->u.w.read(dev, offset, (u16 *) value,
+                                             entry->data);
+               break;
+       case 4:
+               if (field->u.dw.read)
+                       ret = field->u.dw.read(dev, offset, value, entry->data);
+               break;
+       }
+       return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+                           const struct config_field_entry *entry,
+                           int offset, u32 value)
+{
+       int ret = 0;
+       const struct config_field *field = entry->field;
+
+       switch (field->size) {
+       case 1:
+               if (field->u.b.write)
+                       ret = field->u.b.write(dev, offset, (u8) value,
+                                              entry->data);
+               break;
+       case 2:
+               if (field->u.w.write)
+                       ret = field->u.w.write(dev, offset, (u16) value,
+                                              entry->data);
+               break;
+       case 4:
+               if (field->u.dw.write)
+                       ret = field->u.dw.write(dev, offset, value,
+                                               entry->data);
+               break;
+       }
+       return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+       if (size == 1)
+               return 0xff;
+       else if (size == 2)
+               return 0xffff;
+       else
+               return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+       /* Validate request (no un-aligned requests) */
+       if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+               return 1;
+       return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+                             int offset)
+{
+       if (offset >= 0) {
+               new_val_mask <<= (offset * 8);
+               new_val <<= (offset * 8);
+       } else {
+               new_val_mask >>= (offset * -8);
+               new_val >>= (offset * -8);
+       }
+       val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+       return val;
+}
+
+static int pcibios_err_to_errno(int err)
+{
+       switch (err) {
+       case PCIBIOS_SUCCESSFUL:
+               return XEN_PCI_ERR_success;
+       case PCIBIOS_DEVICE_NOT_FOUND:
+               return XEN_PCI_ERR_dev_not_found;
+       case PCIBIOS_BAD_REGISTER_NUMBER:
+               return XEN_PCI_ERR_invalid_offset;
+       case PCIBIOS_FUNC_NOT_SUPPORTED:
+               return XEN_PCI_ERR_not_implemented;
+       case PCIBIOS_SET_FAILED:
+               return XEN_PCI_ERR_access_denied;
+       }
+       return err;
+}
+
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+                         u32 *ret_val)
+{
+       int err = 0;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       const struct config_field_entry *cfg_entry;
+       const struct config_field *field;
+       int req_start, req_end, field_start, field_end;
+       /* if read fails for any reason, return 0
+        * (as if device didn't respond) */
+       u32 value = 0, tmp_val;
+
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n",
+                      pci_name(dev), size, offset);
+
+       if (!valid_request(offset, size)) {
+               err = XEN_PCI_ERR_invalid_offset;
+               goto out;
+       }
+
+       /* Get the real value first, then modify as appropriate */
+       switch (size) {
+       case 1:
+               err = pci_read_config_byte(dev, offset, (u8 *) &value);
+               break;
+       case 2:
+               err = pci_read_config_word(dev, offset, (u16 *) &value);
+               break;
+       case 4:
+               err = pci_read_config_dword(dev, offset, &value);
+               break;
+       }
+
+       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+               field = cfg_entry->field;
+
+               req_start = offset;
+               req_end = offset + size;
+               field_start = OFFSET(cfg_entry);
+               field_end = OFFSET(cfg_entry) + field->size;
+
+               if ((req_start >= field_start && req_start < field_end)
+                   || (req_end > field_start && req_end <= field_end)) {
+                       err = conf_space_read(dev, cfg_entry, field_start,
+                                             &tmp_val);
+                       if (err)
+                               goto out;
+
+                       value = merge_value(value, tmp_val,
+                                           get_mask(field->size),
+                                           field_start - req_start);
+               }
+       }
+
+out:
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n",
+                      pci_name(dev), size, offset, value);
+
+       *ret_val = value;
+       return pcibios_err_to_errno(err);
+}
+
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+       int err = 0, handled = 0;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       const struct config_field_entry *cfg_entry;
+       const struct config_field *field;
+       u32 tmp_val;
+       int req_start, req_end, field_start, field_end;
+
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG
+                      DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n",
+                      pci_name(dev), size, offset, value);
+
+       if (!valid_request(offset, size))
+               return XEN_PCI_ERR_invalid_offset;
+
+       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+               field = cfg_entry->field;
+
+               req_start = offset;
+               req_end = offset + size;
+               field_start = OFFSET(cfg_entry);
+               field_end = OFFSET(cfg_entry) + field->size;
+
+               if ((req_start >= field_start && req_start < field_end)
+                   || (req_end > field_start && req_end <= field_end)) {
+                       tmp_val = 0;
+
+                       err = xen_pcibk_config_read(dev, field_start,
+                                                 field->size, &tmp_val);
+                       if (err)
+                               break;
+
+                       tmp_val = merge_value(tmp_val, value, get_mask(size),
+                                             req_start - field_start);
+
+                       err = conf_space_write(dev, cfg_entry, field_start,
+                                              tmp_val);
+
+                       /* handled is set true here, but not every byte
+                        * may have been written! Properly detecting if
+                        * every byte is handled is unnecessary as the
+                        * flag is used to detect devices that need
+                        * special helpers to work correctly.
+                        */
+                       handled = 1;
+               }
+       }
+
+       if (!handled && !err) {
+               /* By default, anything not specificially handled above is
+                * read-only. The permissive flag changes this behavior so
+                * that anything not specifically handled above is writable.
+                * This means that some fields may still be read-only because
+                * they have entries in the config_field list that intercept
+                * the write and do nothing. */
+               if (dev_data->permissive || permissive) {
+                       switch (size) {
+                       case 1:
+                               err = pci_write_config_byte(dev, offset,
+                                                           (u8) value);
+                               break;
+                       case 2:
+                               err = pci_write_config_word(dev, offset,
+                                                           (u16) value);
+                               break;
+                       case 4:
+                               err = pci_write_config_dword(dev, offset,
+                                                            (u32) value);
+                               break;
+                       }
+               } else if (!dev_data->warned_on_write) {
+                       dev_data->warned_on_write = 1;
+                       dev_warn(&dev->dev, "Driver tried to write to a "
+                                "read-only configuration space field at offset"
+                                " 0x%x, size %d. This may be harmless, but if "
+                                "you have problems with your device:\n"
+                                "1) see permissive attribute in sysfs\n"
+                                "2) report problems to the xen-devel "
+                                "mailing list along with details of your "
+                                "device obtained from lspci.\n", offset, size);
+               }
+       }
+
+       return pcibios_err_to_errno(err);
+}
+
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev)
+{
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       struct config_field_entry *cfg_entry, *t;
+       const struct config_field *field;
+
+       dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+                          "configuration space fields\n");
+       if (!dev_data)
+               return;
+
+       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+               field = cfg_entry->field;
+
+               if (field->clean) {
+                       field->clean((struct config_field *)field);
+
+                       kfree(cfg_entry->data);
+
+                       list_del(&cfg_entry->list);
+                       kfree(cfg_entry);
+               }
+
+       }
+}
+
+void xen_pcibk_config_reset_dev(struct pci_dev *dev)
+{
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       const struct config_field_entry *cfg_entry;
+       const struct config_field *field;
+
+       dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+       if (!dev_data)
+               return;
+
+       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+               field = cfg_entry->field;
+
+               if (field->reset)
+                       field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+       }
+}
+
+void xen_pcibk_config_free_dev(struct pci_dev *dev)
+{
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       struct config_field_entry *cfg_entry, *t;
+       const struct config_field *field;
+
+       dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+       if (!dev_data)
+               return;
+
+       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+               list_del(&cfg_entry->list);
+
+               field = cfg_entry->field;
+
+               if (field->release)
+                       field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+               kfree(cfg_entry);
+       }
+}
+
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+                                   const struct config_field *field,
+                                   unsigned int base_offset)
+{
+       int err = 0;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       struct config_field_entry *cfg_entry;
+       void *tmp;
+
+       cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+       if (!cfg_entry) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       cfg_entry->data = NULL;
+       cfg_entry->field = field;
+       cfg_entry->base_offset = base_offset;
+
+       /* silently ignore duplicate fields */
+       err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry));
+       if (err)
+               goto out;
+
+       if (field->init) {
+               tmp = field->init(dev, OFFSET(cfg_entry));
+
+               if (IS_ERR(tmp)) {
+                       err = PTR_ERR(tmp);
+                       goto out;
+               }
+
+               cfg_entry->data = tmp;
+       }
+
+       dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+               OFFSET(cfg_entry));
+       list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+       if (err)
+               kfree(cfg_entry);
+
+       return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int xen_pcibk_config_init_dev(struct pci_dev *dev)
+{
+       int err = 0;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+       dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+       INIT_LIST_HEAD(&dev_data->config_fields);
+
+       err = xen_pcibk_config_header_add_fields(dev);
+       if (err)
+               goto out;
+
+       err = xen_pcibk_config_capability_add_fields(dev);
+       if (err)
+               goto out;
+
+       err = xen_pcibk_config_quirks_init(dev);
+
+out:
+       return err;
+}
+
+int xen_pcibk_config_init(void)
+{
+       return xen_pcibk_config_capability_init();
+}
diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h
new file mode 100644 (file)
index 0000000..e56c934
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+                                void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+                               void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+                               void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+                               void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+                              void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+                              void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+       unsigned int offset;
+       unsigned int size;
+       unsigned int mask;
+       conf_field_init init;
+       conf_field_reset reset;
+       conf_field_free release;
+       void (*clean) (struct config_field *field);
+       union {
+               struct {
+                       conf_dword_write write;
+                       conf_dword_read read;
+               } dw;
+               struct {
+                       conf_word_write write;
+                       conf_word_read read;
+               } w;
+               struct {
+                       conf_byte_write write;
+                       conf_byte_read read;
+               } b;
+       } u;
+       struct list_head list;
+};
+
+struct config_field_entry {
+       struct list_head list;
+       const struct config_field *field;
+       unsigned int base_offset;
+       void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+                                   const struct config_field *field,
+                                   unsigned int offset);
+
+static inline int xen_pcibk_config_add_field(struct pci_dev *dev,
+                                          const struct config_field *field)
+{
+       return xen_pcibk_config_add_field_offset(dev, field, 0);
+}
+
+static inline int xen_pcibk_config_add_fields(struct pci_dev *dev,
+                                           const struct config_field *field)
+{
+       int i, err = 0;
+       for (i = 0; field[i].size != 0; i++) {
+               err = xen_pcibk_config_add_field(dev, &field[i]);
+               if (err)
+                       break;
+       }
+       return err;
+}
+
+static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev,
+                                       const struct config_field *field,
+                                       unsigned int offset)
+{
+       int i, err = 0;
+       for (i = 0; field[i].size != 0; i++) {
+               err = xen_pcibk_config_add_field_offset(dev, &field[i], offset);
+               if (err)
+                       break;
+       }
+       return err;
+}
+
+/* Read/Write the real configuration space */
+int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+                              void *data);
+int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+                              void *data);
+int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+                               void *data);
+int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+                                void *data);
+int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value,
+                               void *data);
+int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+                                void *data);
+
+int xen_pcibk_config_capability_init(void);
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev);
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev);
+
+#endif                         /* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
new file mode 100644 (file)
index 0000000..7f83e90
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ *               in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+static LIST_HEAD(capabilities);
+struct xen_pcibk_config_capability {
+       struct list_head cap_list;
+
+       int capability;
+
+       /* If the device has the capability found above, add these fields */
+       const struct config_field *fields;
+};
+
+static const struct config_field caplist_header[] = {
+       {
+        .offset    = PCI_CAP_LIST_ID,
+        .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+        .u.w.read  = xen_pcibk_read_config_word,
+        .u.w.write = NULL,
+       },
+       {}
+};
+
+static inline void register_capability(struct xen_pcibk_config_capability *cap)
+{
+       list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev)
+{
+       int err = 0;
+       struct xen_pcibk_config_capability *cap;
+       int cap_offset;
+
+       list_for_each_entry(cap, &capabilities, cap_list) {
+               cap_offset = pci_find_capability(dev, cap->capability);
+               if (cap_offset) {
+                       dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+                               cap->capability, cap_offset);
+
+                       err = xen_pcibk_config_add_fields_offset(dev,
+                                                              caplist_header,
+                                                              cap_offset);
+                       if (err)
+                               goto out;
+                       err = xen_pcibk_config_add_fields_offset(dev,
+                                                              cap->fields,
+                                                              cap_offset);
+                       if (err)
+                               goto out;
+               }
+       }
+
+out:
+       return err;
+}
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+                            void *data)
+{
+       /* Disallow writes to the vital product data */
+       if (value & PCI_VPD_ADDR_F)
+               return PCIBIOS_SET_FAILED;
+       else
+               return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+       {
+        .offset    = PCI_VPD_ADDR,
+        .size      = 2,
+        .u.w.read  = xen_pcibk_read_config_word,
+        .u.w.write = vpd_address_write,
+        },
+       {
+        .offset     = PCI_VPD_DATA,
+        .size       = 4,
+        .u.dw.read  = xen_pcibk_read_config_dword,
+        .u.dw.write = NULL,
+        },
+       {}
+};
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+                       void *data)
+{
+       int err;
+       u16 real_value;
+
+       err = pci_read_config_word(dev, offset, &real_value);
+       if (err)
+               goto out;
+
+       *value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+       return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+                        void *data)
+{
+       int err;
+       u16 old_value;
+       pci_power_t new_state, old_state;
+
+       err = pci_read_config_word(dev, offset, &old_value);
+       if (err)
+               goto out;
+
+       old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+       new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+       new_value &= PM_OK_BITS;
+       if ((old_value & PM_OK_BITS) != new_value) {
+               new_value = (old_value & ~PM_OK_BITS) | new_value;
+               err = pci_write_config_word(dev, offset, new_value);
+               if (err)
+                       goto out;
+       }
+
+       /* Let pci core handle the power management change */
+       dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+       err = pci_set_power_state(dev, new_state);
+       if (err) {
+               err = PCIBIOS_SET_FAILED;
+               goto out;
+       }
+
+ out:
+       return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+       int err;
+       u16 value;
+
+       err = pci_read_config_word(dev, offset, &value);
+       if (err)
+               goto out;
+
+       if (value & PCI_PM_CTRL_PME_ENABLE) {
+               value &= ~PCI_PM_CTRL_PME_ENABLE;
+               err = pci_write_config_word(dev, offset, value);
+       }
+
+out:
+       return ERR_PTR(err);
+}
+
+static const struct config_field caplist_pm[] = {
+       {
+               .offset     = PCI_PM_PMC,
+               .size       = 2,
+               .u.w.read   = pm_caps_read,
+       },
+       {
+               .offset     = PCI_PM_CTRL,
+               .size       = 2,
+               .init       = pm_ctrl_init,
+               .u.w.read   = xen_pcibk_read_config_word,
+               .u.w.write  = pm_ctrl_write,
+       },
+       {
+               .offset     = PCI_PM_PPB_EXTENSIONS,
+               .size       = 1,
+               .u.b.read   = xen_pcibk_read_config_byte,
+       },
+       {
+               .offset     = PCI_PM_DATA_REGISTER,
+               .size       = 1,
+               .u.b.read   = xen_pcibk_read_config_byte,
+       },
+       {}
+};
+
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
+       .capability = PCI_CAP_ID_PM,
+       .fields = caplist_pm,
+};
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
+       .capability = PCI_CAP_ID_VPD,
+       .fields = caplist_vpd,
+};
+
+int xen_pcibk_config_capability_init(void)
+{
+       register_capability(&xen_pcibk_config_capability_vpd);
+       register_capability(&xen_pcibk_config_capability_pm);
+
+       return 0;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
new file mode 100644 (file)
index 0000000..da3cbdf
--- /dev/null
@@ -0,0 +1,386 @@
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+       u32 val;
+       u32 len_val;
+       int which;
+};
+
+#define DRV_NAME       "xen-pciback"
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+       int i;
+       int ret;
+
+       ret = xen_pcibk_read_config_word(dev, offset, value, data);
+       if (!atomic_read(&dev->enable_cnt))
+               return ret;
+
+       for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+               if (dev->resource[i].flags & IORESOURCE_IO)
+                       *value |= PCI_COMMAND_IO;
+               if (dev->resource[i].flags & IORESOURCE_MEM)
+                       *value |= PCI_COMMAND_MEMORY;
+       }
+
+       return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       int err;
+
+       dev_data = pci_get_drvdata(dev);
+       if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+               if (unlikely(verbose_request))
+                       printk(KERN_DEBUG DRV_NAME ": %s: enable\n",
+                              pci_name(dev));
+               err = pci_enable_device(dev);
+               if (err)
+                       return err;
+               if (dev_data)
+                       dev_data->enable_intx = 1;
+       } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+               if (unlikely(verbose_request))
+                       printk(KERN_DEBUG DRV_NAME ": %s: disable\n",
+                              pci_name(dev));
+               pci_disable_device(dev);
+               if (dev_data)
+                       dev_data->enable_intx = 0;
+       }
+
+       if (!dev->is_busmaster && is_master_cmd(value)) {
+               if (unlikely(verbose_request))
+                       printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n",
+                              pci_name(dev));
+               pci_set_master(dev);
+       }
+
+       if (value & PCI_COMMAND_INVALIDATE) {
+               if (unlikely(verbose_request))
+                       printk(KERN_DEBUG
+                              DRV_NAME ": %s: enable memory-write-invalidate\n",
+                              pci_name(dev));
+               err = pci_set_mwi(dev);
+               if (err) {
+                       printk(KERN_WARNING
+                              DRV_NAME ": %s: cannot enable "
+                              "memory-write-invalidate (%d)\n",
+                              pci_name(dev), err);
+                       value &= ~PCI_COMMAND_INVALIDATE;
+               }
+       }
+
+       return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+       struct pci_bar_info *bar = data;
+
+       if (unlikely(!bar)) {
+               printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+                      pci_name(dev));
+               return XEN_PCI_ERR_op_failed;
+       }
+
+       /* A write to obtain the length must happen as a 32-bit write.
+        * This does not (yet) support writing individual bytes
+        */
+       if (value == ~PCI_ROM_ADDRESS_ENABLE)
+               bar->which = 1;
+       else {
+               u32 tmpval;
+               pci_read_config_dword(dev, offset, &tmpval);
+               if (tmpval != bar->val && value == bar->val) {
+                       /* Allow restoration of bar value. */
+                       pci_write_config_dword(dev, offset, bar->val);
+               }
+               bar->which = 0;
+       }
+
+       /* Do we need to support enabling/disabling the rom address here? */
+
+       return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+       struct pci_bar_info *bar = data;
+
+       if (unlikely(!bar)) {
+               printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+                      pci_name(dev));
+               return XEN_PCI_ERR_op_failed;
+       }
+
+       /* A write to obtain the length must happen as a 32-bit write.
+        * This does not (yet) support writing individual bytes
+        */
+       if (value == ~0)
+               bar->which = 1;
+       else {
+               u32 tmpval;
+               pci_read_config_dword(dev, offset, &tmpval);
+               if (tmpval != bar->val && value == bar->val) {
+                       /* Allow restoration of bar value. */
+                       pci_write_config_dword(dev, offset, bar->val);
+               }
+               bar->which = 0;
+       }
+
+       return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+       struct pci_bar_info *bar = data;
+
+       if (unlikely(!bar)) {
+               printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n",
+                      pci_name(dev));
+               return XEN_PCI_ERR_op_failed;
+       }
+
+       *value = bar->which ? bar->len_val : bar->val;
+
+       return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+                               struct pci_bar_info *bar_info, int offset,
+                               u32 len_mask)
+{
+       int     pos;
+       struct resource *res = dev->resource;
+
+       if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+               pos = PCI_ROM_RESOURCE;
+       else {
+               pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+               if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
+                               PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+                          (PCI_BASE_ADDRESS_SPACE_MEMORY |
+                               PCI_BASE_ADDRESS_MEM_TYPE_64))) {
+                       bar_info->val = res[pos - 1].start >> 32;
+                       bar_info->len_val = res[pos - 1].end >> 32;
+                       return;
+               }
+       }
+
+       bar_info->val = res[pos].start |
+                       (res[pos].flags & PCI_REGION_FLAG_MASK);
+       bar_info->len_val = res[pos].end - res[pos].start + 1;
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+       if (!bar)
+               return ERR_PTR(-ENOMEM);
+
+       read_dev_bar(dev, bar, offset, ~0);
+       bar->which = 0;
+
+       return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+       if (!bar)
+               return ERR_PTR(-ENOMEM);
+
+       read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+       bar->which = 0;
+
+       return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+       struct pci_bar_info *bar = data;
+
+       bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+       kfree(data);
+}
+
+static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset,
+                              u16 *value, void *data)
+{
+       *value = dev->vendor;
+
+       return 0;
+}
+
+static int xen_pcibk_read_device(struct pci_dev *dev, int offset,
+                              u16 *value, void *data)
+{
+       *value = dev->device;
+
+       return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+                         void *data)
+{
+       *value = (u8) dev->irq;
+
+       return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+       u8 cur_value;
+       int err;
+
+       err = pci_read_config_byte(dev, offset, &cur_value);
+       if (err)
+               goto out;
+
+       if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+           || value == PCI_BIST_START)
+               err = pci_write_config_byte(dev, offset, value);
+
+out:
+       return err;
+}
+
+static const struct config_field header_common[] = {
+       {
+        .offset    = PCI_VENDOR_ID,
+        .size      = 2,
+        .u.w.read  = xen_pcibk_read_vendor,
+       },
+       {
+        .offset    = PCI_DEVICE_ID,
+        .size      = 2,
+        .u.w.read  = xen_pcibk_read_device,
+       },
+       {
+        .offset    = PCI_COMMAND,
+        .size      = 2,
+        .u.w.read  = command_read,
+        .u.w.write = command_write,
+       },
+       {
+        .offset    = PCI_INTERRUPT_LINE,
+        .size      = 1,
+        .u.b.read  = interrupt_read,
+       },
+       {
+        .offset    = PCI_INTERRUPT_PIN,
+        .size      = 1,
+        .u.b.read  = xen_pcibk_read_config_byte,
+       },
+       {
+        /* Any side effects of letting driver domain control cache line? */
+        .offset    = PCI_CACHE_LINE_SIZE,
+        .size      = 1,
+        .u.b.read  = xen_pcibk_read_config_byte,
+        .u.b.write = xen_pcibk_write_config_byte,
+       },
+       {
+        .offset    = PCI_LATENCY_TIMER,
+        .size      = 1,
+        .u.b.read  = xen_pcibk_read_config_byte,
+       },
+       {
+        .offset    = PCI_BIST,
+        .size      = 1,
+        .u.b.read  = xen_pcibk_read_config_byte,
+        .u.b.write = bist_write,
+       },
+       {}
+};
+
+#define CFG_FIELD_BAR(reg_offset)                      \
+       {                                               \
+       .offset     = reg_offset,                       \
+       .size       = 4,                                \
+       .init       = bar_init,                         \
+       .reset      = bar_reset,                        \
+       .release    = bar_release,                      \
+       .u.dw.read  = bar_read,                         \
+       .u.dw.write = bar_write,                        \
+       }
+
+#define CFG_FIELD_ROM(reg_offset)                      \
+       {                                               \
+       .offset     = reg_offset,                       \
+       .size       = 4,                                \
+       .init       = rom_init,                         \
+       .reset      = bar_reset,                        \
+       .release    = bar_release,                      \
+       .u.dw.read  = bar_read,                         \
+       .u.dw.write = rom_write,                        \
+       }
+
+static const struct config_field header_0[] = {
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+       CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+       {}
+};
+
+static const struct config_field header_1[] = {
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+       CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+       {}
+};
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev)
+{
+       int err;
+
+       err = xen_pcibk_config_add_fields(dev, header_common);
+       if (err)
+               goto out;
+
+       switch (dev->hdr_type) {
+       case PCI_HEADER_TYPE_NORMAL:
+               err = xen_pcibk_config_add_fields(dev, header_0);
+               break;
+
+       case PCI_HEADER_TYPE_BRIDGE:
+               err = xen_pcibk_config_add_fields(dev, header_1);
+               break;
+
+       default:
+               err = -EINVAL;
+               printk(KERN_ERR DRV_NAME ": %s: Unsupported header type %d!\n",
+                      pci_name(dev), dev->hdr_type);
+               break;
+       }
+
+out:
+       return err;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c
new file mode 100644 (file)
index 0000000..921a889
--- /dev/null
@@ -0,0 +1,140 @@
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(xen_pcibk_quirks);
+#define        DRV_NAME        "xen-pciback"
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+       if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+           (id->device == PCI_ANY_ID || id->device == dev->device) &&
+           (id->subvendor == PCI_ANY_ID ||
+                               id->subvendor == dev->subsystem_vendor) &&
+           (id->subdevice == PCI_ANY_ID ||
+                               id->subdevice == dev->subsystem_device) &&
+           !((id->class ^ dev->class) & id->class_mask))
+               return id;
+       return NULL;
+}
+
+static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev)
+{
+       struct xen_pcibk_config_quirk *tmp_quirk;
+
+       list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list)
+               if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+                       goto out;
+       tmp_quirk = NULL;
+       printk(KERN_DEBUG DRV_NAME
+              ":quirk didn't match any device xen_pciback knows about\n");
+out:
+       return tmp_quirk;
+}
+
+static inline void register_quirk(struct xen_pcibk_config_quirk *quirk)
+{
+       list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks);
+}
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+       int ret = 0;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+       struct config_field_entry *cfg_entry;
+
+       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+               if (OFFSET(cfg_entry) == reg) {
+                       ret = 1;
+                       break;
+               }
+       }
+       return ret;
+}
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+                                   *field)
+{
+       int err = 0;
+
+       switch (field->size) {
+       case 1:
+               field->u.b.read = xen_pcibk_read_config_byte;
+               field->u.b.write = xen_pcibk_write_config_byte;
+               break;
+       case 2:
+               field->u.w.read = xen_pcibk_read_config_word;
+               field->u.w.write = xen_pcibk_write_config_word;
+               break;
+       case 4:
+               field->u.dw.read = xen_pcibk_read_config_dword;
+               field->u.dw.write = xen_pcibk_write_config_dword;
+               break;
+       default:
+               err = -EINVAL;
+               goto out;
+       }
+
+       xen_pcibk_config_add_field(dev, field);
+
+out:
+       return err;
+}
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev)
+{
+       struct xen_pcibk_config_quirk *quirk;
+       int ret = 0;
+
+       quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+       if (!quirk) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       quirk->devid.vendor = dev->vendor;
+       quirk->devid.device = dev->device;
+       quirk->devid.subvendor = dev->subsystem_vendor;
+       quirk->devid.subdevice = dev->subsystem_device;
+       quirk->devid.class = 0;
+       quirk->devid.class_mask = 0;
+       quirk->devid.driver_data = 0UL;
+
+       quirk->pdev = dev;
+
+       register_quirk(quirk);
+out:
+       return ret;
+}
+
+void xen_pcibk_config_field_free(struct config_field *field)
+{
+       kfree(field);
+}
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev)
+{
+       struct xen_pcibk_config_quirk *quirk;
+       int ret = 0;
+
+       quirk = xen_pcibk_find_quirk(dev);
+       if (!quirk) {
+               ret = -ENXIO;
+               goto out;
+       }
+
+       list_del(&quirk->quirks_list);
+       kfree(quirk);
+
+out:
+       return ret;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h
new file mode 100644 (file)
index 0000000..cfcc517
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct xen_pcibk_config_quirk {
+       struct list_head quirks_list;
+       struct pci_device_id devid;
+       struct pci_dev *pdev;
+};
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+                                   *field);
+
+int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev);
+
+void xen_pcibk_config_field_free(struct config_field *field);
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev);
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c
new file mode 100644 (file)
index 0000000..1d32a9a
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+       /* Access to dev_list must be protected by lock */
+       struct list_head dev_list;
+       spinlock_t lock;
+};
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+                                              unsigned int domain,
+                                              unsigned int bus,
+                                              unsigned int devfn)
+{
+       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+       struct pci_dev_entry *dev_entry;
+       struct pci_dev *dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev_data->lock, flags);
+
+       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+               if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+                   && bus == (unsigned int)dev_entry->dev->bus->number
+                   && devfn == dev_entry->dev->devfn) {
+                       dev = dev_entry->dev;
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&dev_data->lock, flags);
+
+       return dev;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+                                  struct pci_dev *dev,
+                                  int devid, publish_pci_dev_cb publish_cb)
+{
+       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+       struct pci_dev_entry *dev_entry;
+       unsigned long flags;
+       unsigned int domain, bus, devfn;
+       int err;
+
+       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+       if (!dev_entry)
+               return -ENOMEM;
+       dev_entry->dev = dev;
+
+       spin_lock_irqsave(&dev_data->lock, flags);
+       list_add_tail(&dev_entry->list, &dev_data->dev_list);
+       spin_unlock_irqrestore(&dev_data->lock, flags);
+
+       /* Publish this device. */
+       domain = (unsigned int)pci_domain_nr(dev->bus);
+       bus = (unsigned int)dev->bus->number;
+       devfn = dev->devfn;
+       err = publish_cb(pdev, domain, bus, devfn, devid);
+
+       return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+                                       struct pci_dev *dev)
+{
+       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+       struct pci_dev_entry *dev_entry, *t;
+       struct pci_dev *found_dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev_data->lock, flags);
+
+       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+               if (dev_entry->dev == dev) {
+                       list_del(&dev_entry->list);
+                       found_dev = dev_entry->dev;
+                       kfree(dev_entry);
+               }
+       }
+
+       spin_unlock_irqrestore(&dev_data->lock, flags);
+
+       if (found_dev)
+               pcistub_put_pci_dev(found_dev);
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+       struct passthrough_dev_data *dev_data;
+
+       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+       if (!dev_data)
+               return -ENOMEM;
+
+       spin_lock_init(&dev_data->lock);
+
+       INIT_LIST_HEAD(&dev_data->dev_list);
+
+       pdev->pci_dev_data = dev_data;
+
+       return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+                                        publish_pci_root_cb publish_root_cb)
+{
+       int err = 0;
+       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+       struct pci_dev_entry *dev_entry, *e, *tmp;
+       struct pci_dev *dev;
+       int found;
+       unsigned int domain, bus;
+
+       spin_lock(&dev_data->lock);
+
+       list_for_each_entry_safe(dev_entry, tmp, &dev_data->dev_list, list) {
+               /* Only publish this device as a root if none of its
+                * parent bridges are exported
+                */
+               found = 0;
+               dev = dev_entry->dev->bus->self;
+               for (; !found && dev != NULL; dev = dev->bus->self) {
+                       list_for_each_entry(e, &dev_data->dev_list, list) {
+                               if (dev == e->dev) {
+                                       found = 1;
+                                       break;
+                               }
+                       }
+               }
+
+               domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+               bus = (unsigned int)dev_entry->dev->bus->number;
+
+               if (!found) {
+                       spin_unlock(&dev_data->lock);
+                       err = publish_root_cb(pdev, domain, bus);
+                       if (err)
+                               break;
+                       spin_lock(&dev_data->lock);
+               }
+       }
+
+       if (!err)
+               spin_unlock(&dev_data->lock);
+
+       return err;
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+       struct pci_dev_entry *dev_entry, *t;
+
+       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+               list_del(&dev_entry->list);
+               pcistub_put_pci_dev(dev_entry->dev);
+               kfree(dev_entry);
+       }
+
+       kfree(dev_data);
+       pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+                                       struct xen_pcibk_device *pdev,
+                                       unsigned int *domain, unsigned int *bus,
+                                       unsigned int *devfn)
+{
+       *domain = pci_domain_nr(pcidev->bus);
+       *bus = pcidev->bus->number;
+       *devfn = pcidev->devfn;
+       return 1;
+}
+
+struct xen_pcibk_backend xen_pcibk_passthrough_backend = {
+       .name           = "passthrough",
+       .init           = __xen_pcibk_init_devices,
+       .free           = __xen_pcibk_release_devices,
+       .find           = __xen_pcibk_get_pcifront_dev,
+       .publish        = __xen_pcibk_publish_pci_roots,
+       .release        = __xen_pcibk_release_pci_dev,
+       .add            = __xen_pcibk_add_pci_dev,
+       .get            = __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
new file mode 100644 (file)
index 0000000..aec214a
--- /dev/null
@@ -0,0 +1,1376 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DRV_NAME       "xen-pciback"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t xen_pcibk_aer_wait_queue;
+/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops,
+* We want to avoid in middle of AER ops, xen_pcibk devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+       struct list_head slot_list;
+       int domain;
+       unsigned char bus;
+       unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+       struct kref kref;
+       struct list_head dev_list;
+       spinlock_t lock;
+
+       struct pci_dev *dev;
+       struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+
+       dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+       psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+       if (!psdev)
+               return NULL;
+
+       psdev->dev = pci_dev_get(dev);
+       if (!psdev->dev) {
+               kfree(psdev);
+               return NULL;
+       }
+
+       kref_init(&psdev->kref);
+       spin_lock_init(&psdev->lock);
+
+       return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+       struct pcistub_device *psdev;
+
+       psdev = container_of(kref, struct pcistub_device, kref);
+
+       dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
+       xen_unregister_device_domain_owner(psdev->dev);
+
+       /* Clean-up the device */
+       xen_pcibk_reset_device(psdev->dev);
+       xen_pcibk_config_free_dyn_fields(psdev->dev);
+       xen_pcibk_config_free_dev(psdev->dev);
+       kfree(pci_get_drvdata(psdev->dev));
+       pci_set_drvdata(psdev->dev, NULL);
+
+       pci_dev_put(psdev->dev);
+
+       kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+       kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+       kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+                                                 int slot, int func)
+{
+       struct pcistub_device *psdev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (psdev->dev != NULL
+                   && domain == pci_domain_nr(psdev->dev->bus)
+                   && bus == psdev->dev->bus->number
+                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+                       pcistub_device_get(psdev);
+                       goto out;
+               }
+       }
+
+       /* didn't find it */
+       psdev = NULL;
+
+out:
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev,
+                                                 struct pcistub_device *psdev)
+{
+       struct pci_dev *pci_dev = NULL;
+       unsigned long flags;
+
+       pcistub_device_get(psdev);
+
+       spin_lock_irqsave(&psdev->lock, flags);
+       if (!psdev->pdev) {
+               psdev->pdev = pdev;
+               pci_dev = psdev->dev;
+       }
+       spin_unlock_irqrestore(&psdev->lock, flags);
+
+       if (!pci_dev)
+               pcistub_device_put(psdev);
+
+       return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+                                           int domain, int bus,
+                                           int slot, int func)
+{
+       struct pcistub_device *psdev;
+       struct pci_dev *found_dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (psdev->dev != NULL
+                   && domain == pci_domain_nr(psdev->dev->bus)
+                   && bus == psdev->dev->bus->number
+                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+                                   struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+       struct pci_dev *found_dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (psdev->dev == dev) {
+                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       return found_dev;
+}
+
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev, *found_psdev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (psdev->dev == dev) {
+                       found_psdev = psdev;
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+       /*hold this lock for avoiding breaking link between
+       * pcistub and xen_pcibk when AER is in processing
+       */
+       down_write(&pcistub_sem);
+       /* Cleanup our device
+        * (so it's ready for the next domain)
+        */
+       xen_pcibk_reset_device(found_psdev->dev);
+       xen_pcibk_config_free_dyn_fields(found_psdev->dev);
+       xen_pcibk_config_reset_dev(found_psdev->dev);
+
+       spin_lock_irqsave(&found_psdev->lock, flags);
+       found_psdev->pdev = NULL;
+       spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+       pcistub_device_put(found_psdev);
+       up_write(&pcistub_sem);
+}
+
+static int __devinit pcistub_match_one(struct pci_dev *dev,
+                                      struct pcistub_device_id *pdev_id)
+{
+       /* Match the specified device by domain, bus, slot, func and also if
+        * any of the device's parent bridges match.
+        */
+       for (; dev != NULL; dev = dev->bus->self) {
+               if (pci_domain_nr(dev->bus) == pdev_id->domain
+                   && dev->bus->number == pdev_id->bus
+                   && dev->devfn == pdev_id->devfn)
+                       return 1;
+
+               /* Sometimes topmost bridge links to itself. */
+               if (dev == dev->bus->self)
+                       break;
+       }
+
+       return 0;
+}
+
+static int __devinit pcistub_match(struct pci_dev *dev)
+{
+       struct pcistub_device_id *pdev_id;
+       unsigned long flags;
+       int found = 0;
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+       list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+               if (pcistub_match_one(dev, pdev_id)) {
+                       found = 1;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       return found;
+}
+
+static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       int err = 0;
+
+       dev_dbg(&dev->dev, "initializing...\n");
+
+       /* The PCI backend is not intended to be a module (or to work with
+        * removable PCI devices (yet). If it were, xen_pcibk_config_free()
+        * would need to be called somewhere to free the memory allocated
+        * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+        */
+       dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
+                               + strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+       if (!dev_data) {
+               err = -ENOMEM;
+               goto out;
+       }
+       pci_set_drvdata(dev, dev_data);
+
+       /*
+        * Setup name for fake IRQ handler. It will only be enabled
+        * once the device is turned on by the guest.
+        */
+       sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+       dev_dbg(&dev->dev, "initializing config\n");
+
+       init_waitqueue_head(&xen_pcibk_aer_wait_queue);
+       err = xen_pcibk_config_init_dev(dev);
+       if (err)
+               goto out;
+
+       /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+        * must do this here because pcibios_enable_device may specify
+        * the pci device's true irq (and possibly its other resources)
+        * if they differ from what's in the configuration space.
+        * This makes the assumption that the device's resources won't
+        * change after this point (otherwise this code may break!)
+        */
+       dev_dbg(&dev->dev, "enabling device\n");
+       err = pci_enable_device(dev);
+       if (err)
+               goto config_release;
+
+       /* Now disable the device (this also ensures some private device
+        * data is setup before we export)
+        */
+       dev_dbg(&dev->dev, "reset device\n");
+       xen_pcibk_reset_device(dev);
+
+       return 0;
+
+config_release:
+       xen_pcibk_config_free_dev(dev);
+
+out:
+       pci_set_drvdata(dev, NULL);
+       kfree(dev_data);
+       return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+       struct pcistub_device *psdev;
+       unsigned long flags;
+       int err = 0;
+
+       pr_debug(DRV_NAME ": pcistub_init_devices_late\n");
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       while (!list_empty(&seized_devices)) {
+               psdev = container_of(seized_devices.next,
+                                    struct pcistub_device, dev_list);
+               list_del(&psdev->dev_list);
+
+               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+               err = pcistub_init_device(psdev->dev);
+               if (err) {
+                       dev_err(&psdev->dev->dev,
+                               "error %d initializing device\n", err);
+                       kfree(psdev);
+                       psdev = NULL;
+               }
+
+               spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+               if (psdev)
+                       list_add_tail(&psdev->dev_list, &pcistub_devices);
+       }
+
+       initialize_devices = 1;
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+       return 0;
+}
+
+static int __devinit pcistub_seize(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+       unsigned long flags;
+       int err = 0;
+
+       psdev = pcistub_device_alloc(dev);
+       if (!psdev)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       if (initialize_devices) {
+               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+               /* don't want irqs disabled when calling pcistub_init_device */
+               err = pcistub_init_device(psdev->dev);
+
+               spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+               if (!err)
+                       list_add(&psdev->dev_list, &pcistub_devices);
+       } else {
+               dev_dbg(&dev->dev, "deferring initialization\n");
+               list_add(&psdev->dev_list, &seized_devices);
+       }
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+       if (err)
+               pcistub_device_put(psdev);
+
+       return err;
+}
+
+static int __devinit pcistub_probe(struct pci_dev *dev,
+                                  const struct pci_device_id *id)
+{
+       int err = 0;
+
+       dev_dbg(&dev->dev, "probing...\n");
+
+       if (pcistub_match(dev)) {
+
+               if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+                   && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+                       dev_err(&dev->dev, "can't export pci devices that "
+                               "don't have a normal (0) or bridge (1) "
+                               "header type!\n");
+                       err = -ENODEV;
+                       goto out;
+               }
+
+               dev_info(&dev->dev, "seizing device\n");
+               err = pcistub_seize(dev);
+       } else
+               /* Didn't find the device */
+               err = -ENODEV;
+
+out:
+       return err;
+}
+
+static void pcistub_remove(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev, *found_psdev = NULL;
+       unsigned long flags;
+
+       dev_dbg(&dev->dev, "removing\n");
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       xen_pcibk_config_quirk_release(dev);
+
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (psdev->dev == dev) {
+                       found_psdev = psdev;
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+       if (found_psdev) {
+               dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+                       found_psdev->pdev);
+
+               if (found_psdev->pdev) {
+                       printk(KERN_WARNING DRV_NAME ": ****** removing device "
+                              "%s while still in-use! ******\n",
+                              pci_name(found_psdev->dev));
+                       printk(KERN_WARNING DRV_NAME ": ****** driver domain may"
+                              " still access this device's i/o resources!\n");
+                       printk(KERN_WARNING DRV_NAME ": ****** shutdown driver "
+                              "domain before binding device\n");
+                       printk(KERN_WARNING DRV_NAME ": ****** to other drivers "
+                              "or domains\n");
+
+                       xen_pcibk_release_pci_dev(found_psdev->pdev,
+                                               found_psdev->dev);
+               }
+
+               spin_lock_irqsave(&pcistub_devices_lock, flags);
+               list_del(&found_psdev->dev_list);
+               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+               /* the final put for releasing from the list */
+               pcistub_device_put(found_psdev);
+       }
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = {
+       {
+        .vendor = PCI_ANY_ID,
+        .device = PCI_ANY_ID,
+        .subvendor = PCI_ANY_ID,
+        .subdevice = PCI_ANY_ID,
+        },
+       {0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+       struct xenbus_transaction xbt;
+       int err;
+       char nodename[PCI_NODENAME_MAX];
+
+       if (!psdev)
+               dev_err(&psdev->dev->dev,
+                       "device is NULL when do AER recovery/kill_domain\n");
+       snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+               psdev->pdev->xdev->otherend_id);
+       nodename[strlen(nodename)] = '\0';
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               dev_err(&psdev->dev->dev,
+                       "error %d when start xenbus transaction\n", err);
+               return;
+       }
+       /*PV AER handlers will set this flag*/
+       xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               dev_err(&psdev->dev->dev,
+                       "error %d when end xenbus transaction\n", err);
+               return;
+       }
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In xen_pcibk, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+                                      pci_channel_state_t state, int aer_cmd,
+                                      pci_ers_result_t result)
+{
+       pci_ers_result_t res = result;
+       struct xen_pcie_aer_op *aer_op;
+       int ret;
+
+       /*with PV AER drivers*/
+       aer_op = &(psdev->pdev->sh_info->aer_op);
+       aer_op->cmd = aer_cmd ;
+       /*useful for error_detected callback*/
+       aer_op->err = state;
+       /*pcifront_end BDF*/
+       ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev,
+               &aer_op->domain, &aer_op->bus, &aer_op->devfn);
+       if (!ret) {
+               dev_err(&psdev->dev->dev,
+                       DRV_NAME ": failed to get pcifront device\n");
+               return PCI_ERS_RESULT_NONE;
+       }
+       wmb();
+
+       dev_dbg(&psdev->dev->dev,
+                       DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n",
+                       aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+       /*local flag to mark there's aer request, xen_pcibk callback will use
+       * this flag to judge whether we need to check pci-front give aer
+       * service ack signal
+       */
+       set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+       /*It is possible that a pcifront conf_read_write ops request invokes
+       * the callback which cause the spurious execution of wake_up.
+       * Yet it is harmless and better than a spinlock here
+       */
+       set_bit(_XEN_PCIB_active,
+               (unsigned long *)&psdev->pdev->sh_info->flags);
+       wmb();
+       notify_remote_via_irq(psdev->pdev->evtchn_irq);
+
+       ret = wait_event_timeout(xen_pcibk_aer_wait_queue,
+                                !(test_bit(_XEN_PCIB_active, (unsigned long *)
+                                &psdev->pdev->sh_info->flags)), 300*HZ);
+
+       if (!ret) {
+               if (test_bit(_XEN_PCIB_active,
+                       (unsigned long *)&psdev->pdev->sh_info->flags)) {
+                       dev_err(&psdev->dev->dev,
+                               "pcifront aer process not responding!\n");
+                       clear_bit(_XEN_PCIB_active,
+                         (unsigned long *)&psdev->pdev->sh_info->flags);
+                       aer_op->err = PCI_ERS_RESULT_NONE;
+                       return res;
+               }
+       }
+       clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+       if (test_bit(_XEN_PCIF_active,
+               (unsigned long *)&psdev->pdev->sh_info->flags)) {
+               dev_dbg(&psdev->dev->dev,
+                       "schedule pci_conf service in xen_pcibk\n");
+               xen_pcibk_test_and_schedule_op(psdev->pdev);
+       }
+
+       res = (pci_ers_result_t)aer_op->err;
+       return res;
+}
+
+/*
+* xen_pcibk_slot_reset: it will send the slot_reset request to  pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+       pci_ers_result_t result;
+
+       result = PCI_ERS_RESULT_RECOVERED;
+       dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n",
+               dev->bus->number, dev->devfn);
+
+       down_write(&pcistub_sem);
+       psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+                               dev->bus->number,
+                               PCI_SLOT(dev->devfn),
+                               PCI_FUNC(dev->devfn));
+
+       if (!psdev || !psdev->pdev) {
+               dev_err(&dev->dev,
+                       DRV_NAME " device is not found/assigned\n");
+               goto end;
+       }
+
+       if (!psdev->pdev->sh_info) {
+               dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+                       " by HVM, kill it\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+
+       if (!test_bit(_XEN_PCIB_AERHANDLER,
+               (unsigned long *)&psdev->pdev->sh_info->flags)) {
+               dev_err(&dev->dev,
+                       "guest with no AER driver should have been killed\n");
+               goto release;
+       }
+       result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+       if (result == PCI_ERS_RESULT_NONE ||
+               result == PCI_ERS_RESULT_DISCONNECT) {
+               dev_dbg(&dev->dev,
+                       "No AER slot_reset service or disconnected!\n");
+               kill_domain_by_device(psdev);
+       }
+release:
+       pcistub_device_put(psdev);
+end:
+       up_write(&pcistub_sem);
+       return result;
+
+}
+
+
+/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+       pci_ers_result_t result;
+
+       result = PCI_ERS_RESULT_RECOVERED;
+       dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n",
+               dev->bus->number, dev->devfn);
+
+       down_write(&pcistub_sem);
+       psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+                               dev->bus->number,
+                               PCI_SLOT(dev->devfn),
+                               PCI_FUNC(dev->devfn));
+
+       if (!psdev || !psdev->pdev) {
+               dev_err(&dev->dev,
+                       DRV_NAME " device is not found/assigned\n");
+               goto end;
+       }
+
+       if (!psdev->pdev->sh_info) {
+               dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+                       " by HVM, kill it\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+
+       if (!test_bit(_XEN_PCIB_AERHANDLER,
+               (unsigned long *)&psdev->pdev->sh_info->flags)) {
+               dev_err(&dev->dev,
+                       "guest with no AER driver should have been killed\n");
+               goto release;
+       }
+       result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+       if (result == PCI_ERS_RESULT_NONE ||
+               result == PCI_ERS_RESULT_DISCONNECT) {
+               dev_dbg(&dev->dev,
+                       "No AER mmio_enabled service or disconnected!\n");
+               kill_domain_by_device(psdev);
+       }
+release:
+       pcistub_device_put(psdev);
+end:
+       up_write(&pcistub_sem);
+       return result;
+}
+
+/*xen_pcibk_error_detected: it will send the error_detected request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev,
+       pci_channel_state_t error)
+{
+       struct pcistub_device *psdev;
+       pci_ers_result_t result;
+
+       result = PCI_ERS_RESULT_CAN_RECOVER;
+       dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n",
+               dev->bus->number, dev->devfn);
+
+       down_write(&pcistub_sem);
+       psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+                               dev->bus->number,
+                               PCI_SLOT(dev->devfn),
+                               PCI_FUNC(dev->devfn));
+
+       if (!psdev || !psdev->pdev) {
+               dev_err(&dev->dev,
+                       DRV_NAME " device is not found/assigned\n");
+               goto end;
+       }
+
+       if (!psdev->pdev->sh_info) {
+               dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+                       " by HVM, kill it\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+
+       /*Guest owns the device yet no aer handler regiested, kill guest*/
+       if (!test_bit(_XEN_PCIB_AERHANDLER,
+               (unsigned long *)&psdev->pdev->sh_info->flags)) {
+               dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+       result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+       if (result == PCI_ERS_RESULT_NONE ||
+               result == PCI_ERS_RESULT_DISCONNECT) {
+               dev_dbg(&dev->dev,
+                       "No AER error_detected service or disconnected!\n");
+               kill_domain_by_device(psdev);
+       }
+release:
+       pcistub_device_put(psdev);
+end:
+       up_write(&pcistub_sem);
+       return result;
+}
+
+/*xen_pcibk_error_resume: it will send the error_resume request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void xen_pcibk_error_resume(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+
+       dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n",
+               dev->bus->number, dev->devfn);
+
+       down_write(&pcistub_sem);
+       psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+                               dev->bus->number,
+                               PCI_SLOT(dev->devfn),
+                               PCI_FUNC(dev->devfn));
+
+       if (!psdev || !psdev->pdev) {
+               dev_err(&dev->dev,
+                       DRV_NAME " device is not found/assigned\n");
+               goto end;
+       }
+
+       if (!psdev->pdev->sh_info) {
+               dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+                       " by HVM, kill it\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+
+       if (!test_bit(_XEN_PCIB_AERHANDLER,
+               (unsigned long *)&psdev->pdev->sh_info->flags)) {
+               dev_err(&dev->dev,
+                       "guest with no AER driver should have been killed\n");
+               kill_domain_by_device(psdev);
+               goto release;
+       }
+       common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+                      PCI_ERS_RESULT_RECOVERED);
+release:
+       pcistub_device_put(psdev);
+end:
+       up_write(&pcistub_sem);
+       return;
+}
+
+/*add xen_pcibk AER handling*/
+static struct pci_error_handlers xen_pcibk_error_handler = {
+       .error_detected = xen_pcibk_error_detected,
+       .mmio_enabled = xen_pcibk_mmio_enabled,
+       .slot_reset = xen_pcibk_slot_reset,
+       .resume = xen_pcibk_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver xen_pcibk_pci_driver = {
+       /* The name should be xen_pciback, but until the tools are updated
+        * we will keep it as pciback. */
+       .name = "pciback",
+       .id_table = pcistub_ids,
+       .probe = pcistub_probe,
+       .remove = pcistub_remove,
+       .err_handler = &xen_pcibk_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+                             int *slot, int *func)
+{
+       int err;
+
+       err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
+       if (err == 4)
+               return 0;
+       else if (err < 0)
+               return -EINVAL;
+
+       /* try again without domain */
+       *domain = 0;
+       err = sscanf(buf, " %x:%x.%x", bus, slot, func);
+       if (err == 3)
+               return 0;
+
+       return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+                              *slot, int *func, int *reg, int *size, int *mask)
+{
+       int err;
+
+       err =
+           sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
+                  func, reg, size, mask);
+       if (err == 7)
+               return 0;
+       return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+       struct pcistub_device_id *pci_dev_id;
+       unsigned long flags;
+
+       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+       if (!pci_dev_id)
+               return -ENOMEM;
+
+       pci_dev_id->domain = domain;
+       pci_dev_id->bus = bus;
+       pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
+       pr_debug(DRV_NAME ": wants to seize %04x:%02x:%02x.%01x\n",
+                domain, bus, slot, func);
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+       struct pcistub_device_id *pci_dev_id, *t;
+       int devfn = PCI_DEVFN(slot, func);
+       int err = -ENOENT;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+       list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+                                slot_list) {
+               if (pci_dev_id->domain == domain
+                   && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
+                       /* Don't break; here because it's possible the same
+                        * slot could be in the list more than once
+                        */
+                       list_del(&pci_dev_id->slot_list);
+                       kfree(pci_dev_id);
+
+                       err = 0;
+
+                       pr_debug(DRV_NAME ": removed %04x:%02x:%02x.%01x from "
+                                "seize list\n", domain, bus, slot, func);
+               }
+       }
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
+                          int size, int mask)
+{
+       int err = 0;
+       struct pcistub_device *psdev;
+       struct pci_dev *dev;
+       struct config_field *field;
+
+       psdev = pcistub_device_find(domain, bus, slot, func);
+       if (!psdev || !psdev->dev) {
+               err = -ENODEV;
+               goto out;
+       }
+       dev = psdev->dev;
+
+       field = kzalloc(sizeof(*field), GFP_ATOMIC);
+       if (!field) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       field->offset = reg;
+       field->size = size;
+       field->mask = mask;
+       field->init = NULL;
+       field->reset = NULL;
+       field->release = NULL;
+       field->clean = xen_pcibk_config_field_free;
+
+       err = xen_pcibk_config_quirks_add_field(dev, field);
+       if (err)
+               kfree(field);
+out:
+       return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+                               size_t count)
+{
+       int domain, bus, slot, func;
+       int err;
+
+       err = str_to_slot(buf, &domain, &bus, &slot, &func);
+       if (err)
+               goto out;
+
+       err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+       if (!err)
+               err = count;
+       return err;
+}
+
+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+                                  size_t count)
+{
+       int domain, bus, slot, func;
+       int err;
+
+       err = str_to_slot(buf, &domain, &bus, &slot, &func);
+       if (err)
+               goto out;
+
+       err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+       if (!err)
+               err = count;
+       return err;
+}
+
+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+       struct pcistub_device_id *pci_dev_id;
+       size_t count = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+               if (count >= PAGE_SIZE)
+                       break;
+
+               count += scnprintf(buf + count, PAGE_SIZE - count,
+                                  "%04x:%02x:%02x.%01x\n",
+                                  pci_dev_id->domain, pci_dev_id->bus,
+                                  PCI_SLOT(pci_dev_id->devfn),
+                                  PCI_FUNC(pci_dev_id->devfn));
+       }
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       return count;
+}
+
+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
+{
+       struct pcistub_device *psdev;
+       struct xen_pcibk_dev_data *dev_data;
+       size_t count = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (count >= PAGE_SIZE)
+                       break;
+               if (!psdev->dev)
+                       continue;
+               dev_data = pci_get_drvdata(psdev->dev);
+               if (!dev_data)
+                       continue;
+               count +=
+                   scnprintf(buf + count, PAGE_SIZE - count,
+                             "%s:%s:%sing:%ld\n",
+                             pci_name(psdev->dev),
+                             dev_data->isr_on ? "on" : "off",
+                             dev_data->ack_intr ? "ack" : "not ack",
+                             dev_data->handled);
+       }
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       return count;
+}
+
+DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+
+static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
+                                         const char *buf,
+                                         size_t count)
+{
+       struct pcistub_device *psdev;
+       struct xen_pcibk_dev_data *dev_data;
+       int domain, bus, slot, func;
+       int err = -ENOENT;
+
+       err = str_to_slot(buf, &domain, &bus, &slot, &func);
+       if (err)
+               goto out;
+
+       psdev = pcistub_device_find(domain, bus, slot, func);
+
+       if (!psdev)
+               goto out;
+
+       dev_data = pci_get_drvdata(psdev->dev);
+       if (!dev_data)
+               goto out;
+
+       dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+               dev_data->irq_name, dev_data->isr_on,
+               !dev_data->isr_on);
+
+       dev_data->isr_on = !(dev_data->isr_on);
+       if (dev_data->isr_on)
+               dev_data->ack_intr = 1;
+out:
+       if (!err)
+               err = count;
+       return err;
+}
+DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+                                size_t count)
+{
+       int domain, bus, slot, func, reg, size, mask;
+       int err;
+
+       err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
+                          &mask);
+       if (err)
+               goto out;
+
+       err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+       if (!err)
+               err = count;
+       return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+       int count = 0;
+       unsigned long flags;
+       struct xen_pcibk_config_quirk *quirk;
+       struct xen_pcibk_dev_data *dev_data;
+       const struct config_field *field;
+       const struct config_field_entry *cfg_entry;
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+       list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) {
+               if (count >= PAGE_SIZE)
+                       goto out;
+
+               count += scnprintf(buf + count, PAGE_SIZE - count,
+                                  "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+                                  quirk->pdev->bus->number,
+                                  PCI_SLOT(quirk->pdev->devfn),
+                                  PCI_FUNC(quirk->pdev->devfn),
+                                  quirk->devid.vendor, quirk->devid.device,
+                                  quirk->devid.subvendor,
+                                  quirk->devid.subdevice);
+
+               dev_data = pci_get_drvdata(quirk->pdev);
+
+               list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+                       field = cfg_entry->field;
+                       if (count >= PAGE_SIZE)
+                               goto out;
+
+                       count += scnprintf(buf + count, PAGE_SIZE - count,
+                                          "\t\t%08x:%01x:%08x\n",
+                                          cfg_entry->base_offset +
+                                          field->offset, field->size,
+                                          field->mask);
+               }
+       }
+
+out:
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       return count;
+}
+
+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+                             size_t count)
+{
+       int domain, bus, slot, func;
+       int err;
+       struct pcistub_device *psdev;
+       struct xen_pcibk_dev_data *dev_data;
+       err = str_to_slot(buf, &domain, &bus, &slot, &func);
+       if (err)
+               goto out;
+       psdev = pcistub_device_find(domain, bus, slot, func);
+       if (!psdev) {
+               err = -ENODEV;
+               goto out;
+       }
+       if (!psdev->dev) {
+               err = -ENODEV;
+               goto release;
+       }
+       dev_data = pci_get_drvdata(psdev->dev);
+       /* the driver data for a device should never be null at this point */
+       if (!dev_data) {
+               err = -ENXIO;
+               goto release;
+       }
+       if (!dev_data->permissive) {
+               dev_data->permissive = 1;
+               /* Let user know that what they're doing could be unsafe */
+               dev_warn(&psdev->dev->dev, "enabling permissive mode "
+                        "configuration space accesses!\n");
+               dev_warn(&psdev->dev->dev,
+                        "permissive mode is potentially unsafe!\n");
+       }
+release:
+       pcistub_device_put(psdev);
+out:
+       if (!err)
+               err = count;
+       return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+       struct pcistub_device *psdev;
+       struct xen_pcibk_dev_data *dev_data;
+       size_t count = 0;
+       unsigned long flags;
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+               if (count >= PAGE_SIZE)
+                       break;
+               if (!psdev->dev)
+                       continue;
+               dev_data = pci_get_drvdata(psdev->dev);
+               if (!dev_data || !dev_data->permissive)
+                       continue;
+               count +=
+                   scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+                             pci_name(psdev->dev));
+       }
+       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       return count;
+}
+
+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
+static void pcistub_exit(void)
+{
+       driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
+       driver_remove_file(&xen_pcibk_pci_driver.driver,
+                          &driver_attr_remove_slot);
+       driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots);
+       driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
+       driver_remove_file(&xen_pcibk_pci_driver.driver,
+                          &driver_attr_permissive);
+       driver_remove_file(&xen_pcibk_pci_driver.driver,
+                          &driver_attr_irq_handlers);
+       driver_remove_file(&xen_pcibk_pci_driver.driver,
+                          &driver_attr_irq_handler_state);
+       pci_unregister_driver(&xen_pcibk_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+       int pos = 0;
+       int err = 0;
+       int domain, bus, slot, func;
+       int parsed;
+
+       if (pci_devs_to_hide && *pci_devs_to_hide) {
+               do {
+                       parsed = 0;
+
+                       err = sscanf(pci_devs_to_hide + pos,
+                                    " (%x:%x:%x.%x) %n",
+                                    &domain, &bus, &slot, &func, &parsed);
+                       if (err != 4) {
+                               domain = 0;
+                               err = sscanf(pci_devs_to_hide + pos,
+                                            " (%x:%x.%x) %n",
+                                            &bus, &slot, &func, &parsed);
+                               if (err != 3)
+                                       goto parse_error;
+                       }
+
+                       err = pcistub_device_id_add(domain, bus, slot, func);
+                       if (err)
+                               goto out;
+
+                       /* if parsed<=0, we've reached the end of the string */
+                       pos += parsed;
+               } while (parsed > 0 && pci_devs_to_hide[pos]);
+       }
+
+       /* If we're the first PCI Device Driver to register, we're the
+        * first one to get offered PCI devices as they become
+        * available (and thus we can be the first to grab them)
+        */
+       err = pci_register_driver(&xen_pcibk_pci_driver);
+       if (err < 0)
+               goto out;
+
+       err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                &driver_attr_new_slot);
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                        &driver_attr_remove_slot);
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                        &driver_attr_slots);
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                        &driver_attr_quirks);
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                        &driver_attr_permissive);
+
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                        &driver_attr_irq_handlers);
+       if (!err)
+               err = driver_create_file(&xen_pcibk_pci_driver.driver,
+                                       &driver_attr_irq_handler_state);
+       if (err)
+               pcistub_exit();
+
+out:
+       return err;
+
+parse_error:
+       printk(KERN_ERR DRV_NAME ": Error parsing pci_devs_to_hide at \"%s\"\n",
+              pci_devs_to_hide + pos);
+       return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so xen_pcibk *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init xen_pcibk_init(void)
+{
+       int err;
+
+       if (!xen_initial_domain())
+               return -ENODEV;
+
+       err = xen_pcibk_config_init();
+       if (err)
+               return err;
+
+#ifdef MODULE
+       err = pcistub_init();
+       if (err < 0)
+               return err;
+#endif
+
+       pcistub_init_devices_late();
+       err = xen_pcibk_xenbus_register();
+       if (err)
+               pcistub_exit();
+
+       return err;
+}
+
+static void __exit xen_pcibk_cleanup(void)
+{
+       xen_pcibk_xenbus_unregister();
+       pcistub_exit();
+}
+
+module_init(xen_pcibk_init);
+module_exit(xen_pcibk_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
new file mode 100644 (file)
index 0000000..a0e131a
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+struct pci_dev_entry {
+       struct list_head list;
+       struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active       (0)
+#define PDEVF_op_active                (1<<(_PDEVF_op_active))
+#define _PCIB_op_pending       (1)
+#define PCIB_op_pending                (1<<(_PCIB_op_pending))
+
+struct xen_pcibk_device {
+       void *pci_dev_data;
+       spinlock_t dev_lock;
+       struct xenbus_device *xdev;
+       struct xenbus_watch be_watch;
+       u8 be_watching;
+       int evtchn_irq;
+       struct xen_pci_sharedinfo *sh_info;
+       unsigned long flags;
+       struct work_struct op_work;
+};
+
+struct xen_pcibk_dev_data {
+       struct list_head config_fields;
+       unsigned int permissive:1;
+       unsigned int warned_on_write:1;
+       unsigned int enable_intx:1;
+       unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
+       unsigned int ack_intr:1; /* .. and ACK-ing */
+       unsigned long handled;
+       unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+       char irq_name[0]; /* xen-pcibk[000:04:00.0] */
+};
+
+/* Used by XenBus and xen_pcibk_ops.c */
+extern wait_queue_head_t xen_pcibk_aer_wait_queue;
+extern struct workqueue_struct *xen_pcibk_wq;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head xen_pcibk_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+                                           int domain, int bus,
+                                           int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+                                   struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void xen_pcibk_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int xen_pcibk_config_init(void);
+int xen_pcibk_config_init_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev);
+void xen_pcibk_config_reset_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dev(struct pci_dev *dev);
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+                         u32 *ret_val);
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size,
+                          u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev,
+                                  unsigned int domain, unsigned int bus,
+                                  unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
+                                   unsigned int domain, unsigned int bus);
+
+/* Backend registration for the two types of BDF representation:
+ *  vpci - BDFs start at 00
+ *  passthrough - BDFs are exactly like in the host.
+ */
+struct xen_pcibk_backend {
+       char *name;
+       int (*init)(struct xen_pcibk_device *pdev);
+       void (*free)(struct xen_pcibk_device *pdev);
+       int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev,
+                   unsigned int *domain, unsigned int *bus,
+                   unsigned int *devfn);
+       int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb);
+       void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev);
+       int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev,
+                  int devid, publish_pci_dev_cb publish_cb);
+       struct pci_dev *(*get)(struct xen_pcibk_device *pdev,
+                              unsigned int domain, unsigned int bus,
+                              unsigned int devfn);
+};
+
+extern struct xen_pcibk_backend xen_pcibk_vpci_backend;
+extern struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern struct xen_pcibk_backend *xen_pcibk_backend;
+
+static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+                                       struct pci_dev *dev,
+                                       int devid,
+                                       publish_pci_dev_cb publish_cb)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->add)
+               return xen_pcibk_backend->add(pdev, dev, devid, publish_cb);
+       return -1;
+};
+static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+                                            struct pci_dev *dev)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->free)
+               return xen_pcibk_backend->release(pdev, dev);
+};
+
+static inline struct pci_dev *
+xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain,
+                     unsigned int bus, unsigned int devfn)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->get)
+               return xen_pcibk_backend->get(pdev, domain, bus, devfn);
+       return NULL;
+};
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with xen_pcibk to finish aer recovery job if device driver
+* has the capability
+*/
+static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+                                            struct xen_pcibk_device *pdev,
+                                            unsigned int *domain,
+                                            unsigned int *bus,
+                                            unsigned int *devfn)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->find)
+               return xen_pcibk_backend->find(pcidev, pdev, domain, bus,
+                                              devfn);
+       return -1;
+};
+static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->init)
+               return xen_pcibk_backend->init(pdev);
+       return -1;
+};
+static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+                                             publish_pci_root_cb cb)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->publish)
+               return xen_pcibk_backend->publish(pdev, cb);
+       return -1;
+};
+static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+       if (xen_pcibk_backend && xen_pcibk_backend->free)
+               return xen_pcibk_backend->free(pdev);
+};
+/* Handles events from front-end */
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id);
+void xen_pcibk_do_op(struct work_struct *data);
+
+int xen_pcibk_xenbus_register(void);
+void xen_pcibk_xenbus_unregister(void);
+
+extern int verbose_request;
+
+void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev);
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void xen_pcibk_irq_handler(struct pci_dev *dev, int reset);
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
new file mode 100644 (file)
index 0000000..8c95c34
--- /dev/null
@@ -0,0 +1,384 @@
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+#define DRV_NAME       "xen-pciback"
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after xen_pcibk_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+static void xen_pcibk_control_isr(struct pci_dev *dev, int reset)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       int rc;
+       int enable = 0;
+
+       dev_data = pci_get_drvdata(dev);
+       if (!dev_data)
+               return;
+
+       /* We don't deal with bridges */
+       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+               return;
+
+       if (reset) {
+               dev_data->enable_intx = 0;
+               dev_data->ack_intr = 0;
+       }
+       enable =  dev_data->enable_intx;
+
+       /* Asked to disable, but ISR isn't runnig */
+       if (!enable && !dev_data->isr_on)
+               return;
+
+       /* Squirrel away the IRQs in the dev_data. We need this
+        * b/c when device transitions to MSI, the dev->irq is
+        * overwritten with the MSI vector.
+        */
+       if (enable)
+               dev_data->irq = dev->irq;
+
+       /*
+        * SR-IOV devices in all use MSI-X and have no legacy
+        * interrupts, so inhibit creating a fake IRQ handler for them.
+        */
+       if (dev_data->irq == 0)
+               goto out;
+
+       dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+               dev_data->irq_name,
+               dev_data->irq,
+               pci_is_enabled(dev) ? "on" : "off",
+               dev->msi_enabled ? "MSI" : "",
+               dev->msix_enabled ? "MSI/X" : "",
+               dev_data->isr_on ? "enable" : "disable",
+               enable ? "enable" : "disable");
+
+       if (enable) {
+               rc = request_irq(dev_data->irq,
+                               xen_pcibk_guest_interrupt, IRQF_SHARED,
+                               dev_data->irq_name, dev);
+               if (rc) {
+                       dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+                               "handler for IRQ %d! (rc:%d)\n",
+                               dev_data->irq_name, dev_data->irq, rc);
+                       goto out;
+               }
+       } else {
+               free_irq(dev_data->irq, dev);
+               dev_data->irq = 0;
+       }
+       dev_data->isr_on = enable;
+       dev_data->ack_intr = enable;
+out:
+       dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+               dev_data->irq_name,
+               dev_data->irq,
+               pci_is_enabled(dev) ? "on" : "off",
+               dev->msi_enabled ? "MSI" : "",
+               dev->msix_enabled ? "MSI/X" : "",
+               enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+                       (dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see xen_pcibk_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void xen_pcibk_reset_device(struct pci_dev *dev)
+{
+       u16 cmd;
+
+       xen_pcibk_control_isr(dev, 1 /* reset device */);
+
+       /* Disable devices (but not bridges) */
+       if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+               /* The guest could have been abruptly killed without
+                * disabling MSI/MSI-X interrupts.*/
+               if (dev->msix_enabled)
+                       pci_disable_msix(dev);
+               if (dev->msi_enabled)
+                       pci_disable_msi(dev);
+#endif
+               pci_disable_device(dev);
+
+               pci_write_config_word(dev, PCI_COMMAND, 0);
+
+               dev->is_busmaster = 0;
+       } else {
+               pci_read_config_word(dev, PCI_COMMAND, &cmd);
+               if (cmd & (PCI_COMMAND_INVALIDATE)) {
+                       cmd &= ~(PCI_COMMAND_INVALIDATE);
+                       pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+                       dev->is_busmaster = 0;
+               }
+       }
+}
+
+#ifdef CONFIG_PCI_MSI
+static
+int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
+                        struct pci_dev *dev, struct xen_pci_op *op)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       int otherend = pdev->xdev->otherend_id;
+       int status;
+
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev));
+
+       status = pci_enable_msi(dev);
+
+       if (status) {
+               printk(KERN_ERR "error enable msi for guest %x status %x\n",
+                       otherend, status);
+               op->value = 0;
+               return XEN_PCI_ERR_op_failed;
+       }
+
+       /* The value the guest needs is actually the IDT vector, not the
+        * the local domain's IRQ number. */
+
+       op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+                       op->value);
+
+       dev_data = pci_get_drvdata(dev);
+       if (dev_data)
+               dev_data->ack_intr = 0;
+
+       return 0;
+}
+
+static
+int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
+                         struct pci_dev *dev, struct xen_pci_op *op)
+{
+       struct xen_pcibk_dev_data *dev_data;
+
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
+                      pci_name(dev));
+       pci_disable_msi(dev);
+
+       op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+                       op->value);
+       dev_data = pci_get_drvdata(dev);
+       if (dev_data)
+               dev_data->ack_intr = 1;
+       return 0;
+}
+
+static
+int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
+                         struct pci_dev *dev, struct xen_pci_op *op)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       int i, result;
+       struct msix_entry *entries;
+
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
+                      pci_name(dev));
+       if (op->value > SH_INFO_MAX_VEC)
+               return -EINVAL;
+
+       entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
+       if (entries == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < op->value; i++) {
+               entries[i].entry = op->msix_entries[i].entry;
+               entries[i].vector = op->msix_entries[i].vector;
+       }
+
+       result = pci_enable_msix(dev, entries, op->value);
+
+       if (result == 0) {
+               for (i = 0; i < op->value; i++) {
+                       op->msix_entries[i].entry = entries[i].entry;
+                       if (entries[i].vector)
+                               op->msix_entries[i].vector =
+                                       xen_pirq_from_irq(entries[i].vector);
+                               if (unlikely(verbose_request))
+                                       printk(KERN_DEBUG DRV_NAME ": %s: " \
+                                               "MSI-X[%d]: %d\n",
+                                               pci_name(dev), i,
+                                               op->msix_entries[i].vector);
+               }
+       } else {
+               printk(KERN_WARNING DRV_NAME ": %s: failed to enable MSI-X: err %d!\n",
+                       pci_name(dev), result);
+       }
+       kfree(entries);
+
+       op->value = result;
+       dev_data = pci_get_drvdata(dev);
+       if (dev_data)
+               dev_data->ack_intr = 0;
+
+       return result;
+}
+
+static
+int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
+                          struct pci_dev *dev, struct xen_pci_op *op)
+{
+       struct xen_pcibk_dev_data *dev_data;
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
+                       pci_name(dev));
+       pci_disable_msix(dev);
+
+       /*
+        * SR-IOV devices (which don't have any legacy IRQ) have
+        * an undefined IRQ value of zero.
+        */
+       op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+       if (unlikely(verbose_request))
+               printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev),
+                       op->value);
+       dev_data = pci_get_drvdata(dev);
+       if (dev_data)
+               dev_data->ack_intr = 1;
+       return 0;
+}
+#endif
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* xen_pcibk conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev)
+{
+       /* Check that frontend is requesting an operation and that we are not
+        * already processing a request */
+       if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+           && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+               queue_work(xen_pcibk_wq, &pdev->op_work);
+       }
+       /*_XEN_PCIB_active should have been cleared by pcifront. And also make
+       sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/
+       if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+           && test_bit(_PCIB_op_pending, &pdev->flags)) {
+               wake_up(&xen_pcibk_aer_wait_queue);
+       }
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct xen_pcibk_device as a parameter */
+
+void xen_pcibk_do_op(struct work_struct *data)
+{
+       struct xen_pcibk_device *pdev =
+               container_of(data, struct xen_pcibk_device, op_work);
+       struct pci_dev *dev;
+       struct xen_pcibk_dev_data *dev_data = NULL;
+       struct xen_pci_op *op = &pdev->sh_info->op;
+       int test_intx = 0;
+
+       dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+       if (dev == NULL)
+               op->err = XEN_PCI_ERR_dev_not_found;
+       else {
+               dev_data = pci_get_drvdata(dev);
+               if (dev_data)
+                       test_intx = dev_data->enable_intx;
+               switch (op->cmd) {
+               case XEN_PCI_OP_conf_read:
+                       op->err = xen_pcibk_config_read(dev,
+                                 op->offset, op->size, &op->value);
+                       break;
+               case XEN_PCI_OP_conf_write:
+                       op->err = xen_pcibk_config_write(dev,
+                                 op->offset, op->size, op->value);
+                       break;
+#ifdef CONFIG_PCI_MSI
+               case XEN_PCI_OP_enable_msi:
+                       op->err = xen_pcibk_enable_msi(pdev, dev, op);
+                       break;
+               case XEN_PCI_OP_disable_msi:
+                       op->err = xen_pcibk_disable_msi(pdev, dev, op);
+                       break;
+               case XEN_PCI_OP_enable_msix:
+                       op->err = xen_pcibk_enable_msix(pdev, dev, op);
+                       break;
+               case XEN_PCI_OP_disable_msix:
+                       op->err = xen_pcibk_disable_msix(pdev, dev, op);
+                       break;
+#endif
+               default:
+                       op->err = XEN_PCI_ERR_not_implemented;
+                       break;
+               }
+       }
+       if (!op->err && dev && dev_data) {
+               /* Transition detected */
+               if ((dev_data->enable_intx != test_intx))
+                       xen_pcibk_control_isr(dev, 0 /* no reset */);
+       }
+       /* Tell the driver domain that we're done. */
+       wmb();
+       clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+       notify_remote_via_irq(pdev->evtchn_irq);
+
+       /* Mark that we're done. */
+       smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+       clear_bit(_PDEVF_op_active, &pdev->flags);
+       smp_mb__after_clear_bit(); /* /before/ final check for work */
+
+       /* Check to see if the driver domain tried to start another request in
+        * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
+       */
+       xen_pcibk_test_and_schedule_op(pdev);
+}
+
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
+{
+       struct xen_pcibk_device *pdev = dev_id;
+
+       xen_pcibk_test_and_schedule_op(pdev);
+
+       return IRQ_HANDLED;
+}
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
+{
+       struct pci_dev *dev = (struct pci_dev *)dev_id;
+       struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+       if (dev_data->isr_on && dev_data->ack_intr) {
+               dev_data->handled++;
+               if ((dev_data->handled % 1000) == 0) {
+                       if (xen_test_irq_shared(irq)) {
+                               printk(KERN_INFO "%s IRQ line is not shared "
+                                       "with other domains. Turning ISR off\n",
+                                        dev_data->irq_name);
+                               dev_data->ack_intr = 0;
+                       }
+               }
+               return IRQ_HANDLED;
+       }
+       return IRQ_NONE;
+}
diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c
new file mode 100644 (file)
index 0000000..4a42cfb
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+#define DRV_NAME       "xen-pciback"
+
+struct vpci_dev_data {
+       /* Access to dev_list must be protected by lock */
+       struct list_head dev_list[PCI_SLOT_MAX];
+       spinlock_t lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+       return head->next;
+}
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+                                              unsigned int domain,
+                                              unsigned int bus,
+                                              unsigned int devfn)
+{
+       struct pci_dev_entry *entry;
+       struct pci_dev *dev = NULL;
+       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+       unsigned long flags;
+
+       if (domain != 0 || bus != 0)
+               return NULL;
+
+       if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+               spin_lock_irqsave(&vpci_dev->lock, flags);
+
+               list_for_each_entry(entry,
+                                   &vpci_dev->dev_list[PCI_SLOT(devfn)],
+                                   list) {
+                       if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+                               dev = entry->dev;
+                               break;
+                       }
+               }
+
+               spin_unlock_irqrestore(&vpci_dev->lock, flags);
+       }
+       return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+       if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+           && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+               return 1;
+
+       return 0;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+                                  struct pci_dev *dev, int devid,
+                                  publish_pci_dev_cb publish_cb)
+{
+       int err = 0, slot, func = -1;
+       struct pci_dev_entry *t, *dev_entry;
+       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+       unsigned long flags;
+
+       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+               err = -EFAULT;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Can't export bridges on the virtual PCI bus");
+               goto out;
+       }
+
+       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+       if (!dev_entry) {
+               err = -ENOMEM;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error adding entry to virtual PCI bus");
+               goto out;
+       }
+
+       dev_entry->dev = dev;
+
+       spin_lock_irqsave(&vpci_dev->lock, flags);
+
+       /* Keep multi-function devices together on the virtual PCI bus */
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+               if (!list_empty(&vpci_dev->dev_list[slot])) {
+                       t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+                                      struct pci_dev_entry, list);
+
+                       if (match_slot(dev, t->dev)) {
+                               pr_info(DRV_NAME ": vpci: %s: "
+                                       "assign to virtual slot %d func %d\n",
+                                       pci_name(dev), slot,
+                                       PCI_FUNC(dev->devfn));
+                               list_add_tail(&dev_entry->list,
+                                             &vpci_dev->dev_list[slot]);
+                               func = PCI_FUNC(dev->devfn);
+                               goto unlock;
+                       }
+               }
+       }
+
+       /* Assign to a new slot on the virtual PCI bus */
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+               if (list_empty(&vpci_dev->dev_list[slot])) {
+                       printk(KERN_INFO DRV_NAME
+                              ": vpci: %s: assign to virtual slot %d\n",
+                              pci_name(dev), slot);
+                       list_add_tail(&dev_entry->list,
+                                     &vpci_dev->dev_list[slot]);
+                       func = PCI_FUNC(dev->devfn);
+                       goto unlock;
+               }
+       }
+
+       err = -ENOMEM;
+       xenbus_dev_fatal(pdev->xdev, err,
+                        "No more space on root virtual PCI bus");
+
+unlock:
+       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+       /* Publish this device. */
+       if (!err)
+               err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
+out:
+       return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+                                       struct pci_dev *dev)
+{
+       int slot;
+       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+       struct pci_dev *found_dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vpci_dev->lock, flags);
+
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+               struct pci_dev_entry *e, *tmp;
+               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+                                        list) {
+                       if (e->dev == dev) {
+                               list_del(&e->list);
+                               found_dev = e->dev;
+                               kfree(e);
+                               goto out;
+                       }
+               }
+       }
+
+out:
+       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+       if (found_dev)
+               pcistub_put_pci_dev(found_dev);
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+       int slot;
+       struct vpci_dev_data *vpci_dev;
+
+       vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+       if (!vpci_dev)
+               return -ENOMEM;
+
+       spin_lock_init(&vpci_dev->lock);
+
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+               INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+       pdev->pci_dev_data = vpci_dev;
+
+       return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+                                        publish_pci_root_cb publish_cb)
+{
+       /* The Virtual PCI bus has only one root */
+       return publish_cb(pdev, 0, 0);
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+       int slot;
+       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+               struct pci_dev_entry *e, *tmp;
+               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+                                        list) {
+                       list_del(&e->list);
+                       pcistub_put_pci_dev(e->dev);
+                       kfree(e);
+               }
+       }
+
+       kfree(vpci_dev);
+       pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+                                       struct xen_pcibk_device *pdev,
+                                       unsigned int *domain, unsigned int *bus,
+                                       unsigned int *devfn)
+{
+       struct pci_dev_entry *entry;
+       struct pci_dev *dev = NULL;
+       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+       unsigned long flags;
+       int found = 0, slot;
+
+       spin_lock_irqsave(&vpci_dev->lock, flags);
+       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+               list_for_each_entry(entry,
+                           &vpci_dev->dev_list[slot],
+                           list) {
+                       dev = entry->dev;
+                       if (dev && dev->bus->number == pcidev->bus->number
+                               && pci_domain_nr(dev->bus) ==
+                                       pci_domain_nr(pcidev->bus)
+                               && dev->devfn == pcidev->devfn) {
+                               found = 1;
+                               *domain = 0;
+                               *bus = 0;
+                               *devfn = PCI_DEVFN(slot,
+                                        PCI_FUNC(pcidev->devfn));
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+       return found;
+}
+
+struct xen_pcibk_backend xen_pcibk_vpci_backend = {
+       .name           = "vpci",
+       .init           = __xen_pcibk_init_devices,
+       .free           = __xen_pcibk_release_devices,
+       .find           = __xen_pcibk_get_pcifront_dev,
+       .publish        = __xen_pcibk_publish_pci_roots,
+       .release        = __xen_pcibk_release_pci_dev,
+       .add            = __xen_pcibk_add_pci_dev,
+       .get            = __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
new file mode 100644 (file)
index 0000000..206c4ce
--- /dev/null
@@ -0,0 +1,749 @@
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <linux/workqueue.h>
+#include "pciback.h"
+
+#define        DRV_NAME        "xen-pciback"
+#define INVALID_EVTCHN_IRQ  (-1)
+struct workqueue_struct *xen_pcibk_wq;
+
+static int __read_mostly passthrough;
+module_param(passthrough, bool, S_IRUGO);
+MODULE_PARM_DESC(passthrough,
+       "Option to specify how to export PCI topology to guest:\n"\
+       " 0 - (default) Hide the true PCI topology and makes the frontend\n"\
+       "   there is a single PCI bus with only the exported devices on it.\n"\
+       "   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\
+       "   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\
+       " 1 - Passthrough provides a real view of the PCI topology to the\n"\
+       "   frontend (for example, a device at 06:01.b will still appear at\n"\
+       "   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
+       "   exposed PCI devices to its driver domains. This may be required\n"\
+       "   for drivers which depend on finding their hardward in certain\n"\
+       "   bus/slot locations.");
+
+static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
+{
+       struct xen_pcibk_device *pdev;
+
+       pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL);
+       if (pdev == NULL)
+               goto out;
+       dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+       pdev->xdev = xdev;
+       dev_set_drvdata(&xdev->dev, pdev);
+
+       spin_lock_init(&pdev->dev_lock);
+
+       pdev->sh_info = NULL;
+       pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+       pdev->be_watching = 0;
+
+       INIT_WORK(&pdev->op_work, xen_pcibk_do_op);
+
+       if (xen_pcibk_init_devices(pdev)) {
+               kfree(pdev);
+               pdev = NULL;
+       }
+out:
+       return pdev;
+}
+
+static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
+{
+       spin_lock(&pdev->dev_lock);
+
+       /* Ensure the guest can't trigger our handler before removing devices */
+       if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+               unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+               pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+       }
+       spin_unlock(&pdev->dev_lock);
+
+       /* If the driver domain started an op, make sure we complete it
+        * before releasing the shared memory */
+
+       /* Note, the workqueue does not use spinlocks at all.*/
+       flush_workqueue(xen_pcibk_wq);
+
+       spin_lock(&pdev->dev_lock);
+       if (pdev->sh_info != NULL) {
+               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+               pdev->sh_info = NULL;
+       }
+       spin_unlock(&pdev->dev_lock);
+
+}
+
+static void free_pdev(struct xen_pcibk_device *pdev)
+{
+       if (pdev->be_watching) {
+               unregister_xenbus_watch(&pdev->be_watch);
+               pdev->be_watching = 0;
+       }
+
+       xen_pcibk_disconnect(pdev);
+
+       xen_pcibk_release_devices(pdev);
+
+       dev_set_drvdata(&pdev->xdev->dev, NULL);
+       pdev->xdev = NULL;
+
+       kfree(pdev);
+}
+
+static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
+                            int remote_evtchn)
+{
+       int err = 0;
+       void *vaddr;
+
+       dev_dbg(&pdev->xdev->dev,
+               "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+               gnt_ref, remote_evtchn);
+
+       err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+       if (err < 0) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                               "Error mapping other domain page in ours.");
+               goto out;
+       }
+
+       spin_lock(&pdev->dev_lock);
+       pdev->sh_info = vaddr;
+       spin_unlock(&pdev->dev_lock);
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
+               0, DRV_NAME, pdev);
+       if (err < 0) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error binding event channel to IRQ");
+               goto out;
+       }
+
+       spin_lock(&pdev->dev_lock);
+       pdev->evtchn_irq = err;
+       spin_unlock(&pdev->dev_lock);
+       err = 0;
+
+       dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+       return err;
+}
+
+static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
+{
+       int err = 0;
+       int gnt_ref, remote_evtchn;
+       char *magic = NULL;
+
+
+       /* Make sure we only do this setup once */
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateInitialised)
+               goto out;
+
+       /* Wait for frontend to state that it has published the configuration */
+       if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+           XenbusStateInitialised)
+               goto out;
+
+       dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+       err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+                           "pci-op-ref", "%u", &gnt_ref,
+                           "event-channel", "%u", &remote_evtchn,
+                           "magic", NULL, &magic, NULL);
+       if (err) {
+               /* If configuration didn't get read correctly, wait longer */
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading configuration from frontend");
+               goto out;
+       }
+
+       if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+               xenbus_dev_fatal(pdev->xdev, -EFAULT,
+                                "version mismatch (%s/%s) with pcifront - "
+                                "halting xen_pcibk",
+                                magic, XEN_PCI_MAGIC);
+               goto out;
+       }
+
+       err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn);
+       if (err)
+               goto out;
+
+       dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+       if (err)
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error switching to connected state!");
+
+       dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+
+       kfree(magic);
+
+       return err;
+}
+
+static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev,
+                                  unsigned int domain, unsigned int bus,
+                                  unsigned int devfn, unsigned int devid)
+{
+       int err;
+       int len;
+       char str[64];
+
+       len = snprintf(str, sizeof(str), "vdev-%d", devid);
+       if (unlikely(len >= (sizeof(str) - 1))) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+                           "%04x:%02x:%02x.%02x", domain, bus,
+                           PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+       return err;
+}
+
+static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
+                                int domain, int bus, int slot, int func,
+                                int devid)
+{
+       struct pci_dev *dev;
+       int err = 0;
+
+       dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+               domain, bus, slot, func);
+
+       dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+       if (!dev) {
+               err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Couldn't locate PCI device "
+                                "(%04x:%02x:%02x.%01x)! "
+                                "perhaps already in-use?",
+                                domain, bus, slot, func);
+               goto out;
+       }
+
+       err = xen_pcibk_add_pci_dev(pdev, dev, devid,
+                                   xen_pcibk_publish_pci_dev);
+       if (err)
+               goto out;
+
+       dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+       if (xen_register_device_domain_owner(dev,
+                                            pdev->xdev->otherend_id) != 0) {
+               dev_err(&dev->dev, "device has been assigned to another " \
+                       "domain! Over-writting the ownership, but beware.\n");
+               xen_unregister_device_domain_owner(dev);
+               xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+       }
+
+       /* TODO: It'd be nice to export a bridge and have all of its children
+        * get exported with it. This may be best done in xend (which will
+        * have to calculate resource usage anyway) but we probably want to
+        * put something in here to ensure that if a bridge gets given to a
+        * driver domain, that all devices under that bridge are not given
+        * to other driver domains (as he who controls the bridge can disable
+        * it and stop the other devices from working).
+        */
+out:
+       return err;
+}
+
+static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
+                                int domain, int bus, int slot, int func)
+{
+       int err = 0;
+       struct pci_dev *dev;
+
+       dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+               domain, bus, slot, func);
+
+       dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+       if (!dev) {
+               err = -EINVAL;
+               dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+                       "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
+                       domain, bus, slot, func);
+               goto out;
+       }
+
+       dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+       xen_unregister_device_domain_owner(dev);
+
+       xen_pcibk_release_pci_dev(pdev, dev);
+
+out:
+       return err;
+}
+
+static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev,
+                                   unsigned int domain, unsigned int bus)
+{
+       unsigned int d, b;
+       int i, root_num, len, err;
+       char str[64];
+
+       dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                          "root_num", "%d", &root_num);
+       if (err == 0 || err == -ENOENT)
+               root_num = 0;
+       else if (err < 0)
+               goto out;
+
+       /* Verify that we haven't already published this pci root */
+       for (i = 0; i < root_num; i++) {
+               len = snprintf(str, sizeof(str), "root-%d", i);
+               if (unlikely(len >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                                  str, "%x:%x", &d, &b);
+               if (err < 0)
+                       goto out;
+               if (err != 2) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               if (d == domain && b == bus) {
+                       err = 0;
+                       goto out;
+               }
+       }
+
+       len = snprintf(str, sizeof(str), "root-%d", root_num);
+       if (unlikely(len >= (sizeof(str) - 1))) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+               root_num, domain, bus);
+
+       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+                           "%04x:%02x", domain, bus);
+       if (err)
+               goto out;
+
+       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+                           "root_num", "%d", (root_num + 1));
+
+out:
+       return err;
+}
+
+static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev)
+{
+       int err = 0;
+       int num_devs;
+       int domain, bus, slot, func;
+       int substate;
+       int i, len;
+       char state_str[64];
+       char dev_str[64];
+
+
+       dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+       /* Make sure we only reconfigure once */
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateReconfiguring)
+               goto out;
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+                          &num_devs);
+       if (err != 1) {
+               if (err >= 0)
+                       err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading number of devices");
+               goto out;
+       }
+
+       for (i = 0; i < num_devs; i++) {
+               len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+               if (unlikely(len >= (sizeof(state_str) - 1))) {
+                       err = -ENOMEM;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "String overflow while reading "
+                                        "configuration");
+                       goto out;
+               }
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+                                  "%d", &substate);
+               if (err != 1)
+                       substate = XenbusStateUnknown;
+
+               switch (substate) {
+               case XenbusStateInitialising:
+                       dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+                       len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+                       if (unlikely(len >= (sizeof(dev_str) - 1))) {
+                               err = -ENOMEM;
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "String overflow while "
+                                                "reading configuration");
+                               goto out;
+                       }
+                       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                                          dev_str, "%x:%x:%x.%x",
+                                          &domain, &bus, &slot, &func);
+                       if (err < 0) {
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error reading device "
+                                                "configuration");
+                               goto out;
+                       }
+                       if (err != 4) {
+                               err = -EINVAL;
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error parsing pci device "
+                                                "configuration");
+                               goto out;
+                       }
+
+                       err = xen_pcibk_export_device(pdev, domain, bus, slot,
+                                                   func, i);
+                       if (err)
+                               goto out;
+
+                       /* Publish pci roots. */
+                       err = xen_pcibk_publish_pci_roots(pdev,
+                                               xen_pcibk_publish_pci_root);
+                       if (err) {
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error while publish PCI root"
+                                                "buses for frontend");
+                               goto out;
+                       }
+
+                       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+                                           state_str, "%d",
+                                           XenbusStateInitialised);
+                       if (err) {
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error switching substate of "
+                                                "dev-%d\n", i);
+                               goto out;
+                       }
+                       break;
+
+               case XenbusStateClosing:
+                       dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+                       len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+                       if (unlikely(len >= (sizeof(dev_str) - 1))) {
+                               err = -ENOMEM;
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "String overflow while "
+                                                "reading configuration");
+                               goto out;
+                       }
+                       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                                          dev_str, "%x:%x:%x.%x",
+                                          &domain, &bus, &slot, &func);
+                       if (err < 0) {
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error reading device "
+                                                "configuration");
+                               goto out;
+                       }
+                       if (err != 4) {
+                               err = -EINVAL;
+                               xenbus_dev_fatal(pdev->xdev, err,
+                                                "Error parsing pci device "
+                                                "configuration");
+                               goto out;
+                       }
+
+                       err = xen_pcibk_remove_device(pdev, domain, bus, slot,
+                                                   func);
+                       if (err)
+                               goto out;
+
+                       /* TODO: If at some point we implement support for pci
+                        * root hot-remove on pcifront side, we'll need to
+                        * remove unnecessary xenstore nodes of pci roots here.
+                        */
+
+                       break;
+
+               default:
+                       break;
+               }
+       }
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+       if (err) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error switching to reconfigured state!");
+               goto out;
+       }
+
+out:
+       return 0;
+}
+
+static void xen_pcibk_frontend_changed(struct xenbus_device *xdev,
+                                    enum xenbus_state fe_state)
+{
+       struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev);
+
+       dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+       switch (fe_state) {
+       case XenbusStateInitialised:
+               xen_pcibk_attach(pdev);
+               break;
+
+       case XenbusStateReconfiguring:
+               xen_pcibk_reconfigure(pdev);
+               break;
+
+       case XenbusStateConnected:
+               /* pcifront switched its state from reconfiguring to connected.
+                * Then switch to connected state.
+                */
+               xenbus_switch_state(xdev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosing:
+               xen_pcibk_disconnect(pdev);
+               xenbus_switch_state(xdev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xen_pcibk_disconnect(pdev);
+               xenbus_switch_state(xdev, XenbusStateClosed);
+               if (xenbus_dev_is_online(xdev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+               device_unregister(&xdev->dev);
+               break;
+
+       default:
+               break;
+       }
+}
+
+static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
+{
+       /* Get configuration from xend (if available now) */
+       int domain, bus, slot, func;
+       int err = 0;
+       int i, num_devs;
+       char dev_str[64];
+       char state_str[64];
+
+       /* It's possible we could get the call to setup twice, so make sure
+        * we're not already connected.
+        */
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateInitWait)
+               goto out;
+
+       dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+                          &num_devs);
+       if (err != 1) {
+               if (err >= 0)
+                       err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading number of devices");
+               goto out;
+       }
+
+       for (i = 0; i < num_devs; i++) {
+               int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+               if (unlikely(l >= (sizeof(dev_str) - 1))) {
+                       err = -ENOMEM;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "String overflow while reading "
+                                        "configuration");
+                       goto out;
+               }
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+               if (err < 0) {
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error reading device configuration");
+                       goto out;
+               }
+               if (err != 4) {
+                       err = -EINVAL;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error parsing pci device "
+                                        "configuration");
+                       goto out;
+               }
+
+               err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i);
+               if (err)
+                       goto out;
+
+               /* Switch substate of this device. */
+               l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+               if (unlikely(l >= (sizeof(state_str) - 1))) {
+                       err = -ENOMEM;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "String overflow while reading "
+                                        "configuration");
+                       goto out;
+               }
+               err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+                                   "%d", XenbusStateInitialised);
+               if (err) {
+                       xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+                                        "substate of dev-%d\n", i);
+                       goto out;
+               }
+       }
+
+       err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root);
+       if (err) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error while publish PCI root buses "
+                                "for frontend");
+               goto out;
+       }
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+       if (err)
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error switching to initialised state!");
+
+out:
+       if (!err)
+               /* see if pcifront is already configured (if not, we'll wait) */
+               xen_pcibk_attach(pdev);
+
+       return err;
+}
+
+static void xen_pcibk_be_watch(struct xenbus_watch *watch,
+                            const char **vec, unsigned int len)
+{
+       struct xen_pcibk_device *pdev =
+           container_of(watch, struct xen_pcibk_device, be_watch);
+
+       switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+       case XenbusStateInitWait:
+               xen_pcibk_setup_backend(pdev);
+               break;
+
+       default:
+               break;
+       }
+}
+
+static int xen_pcibk_xenbus_probe(struct xenbus_device *dev,
+                               const struct xenbus_device_id *id)
+{
+       int err = 0;
+       struct xen_pcibk_device *pdev = alloc_pdev(dev);
+
+       if (pdev == NULL) {
+               err = -ENOMEM;
+               xenbus_dev_fatal(dev, err,
+                                "Error allocating xen_pcibk_device struct");
+               goto out;
+       }
+
+       /* wait for xend to configure us */
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto out;
+
+       /* watch the backend node for backend configuration information */
+       err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+                               xen_pcibk_be_watch);
+       if (err)
+               goto out;
+
+       pdev->be_watching = 1;
+
+       /* We need to force a call to our callback here in case
+        * xend already configured us!
+        */
+       xen_pcibk_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+       return err;
+}
+
+static int xen_pcibk_xenbus_remove(struct xenbus_device *dev)
+{
+       struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev);
+
+       if (pdev != NULL)
+               free_pdev(pdev);
+
+       return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+       {"pci"},
+       {""},
+};
+
+static struct xenbus_driver xenbus_xen_pcibk_driver = {
+       .name                   = DRV_NAME,
+       .owner                  = THIS_MODULE,
+       .ids                    = xenpci_ids,
+       .probe                  = xen_pcibk_xenbus_probe,
+       .remove                 = xen_pcibk_xenbus_remove,
+       .otherend_changed       = xen_pcibk_frontend_changed,
+};
+
+struct xen_pcibk_backend *xen_pcibk_backend;
+
+int __init xen_pcibk_xenbus_register(void)
+{
+       xen_pcibk_wq = create_workqueue("xen_pciback_workqueue");
+       if (!xen_pcibk_wq) {
+               printk(KERN_ERR "%s: create"
+                       "xen_pciback_workqueue failed\n", __func__);
+               return -EFAULT;
+       }
+       xen_pcibk_backend = &xen_pcibk_vpci_backend;
+       if (passthrough)
+               xen_pcibk_backend = &xen_pcibk_passthrough_backend;
+       pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name);
+       return xenbus_register_backend(&xenbus_xen_pcibk_driver);
+}
+
+void __exit xen_pcibk_xenbus_unregister(void)
+{
+       destroy_workqueue(xen_pcibk_wq);
+       xenbus_unregister_driver(&xenbus_xen_pcibk_driver);
+}
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
new file mode 100644 (file)
index 0000000..010937b
--- /dev/null
@@ -0,0 +1,485 @@
+/******************************************************************************
+ * Xen selfballoon driver (and optional frontswap self-shrinking driver)
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ *
+ * This code complements the cleancache and frontswap patchsets to optimize
+ * support for Xen Transcendent Memory ("tmem").  The policy it implements
+ * is rudimentary and will likely improve over time, but it does work well
+ * enough today.
+ *
+ * Two functionalities are implemented here which both use "control theory"
+ * (feedback) to optimize memory utilization. In a virtualized environment
+ * such as Xen, RAM is often a scarce resource and we would like to ensure
+ * that each of a possibly large number of virtual machines is using RAM
+ * efficiently, i.e. using as little as possible when under light load
+ * and obtaining as much as possible when memory demands are high.
+ * Since RAM needs vary highly dynamically and sometimes dramatically,
+ * "hysteresis" is used, that is, memory target is determined not just
+ * on current data but also on past data stored in the system.
+ *
+ * "Selfballooning" creates memory pressure by managing the Xen balloon
+ * driver to decrease and increase available kernel memory, driven
+ * largely by the target value of "Committed_AS" (see /proc/meminfo).
+ * Since Committed_AS does not account for clean mapped pages (i.e. pages
+ * in RAM that are identical to pages on disk), selfballooning has the
+ * affect of pushing less frequently used clean pagecache pages out of
+ * kernel RAM and, presumably using cleancache, into Xen tmem where
+ * Xen can more efficiently optimize RAM utilization for such pages.
+ *
+ * When kernel memory demand unexpectedly increases faster than Xen, via
+ * the selfballoon driver, is able to (or chooses to) provide usable RAM,
+ * the kernel may invoke swapping.  In most cases, frontswap is able
+ * to absorb this swapping into Xen tmem.  However, due to the fact
+ * that the kernel swap subsystem assumes swapping occurs to a disk,
+ * swapped pages may sit on the disk for a very long time; even if
+ * the kernel knows the page will never be used again.  This is because
+ * the disk space costs very little and can be overwritten when
+ * necessary.  When such stale pages are in frontswap, however, they
+ * are taking up valuable real estate.  "Frontswap selfshrinking" works
+ * to resolve this:  When frontswap activity is otherwise stable
+ * and the guest kernel is not under memory pressure, the "frontswap
+ * selfshrinking" accounts for this by providing pressure to remove some
+ * pages from frontswap and return them to kernel memory.
+ *
+ * For both "selfballooning" and "frontswap-selfshrinking", a worker
+ * thread is used and sysfs tunables are provided to adjust the frequency
+ * and rate of adjustments to achieve the goal, as well as to disable one
+ * or both functions independently.
+ *
+ * While some argue that this functionality can and should be implemented
+ * in userspace, it has been observed that bad things happen (e.g. OOMs).
+ *
+ * System configuration note: Selfballooning should not be enabled on
+ * systems without a sufficiently large swap device configured; for best
+ * results, it is recommended that total swap be increased by the size
+ * of the guest memory.  Also, while technically not required to be
+ * configured, it is highly recommended that frontswap also be configured
+ * and enabled when selfballooning is running.  So, selfballooning
+ * is disabled by default if frontswap is not configured and can only
+ * be enabled with the "selfballooning" kernel boot option; similarly
+ * selfballooning is enabled by default if frontswap is configured and
+ * can be disabled with the "noselfballooning" kernel boot option.  Finally,
+ * when frontswap is configured, frontswap-selfshrinking can be disabled
+ * with the "noselfshrink" kernel boot option.
+ *
+ * Selfballooning is disallowed in domain0 and force-disabled.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+
+#include <xen/balloon.h>
+
+#include <xen/tmem.h>
+
+/* Enable/disable with sysfs. */
+static int xen_selfballooning_enabled __read_mostly;
+
+/*
+ * Controls rate at which memory target (this iteration) approaches
+ * ultimate goal when memory need is increasing (up-hysteresis) or
+ * decreasing (down-hysteresis). Higher values of hysteresis cause
+ * slower increases/decreases. The default values for the various
+ * parameters were deemed reasonable by experimentation, may be
+ * workload-dependent, and can all be adjusted via sysfs.
+ */
+static unsigned int selfballoon_downhysteresis __read_mostly = 8;
+static unsigned int selfballoon_uphysteresis __read_mostly = 1;
+
+/* In HZ, controls frequency of worker invocation. */
+static unsigned int selfballoon_interval __read_mostly = 5;
+
+static void selfballoon_process(struct work_struct *work);
+static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
+
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+
+/* Enable/disable with sysfs. */
+static bool frontswap_selfshrinking __read_mostly;
+
+/* Enable/disable with kernel boot option. */
+static bool use_frontswap_selfshrink __initdata = true;
+
+/*
+ * The default values for the following parameters were deemed reasonable
+ * by experimentation, may be workload-dependent, and can all be
+ * adjusted via sysfs.
+ */
+
+/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
+static unsigned int frontswap_hysteresis __read_mostly = 20;
+
+/*
+ * Number of selfballoon worker invocations to wait before observing that
+ * frontswap selfshrinking should commence. Note that selfshrinking does
+ * not use a separate worker thread.
+ */
+static unsigned int frontswap_inertia __read_mostly = 3;
+
+/* Countdown to next invocation of frontswap_shrink() */
+static unsigned long frontswap_inertia_counter;
+
+/*
+ * Invoked by the selfballoon worker thread, uses current number of pages
+ * in frontswap (frontswap_curr_pages()), previous status, and control
+ * values (hysteresis and inertia) to determine if frontswap should be
+ * shrunk and what the new frontswap size should be.  Note that
+ * frontswap_shrink is essentially a partial swapoff that immediately
+ * transfers pages from the "swap device" (frontswap) back into kernel
+ * RAM; despite the name, frontswap "shrinking" is very different from
+ * the "shrinker" interface used by the kernel MM subsystem to reclaim
+ * memory.
+ */
+static void frontswap_selfshrink(void)
+{
+       static unsigned long cur_frontswap_pages;
+       static unsigned long last_frontswap_pages;
+       static unsigned long tgt_frontswap_pages;
+
+       last_frontswap_pages = cur_frontswap_pages;
+       cur_frontswap_pages = frontswap_curr_pages();
+       if (!cur_frontswap_pages ||
+                       (cur_frontswap_pages > last_frontswap_pages)) {
+               frontswap_inertia_counter = frontswap_inertia;
+               return;
+       }
+       if (frontswap_inertia_counter && --frontswap_inertia_counter)
+               return;
+       if (cur_frontswap_pages <= frontswap_hysteresis)
+               tgt_frontswap_pages = 0;
+       else
+               tgt_frontswap_pages = cur_frontswap_pages -
+                       (cur_frontswap_pages / frontswap_hysteresis);
+       frontswap_shrink(tgt_frontswap_pages);
+}
+
+static int __init xen_nofrontswap_selfshrink_setup(char *s)
+{
+       use_frontswap_selfshrink = false;
+       return 1;
+}
+
+__setup("noselfshrink", xen_nofrontswap_selfshrink_setup);
+
+/* Disable with kernel boot option. */
+static bool use_selfballooning __initdata = true;
+
+static int __init xen_noselfballooning_setup(char *s)
+{
+       use_selfballooning = false;
+       return 1;
+}
+
+__setup("noselfballooning", xen_noselfballooning_setup);
+#else /* !CONFIG_FRONTSWAP */
+/* Enable with kernel boot option. */
+static bool use_selfballooning __initdata = false;
+
+static int __init xen_selfballooning_setup(char *s)
+{
+       use_selfballooning = true;
+       return 1;
+}
+
+__setup("selfballooning", xen_selfballooning_setup);
+#endif /* CONFIG_FRONTSWAP */
+
+/*
+ * Use current balloon size, the goal (vm_committed_as), and hysteresis
+ * parameters to set a new target balloon size
+ */
+static void selfballoon_process(struct work_struct *work)
+{
+       unsigned long cur_pages, goal_pages, tgt_pages;
+       bool reset_timer = false;
+
+       if (xen_selfballooning_enabled) {
+               cur_pages = balloon_stats.current_pages;
+               tgt_pages = cur_pages; /* default is no change */
+               goal_pages = percpu_counter_read_positive(&vm_committed_as) +
+                       balloon_stats.current_pages - totalram_pages;
+#ifdef CONFIG_FRONTSWAP
+               /* allow space for frontswap pages to be repatriated */
+               if (frontswap_selfshrinking && frontswap_enabled)
+                       goal_pages += frontswap_curr_pages();
+#endif
+               if (cur_pages > goal_pages)
+                       tgt_pages = cur_pages -
+                               ((cur_pages - goal_pages) /
+                                 selfballoon_downhysteresis);
+               else if (cur_pages < goal_pages)
+                       tgt_pages = cur_pages +
+                               ((goal_pages - cur_pages) /
+                                 selfballoon_uphysteresis);
+               /* else if cur_pages == goal_pages, no change */
+               balloon_set_new_target(tgt_pages);
+               reset_timer = true;
+       }
+#ifdef CONFIG_FRONTSWAP
+       if (frontswap_selfshrinking && frontswap_enabled) {
+               frontswap_selfshrink();
+               reset_timer = true;
+       }
+#endif
+       if (reset_timer)
+               schedule_delayed_work(&selfballoon_worker,
+                       selfballoon_interval * HZ);
+}
+
+#ifdef CONFIG_SYSFS
+
+#include <linux/sysdev.h>
+#include <linux/capability.h>
+
+#define SELFBALLOON_SHOW(name, format, args...)                                \
+       static ssize_t show_##name(struct sys_device *dev,      \
+                                          struct sysdev_attribute *attr, \
+                                          char *buf) \
+       { \
+               return sprintf(buf, format, ##args); \
+       }
+
+SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled);
+
+static ssize_t store_selfballooning(struct sys_device *dev,
+                           struct sysdev_attribute *attr,
+                           const char *buf,
+                           size_t count)
+{
+       bool was_enabled = xen_selfballooning_enabled;
+       unsigned long tmp;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       err = strict_strtoul(buf, 10, &tmp);
+       if (err || ((tmp != 0) && (tmp != 1)))
+               return -EINVAL;
+
+       xen_selfballooning_enabled = !!tmp;
+       if (!was_enabled && xen_selfballooning_enabled)
+               schedule_delayed_work(&selfballoon_worker,
+                       selfballoon_interval * HZ);
+
+       return count;
+}
+
+static SYSDEV_ATTR(selfballooning, S_IRUGO | S_IWUSR,
+                  show_selfballooning, store_selfballooning);
+
+SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval);
+
+static ssize_t store_selfballoon_interval(struct sys_device *dev,
+                                         struct sysdev_attribute *attr,
+                                         const char *buf,
+                                         size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       selfballoon_interval = val;
+       return count;
+}
+
+static SYSDEV_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR,
+                  show_selfballoon_interval, store_selfballoon_interval);
+
+SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis);
+
+static ssize_t store_selfballoon_downhys(struct sys_device *dev,
+                                        struct sysdev_attribute *attr,
+                                        const char *buf,
+                                        size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       selfballoon_downhysteresis = val;
+       return count;
+}
+
+static SYSDEV_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR,
+                  show_selfballoon_downhys, store_selfballoon_downhys);
+
+
+SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis);
+
+static ssize_t store_selfballoon_uphys(struct sys_device *dev,
+                                      struct sysdev_attribute *attr,
+                                      const char *buf,
+                                      size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       selfballoon_uphysteresis = val;
+       return count;
+}
+
+static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
+                  show_selfballoon_uphys, store_selfballoon_uphys);
+
+#ifdef CONFIG_FRONTSWAP
+SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
+
+static ssize_t store_frontswap_selfshrinking(struct sys_device *dev,
+                                            struct sysdev_attribute *attr,
+                                            const char *buf,
+                                            size_t count)
+{
+       bool was_enabled = frontswap_selfshrinking;
+       unsigned long tmp;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &tmp);
+       if (err || ((tmp != 0) && (tmp != 1)))
+               return -EINVAL;
+       frontswap_selfshrinking = !!tmp;
+       if (!was_enabled && !xen_selfballooning_enabled &&
+            frontswap_selfshrinking)
+               schedule_delayed_work(&selfballoon_worker,
+                       selfballoon_interval * HZ);
+
+       return count;
+}
+
+static SYSDEV_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR,
+                  show_frontswap_selfshrinking, store_frontswap_selfshrinking);
+
+SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia);
+
+static ssize_t store_frontswap_inertia(struct sys_device *dev,
+                                      struct sysdev_attribute *attr,
+                                      const char *buf,
+                                      size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       frontswap_inertia = val;
+       frontswap_inertia_counter = val;
+       return count;
+}
+
+static SYSDEV_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR,
+                  show_frontswap_inertia, store_frontswap_inertia);
+
+SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis);
+
+static ssize_t store_frontswap_hysteresis(struct sys_device *dev,
+                                         struct sysdev_attribute *attr,
+                                         const char *buf,
+                                         size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       frontswap_hysteresis = val;
+       return count;
+}
+
+static SYSDEV_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR,
+                  show_frontswap_hysteresis, store_frontswap_hysteresis);
+
+#endif /* CONFIG_FRONTSWAP */
+
+static struct attribute *selfballoon_attrs[] = {
+       &attr_selfballooning.attr,
+       &attr_selfballoon_interval.attr,
+       &attr_selfballoon_downhysteresis.attr,
+       &attr_selfballoon_uphysteresis.attr,
+#ifdef CONFIG_FRONTSWAP
+       &attr_frontswap_selfshrinking.attr,
+       &attr_frontswap_hysteresis.attr,
+       &attr_frontswap_inertia.attr,
+#endif
+       NULL
+};
+
+static struct attribute_group selfballoon_group = {
+       .name = "selfballoon",
+       .attrs = selfballoon_attrs
+};
+#endif
+
+int register_xen_selfballooning(struct sys_device *sysdev)
+{
+       int error = -1;
+
+#ifdef CONFIG_SYSFS
+       error = sysfs_create_group(&sysdev->kobj, &selfballoon_group);
+#endif
+       return error;
+}
+EXPORT_SYMBOL(register_xen_selfballooning);
+
+static int __init xen_selfballoon_init(void)
+{
+       bool enable = false;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       if (xen_initial_domain()) {
+               pr_info("xen/balloon: Xen selfballooning driver "
+                               "disabled for domain0.\n");
+               return -ENODEV;
+       }
+
+       xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
+       if (xen_selfballooning_enabled) {
+               pr_info("xen/balloon: Initializing Xen "
+                                       "selfballooning driver.\n");
+               enable = true;
+       }
+#ifdef CONFIG_FRONTSWAP
+       frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
+       if (frontswap_selfshrinking) {
+               pr_info("xen/balloon: Initializing frontswap "
+                                       "selfshrinking driver.\n");
+               enable = true;
+       }
+#endif
+       if (!enable)
+               return -ENODEV;
+
+       schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
+
+       return 0;
+}
+
+subsys_initcall(xen_selfballoon_init);
+
+MODULE_LICENSE("GPL");
index 7397695..bd2f90c 100644 (file)
@@ -378,26 +378,32 @@ static void xenbus_dev_release(struct device *dev)
                kfree(to_xenbus_device(dev));
 }
 
-static ssize_t xendev_show_nodename(struct device *dev,
-                                   struct device_attribute *attr, char *buf)
+static ssize_t nodename_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
 {
        return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
 }
-static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
 
-static ssize_t xendev_show_devtype(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
+static ssize_t devtype_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
 {
        return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
 }
-static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
 
-static ssize_t xendev_show_modalias(struct device *dev,
-                                   struct device_attribute *attr, char *buf)
+static ssize_t modalias_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
 {
-       return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
+       return sprintf(buf, "%s:%s\n", dev->bus->name,
+                      to_xenbus_device(dev)->devicetype);
 }
-static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+
+struct device_attribute xenbus_dev_attrs[] = {
+       __ATTR_RO(nodename),
+       __ATTR_RO(devtype),
+       __ATTR_RO(modalias),
+       __ATTR_NULL
+};
+EXPORT_SYMBOL_GPL(xenbus_dev_attrs);
 
 int xenbus_probe_node(struct xen_bus_type *bus,
                      const char *type,
@@ -449,25 +455,7 @@ int xenbus_probe_node(struct xen_bus_type *bus,
        if (err)
                goto fail;
 
-       err = device_create_file(&xendev->dev, &dev_attr_nodename);
-       if (err)
-               goto fail_unregister;
-
-       err = device_create_file(&xendev->dev, &dev_attr_devtype);
-       if (err)
-               goto fail_remove_nodename;
-
-       err = device_create_file(&xendev->dev, &dev_attr_modalias);
-       if (err)
-               goto fail_remove_devtype;
-
        return 0;
-fail_remove_devtype:
-       device_remove_file(&xendev->dev, &dev_attr_devtype);
-fail_remove_nodename:
-       device_remove_file(&xendev->dev, &dev_attr_nodename);
-fail_unregister:
-       device_unregister(&xendev->dev);
 fail:
        kfree(xendev);
        return err;
index 888b990..b814935 100644 (file)
@@ -48,6 +48,8 @@ struct xen_bus_type
        struct bus_type bus;
 };
 
+extern struct device_attribute xenbus_dev_attrs[];
+
 extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
 extern int xenbus_dev_probe(struct device *_dev);
 extern int xenbus_dev_remove(struct device *_dev);
index 6cf467b..60adf91 100644 (file)
@@ -107,6 +107,9 @@ static int xenbus_uevent_backend(struct device *dev,
        if (xdev == NULL)
                return -ENODEV;
 
+       if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype))
+               return -ENOMEM;
+
        /* stuff we want to pass to /sbin/hotplug */
        if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
                return -ENOMEM;
@@ -183,10 +186,6 @@ static void frontend_changed(struct xenbus_watch *watch,
        xenbus_otherend_changed(watch, vec, len, 0);
 }
 
-static struct device_attribute xenbus_backend_dev_attrs[] = {
-       __ATTR_NULL
-};
-
 static struct xen_bus_type xenbus_backend = {
        .root = "backend",
        .levels = 3,            /* backend/type/<frontend>/<id> */
@@ -200,7 +199,7 @@ static struct xen_bus_type xenbus_backend = {
                .probe          = xenbus_dev_probe,
                .remove         = xenbus_dev_remove,
                .shutdown       = xenbus_dev_shutdown,
-               .dev_attrs      = xenbus_backend_dev_attrs,
+               .dev_attrs      = xenbus_dev_attrs,
        },
 };
 
index b6a2690..ed2ba47 100644 (file)
@@ -81,10 +81,6 @@ static void backend_changed(struct xenbus_watch *watch,
        xenbus_otherend_changed(watch, vec, len, 1);
 }
 
-static struct device_attribute xenbus_frontend_dev_attrs[] = {
-       __ATTR_NULL
-};
-
 static const struct dev_pm_ops xenbus_pm_ops = {
        .suspend        = xenbus_dev_suspend,
        .resume         = xenbus_dev_resume,
@@ -106,7 +102,7 @@ static struct xen_bus_type xenbus_frontend = {
                .probe          = xenbus_dev_probe,
                .remove         = xenbus_dev_remove,
                .shutdown       = xenbus_dev_shutdown,
-               .dev_attrs      = xenbus_frontend_dev_attrs,
+               .dev_attrs      = xenbus_dev_attrs,
 
                .pm             = &xenbus_pm_ops,
        },
index a2b22f0..4076ed7 100644 (file)
@@ -23,3 +23,13 @@ void balloon_set_new_target(unsigned long target);
 
 int alloc_xenballooned_pages(int nr_pages, struct page** pages);
 void free_xenballooned_pages(int nr_pages, struct page** pages);
+
+struct sys_device;
+#ifdef CONFIG_XEN_SELFBALLOONING
+extern int register_xen_selfballooning(struct sys_device *sysdev);
+#else
+static inline int register_xen_selfballooning(struct sys_device *sysdev)
+{
+       return -ENOSYS;
+}
+#endif
index c3adde3..901724d 100644 (file)
@@ -6,11 +6,13 @@ extern struct console xenboot_console;
 #ifdef CONFIG_HVC_XEN
 void xen_console_resume(void);
 void xen_raw_console_write(const char *str);
+__attribute__((format(printf, 1, 2)))
 void xen_raw_printk(const char *fmt, ...);
 #else
 static inline void xen_console_resume(void) { }
 static inline void xen_raw_console_write(const char *str) { }
-static inline void xen_raw_printk(const char *fmt, ...) { }
+static inline __attribute__((format(printf, 1, 2)))
+void xen_raw_printk(const char *fmt, ...) { }
 #endif
 
 #endif /* XEN_HVC_CONSOLE_H */
index 70213b4..6acd9ce 100644 (file)
@@ -450,6 +450,45 @@ struct start_info {
        int8_t cmd_line[MAX_GUEST_CMDLINE];
 };
 
+struct dom0_vga_console_info {
+       uint8_t video_type;
+#define XEN_VGATYPE_TEXT_MODE_3 0x03
+#define XEN_VGATYPE_VESA_LFB    0x23
+
+       union {
+               struct {
+                       /* Font height, in pixels. */
+                       uint16_t font_height;
+                       /* Cursor location (column, row). */
+                       uint16_t cursor_x, cursor_y;
+                       /* Number of rows and columns (dimensions in characters). */
+                       uint16_t rows, columns;
+               } text_mode_3;
+
+               struct {
+                       /* Width and height, in pixels. */
+                       uint16_t width, height;
+                       /* Bytes per scan line. */
+                       uint16_t bytes_per_line;
+                       /* Bits per pixel. */
+                       uint16_t bits_per_pixel;
+                       /* LFB physical address, and size (in units of 64kB). */
+                       uint32_t lfb_base;
+                       uint32_t lfb_size;
+                       /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
+                       uint8_t  red_pos, red_size;
+                       uint8_t  green_pos, green_size;
+                       uint8_t  blue_pos, blue_size;
+                       uint8_t  rsvd_pos, rsvd_size;
+
+                       /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
+                       uint32_t gbl_caps;
+                       /* Mode attributes (offset 0x0, VESA command 0x4f01). */
+                       uint16_t mode_attrs;
+               } vesa_lfb;
+       } u;
+};
+
 /* These flags are passed in the 'flags' field of start_info_t. */
 #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
diff --git a/include/xen/tmem.h b/include/xen/tmem.h
new file mode 100644 (file)
index 0000000..82e2c83
--- /dev/null
@@ -0,0 +1,5 @@
+#ifndef _XEN_TMEM_H
+#define _XEN_TMEM_H
+/* defined in drivers/xen/tmem.c */
+extern int tmem_enabled;
+#endif /* _XEN_TMEM_H */
index 5467369..aceeca7 100644 (file)
@@ -223,7 +223,9 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port);
 
 enum xenbus_state xenbus_read_driver_state(const char *path);
 
+__attribute__((format(printf, 3, 4)))
 void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
+__attribute__((format(printf, 3, 4)))
 void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
 
 const char *xenbus_strstate(enum xenbus_state state);