From 51c71a3bbaca868043cc45b3ad3786dd48a90235 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 26 Nov 2013 15:05:40 -0500
Subject: [PATCH 01/52] xen/pvhvm: If xen_platform_pci=0 is set don't blow up
 (v4).

The user has the option of disabling the platform driver:
00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev 01)

which is used to unplug the emulated drivers (IDE, Realtek 8169, etc)
and allow the PV drivers to take over. If the user wishes
to disable that they can set:

  xen_platform_pci=0
  (in the guest config file)

or
  xen_emul_unplug=never
  (on the Linux command line)

except it does not work properly. The PV drivers still try to
load and since the Xen platform driver is not run - and it
has not initialized the grant tables, most of the PV drivers
stumble upon:

input: Xen Virtual Keyboard as /devices/virtual/input/input5
input: Xen Virtual Pointer as /devices/virtual/input/input6M
------------[ cut here ]------------
kernel BUG at /home/konrad/ssd/konrad/linux/drivers/xen/grant-table.c:1206!
invalid opcode: 0000 [#1] SMP
Modules linked in: xen_kbdfront(+) xenfs xen_privcmd
CPU: 6 PID: 1389 Comm: modprobe Not tainted 3.13.0-rc1upstream-00021-ga6c892b-dirty #1
Hardware name: Xen HVM domU, BIOS 4.4-unstable 11/26/2013
RIP: 0010:[<ffffffff813ddc40>]  [<ffffffff813ddc40>] get_free_entries+0x2e0/0x300
Call Trace:
 [<ffffffff8150d9a3>] ? evdev_connect+0x1e3/0x240
 [<ffffffff813ddd0e>] gnttab_grant_foreign_access+0x2e/0x70
 [<ffffffffa0010081>] xenkbd_connect_backend+0x41/0x290 [xen_kbdfront]
 [<ffffffffa0010a12>] xenkbd_probe+0x2f2/0x324 [xen_kbdfront]
 [<ffffffff813e5757>] xenbus_dev_probe+0x77/0x130
 [<ffffffff813e7217>] xenbus_frontend_dev_probe+0x47/0x50
 [<ffffffff8145e9a9>] driver_probe_device+0x89/0x230
 [<ffffffff8145ebeb>] __driver_attach+0x9b/0xa0
 [<ffffffff8145eb50>] ? driver_probe_device+0x230/0x230
 [<ffffffff8145eb50>] ? driver_probe_device+0x230/0x230
 [<ffffffff8145cf1c>] bus_for_each_dev+0x8c/0xb0
 [<ffffffff8145e7d9>] driver_attach+0x19/0x20
 [<ffffffff8145e260>] bus_add_driver+0x1a0/0x220
 [<ffffffff8145f1ff>] driver_register+0x5f/0xf0
 [<ffffffff813e55c5>] xenbus_register_driver_common+0x15/0x20
 [<ffffffff813e76b3>] xenbus_register_frontend+0x23/0x40
 [<ffffffffa0015000>] ? 0xffffffffa0014fff
 [<ffffffffa001502b>] xenkbd_init+0x2b/0x1000 [xen_kbdfront]
 [<ffffffff81002049>] do_one_initcall+0x49/0x170

.. snip..

which is hardly nice. This patch fixes this by having each
PV driver check for:
 - if running in PV, then it is fine to execute (as that is their
   native environment).
 - if running in HVM, check if user wanted 'xen_emul_unplug=never',
   in which case bail out and don't load any PV drivers.
 - if running in HVM, and if PCI device 5853:0001 (xen_platform_pci)
   does not exist, then bail out and not load PV drivers.
 - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=ide-disks',
   then bail out for all PV devices _except_ the block one.
   Ditto for the network one ('nics').
 - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=unnecessary'
   then load block PV driver, and also setup the legacy IDE paths.
   In (v3) make it actually load PV drivers.

Reported-by: Sander Eikelenboom <linux@eikelenboom.it
Reported-by: Anthony PERARD <anthony.perard@citrix.com>
Reported-and-Tested-by: Fabio Fantoni <fabio.fantoni@m2r.biz>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
[v2: Add extra logic to handle the myrid ways 'xen_emul_unplug'
can be used per Ian and Stefano suggestion]
[v3: Make the unnecessary case work properly]
[v4: s/disks/ide-disks/ spotted by Fabio]
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com> [for PCI parts]
CC: stable@vger.kernel.org
---
 arch/x86/xen/platform-pci-unplug.c         | 74 ++++++++++++++++++++++
 drivers/block/xen-blkfront.c               |  4 +-
 drivers/char/tpm/xen-tpmfront.c            |  4 ++
 drivers/input/misc/xen-kbdfront.c          |  4 ++
 drivers/net/xen-netfront.c                 |  2 +-
 drivers/pci/xen-pcifront.c                 |  4 ++
 drivers/video/xen-fbfront.c                |  4 ++
 drivers/xen/xenbus/xenbus_probe_frontend.c |  2 +-
 include/xen/platform_pci.h                 | 23 +++++++
 9 files changed, 117 insertions(+), 4 deletions(-)
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ffe..ab84ac198a9a 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -69,6 +69,80 @@ static int check_platform_magic(void)
 	return 0;
 }
 
+bool xen_has_pv_devices()
+{
+	if (!xen_domain())
+		return false;
+
+	/* PV domains always have them. */
+	if (xen_pv_domain())
+		return true;
+
+	/* And user has xen_platform_pci=0 set in guest config as
+	 * driver did not modify the value. */
+	if (xen_platform_pci_unplug == 0)
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+		return true;
+
+	/* This is an odd one - we are going to run legacy
+	 * and PV drivers at the same time. */
+	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+		return true;
+
+	/* And the caller has to follow with xen_pv_{disk,nic}_devices
+	 * to be certain which driver can load. */
+	return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+	/* HVM domains might or might not */
+	if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+		return true;
+
+	return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+	return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+	return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+				   XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+	if (!xen_domain())
+		return false;
+
+	/* N.B. This is only ever used in HVM mode */
+	if (xen_pv_domain())
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
 void xen_unplug_emulated_devices(void)
 {
 	int r;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index c4a4c9006288..f9c43f91f03e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1356,7 +1356,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 		char *type;
 		int len;
 		/* no unplug has been done: do not hook devices != xen vbds */
-		if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+		if (xen_has_pv_and_legacy_disk_devices()) {
 			int major;
 
 			if (!VDEV_IS_EXTENDED(vdevice))
@@ -2079,7 +2079,7 @@ static int __init xlblk_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+	if (!xen_has_pv_disk_devices())
 		return -ENODEV;
 
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c
index c8ff4df81779..62e7d383fa64 100644
--- a/drivers/char/tpm/xen-tpmfront.c
+++ b/drivers/char/tpm/xen-tpmfront.c
@@ -17,6 +17,7 @@
 #include <xen/xenbus.h>
 #include <xen/page.h>
 #include "tpm.h"
+#include <xen/platform_pci.h>
 
 struct tpm_private {
 	struct tpm_chip *chip;
@@ -421,6 +422,9 @@ static int __init xen_tpmfront_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
+	if (!xen_has_pv_devices())
+		return -ENODEV;
+
 	return xenbus_register_frontend(&tpmfront_driver);
 }
 module_init(xen_tpmfront_init);
diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c
index e21c1816a8f9..fbfdc10573be 100644
--- a/drivers/input/misc/xen-kbdfront.c
+++ b/drivers/input/misc/xen-kbdfront.c
@@ -29,6 +29,7 @@
 #include <xen/interface/io/fbif.h>
 #include <xen/interface/io/kbdif.h>
 #include <xen/xenbus.h>
+#include <xen/platform_pci.h>
 
 struct xenkbd_info {
 	struct input_dev *kbd;
@@ -380,6 +381,9 @@ static int __init xenkbd_init(void)
 	if (xen_initial_domain())
 		return -ENODEV;
 
+	if (!xen_has_pv_devices())
+		return -ENODEV;
+
 	return xenbus_register_frontend(&xenkbd_driver);
 }
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index e59acb1daa23..2ab82fe75ede 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -2115,7 +2115,7 @@ static int __init netif_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+	if (!xen_has_pv_nic_devices())
 		return -ENODEV;
 
 	pr_info("Initialising Xen virtual ethernet driver\n");
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index f7197a790341..eae7cd9fde7b 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/time.h>
+#include <xen/platform_pci.h>
 
 #include <asm/xen/swiotlb-xen.h>
 #define INVALID_GRANT_REF (0)
@@ -1138,6 +1139,9 @@ static int __init pcifront_init(void)
 	if (!xen_pv_domain() || xen_initial_domain())
 		return -ENODEV;
 
+	if (!xen_has_pv_devices())
+		return -ENODEV;
+
 	pci_frontend_registrar(1 /* enable */);
 
 	return xenbus_register_frontend(&xenpci_driver);
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index cd005c227a23..4b2d3ab870f3 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -35,6 +35,7 @@
 #include <xen/interface/io/fbif.h>
 #include <xen/interface/io/protocols.h>
 #include <xen/xenbus.h>
+#include <xen/platform_pci.h>
 
 struct xenfb_info {
 	unsigned char		*fb;
@@ -699,6 +700,9 @@ static int __init xenfb_init(void)
 	if (xen_initial_domain())
 		return -ENODEV;
 
+	if (!xen_has_pv_devices())
+		return -ENODEV;
+
 	return xenbus_register_frontend(&xenfb_driver);
 }
 
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
index 129bf84c19ec..cb385c10d2b1 100644
--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init);
 #ifndef MODULE
 static int __init boot_wait_for_devices(void)
 {
-	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+	if (!xen_has_pv_devices())
 		return -ENODEV;
 
 	ready_to_wait_for_devices = 1;
diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h
index 438c256c274b..b49eeab0262e 100644
--- a/include/xen/platform_pci.h
+++ b/include/xen/platform_pci.h
@@ -48,4 +48,27 @@ static inline int xen_must_unplug_disks(void) {
 
 extern int xen_platform_pci_unplug;
 
+#if defined(CONFIG_XEN_PVHVM)
+extern bool xen_has_pv_devices(void);
+extern bool xen_has_pv_disk_devices(void);
+extern bool xen_has_pv_nic_devices(void);
+extern bool xen_has_pv_and_legacy_disk_devices(void);
+#else
+static inline bool xen_has_pv_devices(void)
+{
+	return IS_ENABLED(CONFIG_XEN);
+}
+static inline bool xen_has_pv_disk_devices(void)
+{
+	return IS_ENABLED(CONFIG_XEN);
+}
+static inline bool xen_has_pv_nic_devices(void)
+{
+	return IS_ENABLED(CONFIG_XEN);
+}
+static inline bool xen_has_pv_and_legacy_disk_devices(void)
+{
+	return false;
+}
+#endif
 #endif /* _XEN_PLATFORM_PCI_H */

From 6f6c15ef912465b3aaafe709f39bd6026a8b3e72 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 11 Dec 2013 15:22:01 -0500
Subject: [PATCH 02/52] xen/pvhvm: Remove the xen_platform_pci int.

Since we have  xen_has_pv_devices,xen_has_pv_disk_devices,
xen_has_pv_nic_devices, and xen_has_pv_and_legacy_disk_devices
to figure out the different 'unplug' behaviors - lets
use those instead of this single int.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/platform-pci-unplug.c | 5 ++---
 include/xen/platform_pci.h         | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index ab84ac198a9a..a8261716d58d 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -30,10 +30,9 @@
 #define XEN_PLATFORM_ERR_PROTOCOL -2
 #define XEN_PLATFORM_ERR_BLACKLIST -3
 
-/* store the value of xen_emul_unplug after the unplug is done */
-int xen_platform_pci_unplug;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
 #ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
 static int xen_emul_unplug;
 
 static int check_platform_magic(void)
diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h
index b49eeab0262e..5c52b5583917 100644
--- a/include/xen/platform_pci.h
+++ b/include/xen/platform_pci.h
@@ -46,8 +46,6 @@ static inline int xen_must_unplug_disks(void) {
 #endif
 }
 
-extern int xen_platform_pci_unplug;
-
 #if defined(CONFIG_XEN_PVHVM)
 extern bool xen_has_pv_devices(void);
 extern bool xen_has_pv_disk_devices(void);

From 72f28071f14fd9b6cc03aaf83b057d169d817411 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Wed, 11 Dec 2013 12:03:17 +0000
Subject: [PATCH 03/52] xen: balloon: enable for ARM

Since c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of
existing RAM pages" the balloon driver appears to work fine on ARM as far as I
can tell. Prior to that commit it was broken because on ARM RAM doesn't
typically start at zero, effectively leaving a big MMIO hole at the start.
This would cause the balloon driver to give away all of RAM at start of day,
which is rather inconvenient.

It was already enabled (or rather not excluded) on ARM64. The
c1d15f5c8bc1170dafe16e988e55437245966dfe
"xen/balloon: Seperate the auto-translate logic properly (v2)"
added in the proper plumbing to work with ARM and PVH type guests.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
[v2: Added the bit about PVH]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index c794ea182140..b9ea2abe5628 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -3,7 +3,6 @@ menu "Xen driver support"
 
 config XEN_BALLOON
 	bool "Xen memory balloon driver"
-	depends on !ARM
 	default y
 	help
 	  The balloon driver allows the Xen domain to request more memory from

From 02bcf053e9c5dfbb541b8e27a8eeb962a54d577b Mon Sep 17 00:00:00 2001
From: Wei Liu <liuw@liuw.name>
Date: Fri, 3 Jan 2014 14:03:35 +0000
Subject: [PATCH 04/52] asm/xen/page.h: remove redundant semicolon

Signed-off-by: Wei Liu <liuw@liuw.name>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/arm/include/asm/xen/page.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index 75579a9d6f76..ac6789aad059 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -117,6 +117,6 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	return __set_phys_to_machine(pfn, mfn);
 }
 
-#define xen_remap(cookie, size) ioremap_cached((cookie), (size));
+#define xen_remap(cookie, size) ioremap_cached((cookie), (size))
 
 #endif /* _ASM_ARM_XEN_PAGE_H */

From 9346c2a8defab777d1fba6bcc284f6ada181fe96 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 13 Nov 2013 20:59:58 +0800
Subject: [PATCH 05/52] xen: simplify balloon_first_page() with
 list_first_entry_or_null()

Replace the code logic at balloon_first_page() by calling
list_first_entry_or_null() directly.  since here is only
one user of that routine, therefore we can just remove it.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/balloon.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 4c02e2b94103..37d06ea624aa 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -157,13 +157,6 @@ static struct page *balloon_retrieve(bool prefer_highmem)
 	return page;
 }
 
-static struct page *balloon_first_page(void)
-{
-	if (list_empty(&ballooned_pages))
-		return NULL;
-	return list_entry(ballooned_pages.next, struct page, lru);
-}
-
 static struct page *balloon_next_page(struct page *page)
 {
 	struct list_head *next = page->lru.next;
@@ -328,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	page = balloon_first_page();
+	page = list_first_entry_or_null(&ballooned_pages, struct page, lru);
 	for (i = 0; i < nr_pages; i++) {
 		if (!page) {
 			nr_pages = i;

From b7ef4a6dd35d1b47db72fbd1a31c8fd0da7a74f3 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 31 Dec 2013 20:46:27 +0100
Subject: [PATCH 06/52] xen/pci: Fix build on non-x86

We can't include <asm/pci_x86.h> if this isn't x86, and we only need
it if CONFIG_PCI_MMCONFIG is enabled.

Fixes: 8deb3eb1461e ('xen/mcfg: Call PHYSDEVOP_pci_mmcfg_reserved for MCFG areas.')
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/xen/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index 188825122aae..dd9c249ea311 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -26,7 +26,9 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
+#ifdef CONFIG_PCI_MMCONFIG
 #include <asm/pci_x86.h>
+#endif
 
 static bool __read_mostly pci_seg_supported = true;
 

From 872951850666689e931e567ebdc7c483135d14cf Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Tue, 12 Mar 2013 18:28:04 +0000
Subject: [PATCH 07/52] xen/events: refactor retrigger_dynirq() and
 resend_irq_on_evtchn()

These two function did the same thing with different parameters, put
the common bits in retrigger_evtchn().

This changes the return value of resend_irq_on_evtchn() but the only
caller (in arch/ia64/xen/irq_xen.c) ignored the return value so this
is fine.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 4035e833ea26..ddcdbb508dab 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -1558,13 +1558,13 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 	return rebind_irq_to_cpu(data->irq, tcpu);
 }
 
-int resend_irq_on_evtchn(unsigned int irq)
+static int retrigger_evtchn(int evtchn)
 {
-	int masked, evtchn = evtchn_from_irq(irq);
+	int masked;
 	struct shared_info *s = HYPERVISOR_shared_info;
 
 	if (!VALID_EVTCHN(evtchn))
-		return 1;
+		return 0;
 
 	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask));
 	sync_set_bit(evtchn, BM(s->evtchn_pending));
@@ -1574,6 +1574,11 @@ int resend_irq_on_evtchn(unsigned int irq)
 	return 1;
 }
 
+int resend_irq_on_evtchn(unsigned int irq)
+{
+	return retrigger_evtchn(evtchn_from_irq(irq));
+}
+
 static void enable_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -1608,21 +1613,7 @@ static void mask_ack_dynirq(struct irq_data *data)
 
 static int retrigger_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(data->irq);
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	int ret = 0;
-
-	if (VALID_EVTCHN(evtchn)) {
-		int masked;
-
-		masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask));
-		sync_set_bit(evtchn, BM(sh->evtchn_pending));
-		if (!masked)
-			unmask_evtchn(evtchn);
-		ret = 1;
-	}
-
-	return ret;
+	return retrigger_evtchn(evtchn_from_irq(data->irq));
 }
 
 static void restore_pirqs(void)

From fc087e10734a4d3e40693fc099461ec1270b3fff Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Wed, 13 Mar 2013 13:20:52 +0000
Subject: [PATCH 08/52] xen/events: remove unnecessary
 init_evtchn_cpu_bindings()

Because the guest-side binding of an event to a VCPU (i.e., setting
the local per-cpu masks) is always explicitly done after an event
channel is bound to a port, there is no need to initialize all
possible events as bound to VCPU 0 at start of day or after a resume.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events.c | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ddcdbb508dab..1e2c74bcd0c8 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -334,24 +334,6 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 	info_for_irq(irq)->cpu = cpu;
 }
 
-static void init_evtchn_cpu_bindings(void)
-{
-	int i;
-#ifdef CONFIG_SMP
-	struct irq_info *info;
-
-	/* By default all event channels notify CPU#0. */
-	list_for_each_entry(info, &xen_irq_list_head, list) {
-		struct irq_desc *desc = irq_to_desc(info->irq);
-		cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
-	}
-#endif
-
-	for_each_possible_cpu(i)
-		memset(per_cpu(cpu_evtchn_mask, i),
-		       (i == 0) ? ~0 : 0, NR_EVENT_CHANNELS/8);
-}
-
 static inline void clear_evtchn(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -1778,8 +1760,6 @@ void xen_irq_resume(void)
 	unsigned int cpu, evtchn;
 	struct irq_info *info;
 
-	init_evtchn_cpu_bindings();
-
 	/* New event-channel space is not 'live' yet. */
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
 		mask_evtchn(evtchn);
@@ -1890,8 +1870,6 @@ void __init xen_init_IRQ(void)
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		evtchn_to_irq[i] = -1;
 
-	init_evtchn_cpu_bindings();
-
 	/* No event channels are 'live' right now. */
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		mask_evtchn(i);

From 3f70fa828249e3f37883be98f5b4d08e947f55b0 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Thu, 7 Mar 2013 15:50:27 +0000
Subject: [PATCH 09/52] xen/events: introduce test_and_set_mask()

In preparation for adding event channel port ops, add
test_and_set_mask().

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 1e2c74bcd0c8..359e983d97e4 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -352,6 +352,12 @@ static inline int test_evtchn(int port)
 	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
 }
 
+static inline int test_and_set_mask(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
 
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
@@ -1493,7 +1499,6 @@ void rebind_evtchn_irq(int evtchn, int irq)
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
 static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 	int masked;
@@ -1516,7 +1521,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	 * Mask the event while changing the VCPU binding to prevent
 	 * it being delivered on an unexpected VCPU.
 	 */
-	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask));
+	masked = test_and_set_mask(evtchn);
 
 	/*
 	 * If this fails, it usually just indicates that we're dealing with a
@@ -1548,7 +1553,7 @@ static int retrigger_evtchn(int evtchn)
 	if (!VALID_EVTCHN(evtchn))
 		return 0;
 
-	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask));
+	masked = test_and_set_mask(evtchn);
 	sync_set_bit(evtchn, BM(s->evtchn_pending));
 	if (!masked)
 		unmask_evtchn(evtchn);

From 76ec8d64ce50acc8a159740b08a721b7259f9ae7 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Thu, 7 Mar 2013 15:50:28 +0000
Subject: [PATCH 10/52] xen/events: replace raw bit ops with functions

In preparation for adding event channel port ops, use set_evtchn()
instead of sync_set_bit().

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 359e983d97e4..fec5da4ff3a0 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -1548,13 +1548,12 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 static int retrigger_evtchn(int evtchn)
 {
 	int masked;
-	struct shared_info *s = HYPERVISOR_shared_info;
 
 	if (!VALID_EVTCHN(evtchn))
 		return 0;
 
 	masked = test_and_set_mask(evtchn);
-	sync_set_bit(evtchn, BM(s->evtchn_pending));
+	set_evtchn(evtchn);
 	if (!masked)
 		unmask_evtchn(evtchn);
 

From d2ba3166f23baa53f5ee9c5c2ca43b42fb4e9e62 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Wed, 7 Aug 2013 14:32:12 +0100
Subject: [PATCH 11/52] xen/events: move drivers/xen/events.c into
 drivers/xen/events/

events.c will be split into multiple files so move it into its own
directory.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/Makefile                           | 3 ++-
 drivers/xen/events/Makefile                    | 3 +++
 drivers/xen/{events.c => events/events_base.c} | 0
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/events/Makefile
 rename drivers/xen/{events.c => events/events_base.c} (100%)

diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 14fe79d8634a..d75c811bfa56 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -2,7 +2,8 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
 obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
 endif
 obj-$(CONFIG_X86)			+= fallback.o
-obj-y	+= grant-table.o features.o events.o balloon.o manage.o
+obj-y	+= grant-table.o features.o balloon.o manage.o
+obj-y	+= events/
 obj-y	+= xenbus/
 
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
new file mode 100644
index 000000000000..f0bc6071fd84
--- /dev/null
+++ b/drivers/xen/events/Makefile
@@ -0,0 +1,3 @@
+obj-y += events.o
+
+events-y += events_base.o
diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c
similarity index 100%
rename from drivers/xen/events.c
rename to drivers/xen/events/events_base.c

From 9a489f45a155fe96b9b55fbbef2b757ef7737cfc Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Wed, 13 Mar 2013 15:29:25 +0000
Subject: [PATCH 12/52] xen/events: move 2-level specific code into its own
 file

In preparation for alternative event channel ABIs, move all the
functions accessing the shared data structures into their own file.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/Makefile          |   1 +
 drivers/xen/events/events_2l.c       | 348 ++++++++++++++++++++++++
 drivers/xen/events/events_base.c     | 379 ++-------------------------
 drivers/xen/events/events_internal.h |  74 ++++++
 include/xen/events.h                 |   2 +
 5 files changed, 442 insertions(+), 362 deletions(-)
 create mode 100644 drivers/xen/events/events_2l.c
 create mode 100644 drivers/xen/events/events_internal.h

diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
index f0bc6071fd84..08179fe04612 100644
--- a/drivers/xen/events/Makefile
+++ b/drivers/xen/events/Makefile
@@ -1,3 +1,4 @@
 obj-y += events.o
 
 events-y += events_base.o
+events-y += events_2l.o
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
new file mode 100644
index 000000000000..a77e98d025fa
--- /dev/null
+++ b/drivers/xen/events/events_2l.c
@@ -0,0 +1,348 @@
+/*
+ * Xen event channels (2-level ABI)
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+/*
+ * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
+ * careful to only use bitops which allow for this (e.g
+ * test_bit/find_first_bit and friends but not __ffs) and to pass
+ * BITS_PER_EVTCHN_WORD as the bitmask length.
+ */
+#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
+/*
+ * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
+ * array. Primarily to avoid long lines (hence the terse name).
+ */
+#define BM(x) (unsigned long *)(x)
+/* Find the first set bit in a evtchn mask */
+#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
+		      cpu_evtchn_mask);
+
+void xen_evtchn_port_bind_to_cpu(struct irq_info *info, int cpu)
+{
+	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
+	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
+void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+void set_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+int test_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+int test_and_set_mask(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+	int do_hypercall = 0, evtchn_pending = 0;
+
+	BUG_ON(!irqs_disabled());
+
+	if (unlikely((cpu != cpu_from_evtchn(port))))
+		do_hypercall = 1;
+	else {
+		/*
+		 * Need to clear the mask before checking pending to
+		 * avoid a race with an event becoming pending.
+		 *
+		 * EVTCHNOP_unmask will only trigger an upcall if the
+		 * mask bit was set, so if a hypercall is needed
+		 * remask the event.
+		 */
+		sync_clear_bit(port, BM(&s->evtchn_mask[0]));
+		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
+
+		if (unlikely(evtchn_pending && xen_hvm_domain())) {
+			sync_set_bit(port, BM(&s->evtchn_mask[0]));
+			do_hypercall = 1;
+		}
+	}
+
+	/* Slow path (hypercall) if this is a non-local port or if this is
+	 * an hvm domain and an event is pending (hvm domains don't have
+	 * their own implementation of irq_enable). */
+	if (do_hypercall) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (evtchn_pending &&
+		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
+					   BM(&vcpu_info->evtchn_pending_sel)))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+
+/*
+ * Mask out the i least significant bits of w
+ */
+#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
+
+static inline xen_ulong_t active_evtchns(unsigned int cpu,
+					 struct shared_info *sh,
+					 unsigned int idx)
+{
+	return sh->evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~sh->evtchn_mask[idx];
+}
+
+/*
+ * Search the CPU's pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+void xen_evtchn_handle_events(int cpu)
+{
+	int irq;
+	xen_ulong_t pending_words;
+	xen_ulong_t pending_bits;
+	int start_word_idx, start_bit_idx;
+	int word_idx, bit_idx;
+	int i;
+	struct irq_desc *desc;
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+	/* Timer interrupt has highest priority. */
+	irq = irq_from_virq(cpu, VIRQ_TIMER);
+	if (irq != -1) {
+		unsigned int evtchn = evtchn_from_irq(irq);
+		word_idx = evtchn / BITS_PER_LONG;
+		bit_idx = evtchn % BITS_PER_LONG;
+		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) {
+			desc = irq_to_desc(irq);
+			if (desc)
+				generic_handle_irq_desc(irq, desc);
+		}
+	}
+
+	/*
+	 * Master flag must be cleared /before/ clearing
+	 * selector flag. xchg_xen_ulong must contain an
+	 * appropriate barrier.
+	 */
+	pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
+
+	start_word_idx = __this_cpu_read(current_word_idx);
+	start_bit_idx = __this_cpu_read(current_bit_idx);
+
+	word_idx = start_word_idx;
+
+	for (i = 0; pending_words != 0; i++) {
+		xen_ulong_t words;
+
+		words = MASK_LSBS(pending_words, word_idx);
+
+		/*
+		 * If we masked out all events, wrap to beginning.
+		 */
+		if (words == 0) {
+			word_idx = 0;
+			bit_idx = 0;
+			continue;
+		}
+		word_idx = EVTCHN_FIRST_BIT(words);
+
+		pending_bits = active_evtchns(cpu, s, word_idx);
+		bit_idx = 0; /* usually scan entire word from start */
+		/*
+		 * We scan the starting word in two parts.
+		 *
+		 * 1st time: start in the middle, scanning the
+		 * upper bits.
+		 *
+		 * 2nd time: scan the whole word (not just the
+		 * parts skipped in the first pass) -- if an
+		 * event in the previously scanned bits is
+		 * pending again it would just be scanned on
+		 * the next loop anyway.
+		 */
+		if (word_idx == start_word_idx) {
+			if (i == 0)
+				bit_idx = start_bit_idx;
+		}
+
+		do {
+			xen_ulong_t bits;
+			int port;
+
+			bits = MASK_LSBS(pending_bits, bit_idx);
+
+			/* If we masked out all events, move on. */
+			if (bits == 0)
+				break;
+
+			bit_idx = EVTCHN_FIRST_BIT(bits);
+
+			/* Process port. */
+			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
+			irq = evtchn_to_irq[port];
+
+			if (irq != -1) {
+				desc = irq_to_desc(irq);
+				if (desc)
+					generic_handle_irq_desc(irq, desc);
+			}
+
+			bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
+
+			/* Next caller starts at last processed + 1 */
+			__this_cpu_write(current_word_idx,
+					 bit_idx ? word_idx :
+					 (word_idx+1) % BITS_PER_EVTCHN_WORD);
+			__this_cpu_write(current_bit_idx, bit_idx);
+		} while (bit_idx != 0);
+
+		/* Scan start_l1i twice; all others once. */
+		if ((word_idx != start_word_idx) || (i != 0))
+			pending_words &= ~(1UL << word_idx);
+
+		word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
+	}
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	int cpu = smp_processor_id();
+	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(debug_lock);
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int pending;
+		v = per_cpu(xen_vcpu, i);
+		pending = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i,
+		       pending, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+	}
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)sizeof(sh->evtchn_pending[0])*2,
+		       sh->evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+		xen_ulong_t pending = sh->evtchn_pending[i]
+			& ~sh->evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
+			int word_idx = i / BITS_PER_EVTCHN_WORD;
+			printk("  %d: event %d -> irq %d%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
+			       ? "" : " l2-clear",
+			       !sync_test_bit(i, BM(sh->evtchn_mask))
+			       ? "" : " globally-masked",
+			       sync_test_bit(i, BM(cpu_evtchn))
+			       ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index fec5da4ff3a0..8771b740e30f 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -59,6 +59,8 @@
 #include <xen/interface/vcpu.h>
 #include <asm/hw_irq.h>
 
+#include "events_internal.h"
+
 /*
  * This lock protects updates to the following mapping and reference-count
  * arrays. The lock does not need to be acquired to read the mapping tables.
@@ -73,72 +75,12 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
-/* Interrupt types. */
-enum xen_irq_type {
-	IRQT_UNBOUND = 0,
-	IRQT_PIRQ,
-	IRQT_VIRQ,
-	IRQT_IPI,
-	IRQT_EVTCHN
-};
-
-/*
- * Packed IRQ information:
- * type - enum xen_irq_type
- * event channel - irq->event channel mapping
- * cpu - cpu this event channel is bound to
- * index - type-specific information:
- *    PIRQ - physical IRQ, GSI, flags, and owner domain
- *    VIRQ - virq number
- *    IPI - IPI vector
- *    EVTCHN -
- */
-struct irq_info {
-	struct list_head list;
-	int refcnt;
-	enum xen_irq_type type;	/* type */
-	unsigned irq;
-	unsigned short evtchn;	/* event channel */
-	unsigned short cpu;	/* cpu bound */
-
-	union {
-		unsigned short virq;
-		enum ipi_vector ipi;
-		struct {
-			unsigned short pirq;
-			unsigned short gsi;
-			unsigned char flags;
-			uint16_t domid;
-		} pirq;
-	} u;
-};
-#define PIRQ_NEEDS_EOI	(1 << 0)
-#define PIRQ_SHAREABLE	(1 << 1)
-
-static int *evtchn_to_irq;
+int *evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
 #endif
 static bool (*pirq_needs_eoi)(unsigned irq);
 
-/*
- * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
- * careful to only use bitops which allow for this (e.g
- * test_bit/find_first_bit and friends but not __ffs) and to pass
- * BITS_PER_EVTCHN_WORD as the bitmask length.
- */
-#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
-/*
- * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
- * array. Primarily to avoid long lines (hence the terse name).
- */
-#define BM(x) (unsigned long *)(x)
-/* Find the first set bit in a evtchn mask */
-#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
-
-static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
-		      cpu_evtchn_mask);
-
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
 
@@ -149,7 +91,7 @@ static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
 /* Get info for IRQ */
-static struct irq_info *info_for_irq(unsigned irq)
+struct irq_info *info_for_irq(unsigned irq)
 {
 	return irq_get_handler_data(irq);
 }
@@ -230,7 +172,7 @@ static void xen_irq_info_pirq_init(unsigned irq,
 /*
  * Accessors for packed IRQ information.
  */
-static unsigned int evtchn_from_irq(unsigned irq)
+unsigned int evtchn_from_irq(unsigned irq)
 {
 	if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))
 		return 0;
@@ -244,6 +186,11 @@ unsigned irq_from_evtchn(unsigned int evtchn)
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
+int irq_from_virq(unsigned int cpu, unsigned int virq)
+{
+	return per_cpu(virq_to_irq, cpu)[virq];
+}
+
 static enum ipi_vector ipi_from_irq(unsigned irq)
 {
 	struct irq_info *info = info_for_irq(irq);
@@ -279,12 +226,12 @@ static enum xen_irq_type type_from_irq(unsigned irq)
 	return info_for_irq(irq)->type;
 }
 
-static unsigned cpu_from_irq(unsigned irq)
+unsigned cpu_from_irq(unsigned irq)
 {
 	return info_for_irq(irq)->cpu;
 }
 
-static unsigned int cpu_from_evtchn(unsigned int evtchn)
+unsigned int cpu_from_evtchn(unsigned int evtchn)
 {
 	int irq = evtchn_to_irq[evtchn];
 	unsigned ret = 0;
@@ -310,55 +257,21 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static inline xen_ulong_t active_evtchns(unsigned int cpu,
-					 struct shared_info *sh,
-					 unsigned int idx)
-{
-	return sh->evtchn_pending[idx] &
-		per_cpu(cpu_evtchn_mask, cpu)[idx] &
-		~sh->evtchn_mask[idx];
-}
-
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
 	int irq = evtchn_to_irq[chn];
+	struct irq_info *info = info_for_irq(irq);
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
 	cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
 #endif
 
-	clear_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))));
-	set_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+	xen_evtchn_port_bind_to_cpu(info, cpu);
 
-	info_for_irq(irq)->cpu = cpu;
+	info->cpu = cpu;
 }
 
-static inline void clear_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
-}
-
-static inline void set_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, BM(&s->evtchn_pending[0]));
-}
-
-static inline int test_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
-}
-
-static inline int test_and_set_mask(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
-}
-
-
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -376,63 +289,6 @@ void notify_remote_via_irq(int irq)
 }
 EXPORT_SYMBOL_GPL(notify_remote_via_irq);
 
-static void mask_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, BM(&s->evtchn_mask[0]));
-}
-
-static void unmask_evtchn(int port)
-{
-	struct shared_info *s = HYPERVISOR_shared_info;
-	unsigned int cpu = get_cpu();
-	int do_hypercall = 0, evtchn_pending = 0;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely((cpu != cpu_from_evtchn(port))))
-		do_hypercall = 1;
-	else {
-		/*
-		 * Need to clear the mask before checking pending to
-		 * avoid a race with an event becoming pending.
-		 *
-		 * EVTCHNOP_unmask will only trigger an upcall if the
-		 * mask bit was set, so if a hypercall is needed
-		 * remask the event.
-		 */
-		sync_clear_bit(port, BM(&s->evtchn_mask[0]));
-		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
-
-		if (unlikely(evtchn_pending && xen_hvm_domain())) {
-			sync_set_bit(port, BM(&s->evtchn_mask[0]));
-			do_hypercall = 1;
-		}
-	}
-
-	/* Slow path (hypercall) if this is a non-local port or if this is
-	 * an hvm domain and an event is pending (hvm domains don't have
-	 * their own implementation of irq_enable). */
-	if (do_hypercall) {
-		struct evtchn_unmask unmask = { .port = port };
-		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
-	} else {
-		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
-
-		/*
-		 * The following is basically the equivalent of
-		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
-		 * the interrupt edge' if the channel is masked.
-		 */
-		if (evtchn_pending &&
-		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
-					   BM(&vcpu_info->evtchn_pending_sel)))
-			vcpu_info->evtchn_upcall_pending = 1;
-	}
-
-	put_cpu();
-}
-
 static void xen_irq_init(unsigned irq)
 {
 	struct irq_info *info;
@@ -1216,222 +1072,21 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 	notify_remote_via_irq(irq);
 }
 
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
-{
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	int cpu = smp_processor_id();
-	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
-	int i;
-	unsigned long flags;
-	static DEFINE_SPINLOCK(debug_lock);
-	struct vcpu_info *v;
-
-	spin_lock_irqsave(&debug_lock, flags);
-
-	printk("\nvcpu %d\n  ", cpu);
-
-	for_each_online_cpu(i) {
-		int pending;
-		v = per_cpu(xen_vcpu, i);
-		pending = (get_irq_regs() && i == cpu)
-			? xen_irqs_disabled(get_irq_regs())
-			: v->evtchn_upcall_mask;
-		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i,
-		       pending, v->evtchn_upcall_pending,
-		       (int)(sizeof(v->evtchn_pending_sel)*2),
-		       v->evtchn_pending_sel);
-	}
-	v = per_cpu(xen_vcpu, cpu);
-
-	printk("\npending:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
-		       (int)sizeof(sh->evtchn_pending[0])*2,
-		       sh->evtchn_pending[i],
-		       i % 8 == 0 ? "\n   " : " ");
-	printk("\nglobal mask:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_mask[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nglobally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
-		       cpu_evtchn[i],
-		       i % 8 == 0 ? "\n   " : " ");
-
-	printk("\nlocally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
-		xen_ulong_t pending = sh->evtchn_pending[i]
-			& ~sh->evtchn_mask[i]
-			& cpu_evtchn[i];
-		printk("%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       pending, i % 8 == 0 ? "\n   " : " ");
-	}
-
-	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
-			int word_idx = i / BITS_PER_EVTCHN_WORD;
-			printk("  %d: event %d -> irq %d%s%s%s\n",
-			       cpu_from_evtchn(i), i,
-			       evtchn_to_irq[i],
-			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
-					     ? "" : " l2-clear",
-			       !sync_test_bit(i, BM(sh->evtchn_mask))
-					     ? "" : " globally-masked",
-			       sync_test_bit(i, BM(cpu_evtchn))
-					     ? "" : " locally-masked");
-		}
-	}
-
-	spin_unlock_irqrestore(&debug_lock, flags);
-
-	return IRQ_HANDLED;
-}
-
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
-static DEFINE_PER_CPU(unsigned int, current_word_idx);
-static DEFINE_PER_CPU(unsigned int, current_bit_idx);
 
-/*
- * Mask out the i least significant bits of w
- */
-#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
-
-/*
- * Search the CPUs pending events bitmasks.  For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching.  The first level is
- * a bitset of words which contain pending event bits.  The second
- * level is a bitset of pending events themselves.
- */
 static void __xen_evtchn_do_upcall(void)
 {
-	int start_word_idx, start_bit_idx;
-	int word_idx, bit_idx;
-	int i, irq;
-	int cpu = get_cpu();
-	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	int cpu = get_cpu();
 	unsigned count;
 
 	do {
-		xen_ulong_t pending_words;
-		xen_ulong_t pending_bits;
-		struct irq_desc *desc;
-
 		vcpu_info->evtchn_upcall_pending = 0;
 
 		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
-		/*
-		 * Master flag must be cleared /before/ clearing
-		 * selector flag. xchg_xen_ulong must contain an
-		 * appropriate barrier.
-		 */
-		if ((irq = per_cpu(virq_to_irq, cpu)[VIRQ_TIMER]) != -1) {
-			int evtchn = evtchn_from_irq(irq);
-			word_idx = evtchn / BITS_PER_LONG;
-			pending_bits = evtchn % BITS_PER_LONG;
-			if (active_evtchns(cpu, s, word_idx) & (1ULL << pending_bits)) {
-				desc = irq_to_desc(irq);
-				if (desc)
-					generic_handle_irq_desc(irq, desc);
-			}
-		}
-
-		pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
-
-		start_word_idx = __this_cpu_read(current_word_idx);
-		start_bit_idx = __this_cpu_read(current_bit_idx);
-
-		word_idx = start_word_idx;
-
-		for (i = 0; pending_words != 0; i++) {
-			xen_ulong_t words;
-
-			words = MASK_LSBS(pending_words, word_idx);
-
-			/*
-			 * If we masked out all events, wrap to beginning.
-			 */
-			if (words == 0) {
-				word_idx = 0;
-				bit_idx = 0;
-				continue;
-			}
-			word_idx = EVTCHN_FIRST_BIT(words);
-
-			pending_bits = active_evtchns(cpu, s, word_idx);
-			bit_idx = 0; /* usually scan entire word from start */
-			/*
-			 * We scan the starting word in two parts.
-			 *
-			 * 1st time: start in the middle, scanning the
-			 * upper bits.
-			 *
-			 * 2nd time: scan the whole word (not just the
-			 * parts skipped in the first pass) -- if an
-			 * event in the previously scanned bits is
-			 * pending again it would just be scanned on
-			 * the next loop anyway.
-			 */
-			if (word_idx == start_word_idx) {
-				if (i == 0)
-					bit_idx = start_bit_idx;
-			}
-
-			do {
-				xen_ulong_t bits;
-				int port;
-
-				bits = MASK_LSBS(pending_bits, bit_idx);
-
-				/* If we masked out all events, move on. */
-				if (bits == 0)
-					break;
-
-				bit_idx = EVTCHN_FIRST_BIT(bits);
-
-				/* Process port. */
-				port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
-				irq = evtchn_to_irq[port];
-
-				if (irq != -1) {
-					desc = irq_to_desc(irq);
-					if (desc)
-						generic_handle_irq_desc(irq, desc);
-				}
-
-				bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
-
-				/* Next caller starts at last processed + 1 */
-				__this_cpu_write(current_word_idx,
-						 bit_idx ? word_idx :
-						 (word_idx+1) % BITS_PER_EVTCHN_WORD);
-				__this_cpu_write(current_bit_idx, bit_idx);
-			} while (bit_idx != 0);
-
-			/* Scan start_l1i twice; all others once. */
-			if ((word_idx != start_word_idx) || (i != 0))
-				pending_words &= ~(1UL << word_idx);
-
-			word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
-		}
+		xen_evtchn_handle_events(cpu);
 
 		BUG_ON(!irqs_disabled());
 
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
new file mode 100644
index 000000000000..79ac70bbbd26
--- /dev/null
+++ b/drivers/xen/events/events_internal.h
@@ -0,0 +1,74 @@
+/*
+ * Xen Event Channels (internal header)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D Ltd.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2 or later.  See the file COPYING for more details.
+ */
+#ifndef __EVENTS_INTERNAL_H__
+#define __EVENTS_INTERNAL_H__
+
+/* Interrupt types. */
+enum xen_irq_type {
+	IRQT_UNBOUND = 0,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
+
+/*
+ * Packed IRQ information:
+ * type - enum xen_irq_type
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+ *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ *           guest, or GSI (real passthrough IRQ) of the device.
+ *    VIRQ - virq number
+ *    IPI - IPI vector
+ *    EVTCHN -
+ */
+struct irq_info {
+	struct list_head list;
+	int refcnt;
+	enum xen_irq_type type;	/* type */
+	unsigned irq;
+	unsigned short evtchn;	/* event channel */
+	unsigned short cpu;	/* cpu bound */
+
+	union {
+		unsigned short virq;
+		enum ipi_vector ipi;
+		struct {
+			unsigned short pirq;
+			unsigned short gsi;
+			unsigned char vector;
+			unsigned char flags;
+			uint16_t domid;
+		} pirq;
+	} u;
+};
+
+#define PIRQ_NEEDS_EOI	(1 << 0)
+#define PIRQ_SHAREABLE	(1 << 1)
+
+extern int *evtchn_to_irq;
+
+struct irq_info *info_for_irq(unsigned irq);
+unsigned cpu_from_irq(unsigned irq);
+unsigned cpu_from_evtchn(unsigned int evtchn);
+
+void xen_evtchn_port_bind_to_cpu(struct irq_info *info, int cpu);
+
+void clear_evtchn(int port);
+void set_evtchn(int port);
+int test_evtchn(int port);
+int test_and_set_mask(int port);
+void mask_evtchn(int port);
+void unmask_evtchn(int port);
+
+void xen_evtchn_handle_events(int cpu);
+
+#endif /* #ifndef __EVENTS_INTERNAL_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index c9ea10ee2273..32ae0f263749 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -73,6 +73,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout);
 
 /* Determine the IRQ which is bound to an event channel */
 unsigned irq_from_evtchn(unsigned int evtchn);
+int irq_from_virq(unsigned int cpu, unsigned int virq);
+unsigned int evtchn_from_irq(unsigned irq);
 
 /* Xen HVM evtchn vector callback */
 void xen_hvm_callback_vector(void);

From ab9a1cca3d172876ae9d5edb63abce7986045597 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 14 Mar 2013 12:49:19 +0000
Subject: [PATCH 13/52] xen/events: add struct evtchn_ops for the low-level
 port operations

evtchn_ops contains the low-level operations that access the shared
data structures.  This allows alternate ABIs to be supported.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_2l.c       | 33 +++++++++++----
 drivers/xen/events/events_base.c     |  4 ++
 drivers/xen/events/events_internal.h | 63 ++++++++++++++++++++++++----
 3 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index a77e98d025fa..e55677cca745 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -41,43 +41,43 @@
 static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
 		      cpu_evtchn_mask);
 
-void xen_evtchn_port_bind_to_cpu(struct irq_info *info, int cpu)
+static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
 {
 	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
 	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
 }
 
-void clear_evtchn(int port)
+static void evtchn_2l_clear_pending(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
 }
 
-void set_evtchn(int port)
+static void evtchn_2l_set_pending(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, BM(&s->evtchn_pending[0]));
 }
 
-int test_evtchn(int port)
+static bool evtchn_2l_is_pending(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
 }
 
-int test_and_set_mask(int port)
+static bool evtchn_2l_test_and_set_mask(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
 }
 
-void mask_evtchn(int port)
+static void evtchn_2l_mask(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, BM(&s->evtchn_mask[0]));
 }
 
-void unmask_evtchn(int port)
+static void evtchn_2l_unmask(unsigned port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	unsigned int cpu = get_cpu();
@@ -153,7 +153,7 @@ static inline xen_ulong_t active_evtchns(unsigned int cpu,
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-void xen_evtchn_handle_events(int cpu)
+static void evtchn_2l_handle_events(unsigned cpu)
 {
 	int irq;
 	xen_ulong_t pending_words;
@@ -346,3 +346,20 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
+
+static const struct evtchn_ops evtchn_ops_2l = {
+	.bind_to_cpu       = evtchn_2l_bind_to_cpu,
+	.clear_pending     = evtchn_2l_clear_pending,
+	.set_pending       = evtchn_2l_set_pending,
+	.is_pending        = evtchn_2l_is_pending,
+	.test_and_set_mask = evtchn_2l_test_and_set_mask,
+	.mask              = evtchn_2l_mask,
+	.unmask            = evtchn_2l_unmask,
+	.handle_events     = evtchn_2l_handle_events,
+};
+
+void __init xen_evtchn_2l_init(void)
+{
+	pr_info("Using 2-level ABI\n");
+	evtchn_ops = &evtchn_ops_2l;
+}
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 8771b740e30f..7c7b744cd13d 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -61,6 +61,8 @@
 
 #include "events_internal.h"
 
+const struct evtchn_ops *evtchn_ops;
+
 /*
  * This lock protects updates to the following mapping and reference-count
  * arrays. The lock does not need to be acquired to read the mapping tables.
@@ -1523,6 +1525,8 @@ void __init xen_init_IRQ(void)
 {
 	int i;
 
+	xen_evtchn_2l_init();
+
 	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
 				    GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 79ac70bbbd26..ba8142f0c635 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -54,21 +54,68 @@ struct irq_info {
 #define PIRQ_NEEDS_EOI	(1 << 0)
 #define PIRQ_SHAREABLE	(1 << 1)
 
+struct evtchn_ops {
+	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
+
+	void (*clear_pending)(unsigned port);
+	void (*set_pending)(unsigned port);
+	bool (*is_pending)(unsigned port);
+	bool (*test_and_set_mask)(unsigned port);
+	void (*mask)(unsigned port);
+	void (*unmask)(unsigned port);
+
+	void (*handle_events)(unsigned cpu);
+};
+
+extern const struct evtchn_ops *evtchn_ops;
+
 extern int *evtchn_to_irq;
 
 struct irq_info *info_for_irq(unsigned irq);
 unsigned cpu_from_irq(unsigned irq);
 unsigned cpu_from_evtchn(unsigned int evtchn);
 
-void xen_evtchn_port_bind_to_cpu(struct irq_info *info, int cpu);
+static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
+					       unsigned cpu)
+{
+	evtchn_ops->bind_to_cpu(info, cpu);
+}
 
-void clear_evtchn(int port);
-void set_evtchn(int port);
-int test_evtchn(int port);
-int test_and_set_mask(int port);
-void mask_evtchn(int port);
-void unmask_evtchn(int port);
+static inline void clear_evtchn(unsigned port)
+{
+	evtchn_ops->clear_pending(port);
+}
 
-void xen_evtchn_handle_events(int cpu);
+static inline void set_evtchn(unsigned port)
+{
+	evtchn_ops->set_pending(port);
+}
+
+static inline bool test_evtchn(unsigned port)
+{
+	return evtchn_ops->is_pending(port);
+}
+
+static inline bool test_and_set_mask(unsigned port)
+{
+	return evtchn_ops->test_and_set_mask(port);
+}
+
+static inline void mask_evtchn(unsigned port)
+{
+	return evtchn_ops->mask(port);
+}
+
+static inline void unmask_evtchn(unsigned port)
+{
+	return evtchn_ops->unmask(port);
+}
+
+static inline void xen_evtchn_handle_events(unsigned cpu)
+{
+	return evtchn_ops->handle_events(cpu);
+}
+
+void xen_evtchn_2l_init(void);
 
 #endif /* #ifndef __EVENTS_INTERNAL_H__ */

From 96d4c5881806ebb993a3d84991af9c96fa9cd576 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 18 Mar 2013 15:50:17 +0000
Subject: [PATCH 14/52] xen/events: allow setup of irq_info to fail

The FIFO-based event ABI requires additional setup of newly bound
events (it may need to expand the event array) and this setup may
fail.

xen_irq_info_common_init() is a useful place to put this setup so
allow this call to fail.  This call and the other similar calls are
renamed to be *_setup() to reflect that they may now fail.

This failure can only occur with new event channels not on rebind.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_base.c | 156 ++++++++++++++++++-------------
 1 file changed, 91 insertions(+), 65 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 7c7b744cd13d..4f7d94abe82c 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -99,7 +99,7 @@ struct irq_info *info_for_irq(unsigned irq)
 }
 
 /* Constructors for packed IRQ information. */
-static void xen_irq_info_common_init(struct irq_info *info,
+static int xen_irq_info_common_setup(struct irq_info *info,
 				     unsigned irq,
 				     enum xen_irq_type type,
 				     unsigned short evtchn,
@@ -116,45 +116,47 @@ static void xen_irq_info_common_init(struct irq_info *info,
 	evtchn_to_irq[evtchn] = irq;
 
 	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+
+	return 0;
 }
 
-static void xen_irq_info_evtchn_init(unsigned irq,
+static int xen_irq_info_evtchn_setup(unsigned irq,
 				     unsigned short evtchn)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0);
+	return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
 }
 
-static void xen_irq_info_ipi_init(unsigned cpu,
+static int xen_irq_info_ipi_setup(unsigned cpu,
 				  unsigned irq,
 				  unsigned short evtchn,
 				  enum ipi_vector ipi)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0);
-
 	info->u.ipi = ipi;
 
 	per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
 }
 
-static void xen_irq_info_virq_init(unsigned cpu,
+static int xen_irq_info_virq_setup(unsigned cpu,
 				   unsigned irq,
 				   unsigned short evtchn,
 				   unsigned short virq)
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0);
-
 	info->u.virq = virq;
 
 	per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
 }
 
-static void xen_irq_info_pirq_init(unsigned irq,
+static int xen_irq_info_pirq_setup(unsigned irq,
 				   unsigned short evtchn,
 				   unsigned short pirq,
 				   unsigned short gsi,
@@ -163,12 +165,12 @@ static void xen_irq_info_pirq_init(unsigned irq,
 {
 	struct irq_info *info = info_for_irq(irq);
 
-	xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0);
-
 	info->u.pirq.pirq = pirq;
 	info->u.pirq.gsi = gsi;
 	info->u.pirq.domid = domid;
 	info->u.pirq.flags = flags;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
 }
 
 /*
@@ -521,6 +523,47 @@ int xen_irq_from_gsi(unsigned gsi)
 }
 EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
+static void __unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_irq(irq);
+	struct irq_info *info = irq_get_handler_data(irq);
+
+	if (info->refcnt > 0) {
+		info->refcnt--;
+		if (info->refcnt != 0)
+			return;
+	}
+
+	if (VALID_EVTCHN(evtchn)) {
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[virq_from_irq(irq)] = -1;
+			break;
+		case IRQT_IPI:
+			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+				[ipi_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+	}
+
+	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
+
+	xen_free_irq(irq);
+}
+
 /*
  * Do not make any assumptions regarding the relationship between the
  * IRQ number returned here and the Xen pirq argument.
@@ -536,6 +579,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 {
 	int irq = -1;
 	struct physdev_irq irq_op;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
@@ -563,8 +607,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 		goto out;
 	}
 
-	xen_irq_info_pirq_init(irq, 0, pirq, gsi, DOMID_SELF,
+	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
 			       shareable ? PIRQ_SHAREABLE : 0);
+	if (ret < 0) {
+		__unbind_from_irq(irq);
+		irq = ret;
+		goto out;
+	}
 
 	pirq_query_unmask(irq);
 	/* We try to use the handler with the appropriate semantic for the
@@ -624,7 +673,9 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 	irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq,
 			name);
 
-	xen_irq_info_pirq_init(irq, 0, pirq, 0, domid, 0);
+	ret = xen_irq_info_pirq_setup(irq, 0, pirq, 0, domid, 0);
+	if (ret < 0)
+		goto error_irq;
 	ret = irq_set_msi_desc(irq, msidesc);
 	if (ret < 0)
 		goto error_irq;
@@ -632,8 +683,8 @@ out:
 	mutex_unlock(&irq_mapping_update_lock);
 	return irq;
 error_irq:
+	__unbind_from_irq(irq);
 	mutex_unlock(&irq_mapping_update_lock);
-	xen_free_irq(irq);
 	return ret;
 }
 #endif
@@ -703,9 +754,11 @@ int xen_pirq_from_irq(unsigned irq)
 	return pirq_from_irq(irq);
 }
 EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
+
 int bind_evtchn_to_irq(unsigned int evtchn)
 {
 	int irq;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
@@ -719,7 +772,12 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 		irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
 					      handle_edge_irq, "event");
 
-		xen_irq_info_evtchn_init(irq, evtchn);
+		ret = xen_irq_info_evtchn_setup(irq, evtchn);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
 	} else {
 		struct irq_info *info = info_for_irq(irq);
 		WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
@@ -736,6 +794,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
 	struct evtchn_bind_ipi bind_ipi;
 	int evtchn, irq;
+	int ret;
 
 	mutex_lock(&irq_mapping_update_lock);
 
@@ -755,8 +814,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 			BUG();
 		evtchn = bind_ipi.port;
 
-		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
-
+		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
 		bind_evtchn_to_cpu(evtchn, cpu);
 	} else {
 		struct irq_info *info = info_for_irq(irq);
@@ -835,7 +898,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 			evtchn = ret;
 		}
 
-		xen_irq_info_virq_init(cpu, irq, evtchn, virq);
+		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
 
 		bind_evtchn_to_cpu(evtchn, cpu);
 	} else {
@@ -851,50 +919,8 @@ out:
 
 static void unbind_from_irq(unsigned int irq)
 {
-	struct evtchn_close close;
-	int evtchn = evtchn_from_irq(irq);
-	struct irq_info *info = irq_get_handler_data(irq);
-
-	if (WARN_ON(!info))
-		return;
-
 	mutex_lock(&irq_mapping_update_lock);
-
-	if (info->refcnt > 0) {
-		info->refcnt--;
-		if (info->refcnt != 0)
-			goto done;
-	}
-
-	if (VALID_EVTCHN(evtchn)) {
-		close.port = evtchn;
-		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-			BUG();
-
-		switch (type_from_irq(irq)) {
-		case IRQT_VIRQ:
-			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
-				[virq_from_irq(irq)] = -1;
-			break;
-		case IRQT_IPI:
-			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
-				[ipi_from_irq(irq)] = -1;
-			break;
-		default:
-			break;
-		}
-
-		/* Closed ports are implicitly re-bound to VCPU0. */
-		bind_evtchn_to_cpu(evtchn, 0);
-
-		evtchn_to_irq[evtchn] = -1;
-	}
-
-	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
-
-	xen_free_irq(irq);
-
- done:
+	__unbind_from_irq(irq);
 	mutex_unlock(&irq_mapping_update_lock);
 }
 
@@ -1142,7 +1168,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
 	   so there should be a proper type */
 	BUG_ON(info->type == IRQT_UNBOUND);
 
-	xen_irq_info_evtchn_init(irq, evtchn);
+	(void)xen_irq_info_evtchn_setup(irq, evtchn);
 
 	mutex_unlock(&irq_mapping_update_lock);
 
@@ -1317,7 +1343,7 @@ static void restore_cpu_virqs(unsigned int cpu)
 		evtchn = bind_virq.port;
 
 		/* Record the new mapping. */
-		xen_irq_info_virq_init(cpu, irq, evtchn, virq);
+		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 }
@@ -1341,7 +1367,7 @@ static void restore_cpu_ipis(unsigned int cpu)
 		evtchn = bind_ipi.port;
 
 		/* Record the new mapping. */
-		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi);
+		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 }

From 083858758f67bb20ef6be5bc8442be91cca8ee2d Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 18 Mar 2013 16:54:57 +0000
Subject: [PATCH 15/52] xen/events: add a evtchn_op for port setup

Add a hook for port-specific setup and call it from
xen_irq_info_common_setup().

The FIFO-based ABIs may need to perform additional setup (expanding
the event array) before a bound event channel can start to receive
events.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_base.c     |  2 +-
 drivers/xen/events/events_internal.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 4f7d94abe82c..929eccb77270 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -117,7 +117,7 @@ static int xen_irq_info_common_setup(struct irq_info *info,
 
 	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
 
-	return 0;
+	return xen_evtchn_port_setup(info);
 }
 
 static int xen_irq_info_evtchn_setup(unsigned irq,
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index ba8142f0c635..dc9650265e04 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -55,6 +55,7 @@ struct irq_info {
 #define PIRQ_SHAREABLE	(1 << 1)
 
 struct evtchn_ops {
+	int (*setup)(struct irq_info *info);
 	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
 
 	void (*clear_pending)(unsigned port);
@@ -75,6 +76,17 @@ struct irq_info *info_for_irq(unsigned irq);
 unsigned cpu_from_irq(unsigned irq);
 unsigned cpu_from_evtchn(unsigned int evtchn);
 
+/*
+ * Do any ABI specific setup for a bound event channel before it can
+ * be unmasked and used.
+ */
+static inline int xen_evtchn_port_setup(struct irq_info *info)
+{
+	if (evtchn_ops->setup)
+		return evtchn_ops->setup(info);
+	return 0;
+}
+
 static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
 					       unsigned cpu)
 {

From d0b075ffeede257342c3afdbeadd2fda8504ecee Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 17 Oct 2013 15:23:15 +0100
Subject: [PATCH 16/52] xen/events: Refactor evtchn_to_irq array to be
 dynamically allocated

Refactor static array evtchn_to_irq array to be dynamically allocated by
implementing get and set functions for accesses to the array.

Two new port ops are added: max_channels (maximum supported number of
event channels) and nr_channels (number of currently usable event
channels).  For the 2-level ABI, these numbers are both the same as
the shared data structure is a fixed size. For the FIFO ABI, these
will be different as the event array is expanded dynamically.

This allows more than 65000 event channels so an unsigned short is no
longer sufficient for an event channel port number and unsigned int is
used instead.

Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_2l.c       |  11 +-
 drivers/xen/events/events_base.c     | 175 +++++++++++++++++++--------
 drivers/xen/events/events_internal.h |  18 ++-
 3 files changed, 149 insertions(+), 55 deletions(-)

diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index e55677cca745..ecb402a149e3 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -41,6 +41,11 @@
 static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
 		      cpu_evtchn_mask);
 
+static unsigned evtchn_2l_max_channels(void)
+{
+	return NR_EVENT_CHANNELS;
+}
+
 static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
 {
 	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
@@ -238,7 +243,7 @@ static void evtchn_2l_handle_events(unsigned cpu)
 
 			/* Process port. */
 			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
-			irq = evtchn_to_irq[port];
+			irq = get_evtchn_to_irq(port);
 
 			if (irq != -1) {
 				desc = irq_to_desc(irq);
@@ -332,7 +337,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 			int word_idx = i / BITS_PER_EVTCHN_WORD;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
 			       cpu_from_evtchn(i), i,
-			       evtchn_to_irq[i],
+			       get_evtchn_to_irq(i),
 			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
 			       ? "" : " l2-clear",
 			       !sync_test_bit(i, BM(sh->evtchn_mask))
@@ -348,6 +353,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 }
 
 static const struct evtchn_ops evtchn_ops_2l = {
+	.max_channels      = evtchn_2l_max_channels,
+	.nr_channels       = evtchn_2l_max_channels,
 	.bind_to_cpu       = evtchn_2l_bind_to_cpu,
 	.clear_pending     = evtchn_2l_clear_pending,
 	.set_pending       = evtchn_2l_set_pending,
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 929eccb77270..a6906665de53 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -77,12 +77,16 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
-int *evtchn_to_irq;
+int **evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
 #endif
 static bool (*pirq_needs_eoi)(unsigned irq);
 
+#define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))
+
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
 
@@ -92,6 +96,61 @@ static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
+static void clear_evtchn_to_irq_row(unsigned row)
+{
+	unsigned col;
+
+	for (col = 0; col < EVTCHN_PER_ROW; col++)
+		evtchn_to_irq[row][col] = -1;
+}
+
+static void clear_evtchn_to_irq_all(void)
+{
+	unsigned row;
+
+	for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
+		if (evtchn_to_irq[row] == NULL)
+			continue;
+		clear_evtchn_to_irq_row(row);
+	}
+}
+
+static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
+{
+	unsigned row;
+	unsigned col;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -EINVAL;
+
+	row = EVTCHN_ROW(evtchn);
+	col = EVTCHN_COL(evtchn);
+
+	if (evtchn_to_irq[row] == NULL) {
+		/* Unallocated irq entries return -1 anyway */
+		if (irq == -1)
+			return 0;
+
+		evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL);
+		if (evtchn_to_irq[row] == NULL)
+			return -ENOMEM;
+
+		clear_evtchn_to_irq_row(row);
+	}
+
+	evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq;
+	return 0;
+}
+
+int get_evtchn_to_irq(unsigned evtchn)
+{
+	if (evtchn >= xen_evtchn_max_channels())
+		return -1;
+	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+		return -1;
+	return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)];
+}
+
 /* Get info for IRQ */
 struct irq_info *info_for_irq(unsigned irq)
 {
@@ -102,9 +161,10 @@ struct irq_info *info_for_irq(unsigned irq)
 static int xen_irq_info_common_setup(struct irq_info *info,
 				     unsigned irq,
 				     enum xen_irq_type type,
-				     unsigned short evtchn,
+				     unsigned evtchn,
 				     unsigned short cpu)
 {
+	int ret;
 
 	BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
 
@@ -113,7 +173,9 @@ static int xen_irq_info_common_setup(struct irq_info *info,
 	info->evtchn = evtchn;
 	info->cpu = cpu;
 
-	evtchn_to_irq[evtchn] = irq;
+	ret = set_evtchn_to_irq(evtchn, irq);
+	if (ret < 0)
+		return ret;
 
 	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
 
@@ -121,7 +183,7 @@ static int xen_irq_info_common_setup(struct irq_info *info,
 }
 
 static int xen_irq_info_evtchn_setup(unsigned irq,
-				     unsigned short evtchn)
+				     unsigned evtchn)
 {
 	struct irq_info *info = info_for_irq(irq);
 
@@ -130,7 +192,7 @@ static int xen_irq_info_evtchn_setup(unsigned irq,
 
 static int xen_irq_info_ipi_setup(unsigned cpu,
 				  unsigned irq,
-				  unsigned short evtchn,
+				  unsigned evtchn,
 				  enum ipi_vector ipi)
 {
 	struct irq_info *info = info_for_irq(irq);
@@ -144,8 +206,8 @@ static int xen_irq_info_ipi_setup(unsigned cpu,
 
 static int xen_irq_info_virq_setup(unsigned cpu,
 				   unsigned irq,
-				   unsigned short evtchn,
-				   unsigned short virq)
+				   unsigned evtchn,
+				   unsigned virq)
 {
 	struct irq_info *info = info_for_irq(irq);
 
@@ -157,9 +219,9 @@ static int xen_irq_info_virq_setup(unsigned cpu,
 }
 
 static int xen_irq_info_pirq_setup(unsigned irq,
-				   unsigned short evtchn,
-				   unsigned short pirq,
-				   unsigned short gsi,
+				   unsigned evtchn,
+				   unsigned pirq,
+				   unsigned gsi,
 				   uint16_t domid,
 				   unsigned char flags)
 {
@@ -173,6 +235,12 @@ static int xen_irq_info_pirq_setup(unsigned irq,
 	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
 }
 
+static void xen_irq_info_cleanup(struct irq_info *info)
+{
+	set_evtchn_to_irq(info->evtchn, -1);
+	info->evtchn = 0;
+}
+
 /*
  * Accessors for packed IRQ information.
  */
@@ -186,7 +254,7 @@ unsigned int evtchn_from_irq(unsigned irq)
 
 unsigned irq_from_evtchn(unsigned int evtchn)
 {
-	return evtchn_to_irq[evtchn];
+	return get_evtchn_to_irq(evtchn);
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
@@ -237,7 +305,7 @@ unsigned cpu_from_irq(unsigned irq)
 
 unsigned int cpu_from_evtchn(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	unsigned ret = 0;
 
 	if (irq != -1)
@@ -263,7 +331,7 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
-	int irq = evtchn_to_irq[chn];
+	int irq = get_evtchn_to_irq(chn);
 	struct irq_info *info = info_for_irq(irq);
 
 	BUG_ON(irq == -1);
@@ -386,6 +454,18 @@ static void xen_free_irq(unsigned irq)
 	irq_free_desc(irq);
 }
 
+static void xen_evtchn_close(unsigned int port)
+{
+	struct evtchn_close close;
+
+	close.port = port;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+
+	/* Closed ports are implicitly re-bound to VCPU0. */
+	bind_evtchn_to_cpu(port, 0);
+}
+
 static void pirq_query_unmask(int irq)
 {
 	struct physdev_irq_status_query irq_status;
@@ -458,7 +538,13 @@ static unsigned int __startup_pirq(unsigned int irq)
 
 	pirq_query_unmask(irq);
 
-	evtchn_to_irq[evtchn] = irq;
+	rc = set_evtchn_to_irq(evtchn, irq);
+	if (rc != 0) {
+		pr_err("irq%d: Failed to set port to irq mapping (%d)\n",
+		       irq, rc);
+		xen_evtchn_close(evtchn);
+		return 0;
+	}
 	bind_evtchn_to_cpu(evtchn, 0);
 	info->evtchn = evtchn;
 
@@ -476,10 +562,9 @@ static unsigned int startup_pirq(struct irq_data *data)
 
 static void shutdown_pirq(struct irq_data *data)
 {
-	struct evtchn_close close;
 	unsigned int irq = data->irq;
 	struct irq_info *info = info_for_irq(irq);
-	int evtchn = evtchn_from_irq(irq);
+	unsigned evtchn = evtchn_from_irq(irq);
 
 	BUG_ON(info->type != IRQT_PIRQ);
 
@@ -487,14 +572,8 @@ static void shutdown_pirq(struct irq_data *data)
 		return;
 
 	mask_evtchn(evtchn);
-
-	close.port = evtchn;
-	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-		BUG();
-
-	bind_evtchn_to_cpu(evtchn, 0);
-	evtchn_to_irq[evtchn] = -1;
-	info->evtchn = 0;
+	xen_evtchn_close(evtchn);
+	xen_irq_info_cleanup(info);
 }
 
 static void enable_pirq(struct irq_data *data)
@@ -525,7 +604,6 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
 static void __unbind_from_irq(unsigned int irq)
 {
-	struct evtchn_close close;
 	int evtchn = evtchn_from_irq(irq);
 	struct irq_info *info = irq_get_handler_data(irq);
 
@@ -536,27 +614,22 @@ static void __unbind_from_irq(unsigned int irq)
 	}
 
 	if (VALID_EVTCHN(evtchn)) {
-		close.port = evtchn;
-		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-			BUG();
+		unsigned int cpu = cpu_from_irq(irq);
+
+		xen_evtchn_close(evtchn);
 
 		switch (type_from_irq(irq)) {
 		case IRQT_VIRQ:
-			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
-				[virq_from_irq(irq)] = -1;
+			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
 			break;
 		case IRQT_IPI:
-			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
-				[ipi_from_irq(irq)] = -1;
+			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
 			break;
 		default:
 			break;
 		}
 
-		/* Closed ports are implicitly re-bound to VCPU0. */
-		bind_evtchn_to_cpu(evtchn, 0);
-
-		evtchn_to_irq[evtchn] = -1;
+		xen_irq_info_cleanup(info);
 	}
 
 	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND);
@@ -760,9 +833,12 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 	int irq;
 	int ret;
 
+	if (evtchn >= xen_evtchn_max_channels())
+		return -ENOMEM;
+
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = evtchn_to_irq[evtchn];
+	irq = get_evtchn_to_irq(evtchn);
 
 	if (irq == -1) {
 		irq = xen_allocate_irq_dynamic();
@@ -852,7 +928,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	int port, rc = -ENOENT;
 
 	memset(&status, 0, sizeof(status));
-	for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+	for (port = 0; port < xen_evtchn_max_channels(); port++) {
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -1022,7 +1098,7 @@ EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
 
 int evtchn_make_refcounted(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	struct irq_info *info;
 
 	if (irq == -1)
@@ -1047,12 +1123,12 @@ int evtchn_get(unsigned int evtchn)
 	struct irq_info *info;
 	int err = -ENOENT;
 
-	if (evtchn >= NR_EVENT_CHANNELS)
+	if (evtchn >= xen_evtchn_max_channels())
 		return -EINVAL;
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = evtchn_to_irq[evtchn];
+	irq = get_evtchn_to_irq(evtchn);
 	if (irq == -1)
 		goto done;
 
@@ -1076,7 +1152,7 @@ EXPORT_SYMBOL_GPL(evtchn_get);
 
 void evtchn_put(unsigned int evtchn)
 {
-	int irq = evtchn_to_irq[evtchn];
+	int irq = get_evtchn_to_irq(evtchn);
 	if (WARN_ON(irq == -1))
 		return;
 	unbind_from_irq(irq);
@@ -1163,7 +1239,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
 	mutex_lock(&irq_mapping_update_lock);
 
 	/* After resume the irq<->evtchn mappings are all cleared out */
-	BUG_ON(evtchn_to_irq[evtchn] != -1);
+	BUG_ON(get_evtchn_to_irq(evtchn) != -1);
 	/* Expect irq to have been bound before,
 	   so there should be a proper type */
 	BUG_ON(info->type == IRQT_UNBOUND);
@@ -1448,15 +1524,14 @@ void xen_irq_resume(void)
 	struct irq_info *info;
 
 	/* New event-channel space is not 'live' yet. */
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
 		info->evtchn = 0; /* zap event-channel binding */
 
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
-		evtchn_to_irq[evtchn] = -1;
+	clear_evtchn_to_irq_all();
 
 	for_each_possible_cpu(cpu) {
 		restore_cpu_virqs(cpu);
@@ -1553,14 +1628,12 @@ void __init xen_init_IRQ(void)
 
 	xen_evtchn_2l_init();
 
-	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
-				    GFP_KERNEL);
+	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
+				sizeof(*evtchn_to_irq), GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
-		evtchn_to_irq[i] = -1;
 
 	/* No event channels are 'live' right now. */
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+	for (i = 0; i < xen_evtchn_nr_channels(); i++)
 		mask_evtchn(i);
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index dc9650265e04..a3d9aeceda1a 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -35,7 +35,7 @@ struct irq_info {
 	int refcnt;
 	enum xen_irq_type type;	/* type */
 	unsigned irq;
-	unsigned short evtchn;	/* event channel */
+	unsigned int evtchn;	/* event channel */
 	unsigned short cpu;	/* cpu bound */
 
 	union {
@@ -55,6 +55,9 @@ struct irq_info {
 #define PIRQ_SHAREABLE	(1 << 1)
 
 struct evtchn_ops {
+	unsigned (*max_channels)(void);
+	unsigned (*nr_channels)(void);
+
 	int (*setup)(struct irq_info *info);
 	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
 
@@ -70,12 +73,23 @@ struct evtchn_ops {
 
 extern const struct evtchn_ops *evtchn_ops;
 
-extern int *evtchn_to_irq;
+extern int **evtchn_to_irq;
+int get_evtchn_to_irq(unsigned int evtchn);
 
 struct irq_info *info_for_irq(unsigned irq);
 unsigned cpu_from_irq(unsigned irq);
 unsigned cpu_from_evtchn(unsigned int evtchn);
 
+static inline unsigned xen_evtchn_max_channels(void)
+{
+	return evtchn_ops->max_channels();
+}
+
+static inline unsigned xen_evtchn_nr_channels(void)
+{
+	return evtchn_ops->nr_channels();
+}
+
 /*
  * Do any ABI specific setup for a bound event channel before it can
  * be unmasked and used.

From fd21069dfe31a4b20f5ef580006abe72d1660f5b Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 5 Sep 2013 18:11:38 +0100
Subject: [PATCH 17/52] xen/events: add xen_evtchn_mask_all()

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_base.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index a6906665de53..c6d64f1e191c 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -344,6 +344,14 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 	info->cpu = cpu;
 }
 
+static void xen_evtchn_mask_all(void)
+{
+	unsigned int evtchn;
+
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+		mask_evtchn(evtchn);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -1520,12 +1528,11 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared);
 
 void xen_irq_resume(void)
 {
-	unsigned int cpu, evtchn;
+	unsigned int cpu;
 	struct irq_info *info;
 
 	/* New event-channel space is not 'live' yet. */
-	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
-		mask_evtchn(evtchn);
+	xen_evtchn_mask_all();
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
@@ -1624,8 +1631,6 @@ void xen_callback_vector(void) {}
 
 void __init xen_init_IRQ(void)
 {
-	int i;
-
 	xen_evtchn_2l_init();
 
 	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
@@ -1633,8 +1638,7 @@ void __init xen_init_IRQ(void)
 	BUG_ON(!evtchn_to_irq);
 
 	/* No event channels are 'live' right now. */
-	for (i = 0; i < xen_evtchn_nr_channels(); i++)
-		mask_evtchn(i);
+	xen_evtchn_mask_all();
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 

From 0dc0064add422bc0ef5165ebe9ece3052bbd457d Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 23 Sep 2013 21:03:38 +0100
Subject: [PATCH 18/52] xen/evtchn: support more than 4096 ports

Remove the check during unbind for NR_EVENT_CHANNELS as this limits
support to less than 4096 ports.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_base.c     | 13 +++++++++++++
 drivers/xen/events/events_internal.h |  5 -----
 drivers/xen/evtchn.c                 |  2 +-
 include/xen/events.h                 |  2 ++
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index c6d64f1e191c..9d0d88cf74af 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -952,6 +952,19 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	return rc;
 }
 
+/**
+ * xen_evtchn_nr_channels - number of usable event channel ports
+ *
+ * This may be less than the maximum supported by the current
+ * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum
+ * supported.
+ */
+unsigned xen_evtchn_nr_channels(void)
+{
+        return evtchn_ops->nr_channels();
+}
+EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
+
 int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index a3d9aeceda1a..2862e1cccf1c 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -85,11 +85,6 @@ static inline unsigned xen_evtchn_max_channels(void)
 	return evtchn_ops->max_channels();
 }
 
-static inline unsigned xen_evtchn_nr_channels(void)
-{
-	return evtchn_ops->nr_channels();
-}
-
 /*
  * Do any ABI specific setup for a bound event channel before it can
  * be unmasked and used.
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 5de2063e16d3..00f40f051d95 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -417,7 +417,7 @@ static long evtchn_ioctl(struct file *file,
 			break;
 
 		rc = -EINVAL;
-		if (unbind.port >= NR_EVENT_CHANNELS)
+		if (unbind.port >= xen_evtchn_nr_channels())
 			break;
 
 		rc = -ENOTCONN;
diff --git a/include/xen/events.h b/include/xen/events.h
index 32ae0f263749..55b42cc997f6 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -7,6 +7,8 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/events.h>
 
+unsigned xen_evtchn_nr_channels(void);
+
 int bind_evtchn_to_irq(unsigned int evtchn);
 int bind_evtchn_to_irqhandler(unsigned int evtchn,
 			      irq_handler_t handler,

From bf2bbe07f13846a90d4447521d87566d6f87bc0e Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 15 Mar 2013 10:55:41 +0000
Subject: [PATCH 19/52] xen/events: Add the hypervisor interface for the
 FIFO-based event channels

Add the hypercall sub-ops and the structures for the shared data used
in the FIFO-based event channel ABI.

The design document for this new ABI is available here:

    http://xenbits.xen.org/people/dvrabel/event-channels-H.pdf

In summary, events are reported using a per-domain shared event array
of event words.  Each event word has PENDING, LINKED and MASKED bits
and a LINK field for pointing to the next event in the event queue.

There are 16 event queues (with different priorities) per-VCPU.

Key advantages of this new ABI include:

- Support for over 100,000 events (2^17).
- 16 different event priorities.
- Improved fairness in event latency through the use of FIFOs.

The ABI is available in Xen 4.4 and later.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_2l.c        |  8 ++--
 include/xen/interface/event_channel.h | 68 +++++++++++++++++++++++++++
 include/xen/interface/xen.h           |  6 ---
 3 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index ecb402a149e3..d7ff91757307 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -38,12 +38,12 @@
 /* Find the first set bit in a evtchn mask */
 #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
 
-static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
 		      cpu_evtchn_mask);
 
 static unsigned evtchn_2l_max_channels(void)
 {
-	return NR_EVENT_CHANNELS;
+	return EVTCHN_2L_NR_CHANNELS;
 }
 
 static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
@@ -316,7 +316,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+	for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
 		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -332,7 +332,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
 		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
 			int word_idx = i / BITS_PER_EVTCHN_WORD;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
index f4942921e202..7e6acef5415b 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -190,6 +190,39 @@ struct evtchn_reset {
 };
 typedef struct evtchn_reset evtchn_reset_t;
 
+/*
+ * EVTCHNOP_init_control: initialize the control block for the FIFO ABI.
+ */
+#define EVTCHNOP_init_control    11
+struct evtchn_init_control {
+	/* IN parameters. */
+	uint64_t control_gfn;
+	uint32_t offset;
+	uint32_t vcpu;
+	/* OUT parameters. */
+	uint8_t link_bits;
+	uint8_t _pad[7];
+};
+
+/*
+ * EVTCHNOP_expand_array: add an additional page to the event array.
+ */
+#define EVTCHNOP_expand_array    12
+struct evtchn_expand_array {
+	/* IN parameters. */
+	uint64_t array_gfn;
+};
+
+/*
+ * EVTCHNOP_set_priority: set the priority for an event channel.
+ */
+#define EVTCHNOP_set_priority    13
+struct evtchn_set_priority {
+	/* IN parameters. */
+	uint32_t port;
+	uint32_t priority;
+};
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
@@ -207,4 +240,39 @@ struct evtchn_op {
 };
 DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
 
+/*
+ * 2-level ABI
+ */
+
+#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
+
+/*
+ * FIFO ABI
+ */
+
+/* Events may have priorities from 0 (highest) to 15 (lowest). */
+#define EVTCHN_FIFO_PRIORITY_MAX     0
+#define EVTCHN_FIFO_PRIORITY_DEFAULT 7
+#define EVTCHN_FIFO_PRIORITY_MIN     15
+
+#define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1)
+
+typedef uint32_t event_word_t;
+
+#define EVTCHN_FIFO_PENDING 31
+#define EVTCHN_FIFO_MASKED  30
+#define EVTCHN_FIFO_LINKED  29
+#define EVTCHN_FIFO_BUSY    28
+
+#define EVTCHN_FIFO_LINK_BITS 17
+#define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1)
+
+#define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS)
+
+struct evtchn_fifo_control_block {
+	uint32_t     ready;
+	uint32_t     _rsvd;
+	event_word_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+
 #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 53ec4167bd0b..0cd5ca333fac 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -281,12 +281,6 @@ struct multicall_entry {
 };
 DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
 
-/*
- * Event channel endpoints per domain:
- *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
- */
-#define NR_EVENT_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
-
 struct vcpu_time_info {
 	/*
 	 * Updates to the following values are preceded and followed

From 6ccecb0fbc0494c7221459e6358a016f3281a0ca Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 23 Sep 2013 12:47:26 +0100
Subject: [PATCH 20/52] xen/events: allow event channel priority to be set

Add xen_irq_set_priority() to set an event channels priority.  This function
will only work with event channel ABIs that support priority (i.e., the
FIFO-based ABI).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/events_base.c | 17 +++++++++++++++++
 include/xen/events.h             |  5 +++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 9d0d88cf74af..e9001fef4ffd 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1117,6 +1117,23 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)
 }
 EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
 
+/**
+ * xen_set_irq_priority() - set an event channel priority.
+ * @irq:irq bound to an event channel.
+ * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN.
+ */
+int xen_set_irq_priority(unsigned irq, unsigned priority)
+{
+	struct evtchn_set_priority set_priority;
+
+	set_priority.port = evtchn_from_irq(irq);
+	set_priority.priority = priority;
+
+	return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority,
+					   &set_priority);
+}
+EXPORT_SYMBOL_GPL(xen_set_irq_priority);
+
 int evtchn_make_refcounted(unsigned int evtchn)
 {
 	int irq = get_evtchn_to_irq(evtchn);
diff --git a/include/xen/events.h b/include/xen/events.h
index 55b42cc997f6..c9c85cf84895 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -39,6 +39,11 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
  */
 void unbind_from_irqhandler(unsigned int irq, void *dev_id);
 
+#define XEN_IRQ_PRIORITY_MAX     EVTCHN_FIFO_PRIORITY_MAX
+#define XEN_IRQ_PRIORITY_DEFAULT EVTCHN_FIFO_PRIORITY_DEFAULT
+#define XEN_IRQ_PRIORITY_MIN     EVTCHN_FIFO_PRIORITY_MIN
+int xen_set_irq_priority(unsigned irq, unsigned priority);
+
 /*
  * Allow extra references to event channels exposed to userspace by evtchn
  */

From 8785c67663b6ba023c7c6d61d37d8e08c00d86a8 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 23 Sep 2013 12:52:21 +0100
Subject: [PATCH 21/52] xen/x86: set VIRQ_TIMER priority to maximum

Commit bee980d9e (xen/events: Handle VIRQ_TIMER before any other hardirq
in event loop) effectively made the VIRQ_TIMER the highest priority event
when using the 2-level ABI.

Set the VIRQ_TIMER priority to the highest so this behaviour is retained
when using the FIFO-based ABI.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/time.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 12a1ca707b94..7b78f88c1707 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -446,6 +446,7 @@ void xen_setup_timer(int cpu)
 				      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
 				      IRQF_FORCE_RESUME,
 				      name, NULL);
+	(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 
 	memcpy(evt, xen_clockevent, sizeof(*evt));
 

From 1fe565517b57676884349dccfd6ce853ec338636 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 15 Mar 2013 13:02:35 +0000
Subject: [PATCH 22/52] xen/events: use the FIFO-based ABI if available

Implement all the event channel port ops for the FIFO-based ABI.

If the hypervisor supports the FIFO-based ABI, enable it by
initializing the control block for the boot VCPU and subsequent VCPUs
as they are brought up and on resume.  The event array is expanded as
required when event ports are setup.

The 'xen.fifo_events=0' command line option may be used to disable use
of the FIFO-based ABI.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/events/Makefile          |   1 +
 drivers/xen/events/events_base.c     |  14 +-
 drivers/xen/events/events_fifo.c     | 426 +++++++++++++++++++++++++++
 drivers/xen/events/events_internal.h |   8 +
 4 files changed, 448 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/events/events_fifo.c

diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
index 08179fe04612..62be55cd981d 100644
--- a/drivers/xen/events/Makefile
+++ b/drivers/xen/events/Makefile
@@ -2,3 +2,4 @@ obj-y += events.o
 
 events-y += events_base.o
 events-y += events_2l.o
+events-y += events_fifo.o
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index e9001fef4ffd..1d16185e82b2 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1563,6 +1563,7 @@ void xen_irq_resume(void)
 
 	/* New event-channel space is not 'live' yet. */
 	xen_evtchn_mask_all();
+	xen_evtchn_resume();
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
@@ -1659,9 +1660,20 @@ void xen_callback_vector(void)
 void xen_callback_vector(void) {}
 #endif
 
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static bool fifo_events = true;
+module_param(fifo_events, bool, 0);
+
 void __init xen_init_IRQ(void)
 {
-	xen_evtchn_2l_init();
+	int ret = -EINVAL;
+
+	if (fifo_events)
+		ret = xen_evtchn_fifo_init();
+	if (ret < 0)
+		xen_evtchn_2l_init();
 
 	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
 				sizeof(*evtchn_to_irq), GFP_KERNEL);
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
new file mode 100644
index 000000000000..e2bf9571f7fe
--- /dev/null
+++ b/drivers/xen/events/events_fifo.c
@@ -0,0 +1,426 @@
+/*
+ * Xen event channels (FIFO-based ABI)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Or, when distributed separately from the Linux kernel or
+ * incorporated into other software packages, subject to the following
+ * license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
+#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
+
+struct evtchn_fifo_queue {
+	uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+
+static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
+static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
+static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
+static unsigned event_array_pages __read_mostly;
+
+#define BM(w) ((unsigned long *)(w))
+
+static inline event_word_t *event_word_from_port(unsigned port)
+{
+	unsigned i = port / EVENT_WORDS_PER_PAGE;
+
+	return event_array[i] + port % EVENT_WORDS_PER_PAGE;
+}
+
+static unsigned evtchn_fifo_max_channels(void)
+{
+	return EVTCHN_FIFO_NR_CHANNELS;
+}
+
+static unsigned evtchn_fifo_nr_channels(void)
+{
+	return event_array_pages * EVENT_WORDS_PER_PAGE;
+}
+
+static void free_unused_array_pages(void)
+{
+	unsigned i;
+
+	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
+		if (!event_array[i])
+			break;
+		free_page((unsigned long)event_array[i]);
+		event_array[i] = NULL;
+	}
+}
+
+static void init_array_page(event_word_t *array_page)
+{
+	unsigned i;
+
+	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
+		array_page[i] = 1 << EVTCHN_FIFO_MASKED;
+}
+
+static int evtchn_fifo_setup(struct irq_info *info)
+{
+	unsigned port = info->evtchn;
+	unsigned new_array_pages;
+	int ret = -ENOMEM;
+
+	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
+
+	if (new_array_pages > MAX_EVENT_ARRAY_PAGES)
+		return -EINVAL;
+
+	while (event_array_pages < new_array_pages) {
+		void *array_page;
+		struct evtchn_expand_array expand_array;
+
+		/* Might already have a page if we've resumed. */
+		array_page = event_array[event_array_pages];
+		if (!array_page) {
+			array_page = (void *)__get_free_page(GFP_KERNEL);
+			if (array_page == NULL)
+				goto error;
+			event_array[event_array_pages] = array_page;
+		}
+
+		/* Mask all events in this page before adding it. */
+		init_array_page(array_page);
+
+		expand_array.array_gfn = virt_to_mfn(array_page);
+
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
+		if (ret < 0)
+			goto error;
+
+		event_array_pages++;
+	}
+	return 0;
+
+  error:
+	if (event_array_pages == 0)
+		panic("xen: unable to expand event array with initial page (%d)\n", ret);
+	else
+		pr_err("unable to expand event array (%d)\n", ret);
+	free_unused_array_pages();
+	return ret;
+}
+
+static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+	/* no-op */
+}
+
+static void evtchn_fifo_clear_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word));
+}
+
+static void evtchn_fifo_set_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_PENDING, BM(word));
+}
+
+static bool evtchn_fifo_is_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word));
+}
+
+static bool evtchn_fifo_test_and_set_mask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word));
+}
+
+static void evtchn_fifo_mask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_MASKED, BM(word));
+}
+
+/*
+ * Clear MASKED, spinning if BUSY is set.
+ */
+static void clear_masked(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		old = w & ~(1 << EVTCHN_FIFO_BUSY);
+		new = old & ~(1 << EVTCHN_FIFO_MASKED);
+		w = sync_cmpxchg(word, old, new);
+	} while (w != old);
+}
+
+static void evtchn_fifo_unmask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+
+	BUG_ON(!irqs_disabled());
+
+	clear_masked(word);
+	if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	}
+}
+
+static uint32_t clear_linked(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		old = w;
+		new = (w & ~((1 << EVTCHN_FIFO_LINKED)
+			     | EVTCHN_FIFO_LINK_MASK));
+	} while ((w = sync_cmpxchg(word, old, new)) != old);
+
+	return w & EVTCHN_FIFO_LINK_MASK;
+}
+
+static void handle_irq_for_port(unsigned port)
+{
+	int irq;
+	struct irq_desc *desc;
+
+	irq = get_evtchn_to_irq(port);
+	if (irq != -1) {
+		desc = irq_to_desc(irq);
+		if (desc)
+			generic_handle_irq_desc(irq, desc);
+	}
+}
+
+static void consume_one_event(unsigned cpu,
+			      struct evtchn_fifo_control_block *control_block,
+			      unsigned priority, uint32_t *ready)
+{
+	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
+	uint32_t head;
+	unsigned port;
+	event_word_t *word;
+
+	head = q->head[priority];
+
+	/*
+	 * Reached the tail last time?  Read the new HEAD from the
+	 * control block.
+	 */
+	if (head == 0) {
+		rmb(); /* Ensure word is up-to-date before reading head. */
+		head = control_block->head[priority];
+	}
+
+	port = head;
+	word = event_word_from_port(port);
+	head = clear_linked(word);
+
+	/*
+	 * If the link is non-zero, there are more events in the
+	 * queue, otherwise the queue is empty.
+	 *
+	 * If the queue is empty, clear this priority from our local
+	 * copy of the ready word.
+	 */
+	if (head == 0)
+		clear_bit(priority, BM(ready));
+
+	if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))
+	    && !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word)))
+		handle_irq_for_port(port);
+
+	q->head[priority] = head;
+}
+
+static void evtchn_fifo_handle_events(unsigned cpu)
+{
+	struct evtchn_fifo_control_block *control_block;
+	uint32_t ready;
+	unsigned q;
+
+	control_block = per_cpu(cpu_control_block, cpu);
+
+	ready = xchg(&control_block->ready, 0);
+
+	while (ready) {
+		q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
+		consume_one_event(cpu, control_block, q, &ready);
+		ready |= xchg(&control_block->ready, 0);
+	}
+}
+
+static void evtchn_fifo_resume(void)
+{
+	unsigned cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *control_block = per_cpu(cpu_control_block, cpu);
+		struct evtchn_init_control init_control;
+		int ret;
+
+		if (!control_block)
+			continue;
+
+		/*
+		 * If this CPU is offline, take the opportunity to
+		 * free the control block while it is not being
+		 * used.
+		 */
+		if (!cpu_online(cpu)) {
+			free_page((unsigned long)control_block);
+			per_cpu(cpu_control_block, cpu) = NULL;
+			continue;
+		}
+
+		init_control.control_gfn = virt_to_mfn(control_block);
+		init_control.offset = 0;
+		init_control.vcpu = cpu;
+
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
+						  &init_control);
+		if (ret < 0)
+			BUG();
+	}
+
+	/*
+	 * The event array starts out as empty again and is extended
+	 * as normal when events are bound.  The existing pages will
+	 * be reused.
+	 */
+	event_array_pages = 0;
+}
+
+static const struct evtchn_ops evtchn_ops_fifo = {
+	.max_channels      = evtchn_fifo_max_channels,
+	.nr_channels       = evtchn_fifo_nr_channels,
+	.setup             = evtchn_fifo_setup,
+	.bind_to_cpu       = evtchn_fifo_bind_to_cpu,
+	.clear_pending     = evtchn_fifo_clear_pending,
+	.set_pending       = evtchn_fifo_set_pending,
+	.is_pending        = evtchn_fifo_is_pending,
+	.test_and_set_mask = evtchn_fifo_test_and_set_mask,
+	.mask              = evtchn_fifo_mask,
+	.unmask            = evtchn_fifo_unmask,
+	.handle_events     = evtchn_fifo_handle_events,
+	.resume            = evtchn_fifo_resume,
+};
+
+static int __cpuinit evtchn_fifo_init_control_block(unsigned cpu)
+{
+	struct page *control_block = NULL;
+	struct evtchn_init_control init_control;
+	int ret = -ENOMEM;
+
+	control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
+	if (control_block == NULL)
+		goto error;
+
+	init_control.control_gfn = virt_to_mfn(page_address(control_block));
+	init_control.offset      = 0;
+	init_control.vcpu        = cpu;
+
+	ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
+	if (ret < 0)
+		goto error;
+
+	per_cpu(cpu_control_block, cpu) = page_address(control_block);
+
+	return 0;
+
+  error:
+	__free_page(control_block);
+	return ret;
+}
+
+static int __cpuinit evtchn_fifo_cpu_notification(struct notifier_block *self,
+						  unsigned long action,
+						  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!per_cpu(cpu_control_block, cpu))
+			ret = evtchn_fifo_init_control_block(cpu);
+		break;
+	default:
+		break;
+	}
+	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static struct notifier_block evtchn_fifo_cpu_notifier __cpuinitdata = {
+	.notifier_call	= evtchn_fifo_cpu_notification,
+};
+
+int __init xen_evtchn_fifo_init(void)
+{
+	int cpu = get_cpu();
+	int ret;
+
+	ret = evtchn_fifo_init_control_block(cpu);
+	if (ret < 0)
+		goto out;
+
+	pr_info("Using FIFO-based ABI\n");
+
+	evtchn_ops = &evtchn_ops_fifo;
+
+	register_cpu_notifier(&evtchn_fifo_cpu_notifier);
+out:
+	put_cpu();
+	return ret;
+}
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 2862e1cccf1c..677f41a0fff9 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -69,6 +69,7 @@ struct evtchn_ops {
 	void (*unmask)(unsigned port);
 
 	void (*handle_events)(unsigned cpu);
+	void (*resume)(void);
 };
 
 extern const struct evtchn_ops *evtchn_ops;
@@ -137,6 +138,13 @@ static inline void xen_evtchn_handle_events(unsigned cpu)
 	return evtchn_ops->handle_events(cpu);
 }
 
+static inline void xen_evtchn_resume(void)
+{
+	if (evtchn_ops->resume)
+		evtchn_ops->resume();
+}
+
 void xen_evtchn_2l_init(void);
+int xen_evtchn_fifo_init(void);
 
 #endif /* #ifndef __EVENTS_INTERNAL_H__ */

From fc590efe667338f7da250cf2d2eb6fdd486e1b97 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 12:09:28 -0500
Subject: [PATCH 23/52] xen/p2m: Check for auto-xlat when doing
 mfn_to_local_pfn.

Most of the functions in page.h are prefaced with
	if (xen_feature(XENFEAT_auto_translated_physmap))
		return mfn;

Except the mfn_to_local_pfn. At a first sight, the function
should work without this patch - as the 'mfn_to_mfn' has
a similar check. But there are no such check in the
'get_phys_to_machine' function - so we would crash in there.

This fixes it by following the convention of having the
check for auto-xlat in these static functions.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/include/asm/xen/page.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index b913915e8e63..4a092ccdd147 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -167,7 +167,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  */
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
-	unsigned long pfn = mfn_to_pfn(mfn);
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return mfn;
+
+	pfn = mfn_to_pfn(mfn);
 	if (get_phys_to_machine(pfn) != mfn)
 		return -1; /* force !pfn_valid() */
 	return pfn;

From ddc416cbc4e34f52bebca027de1c099bd30795f8 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 12:39:56 -0500
Subject: [PATCH 24/52] xen/pvh/x86: Define what an PVH guest is (v3).

Which is a PV guest with auto page translation enabled
and with vector callback. It is a cross between PVHVM and PV.

The Xen side defines PVH as (from docs/misc/pvh-readme.txt,
with modifications):

"* the guest uses auto translate:
 - p2m is managed by Xen
 - pagetables are owned by the guest
 - mmu_update hypercall not available
* it uses event callback and not vlapic emulation,
* IDT is native, so set_trap_table hcall is also N/A for a PVH guest.

For a full list of hcalls supported for PVH, see pvh_hypercall64_table
in arch/x86/hvm/hvm.c in xen.  From the ABI prespective, it's mostly a
PV guest with auto translate, although it does use hvm_op for setting
callback vector."

Also we use the PV cpuid, albeit we can use the HVM (native) cpuid.
However, we do have a fair bit of filtering in the xen_cpuid and
we can piggyback on that until the hypervisor/toolstack filters
the appropiate cpuids. Once that is done we can swap over to
use the native one.

We setup a Kconfig entry that is disabled by default and
cannot be enabled.

Note that on ARM the concept of PVH is non-existent. As Ian
put it: "an ARM guest is neither PV nor HVM nor PVHVM.
It's a bit like PVH but is different also (it's further towards
the H end of the spectrum than even PVH).". As such these
options (PVHVM, PVH) are never enabled nor seen on ARM
compilations.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Kconfig |  5 +++++
 include/xen/xen.h    | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1a3c76505649..e7d05900efac 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -51,3 +51,8 @@ config XEN_DEBUG_FS
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
 
+config XEN_PVH
+	bool "Support for running as a PVH guest"
+	depends on X86_64 && XEN && BROKEN
+	select XEN_PVHVM
+	def_bool n
diff --git a/include/xen/xen.h b/include/xen/xen.h
index a74d4362c4f8..0c0e3ef4c45d 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -29,4 +29,18 @@ extern enum xen_domain_type xen_domain_type;
 #define xen_initial_domain()	(0)
 #endif	/* CONFIG_XEN_DOM0 */
 
+#ifdef CONFIG_XEN_PVH
+/* This functionality exists only for x86. The XEN_PVHVM support exists
+ * only in x86 world - hence on ARM it will be always disabled.
+ * N.B. ARM guests are neither PV nor HVM nor PVHVM.
+ * It's a bit like PVH but is different also (it's further towards the H
+ * end of the spectrum than even PVH).
+ */
+#include <xen/features.h>
+#define xen_pvh_domain() (xen_pv_domain() && \
+			  xen_feature(XENFEAT_auto_translated_physmap) && \
+			  xen_have_vector_callback)
+#else
+#define xen_pvh_domain()	(0)
+#endif
 #endif	/* _XEN_XEN_H */

From d285d68314af49c4456b71d248e355dd33ae375c Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 12:45:31 -0500
Subject: [PATCH 25/52] xen/pvh: Early bootup changes in PV code (v4).

We don't use the filtering that 'xen_cpuid' is doing
because the hypervisor treats 'XEN_EMULATE_PREFIX' as
an invalid instruction. This means that all of the filtering
will have to be done in the hypervisor/toolstack.

Without the filtering we expose to the guest the:

 - cpu topology (sockets, cores, etc);
 - the APERF (which the generic scheduler likes to
    use), see  5e626254206a709c6e937f3dda69bf26c7344f6f
    "xen/setup: filter APERFMPERF cpuid feature out"
 - and the inability to figure out whether MWAIT_LEAF
   should be exposed or not. See
   df88b2d96e36d9a9e325bfcd12eb45671cbbc937
   "xen/enlighten: Disable MWAIT_LEAF so that acpi-pad won't be loaded."
 - x2apic, see  4ea9b9aca90cfc71e6872ed3522356755162932c
   "xen: mask x2APIC feature in PV"

We also check for vector callback early on, as it is a required
feature. PVH also runs at default kernel IOPL.

Finally, pure PV settings are moved to a separate function that are
only called for pure PV, ie, pv with pvmmu. They are also #ifdef
with CONFIG_XEN_PVMMU.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/enlighten.c | 48 ++++++++++++++++++++++++++++------------
 arch/x86/xen/setup.c     | 18 ++++++++++-----
 2 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fa6ade76ef3f..eb0efc2f9d3c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -46,6 +46,7 @@
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
 #include <xen/acpi.h>
+#include <xen/features.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
@@ -262,8 +263,9 @@ static void __init xen_banner(void)
 	struct xen_extraversion extra;
 	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
 
-	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       pv_info.name);
+	pr_info("Booting paravirtualized kernel %son %s\n",
+		xen_feature(XENFEAT_auto_translated_physmap) ?
+			"with PVH extensions " : "", pv_info.name);
 	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
 	       version >> 16, version & 0xffff, extra.extraversion,
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -433,7 +435,7 @@ static void __init xen_init_cpuid_mask(void)
 
 	ax = 1;
 	cx = 0;
-	xen_cpuid(&ax, &bx, &cx, &dx);
+	cpuid(1, &ax, &bx, &cx, &dx);
 
 	xsave_mask =
 		(1 << (X86_FEATURE_XSAVE % 32)) |
@@ -1420,6 +1422,19 @@ static void __init xen_setup_stackprotector(void)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+static void __init xen_pvh_early_guest_init(void)
+{
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_have_vector_callback = 1;
+
+#ifdef CONFIG_X86_32
+	BUG(); /* PVH: Implement proper support. */
+#endif
+}
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1431,13 +1446,16 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_domain_type = XEN_PV_DOMAIN;
 
+	xen_setup_features();
+	xen_pvh_early_guest_init();
 	xen_setup_machphys_mapping();
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_cpu_ops = xen_cpu_ops;
 	pv_apic_ops = xen_apic_ops;
+	if (!xen_pvh_domain())
+		pv_cpu_ops = xen_cpu_ops;
 
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
@@ -1469,8 +1487,6 @@ asmlinkage void __init xen_start_kernel(void)
 	/* Work out if we support NX */
 	x86_configure_nx();
 
-	xen_setup_features();
-
 	/* Get mfn list */
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		xen_build_dynamic_phys_to_machine();
@@ -1548,14 +1564,18 @@ asmlinkage void __init xen_start_kernel(void)
 	/* set the limit of our address space */
 	xen_reserve_top();
 
-	/* We used to do this in xen_arch_setup, but that is too late on AMD
-	 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
-	 * which pokes 0xcf8 port.
-	 */
-	set_iopl.iopl = 1;
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-	if (rc != 0)
-		xen_raw_printk("physdev_op failed %d\n", rc);
+	/* PVH: runs at default kernel iopl of 0 */
+	if (!xen_pvh_domain()) {
+		/*
+		 * We used to do this in xen_arch_setup, but that is too late
+		 * on AMD were early_cpu_init (run before ->arch_setup()) calls
+		 * early_amd_init which pokes 0xcf8 port.
+		 */
+		set_iopl.iopl = 1;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+		if (rc != 0)
+			xen_raw_printk("physdev_op failed %d\n", rc);
+	}
 
 #ifdef CONFIG_X86_32
 	/* set up basic CPUID stuff */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f59de6..2137c5101dac 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -563,16 +563,13 @@ void xen_enable_nmi(void)
 		BUG();
 #endif
 }
-void __init xen_arch_setup(void)
+void __init xen_pvmmu_arch_setup(void)
 {
-	xen_panic_handler_init();
-
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_pae_extended_cr3);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable,
+			     VMASST_TYPE_pae_extended_cr3);
 
 	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -581,6 +578,15 @@ void __init xen_arch_setup(void)
 	xen_enable_sysenter();
 	xen_enable_syscall();
 	xen_enable_nmi();
+}
+
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+	xen_panic_handler_init();
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		xen_pvmmu_arch_setup();
+
 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");

From 696fd7c5b2ecb31b339019ced4fe15a3f9e7419a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Sun, 15 Dec 2013 12:37:46 -0500
Subject: [PATCH 26/52] xen/pvh: Don't setup P2M tree.

P2M is not available for PVH. Fortunatly for us the
P2M code already has mostly the support for auto-xlat guest thanks to
commit 3d24bbd7dddbea54358a9795abaf051b0f18973c
"grant-table: call set_phys_to_machine after mapping grant refs"
which: "
introduces set_phys_to_machine calls for auto_translated guests
(even on x86) in gnttab_map_refs and gnttab_unmap_refs.
translated by swiotlb-xen... " so we don't need to muck much.

with above mentioned "commit you'll get set_phys_to_machine calls
from gnttab_map_refs and gnttab_unmap_refs but PVH guests won't do
anything with them " (Stefano Stabellini) which is OK - we want
them to be NOPs.

This is because we assume that an "IOMMU is always present on the
plaform and Xen is going to make the appropriate IOMMU pagetable
changes in the hypercall implementation of GNTTABOP_map_grant_ref
and GNTTABOP_unmap_grant_ref, then eveything should be transparent
from PVH priviligied point of view and DMA transfers involving
foreign pages keep working with no issues[sp]

Otherwise we would need a P2M (and an M2P) for PVH priviligied to
track these foreign pages .. (see arch/arm/xen/p2m.c)."
(Stefano Stabellini).

We still have to inhibit the building of the P2M tree.
That had been done in the past by not calling
xen_build_dynamic_phys_to_machine (which setups the P2M tree
and gives us virtual address to access them). But we are missing
a check for xen_build_mfn_list_list - which was continuing to setup
the P2M tree and would blow up at trying to get the virtual
address of p2m_missing (which would have been setup by
xen_build_dynamic_phys_to_machine).

Hence a check is needed to not call xen_build_mfn_list_list when
running in auto-xlat mode.

Instead of replicating the check for auto-xlat in enlighten.c
do it in the p2m.c code. The reason is that the xen_build_mfn_list_list
is called also in xen_arch_post_suspend without any checks for
auto-xlat. So for PVH or PV with auto-xlat - we would needlessly
allocate space for an P2M tree.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/enlighten.c |  3 +--
 arch/x86/xen/p2m.c       | 12 ++++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index eb0efc2f9d3c..23ead298edbd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1488,8 +1488,7 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_configure_nx();
 
 	/* Get mfn list */
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		xen_build_dynamic_phys_to_machine();
+	xen_build_dynamic_phys_to_machine();
 
 	/*
 	 * Set up kernel GDT and segment registers, mainly so that
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2ae8699e8767..fb7ee0a4f9d0 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void)
 {
 	unsigned long pfn;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -346,10 +349,15 @@ void xen_setup_mfn_list_list(void)
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 void __init xen_build_dynamic_phys_to_machine(void)
 {
-	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
-	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+	unsigned long *mfn_list;
+	unsigned long max_pfn;
 	unsigned long pfn;
 
+	 if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	mfn_list = (unsigned long *)xen_start_info->mfn_list;
+	max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	xen_max_p2m_pfn = max_pfn;
 
 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);

From 32df75cd148b43e007848ddbfdb1ea25535114cb Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 31 Dec 2013 12:37:52 -0500
Subject: [PATCH 27/52] xen/mmu/p2m: Refactor the xen_pagetable_init code (v2).

The revectoring and copying of the P2M only happens when
!auto-xlat and on 64-bit builds. It is not obvious from
the code, so lets have seperate 32 and 64-bit functions.

We also invert the check for auto-xlat to make the code
flow simpler.

Suggested-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 70 ++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ce563be09cc1..c140efffe37e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
 	 * instead of somewhere later and be confusing. */
 	xen_mc_flush();
 }
-#endif
-static void __init xen_pagetable_init(void)
+static void __init xen_pagetable_p2m_copy(void)
 {
-#ifdef CONFIG_X86_64
 	unsigned long size;
 	unsigned long addr;
-#endif
-	paging_init();
-	xen_setup_shared_info();
-#ifdef CONFIG_X86_64
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		unsigned long new_mfn_list;
+	unsigned long new_mfn_list;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+	/* On 32-bit, we get zero so this never gets executed. */
+	new_mfn_list = xen_revector_p2m_tree();
+	if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
+		/* using __ka address and sticking INVALID_P2M_ENTRY! */
+		memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+		/* We should be in __ka space. */
+		BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+		addr = xen_start_info->mfn_list;
+		/* We roundup to the PMD, which means that if anybody at this stage is
+		 * using the __ka address of xen_start_info or xen_start_info->shared_info
+		 * they are in going to crash. Fortunatly we have already revectored
+		 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+		size = roundup(size, PMD_SIZE);
+		xen_cleanhighmap(addr, addr + size);
 
 		size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+		memblock_free(__pa(xen_start_info->mfn_list), size);
+		/* And revector! Bye bye old array */
+		xen_start_info->mfn_list = new_mfn_list;
+	} else
+		return;
 
-		/* On 32-bit, we get zero so this never gets executed. */
-		new_mfn_list = xen_revector_p2m_tree();
-		if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
-			/* using __ka address and sticking INVALID_P2M_ENTRY! */
-			memset((void *)xen_start_info->mfn_list, 0xff, size);
-
-			/* We should be in __ka space. */
-			BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
-			addr = xen_start_info->mfn_list;
-			/* We roundup to the PMD, which means that if anybody at this stage is
-			 * using the __ka address of xen_start_info or xen_start_info->shared_info
-			 * they are in going to crash. Fortunatly we have already revectored
-			 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
-			size = roundup(size, PMD_SIZE);
-			xen_cleanhighmap(addr, addr + size);
-
-			size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-			memblock_free(__pa(xen_start_info->mfn_list), size);
-			/* And revector! Bye bye old array */
-			xen_start_info->mfn_list = new_mfn_list;
-		} else
-			goto skip;
-	}
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
 	 * the ramdisk). We continue on, erasing PMD entries that point to page
@@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void)
 	 * anything at this stage. */
 	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
 #endif
-skip:
+}
+#endif
+
+static void __init xen_pagetable_init(void)
+{
+	paging_init();
+	xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+	xen_pagetable_p2m_copy();
 #endif
 	xen_post_allocator_init();
 }

From b621e157ba48fb7d36945405de68c5fa25e7b73c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 3 Jan 2014 14:08:39 -0500
Subject: [PATCH 28/52] xen/mmu: Cleanup xen_pagetable_p2m_copy a bit.

Stefano noticed that the code runs only under 64-bit so
the comments about 32-bit are pointless.

Also we change the condition for xen_revector_p2m_tree
returning the same value (because it could not allocate
a swath of space to put the new P2M in) or it had been
called once already. In such we return early from the
function.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/mmu.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c140efffe37e..9d74249542c5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1209,29 +1209,29 @@ static void __init xen_pagetable_p2m_copy(void)
 
 	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
 
-	/* On 32-bit, we get zero so this never gets executed. */
 	new_mfn_list = xen_revector_p2m_tree();
-	if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
-		/* using __ka address and sticking INVALID_P2M_ENTRY! */
-		memset((void *)xen_start_info->mfn_list, 0xff, size);
-
-		/* We should be in __ka space. */
-		BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
-		addr = xen_start_info->mfn_list;
-		/* We roundup to the PMD, which means that if anybody at this stage is
-		 * using the __ka address of xen_start_info or xen_start_info->shared_info
-		 * they are in going to crash. Fortunatly we have already revectored
-		 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
-		size = roundup(size, PMD_SIZE);
-		xen_cleanhighmap(addr, addr + size);
-
-		size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-		memblock_free(__pa(xen_start_info->mfn_list), size);
-		/* And revector! Bye bye old array */
-		xen_start_info->mfn_list = new_mfn_list;
-	} else
+	/* No memory or already called. */
+	if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
 		return;
 
+	/* using __ka address and sticking INVALID_P2M_ENTRY! */
+	memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+	/* We should be in __ka space. */
+	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+	addr = xen_start_info->mfn_list;
+	/* We roundup to the PMD, which means that if anybody at this stage is
+	 * using the __ka address of xen_start_info or xen_start_info->shared_info
+	 * they are in going to crash. Fortunatly we have already revectored
+	 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+	size = roundup(size, PMD_SIZE);
+	xen_cleanhighmap(addr, addr + size);
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+	memblock_free(__pa(xen_start_info->mfn_list), size);
+	/* And revector! Bye bye old array */
+	xen_start_info->mfn_list = new_mfn_list;
+
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
 	 * the ramdisk). We continue on, erasing PMD entries that point to page

From 4e44e44b0bd25bc4ed23232f06fc7275f1e4e38d Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Tue, 31 Dec 2013 12:41:27 -0500
Subject: [PATCH 29/52] xen/pvh: MMU changes for PVH (v2)

.. which are surprisingly small compared to the amount for PV code.

PVH uses mostly native mmu ops, we leave the generic (native_*) for
the majority and just overwrite the baremetal with the ones we need.

At startup, we are running with pre-allocated page-tables
courtesy of the tool-stack. But we still need to graft them
in the Linux initial pagetables. However there is no need to
unpin/pin and change them to R/O or R/W.

Note that the xen_pagetable_init due to 7836fec9d0994cc9c9150c5a33f0eb0eb08a335a
"xen/mmu/p2m: Refactor the xen_pagetable_init code." does not
need any changes - we just need to make sure that xen_post_allocator_init
does not alter the pvops from the default native one.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/mmu.c | 75 ++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9d74249542c5..490ddb354590 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1757,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
 
+	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
@@ -1867,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
  * but that's enough to get __va working.  We need to fill in the rest
  * of the physical mapping once some sort of allocator has been set
  * up.
+ * NOTE: for PVH, the page tables are native.
  */
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
@@ -1888,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Zap identity mapping */
 	init_level4_pgt[0] = __pgd(0);
 
-	/* Pre-constructed entries are in pfn, so convert to mfn */
-	/* L4[272] -> level3_ident_pgt
-	 * L4[511] -> level3_kernel_pgt */
-	convert_pfn_mfn(init_level4_pgt);
-
-	/* L3_i[0] -> level2_ident_pgt */
-	convert_pfn_mfn(level3_ident_pgt);
-	/* L3_k[510] -> level2_kernel_pgt
-	 * L3_i[511] -> level2_fixmap_pgt */
-	convert_pfn_mfn(level3_kernel_pgt);
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* Pre-constructed entries are in pfn, so convert to mfn */
+		/* L4[272] -> level3_ident_pgt
+		 * L4[511] -> level3_kernel_pgt */
+		convert_pfn_mfn(init_level4_pgt);
 
+		/* L3_i[0] -> level2_ident_pgt */
+		convert_pfn_mfn(level3_ident_pgt);
+		/* L3_k[510] -> level2_kernel_pgt
+		 * L3_i[511] -> level2_fixmap_pgt */
+		convert_pfn_mfn(level3_kernel_pgt);
+	}
 	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1922,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	copy_page(level2_fixmap_pgt, l2);
 	/* Note that we don't do anything with level1_fixmap_pgt which
 	 * we don't need. */
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* Make pagetable pieces RO */
+		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
 
-	/* Make pagetable pieces RO */
-	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+		/* Pin down new L4 */
+		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+				  PFN_DOWN(__pa_symbol(init_level4_pgt)));
 
-	/* Pin down new L4 */
-	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
+		/* Unpin Xen-provided one */
+		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
-	/* Unpin Xen-provided one */
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	/*
-	 * At this stage there can be no user pgd, and no page
-	 * structure to attach it to, so make sure we just set kernel
-	 * pgd.
-	 */
-	xen_mc_batch();
-	__xen_write_cr3(true, __pa(init_level4_pgt));
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
+		/*
+		 * At this stage there can be no user pgd, and no page
+		 * structure to attach it to, so make sure we just set kernel
+		 * pgd.
+		 */
+		xen_mc_batch();
+		__xen_write_cr3(true, __pa(init_level4_pgt));
+		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	} else
+		native_write_cr3(__pa(init_level4_pgt));
 
 	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
 	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
@@ -2107,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
 static void __init xen_post_allocator_init(void)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;

From 76bcceff0bfbded075c9703ec5413a9bf50ef8c4 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 3 Jan 2014 09:48:08 -0500
Subject: [PATCH 30/52] xen/pvh/mmu: Use PV TLB instead of native.

We also optimize one - the TLB flush. The native operation would
needlessly IPI offline VCPUs causing extra wakeups. Using the
Xen one avoids that and lets the hypervisor determine which
VCPU needs the TLB flush.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 490ddb354590..c1d406f35523 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2222,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 void __init xen_init_mmu_ops(void)
 {
 	x86_init.paging.pagetable_init = xen_pagetable_init;
+
+	/* Optimization - we can use the HVM one but it has no idea which
+	 * VCPUs are descheduled - which means that it will needlessly IPI
+	 * them. Xen knows so let it do the job.
+	 */
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+		return;
+	}
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);

From 4dd322bc3b25be40dfa91e8ac483b846f3e8dffc Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Tue, 31 Dec 2013 14:02:44 -0500
Subject: [PATCH 31/52] xen/pvh: Setup up shared_info.

For PVHVM the shared_info structure is provided via the same way
as for normal PV guests (see include/xen/interface/xen.h).

That is during bootup we get 'xen_start_info' via the %esi register
in startup_xen. Then later we extract the 'shared_info' from said
structure (in xen_setup_shared_info) and start using it.

The 'xen_setup_shared_info' is all setup to work with auto-xlat
guests, but there are two functions which it calls that are not:
xen_setup_mfn_list_list and xen_setup_vcpu_info_placement.
This patch modifies the P2M code (xen_setup_mfn_list_list)
while the "Piggyback on PVHVM for event channels" modifies
the xen_setup_vcpu_info_placement.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fb7ee0a4f9d0..696c694986d0 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -339,6 +339,9 @@ void __ref xen_build_mfn_list_list(void)
 
 void xen_setup_mfn_list_list(void)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =

From 8d656bbe43aee6d1be6b49fcf8acbc04588472bc Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 13:03:37 -0500
Subject: [PATCH 32/52] xen/pvh: Load GDT/GS in early PV bootup code for BSP.

During early bootup we start life using the Xen provided
GDT, which means that we are running with %cs segment set
to FLAT_KERNEL_CS (FLAT_RING3_CS64 0xe033, GDT index 261).

But for PVH we want to be use HVM type mechanism for
segment operations. As such we need to switch to the HVM
one and also reload ourselves with the __KERNEL_CS:eip
to run in the proper GDT and segment.

For HVM this is usually done in 'secondary_startup_64' in
(head_64.S) but since we are not taking that bootup
path (we start in PV - xen_start_kernel) we need to do
that in the early PV bootup paths.

For good measure we also zero out the %fs, %ds, and %es
(not strictly needed as Xen has already cleared them
for us). The %gs is loaded by 'switch_to_new_gdt'.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/enlighten.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 23ead298edbd..1170d00879d5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1410,8 +1410,43 @@ static void __init xen_boot_params_init_edd(void)
  * we do this, we have to be careful not to call any stack-protected
  * function, which is most of the kernel.
  */
-static void __init xen_setup_stackprotector(void)
+static void __init xen_setup_gdt(void)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_X86_64
+		unsigned long dummy;
+
+		switch_to_new_gdt(0); /* GDT and GS set */
+
+		/* We are switching of the Xen provided GDT to our HVM mode
+		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
+		 * and we are jumping to reload it.
+		 */
+		asm volatile ("pushq %0\n"
+			      "leaq 1f(%%rip),%0\n"
+			      "pushq %0\n"
+			      "lretq\n"
+			      "1:\n"
+			      : "=&r" (dummy) : "0" (__KERNEL_CS));
+
+		/*
+		 * While not needed, we also set the %es, %ds, and %fs
+		 * to zero. We don't care about %ss as it is NULL.
+		 * Strictly speaking this is not needed as Xen zeros those
+		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
+		 *
+		 * Linux zeros them in cpu_init() and in secondary_startup_64
+		 * (for BSP).
+		 */
+		loadsegment(es, 0);
+		loadsegment(ds, 0);
+		loadsegment(fs, 0);
+#else
+		/* PVH: TODO Implement. */
+		BUG();
+#endif
+		return; /* PVH does not need any PV GDT ops. */
+	}
 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
 	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
 
@@ -1494,7 +1529,7 @@ asmlinkage void __init xen_start_kernel(void)
 	 * Set up kernel GDT and segment registers, mainly so that
 	 * -fstack-protector code can be executed.
 	 */
-	xen_setup_stackprotector();
+	xen_setup_gdt();
 
 	xen_init_irq_ops();
 	xen_init_cpuid_mask();

From 5840c84b16aad223d5305d8a569ea55de4120d67 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 11:48:08 -0500
Subject: [PATCH 33/52] xen/pvh: Secondary VCPU bringup (non-bootup CPUs)

The VCPU bringup protocol follows the PV with certain twists.
From xen/include/public/arch-x86/xen.h:

Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
for HVM and PVH guests, not all information in this structure is updated:

 - For HVM guests, the structures read include: fpu_ctxt (if
 VGCT_I387_VALID is set), flags, user_regs, debugreg[*]

 - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
 set cr3. All other fields not used should be set to 0.

This is what we do. We piggyback on the 'xen_setup_gdt' - but modify
a bit - we need to call 'load_percpu_segment' so that 'switch_to_new_gdt'
can load per-cpu data-structures. It has no effect on the VCPU0.

We also piggyback on the %rdi register to pass in the CPU number - so
that when we bootup a new CPU, the cpu_bringup_and_idle will have
passed as the first parameter the CPU number (via %rdi for 64-bit).

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 11 ++++++---
 arch/x86/xen/smp.c       | 49 +++++++++++++++++++++++++++-------------
 arch/x86/xen/xen-ops.h   |  1 +
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1170d00879d5..2eca6187fc92 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1409,14 +1409,19 @@ static void __init xen_boot_params_init_edd(void)
  * Set up the GDT and segment registers for -fstack-protector.  Until
  * we do this, we have to be careful not to call any stack-protected
  * function, which is most of the kernel.
+ *
+ * Note, that it is __ref because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
  */
-static void __init xen_setup_gdt(void)
+void __ref xen_setup_gdt(int cpu)
 {
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_X86_64
 		unsigned long dummy;
 
-		switch_to_new_gdt(0); /* GDT and GS set */
+		load_percpu_segment(cpu); /* We need to access per-cpu area */
+		switch_to_new_gdt(cpu); /* GDT and GS set */
 
 		/* We are switching of the Xen provided GDT to our HVM mode
 		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
@@ -1529,7 +1534,7 @@ asmlinkage void __init xen_start_kernel(void)
 	 * Set up kernel GDT and segment registers, mainly so that
 	 * -fstack-protector code can be executed.
 	 */
-	xen_setup_gdt();
+	xen_setup_gdt(0);
 
 	xen_init_irq_ops();
 	xen_init_cpuid_mask();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c36b325abd83..5e46190133b2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,9 +73,11 @@ static void cpu_bringup(void)
 	touch_softlockup_watchdog();
 	preempt_disable();
 
-	xen_enable_sysenter();
-	xen_enable_syscall();
-
+	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+		xen_enable_sysenter();
+		xen_enable_syscall();
+	}
 	cpu = smp_processor_id();
 	smp_store_cpu_info(cpu);
 	cpu_data(cpu).x86_max_cores = 1;
@@ -97,8 +99,14 @@ static void cpu_bringup(void)
 	wmb();			/* make sure everything is out */
 }
 
-static void cpu_bringup_and_idle(void)
+/* Note: cpu parameter is only relevant for PVH */
+static void cpu_bringup_and_idle(int cpu)
 {
+#ifdef CONFIG_X86_64
+	if (xen_feature(XENFEAT_auto_translated_physmap) &&
+	    xen_feature(XENFEAT_supervisor_mode_kernel))
+		xen_setup_gdt(cpu);
+#endif
 	cpu_bringup();
 	cpu_startup_entry(CPUHP_ONLINE);
 }
@@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void)
 	native_smp_prepare_boot_cpu();
 
 	if (xen_pv_domain()) {
-		/* We've switched to the "real" per-cpu gdt, so make sure the
-		   old memory can be recycled */
-		make_lowmem_page_readwrite(xen_initial_gdt);
+		if (!xen_feature(XENFEAT_writable_page_tables))
+			/* We've switched to the "real" per-cpu gdt, so make
+			 * sure the old memory can be recycled. */
+			make_lowmem_page_readwrite(xen_initial_gdt);
 
 #ifdef CONFIG_X86_32
 		/*
@@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 
 	gdt = get_cpu_gdt_table(cpu);
 
-	ctxt->flags = VGCF_IN_KERNEL;
-	ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
+	/* Note: PVH is not yet supported on x86_32. */
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#else
-	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
-	{
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		ctxt->flags = VGCF_IN_KERNEL;
 		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 		ctxt->user_regs.ds = __USER_DS;
 		ctxt->user_regs.es = __USER_DS;
+		ctxt->user_regs.ss = __KERNEL_DS;
 
 		xen_copy_trap_info(ctxt->trap_ctxt);
 
@@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 		ctxt->event_callback_cs     = __KERNEL_CS;
 		ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#else
+		ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 		ctxt->event_callback_eip    =
 					(unsigned long)xen_hypervisor_callback;
 		ctxt->failsafe_callback_eip =
 					(unsigned long)xen_failsafe_callback;
+		ctxt->user_regs.cs = __KERNEL_CS;
+		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
 	}
-	ctxt->user_regs.cs = __KERNEL_CS;
+#else
+	} else
+		/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
+		 * %rdi having the cpu number - which means are passing in
+		 * as the first parameter the cpu. Subtle!
+		 */
+		ctxt->user_regs.rdi = cpu;
+#endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-
-	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
 		BUG();
 
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 95f8c6142328..9059c24ed564 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void);
 
 extern int xen_panic_handler_init(void);
 
+void xen_setup_gdt(int cpu);
 #endif /* XEN_OPS_H */

From 9103bb0f8240b2a55aac3ff7ecba9c7dcf66b08b Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Fri, 13 Dec 2013 13:02:58 -0500
Subject: [PATCH 34/52] xen/pvh: Update E820 to work with PVH (v2)

In xen_add_extra_mem() we can skip updating P2M as it's managed
by Xen. PVH maps the entire IO space, but only RAM pages need
to be repopulated.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/setup.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2137c5101dac..dd5f905e33d5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,6 +27,7 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include "mmu.h"
 #include "xen-ops.h"
 #include "vdso.h"
 
@@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 
 	memblock_reserve(start, size);
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	xen_max_p2m_pfn = PFN_DOWN(start + size);
 	for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
 		unsigned long mfn = pfn_to_mfn(pfn);
@@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 		.domid        = DOMID_SELF
 	};
 	unsigned long len = 0;
+	int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
 	unsigned long pfn;
 	int ret;
 
@@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 				continue;
 			frame = mfn;
 		} else {
-			if (mfn != INVALID_P2M_ENTRY)
+			if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
 				continue;
 			frame = pfn;
 		}
@@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 static unsigned long __init xen_release_chunk(unsigned long start,
 					      unsigned long end)
 {
+	/*
+	 * Xen already ballooned out the E820 non RAM regions for us
+	 * and set them up properly in EPT.
+	 */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return end - start;
+
 	return xen_do_chunk(start, end, true);
 }
 
@@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk(
 	 * (except for the ISA region which must be 1:1 mapped) to
 	 * release the refcounts (in Xen) on the original frames.
 	 */
-	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+
+	/*
+	 * PVH E820 matches the hypervisor's P2M which means we need to
+	 * account for the proper values of *release and *identity.
+	 */
+	for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
+	     pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
 		pte_t pte = __pte_ma(0);
 
 		if (pfn < PFN_UP(ISA_END_ADDRESS))

From 2771374d47220c7ec271281437625e9519505bb2 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Wed, 11 Dec 2013 15:36:51 -0500
Subject: [PATCH 35/52] xen/pvh: Piggyback on PVHVM for event channels (v2)

PVH is a PV guest with a twist - there are certain things
that work in it like HVM and some like PV. There is
a similar mode - PVHVM where we run in HVM mode with
PV code enabled - and this patch explores that.

The most notable PV interfaces are the XenBus and event channels.

We will piggyback on how the event channel mechanism is
used in PVHVM - that is we want the normal native IRQ mechanism
and we will install a vector (hvm callback) for which we
will call the event channel mechanism.

This means that from a pvops perspective, we can use
native_irq_ops instead of the Xen PV specific. Albeit in the
future we could support pirq_eoi_map. But that is
a feature request that can be shared with PVHVM.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/enlighten.c         |  5 +++--
 arch/x86/xen/irq.c               |  5 ++++-
 drivers/xen/events/events_base.c | 14 +++++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2eca6187fc92..a4e2f30917a3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1144,8 +1144,9 @@ void xen_setup_vcpu_info_placement(void)
 		xen_vcpu_setup(cpu);
 
 	/* xen_vcpu_setup managed to place the vcpu_info within the
-	   percpu area for all cpus, so make use of it */
-	if (have_vcpu_info_placement) {
+	 * percpu area for all cpus, so make use of it. Note that for
+	 * PVH we want to use native IRQ mechanism. */
+	if (have_vcpu_info_placement && !xen_pvh_domain()) {
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 0da7f863056f..76ca326105f7 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,6 +5,7 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/vcpu.h>
+#include <xen/features.h>
 #include <xen/events.h>
 
 #include <asm/xen/hypercall.h>
@@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
 
 void __init xen_init_irq_ops(void)
 {
-	pv_irq_ops = xen_irq_ops;
+	/* For PVH we use default pv_irq_ops settings. */
+	if (!xen_feature(XENFEAT_hvm_callback_vector))
+		pv_irq_ops = xen_irq_ops;
 	x86_init.irqs.intr_init = xen_init_IRQ;
 }
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 1d16185e82b2..4672e003c0ad 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1685,8 +1685,15 @@ void __init xen_init_IRQ(void)
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 
 #ifdef CONFIG_X86
-	if (xen_hvm_domain()) {
+	if (xen_pv_domain()) {
+		irq_ctx_init(smp_processor_id());
+		if (xen_initial_domain())
+			pci_xen_initial_domain();
+	}
+	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_callback_vector();
+
+	if (xen_hvm_domain()) {
 		native_init_IRQ();
 		/* pci_xen_hvm_init must be called after native_init_IRQ so that
 		 * __acpi_register_gsi can point at the right function */
@@ -1695,13 +1702,10 @@ void __init xen_init_IRQ(void)
 		int rc;
 		struct physdev_pirq_eoi_gmfn eoi_gmfn;
 
-		irq_ctx_init(smp_processor_id());
-		if (xen_initial_domain())
-			pci_xen_initial_domain();
-
 		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 		eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+		/* TODO: No PVH support for PIRQ EOI */
 		if (rc != 0) {
 			free_page((unsigned long) pirq_eoi_map);
 			pirq_eoi_map = NULL;

From 7f256020cc599bc0b736c57d702b864dbbefcefb Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 31 Dec 2013 15:55:39 -0500
Subject: [PATCH 36/52] xen/grants: Remove gnttab_max_grant_frames dependency
 on gnttab_init.

The function gnttab_max_grant_frames() returns the maximum amount
of frames (pages) of grants we can have. Unfortunatly it was
dependent on gnttab_init() having been run before to initialize
the boot max value (boot_max_nr_grant_frames).

This meant that users of gnttab_max_grant_frames would always
get a zero value if they called before gnttab_init() - such as
'platform_pci_init' (drivers/xen/platform-pci.c).

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 drivers/xen/grant-table.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index aa846a48f400..99399cb0fd1c 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -62,7 +62,6 @@
 
 static grant_ref_t **gnttab_list;
 static unsigned int nr_grant_frames;
-static unsigned int boot_max_nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
@@ -827,6 +826,11 @@ static unsigned int __max_nr_grant_frames(void)
 unsigned int gnttab_max_grant_frames(void)
 {
 	unsigned int xen_max = __max_nr_grant_frames();
+	static unsigned int boot_max_nr_grant_frames;
+
+	/* First time, initialize it properly. */
+	if (!boot_max_nr_grant_frames)
+		boot_max_nr_grant_frames = __max_nr_grant_frames();
 
 	if (xen_max > boot_max_nr_grant_frames)
 		return boot_max_nr_grant_frames;
@@ -1227,13 +1231,12 @@ int gnttab_init(void)
 
 	gnttab_request_version();
 	nr_grant_frames = 1;
-	boot_max_nr_grant_frames = __max_nr_grant_frames();
 
 	/* Determine the maximum number of frames required for the
 	 * grant reference free list on the current hypervisor.
 	 */
 	BUG_ON(grefs_per_grant_frame == 0);
-	max_nr_glist_frames = (boot_max_nr_grant_frames *
+	max_nr_glist_frames = (gnttab_max_grant_frames() *
 			       grefs_per_grant_frame / RPP);
 
 	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),

From 456847533b9ad18baa6685946a2f1e1fa9c05c34 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 31 Dec 2013 16:33:31 -0500
Subject: [PATCH 37/52] xen/grant-table: Refactor gnttab_init

We have this odd scenario of where for PV paths we take a shortcut
but for the HVM paths we first ioremap xen_hvm_resume_frames, then
assign it to gnttab_shared.addr. This is needed because gnttab_map
uses gnttab_shared.addr.

Instead of having:
	if (pv)
		return gnttab_map
	if (hvm)
		...

	gnttab_map

Lets move the HVM part before the gnttab_map and remove the
first call to gnttab_map.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 drivers/xen/grant-table.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 99399cb0fd1c..e69c7780c208 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1173,10 +1173,7 @@ static int gnttab_setup(void)
 	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
 
-	if (xen_pv_domain())
-		return gnttab_map(0, nr_grant_frames - 1);
-
-	if (gnttab_shared.addr == NULL) {
+	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
 		gnttab_shared.addr = xen_remap(xen_hvm_resume_frames,
 						PAGE_SIZE * max_nr_gframes);
 		if (gnttab_shared.addr == NULL) {
@@ -1185,10 +1182,7 @@ static int gnttab_setup(void)
 			return -ENOMEM;
 		}
 	}
-
-	gnttab_map(0, nr_grant_frames - 1);
-
-	return 0;
+	return gnttab_map(0, nr_grant_frames - 1);
 }
 
 int gnttab_resume(void)

From efaf30a3357872cf0fc7d555b1f9968ec71535d3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 6 Jan 2014 10:40:36 -0500
Subject: [PATCH 38/52] xen/grant: Implement an grant frame array struct (v3).

The 'xen_hvm_resume_frames' used to be an 'unsigned long'
and contain the virtual address of the grants. That was OK
for most architectures (PVHVM, ARM) were the grants are contiguous
in memory. That however is not the case for PVH - in which case
we will have to do a lookup for each virtual address for the PFN.

Instead of doing that, lets make it a structure which will contain
the array of PFNs, the virtual address and the count of said PFNs.

Also provide a generic functions: gnttab_setup_auto_xlat_frames and
gnttab_free_auto_xlat_frames to populate said structure with
appropriate values for PVHVM and ARM.

To round it off, change the name from 'xen_hvm_resume_frames' to
a more descriptive one - 'xen_auto_xlat_grant_frames'.

For PVH, in patch "xen/pvh: Piggyback on PVHVM for grant driver"
we will populate the 'xen_auto_xlat_grant_frames' by ourselves.

v2 moves the xen_remap in the gnttab_setup_auto_xlat_frames
and also introduces xen_unmap for gnttab_free_auto_xlat_frames.

Suggested-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
[v3: Based on top of 'asm/xen/page.h: remove redundant semicolon']
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/arm/include/asm/xen/page.h |  1 +
 arch/arm/xen/enlighten.c        |  9 +++--
 arch/x86/include/asm/xen/page.h |  1 +
 drivers/xen/grant-table.c       | 58 +++++++++++++++++++++++++++++----
 drivers/xen/platform-pci.c      | 10 ++++--
 include/xen/grant_table.h       |  9 ++++-
 6 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index ac6789aad059..709c4b4d2f1d 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -118,5 +118,6 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 }
 
 #define xen_remap(cookie, size) ioremap_cached((cookie), (size))
+#define xen_unmap(cookie) iounmap((cookie))
 
 #endif /* _ASM_ARM_XEN_PAGE_H */
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 85501238b425..2162172c0ddc 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -208,6 +208,7 @@ static int __init xen_guest_init(void)
 	const char *version = NULL;
 	const char *xen_prefix = "xen,xen-";
 	struct resource res;
+	unsigned long grant_frames;
 
 	node = of_find_compatible_node(NULL, NULL, "xen,xen");
 	if (!node) {
@@ -224,10 +225,10 @@ static int __init xen_guest_init(void)
 	}
 	if (of_address_to_resource(node, GRANT_TABLE_PHYSADDR, &res))
 		return 0;
-	xen_hvm_resume_frames = res.start;
+	grant_frames = res.start;
 	xen_events_irq = irq_of_parse_and_map(node, 0);
 	pr_info("Xen %s support found, events_irq=%d gnttab_frame_pfn=%lx\n",
-			version, xen_events_irq, (xen_hvm_resume_frames >> PAGE_SHIFT));
+			version, xen_events_irq, (grant_frames >> PAGE_SHIFT));
 	xen_domain_type = XEN_HVM_DOMAIN;
 
 	xen_setup_features();
@@ -265,6 +266,10 @@ static int __init xen_guest_init(void)
 	if (xen_vcpu_info == NULL)
 		return -ENOMEM;
 
+	if (gnttab_setup_auto_xlat_frames(grant_frames)) {
+		free_percpu(xen_vcpu_info);
+		return -ENOMEM;
+	}
 	gnttab_init();
 	if (!xen_initial_domain())
 		xenbus_probe(NULL);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4a092ccdd147..3e276eb23d1b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -227,5 +227,6 @@ void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
 #define xen_remap(cookie, size) ioremap((cookie), (size));
+#define xen_unmap(cookie) iounmap((cookie))
 
 #endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e69c7780c208..44b75ccfbbff 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -65,8 +65,7 @@ static unsigned int nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
-unsigned long xen_hvm_resume_frames;
-EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
+struct grant_frames xen_auto_xlat_grant_frames;
 
 static union {
 	struct grant_entry_v1 *v1;
@@ -838,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
+int gnttab_setup_auto_xlat_frames(unsigned long addr)
+{
+	xen_pfn_t *pfn;
+	unsigned int max_nr_gframes = __max_nr_grant_frames();
+	unsigned int i;
+	void *vaddr;
+
+	if (xen_auto_xlat_grant_frames.count)
+		return -EINVAL;
+
+	vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes);
+	if (vaddr == NULL) {
+		pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n",
+			addr);
+		return -ENOMEM;
+	}
+	pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
+	if (!pfn) {
+		xen_unmap(vaddr);
+		return -ENOMEM;
+	}
+	for (i = 0; i < max_nr_gframes; i++)
+		pfn[i] = PFN_DOWN(addr) + i;
+
+	xen_auto_xlat_grant_frames.vaddr = vaddr;
+	xen_auto_xlat_grant_frames.pfn = pfn;
+	xen_auto_xlat_grant_frames.count = max_nr_gframes;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames);
+
+void gnttab_free_auto_xlat_frames(void)
+{
+	if (!xen_auto_xlat_grant_frames.count)
+		return;
+	kfree(xen_auto_xlat_grant_frames.pfn);
+	xen_unmap(xen_auto_xlat_grant_frames.vaddr);
+
+	xen_auto_xlat_grant_frames.pfn = NULL;
+	xen_auto_xlat_grant_frames.count = 0;
+	xen_auto_xlat_grant_frames.vaddr = NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
+
 /* Handling of paged out grant targets (GNTST_eagain) */
 #define MAX_DELAY 256
 static inline void
@@ -1068,6 +1112,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 		struct xen_add_to_physmap xatp;
 		unsigned int i = end_idx;
 		rc = 0;
+		BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);
 		/*
 		 * Loop backwards, so that the first hypercall has the largest
 		 * index, ensuring that the table will grow only once.
@@ -1076,7 +1121,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 			xatp.domid = DOMID_SELF;
 			xatp.idx = i;
 			xatp.space = XENMAPSPACE_grant_table;
-			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+			xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];
 			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
 			if (rc != 0) {
 				pr_warn("grant table add_to_physmap failed, err=%d\n",
@@ -1174,11 +1219,10 @@ static int gnttab_setup(void)
 		return -ENOSYS;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
-		gnttab_shared.addr = xen_remap(xen_hvm_resume_frames,
-						PAGE_SIZE * max_nr_gframes);
+		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
 		if (gnttab_shared.addr == NULL) {
-			pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n",
-					xen_hvm_resume_frames);
+			pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n",
+				(unsigned long)xen_auto_xlat_grant_frames.vaddr);
 			return -ENOMEM;
 		}
 	}
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 2f3528e93cb9..f1947ac218d9 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev,
 	long ioaddr;
 	long mmio_addr, mmio_len;
 	unsigned int max_nr_gframes;
+	unsigned long grant_frames;
 
 	if (!xen_domain())
 		return -ENODEV;
@@ -154,13 +155,16 @@ static int platform_pci_init(struct pci_dev *pdev,
 	}
 
 	max_nr_gframes = gnttab_max_grant_frames();
-	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	if (gnttab_setup_auto_xlat_frames(grant_frames))
+		goto out;
 	ret = gnttab_init();
 	if (ret)
-		goto out;
+		goto grant_out;
 	xenbus_probe(NULL);
 	return 0;
-
+grant_out:
+	gnttab_free_auto_xlat_frames();
 out:
 	pci_release_region(pdev, 0);
 mem_out:
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 694dcaf266e6..5acb1e4ac0d3 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -178,8 +178,15 @@ int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
 			   grant_status_t **__shared);
 void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
 
-extern unsigned long xen_hvm_resume_frames;
+struct grant_frames {
+	xen_pfn_t *pfn;
+	unsigned int count;
+	void *vaddr;
+};
+extern struct grant_frames xen_auto_xlat_grant_frames;
 unsigned int gnttab_max_grant_frames(void);
+int gnttab_setup_auto_xlat_frames(unsigned long addr);
+void gnttab_free_auto_xlat_frames(void);
 
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 

From 6926f6d6109714aab7b26df7099b12555e36676f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 3 Jan 2014 10:20:18 -0500
Subject: [PATCH 39/52] xen/pvh: Piggyback on PVHVM for grant driver (v4)

In PVH the shared grant frame is the PFN and not MFN,
hence its mapped via the same code path as HVM.

The allocation of the grant frame is done differently - we
do not use the early platform-pci driver and have an
ioremap area - instead we use balloon memory and stitch
all of the non-contingous pages in a virtualized area.

That means when we call the hypervisor to replace the GMFN
with a XENMAPSPACE_grant_table type, we need to lookup the
old PFN for every iteration instead of assuming a flat
contingous PFN allocation.

Lastly, we only use v1 for grants. This is because PVHVM
is not able to use v2 due to no XENMEM_add_to_physmap
calls on the error status page (see commit
69e8f430e243d657c2053f097efebc2e2cd559f0
 xen/granttable: Disable grant v2 for HVM domains.)

Until that is implemented this workaround has to
be in place.

Also per suggestions by Stefano utilize the PVHVM paths
as they share common functionality.

v2 of this patch moves most of the PVH code out in the
arch/x86/xen/grant-table driver and touches only minimally
the generic driver.

v3, v4: fixes us some of the code due to earlier patches.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/grant-table.c | 62 ++++++++++++++++++++++++++++++++++++++
 drivers/xen/gntdev.c       |  2 +-
 drivers/xen/grant-table.c  |  9 +++---
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d51907..2d719799fd6a 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -125,3 +125,65 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
 }
+#ifdef CONFIG_XEN_PVH
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <linux/slab.h>
+static int __init xlated_setup_gnttab_pages(void)
+{
+	struct page **pages;
+	xen_pfn_t *pfns;
+	int rc;
+	unsigned int i;
+	unsigned long nr_grant_frames = gnttab_max_grant_frames();
+
+	BUG_ON(nr_grant_frames == 0);
+	pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+	if (!pfns) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+	rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+	if (rc) {
+		pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
+			nr_grant_frames, rc);
+		kfree(pages);
+		kfree(pfns);
+		return rc;
+	}
+	for (i = 0; i < nr_grant_frames; i++)
+		pfns[i] = page_to_pfn(pages[i]);
+
+	rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
+				    &xen_auto_xlat_grant_frames.vaddr);
+
+	kfree(pages);
+	if (rc) {
+		pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
+			nr_grant_frames, rc);
+		free_xenballooned_pages(nr_grant_frames, pages);
+		kfree(pfns);
+		return rc;
+	}
+
+	xen_auto_xlat_grant_frames.pfn = pfns;
+	xen_auto_xlat_grant_frames.count = nr_grant_frames;
+
+	return 0;
+}
+
+static int __init xen_pvh_gnttab_setup(void)
+{
+	if (!xen_pvh_domain())
+		return -ENODEV;
+
+	return xlated_setup_gnttab_pages();
+}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index e41c79c986ea..073b4a19a8b0 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -846,7 +846,7 @@ static int __init gntdev_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	use_ptemod = xen_pv_domain();
+	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
 
 	err = misc_register(&gntdev_miscdev);
 	if (err != 0) {
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 44b75ccfbbff..1d5fbce4acb7 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1108,7 +1108,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
-	if (xen_hvm_domain()) {
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 		struct xen_add_to_physmap xatp;
 		unsigned int i = end_idx;
 		rc = 0;
@@ -1184,7 +1184,7 @@ static void gnttab_request_version(void)
 	int rc;
 	struct gnttab_set_version gsv;
 
-	if (xen_hvm_domain())
+	if (xen_feature(XENFEAT_auto_translated_physmap))
 		gsv.version = 1;
 	else
 		gsv.version = 2;
@@ -1327,5 +1327,6 @@ static int __gnttab_init(void)
 
 	return gnttab_init();
 }
-
-core_initcall(__gnttab_init);
+/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called
+ * beforehand to initialize xen_auto_xlat_grant_frames. */
+core_initcall_sync(__gnttab_init);

From be3e9cf33094210a0723fdc841e1abfd0ddc1007 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Tue, 31 Dec 2013 13:57:35 -0500
Subject: [PATCH 40/52] xen/pvh: Piggyback on PVHVM XenBus.

PVH is a PV guest with a twist - there are certain things
that work in it like HVM and some like PV. For the XenBus
mechanism we want to use the PVHVM mechanism.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 drivers/xen/xenbus/xenbus_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index ec097d6f964d..01d59e66565d 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -45,6 +45,7 @@
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
 #include <xen/xen.h>
+#include <xen/features.h>
 
 #include "xenbus_probe.h"
 
@@ -743,7 +744,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = {
 
 void __init xenbus_ring_ops_init(void)
 {
-	if (xen_pv_domain())
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		ring_ops = &ring_ops_pv;
 	else
 		ring_ops = &ring_ops_hvm;

From 4e903a20da51ed2329c1b9c182dba74f47ac2ca8 Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Tue, 31 Dec 2013 11:16:25 -0500
Subject: [PATCH 41/52] xen/pvh: Support ParaVirtualized Hardware extensions
 (v3).

PVH allows PV linux guest to utilize hardware extended capabilities,
such as running MMU updates in a HVM container.

The Xen side defines PVH as (from docs/misc/pvh-readme.txt,
with modifications):

"* the guest uses auto translate:
 - p2m is managed by Xen
 - pagetables are owned by the guest
 - mmu_update hypercall not available
* it uses event callback and not vlapic emulation,
* IDT is native, so set_trap_table hcall is also N/A for a PVH guest.

For a full list of hcalls supported for PVH, see pvh_hypercall64_table
in arch/x86/hvm/hvm.c in xen.  From the ABI prespective, it's mostly a
PV guest with auto translate, although it does use hvm_op for setting
callback vector."

Use .ascii and .asciz to define xen feature string. Note, the PVH
string must be in a single line (not multiple lines with \) to keep the
assembler from putting null char after each string before \.
This patch allows it to be configured and enabled.

We also use introduce the 'XEN_ELFNOTE_SUPPORTED_FEATURES' ELF note to
tell the hypervisor that 'hvm_callback_vector' is what the kernel
needs. We can not put it in 'XEN_ELFNOTE_FEATURES' as older hypervisor
parse fields they don't understand as errors and refuse to load
the kernel. This work-around fixes the problem.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/Kconfig            |  2 +-
 arch/x86/xen/xen-head.S         | 25 ++++++++++++++++++++++++-
 include/xen/interface/elfnote.h | 13 +++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index e7d05900efac..d88bfd66aaab 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -53,6 +53,6 @@ config XEN_DEBUG_FS
 
 config XEN_PVH
 	bool "Support for running as a PVH guest"
-	depends on X86_64 && XEN && BROKEN
+	depends on X86_64 && XEN
 	select XEN_PVHVM
 	def_bool n
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5b..485b69585540 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -11,8 +11,28 @@
 #include <asm/page_types.h>
 
 #include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
 #include <asm/xen/interface.h>
 
+#ifdef CONFIG_XEN_PVH
+#define PVH_FEATURES_STR  "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
+/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
+ * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
+ * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
+ */
+#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
+		      (1 << XENFEAT_auto_translated_physmap) | \
+		      (1 << XENFEAT_supervisor_mode_kernel) | \
+		      (1 << XENFEAT_hvm_callback_vector))
+/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ * up regardless whether this CONFIG option is enabled or not, but it
+ * clarifies what the right flags need to be.
+ */
+#else
+#define PVH_FEATURES_STR  ""
+#define PVH_FEATURES (0)
+#endif
+
 	__INIT
 ENTRY(startup_xen)
 	cld
@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
-	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
+						(1 << XENFEAT_writable_page_tables) |
+						(1 << XENFEAT_dom0))
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
index 0360b15f4883..6f4eae328ca7 100644
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -140,6 +140,19 @@
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
+/*
+ * The features supported by this kernel (numeric).
+ *
+ * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a
+ * kernel to specify support for features that older hypervisors don't
+ * know about. The set of features 4.2 and newer hypervisors will
+ * consider supported by the kernel is the combination of the sets
+ * specified through this and the string note.
+ *
+ * LEGACY: FEATURES
+ */
+#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
+
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
 
 /*

From 11c7ff17c9b6dbf3a4e4f36be30ad531a6cf0ec9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 6 Jan 2014 10:44:39 -0500
Subject: [PATCH 42/52] xen/grant-table: Force to use v1 of grants.

We have the framework to use v2, but there are no backends that
actually use it. The end result is that on PV we use v2 grants
and on PVHVM v1. The v1 has a capacity of 512 grants per page while
the v2 has 256 grants per page. This means we lose about 50%
capacity - and if we want more than 16 VIFs (each VIF takes
512 grants), then we are hitting the max per guest of 32.

Oracle-bug: 16039922
CC: annie.li@oracle.com
CC: msw@amazon.com
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/grant-table.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 1d5fbce4acb7..1ce1c40331f3 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1184,10 +1184,8 @@ static void gnttab_request_version(void)
 	int rc;
 	struct gnttab_set_version gsv;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		gsv.version = 1;
-	else
-		gsv.version = 2;
+	gsv.version = 1;
+
 	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
 	if (rc == 0 && gsv.version == 2) {
 		grant_table_version = 2;

From 89c3cf52c76ff3d7b129156f4b8943af517d9db2 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Thu, 5 Dec 2013 19:34:05 +0800
Subject: [PATCH 43/52] xen: Use dev_is_pci() to check whether it is pci device

Use PCI standard marco dev_is_pci() instead of directly compare
pci_bus_type to check whether it is pci device.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Jan Beulich <JBeulich@suse.com>
---
 drivers/xen/dbgp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c
index f3ccc80a455f..8145a59fd9f6 100644
--- a/drivers/xen/dbgp.c
+++ b/drivers/xen/dbgp.c
@@ -19,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op)
 	dbgp.op = op;
 
 #ifdef CONFIG_PCI
-	if (ctrlr->bus == &pci_bus_type) {
+	if (dev_is_pci(ctrlr)) {
 		const struct pci_dev *pdev = to_pci_dev(ctrlr);
 
 		dbgp.u.pci.seg = pci_domain_nr(pdev->bus);

From 0869a642326e786544ef3e307175dcec150fe3dc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 7 Jan 2014 09:56:06 -0500
Subject: [PATCH 44/52] xen/pvh: Fix compile issues with xen_pvh_domain()

Oddly enough it compiles for my ancient compiler but with
the supplied .config it does blow up. Fix is easy enough.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/grant-table.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 2d719799fd6a..103c93f874b2 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -128,6 +128,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 #ifdef CONFIG_XEN_PVH
 #include <xen/balloon.h>
 #include <xen/events.h>
+#include <xen/xen.h>
 #include <linux/slab.h>
 static int __init xlated_setup_gnttab_pages(void)
 {

From 5602aba808aa5440e4b8fa4340079cb2047651ee Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 7 Jan 2014 21:37:09 +0800
Subject: [PATCH 45/52] xen/pvh: remove duplicated include from enlighten.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a4e2f30917a3..b6d61c353fe5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -46,7 +46,6 @@
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
 #include <xen/acpi.h>
-#include <xen/features.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>

From 89b9e08f186a203c250872a663c9eab09cdc583a Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 7 Jan 2014 21:11:05 +0800
Subject: [PATCH 46/52] xen-platform: fix error return code in
 platform_pci_init()

Fix to return a negative error code from the error handling
case instead of 0, otherwise the error condition cann't be
reflected from the return value.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/platform-pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index f1947ac218d9..a1361c312c06 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -156,7 +156,8 @@ static int platform_pci_init(struct pci_dev *pdev,
 
 	max_nr_gframes = gnttab_max_grant_frames();
 	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
-	if (gnttab_setup_auto_xlat_frames(grant_frames))
+	ret = gnttab_setup_auto_xlat_frames(grant_frames);
+	if (ret)
 		goto out;
 	ret = gnttab_init();
 	if (ret)

From be1403b9e66bea9d64db9198256cb27532a870b1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 7 Jan 2014 21:11:25 +0800
Subject: [PATCH 47/52] xen/evtchn_fifo: fix error return code in
 evtchn_fifo_setup()

Fix to return -ENOMEM from the error handling case instead of
0 (overwrited to 0 by the HYPERVISOR_event_channel_op call),
otherwise the error condition cann't be reflected from the
return value.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/events/events_fifo.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index e2bf9571f7fe..5b2c039f16c5 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -109,7 +109,7 @@ static int evtchn_fifo_setup(struct irq_info *info)
 {
 	unsigned port = info->evtchn;
 	unsigned new_array_pages;
-	int ret = -ENOMEM;
+	int ret;
 
 	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
 
@@ -124,8 +124,10 @@ static int evtchn_fifo_setup(struct irq_info *info)
 		array_page = event_array[event_array_pages];
 		if (!array_page) {
 			array_page = (void *)__get_free_page(GFP_KERNEL);
-			if (array_page == NULL)
+			if (array_page == NULL) {
+				ret = -ENOMEM;
 				goto error;
+			}
 			event_array[event_array_pages] = array_page;
 		}
 

From b1a3b1c8a8d963424c4699efa64dd8986b2f76d7 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 3 Jan 2014 19:02:09 +0000
Subject: [PATCH 48/52] xen/fb: allow xenfb initialization for hvm guests

There is no reasons why an HVM guest shouldn't be allowed to use xenfb.
As a matter of fact ARM guests, HVM from Linux POV, can use xenfb.
Given that no Xen toolstacks configure a xenfb backend for x86 HVM
guests, they are not affected.

Please note that at this time QEMU needs few outstanding fixes to
provide xenfb on ARM:

http://marc.info/?l=qemu-devel&m=138739419700837&w=2

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: David Vrabel <david.vrabel@citrix.com>
CC: boris.ostrovsky@oracle.com
CC: plagnioj@jcrosoft.com
CC: tomi.valkeinen@ti.com
CC: linux-fbdev@vger.kernel.org
CC: konrad.wilk@oracle.com
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/xen-fbfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 4b2d3ab870f3..901014bbc821 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -693,7 +693,7 @@ static DEFINE_XENBUS_DRIVER(xenfb, ,
 
 static int __init xenfb_init(void)
 {
-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;
 
 	/* Nothing to do if running in dom0. */

From 0db6991dd233396da766076caef71f36b4f96c21 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 10 Jan 2014 09:50:08 -0500
Subject: [PATCH 49/52] xen: delete new instances of __cpuinit usage

Commit 1fe565517b57676884349dccfd6ce853ec338636 ("xen/events: use
the FIFO-based ABI if available") added new instances of __cpuinit
macro usage.

We removed this a couple versions ago; we now want to remove
the compat no-op stubs.  Introducing new users is not what
we want to see at this point in time, as it will break once
the stubs are gone.

Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/events/events_fifo.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index 5b2c039f16c5..1de2a191b395 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -359,7 +359,7 @@ static const struct evtchn_ops evtchn_ops_fifo = {
 	.resume            = evtchn_fifo_resume,
 };
 
-static int __cpuinit evtchn_fifo_init_control_block(unsigned cpu)
+static int evtchn_fifo_init_control_block(unsigned cpu)
 {
 	struct page *control_block = NULL;
 	struct evtchn_init_control init_control;
@@ -386,7 +386,7 @@ static int __cpuinit evtchn_fifo_init_control_block(unsigned cpu)
 	return ret;
 }
 
-static int __cpuinit evtchn_fifo_cpu_notification(struct notifier_block *self,
+static int evtchn_fifo_cpu_notification(struct notifier_block *self,
 						  unsigned long action,
 						  void *hcpu)
 {
@@ -404,7 +404,7 @@ static int __cpuinit evtchn_fifo_cpu_notification(struct notifier_block *self,
 	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
 }
 
-static struct notifier_block evtchn_fifo_cpu_notifier __cpuinitdata = {
+static struct notifier_block evtchn_fifo_cpu_notifier = {
 	.notifier_call	= evtchn_fifo_cpu_notification,
 };
 

From 54d44eb3c7f6f893ae3b3cfb545145aa337976ca Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Jan 2014 10:45:35 -0500
Subject: [PATCH 50/52] xen/pvh: Use 'depend' instead of 'select'.

The usage of 'select' means it will enable the CONFIG
options without checking their dependencies. That meant
we would inadvertently turn on CONFIG_XEN_PVHM while its
core dependency (CONFIG_PCI) was turned off.

This patch fixes the warnings and compile failures:

warning: (XEN_PVH) selects XEN_PVHVM which has unmet direct
dependencies (HYPERVISOR_GUEST && XEN && PCI && X86_LOCAL_APIC)

Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index d88bfd66aaab..01b90261fa38 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -53,6 +53,5 @@ config XEN_DEBUG_FS
 
 config XEN_PVH
 	bool "Support for running as a PVH guest"
-	depends on X86_64 && XEN
-	select XEN_PVHVM
+	depends on X86_64 && XEN && XEN_PVHVM
 	def_bool n

From ea70ba3ab99a10f1eda4ab6473ea11d6acfd57f5 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 17 Jan 2014 11:51:51 +0000
Subject: [PATCH 51/52] MAINTAINERS: add git repository for Xen

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d5e4ff328cc7..49d308f2542c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9544,6 +9544,7 @@ M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 M:	Boris Ostrovsky <boris.ostrovsky@oracle.com>
 M:	David Vrabel <david.vrabel@citrix.com>
 L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git
 S:	Supported
 F:	arch/x86/xen/
 F:	drivers/*/xen-*front.c

From c9f6e9977e38de15da96b732a8dec0ef56cbf977 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 20 Jan 2014 09:20:07 -0500
Subject: [PATCH 52/52] xen/pvh: Set X86_CR0_WP and others in CR0 (v2)

otherwise we will get for some user-space applications
that use 'clone' with CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID
end up hitting an assert in glibc manifested by:

general protection ip:7f80720d364c sp:7fff98fd8a80 error:0 in
libc-2.13.so[7f807209e000+180000]

This is due to the nature of said operations which sets and clears
the PID.  "In the successful one I can see that the page table of
the parent process has been updated successfully to use a
different physical page, so the write of the tid on
that page only affects the child...

On the other hand, in the failed case, the write seems to happen before
the copy of the original page is done, so both the parent and the child
end up with the same value (because the parent copies the page after
the write of the child tid has already happened)."
(Roger's analysis). The nature of this is due to the Xen's commit
of 51e2cac257ec8b4080d89f0855c498cbbd76a5e5
"x86/pvh: set only minimal cr0 and cr4 flags in order to use paging"
the CR0_WP was removed so COW features of the Linux kernel were not
operating properly.

While doing that also update the rest of the CR0 flags to be inline
with what a baremetal Linux kernel would set them to.

In 'secondary_startup_64' (baremetal Linux) sets:

X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP |
X86_CR0_AM | X86_CR0_PG

The hypervisor for HVM type guests (which PVH is a bit) sets:
X86_CR0_PE | X86_CR0_ET | X86_CR0_TS
For PVH it specifically sets:
X86_CR0_PG

Which means we need to set the rest: X86_CR0_MP | X86_CR0_NE  |
X86_CR0_WP | X86_CR0_AM to have full parity.

Signed-off-by: Roger Pau Monne <roger.pau@citrix.com>
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
[v1: Took out the cr4 writes to be a seperate patch]
[v2: 0-DAY kernel found xen_setup_gdt to be missing a static]
---
 arch/x86/xen/enlighten.c | 33 ++++++++++++++++++++++++++++++---
 arch/x86/xen/smp.c       |  2 +-
 arch/x86/xen/xen-ops.h   |  2 +-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b6d61c353fe5..a4d7b647867f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1414,7 +1414,7 @@ static void __init xen_boot_params_init_edd(void)
  * is PVH which is not going to use xen_load_gdt_boot or other
  * __init functions.
  */
-void __ref xen_setup_gdt(int cpu)
+static void __ref xen_setup_gdt(int cpu)
 {
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_X86_64
@@ -1462,13 +1462,40 @@ void __ref xen_setup_gdt(int cpu)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+/*
+ * A PV guest starts with default flags that are not set for PVH, set them
+ * here asap.
+ */
+static void xen_pvh_set_cr_flags(int cpu)
+{
+
+	/* Some of these are setup in 'secondary_startup_64'. The others:
+	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
+	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
+	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
+}
+
+/*
+ * Note, that it is ref - because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+void __ref xen_pvh_secondary_vcpu_init(int cpu)
+{
+	xen_setup_gdt(cpu);
+	xen_pvh_set_cr_flags(cpu);
+}
+
 static void __init xen_pvh_early_guest_init(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		return;
 
-	if (xen_feature(XENFEAT_hvm_callback_vector))
-		xen_have_vector_callback = 1;
+	if (!xen_feature(XENFEAT_hvm_callback_vector))
+		return;
+
+	xen_have_vector_callback = 1;
+	xen_pvh_set_cr_flags(0);
 
 #ifdef CONFIG_X86_32
 	BUG(); /* PVH: Implement proper support. */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 5e46190133b2..a18eadd8bb40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -105,7 +105,7 @@ static void cpu_bringup_and_idle(int cpu)
 #ifdef CONFIG_X86_64
 	if (xen_feature(XENFEAT_auto_translated_physmap) &&
 	    xen_feature(XENFEAT_supervisor_mode_kernel))
-		xen_setup_gdt(cpu);
+		xen_pvh_secondary_vcpu_init(cpu);
 #endif
 	cpu_bringup();
 	cpu_startup_entry(CPUHP_ONLINE);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9059c24ed564..1cb6f4c37300 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -123,5 +123,5 @@ __visible void xen_adjust_exception_frame(void);
 
 extern int xen_panic_handler_init(void);
 
-void xen_setup_gdt(int cpu);
+void xen_pvh_secondary_vcpu_init(int cpu);
 #endif /* XEN_OPS_H */