Merge branch 'stable/for-jens-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-4.5/drivers

Konrad writes:

The pull is based on converting the backend driver into an multiqueue
driver and exposing more than one queue to the frontend. As such we had
to modify the frontend and also fix a bunch of bugs around this.

The original work is based on Arianna Avanzini's work as an OPW intern.
Bob took over the work and had been massaging it for quite some time.

Also included are are features to 64KB page support for ARM and various
bug-fixes.
This commit is contained in:
Jens Axboe 2016-01-13 08:20:36 -07:00
commit 038a75afc5
5 changed files with 1308 additions and 726 deletions

View File

@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
MODULE_PARM_DESC(max_persistent_grants, MODULE_PARM_DESC(max_persistent_grants,
"Maximum number of grants to map persistently"); "Maximum number of grants to map persistently");
/*
* Maximum number of rings/queues blkback supports, allow as many queues as there
* are CPUs if user has not specified a value.
*/
unsigned int xenblk_max_queues;
module_param_named(max_queues, xenblk_max_queues, uint, 0644);
MODULE_PARM_DESC(max_queues,
"Maximum number of hardware queues per virtual disk." \
"By default it is the number of online CPUs.");
/* /*
* Maximum order of pages to be used for the shared ring between front and * Maximum order of pages to be used for the shared ring between front and
* backend, 4KB page granularity is used. * backend, 4KB page granularity is used.
@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
/* Number of free pages to remove on each call to gnttab_free_pages */ /* Number of free pages to remove on each call to gnttab_free_pages */
#define NUM_BATCH_FREE_PAGES 10 #define NUM_BATCH_FREE_PAGES 10
static inline int get_free_page(struct xen_blkif *blkif, struct page **page) static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
if (list_empty(&blkif->free_pages)) { if (list_empty(&ring->free_pages)) {
BUG_ON(blkif->free_pages_num != 0); BUG_ON(ring->free_pages_num != 0);
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
return gnttab_alloc_pages(1, page); return gnttab_alloc_pages(1, page);
} }
BUG_ON(blkif->free_pages_num == 0); BUG_ON(ring->free_pages_num == 0);
page[0] = list_first_entry(&blkif->free_pages, struct page, lru); page[0] = list_first_entry(&ring->free_pages, struct page, lru);
list_del(&page[0]->lru); list_del(&page[0]->lru);
blkif->free_pages_num--; ring->free_pages_num--;
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
return 0; return 0;
} }
static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
int num) int num)
{ {
unsigned long flags; unsigned long flags;
int i; int i;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
for (i = 0; i < num; i++) for (i = 0; i < num; i++)
list_add(&page[i]->lru, &blkif->free_pages); list_add(&page[i]->lru, &ring->free_pages);
blkif->free_pages_num += num; ring->free_pages_num += num;
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
} }
static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
{ {
/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
struct page *page[NUM_BATCH_FREE_PAGES]; struct page *page[NUM_BATCH_FREE_PAGES];
unsigned int num_pages = 0; unsigned int num_pages = 0;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
while (blkif->free_pages_num > num) { while (ring->free_pages_num > num) {
BUG_ON(list_empty(&blkif->free_pages)); BUG_ON(list_empty(&ring->free_pages));
page[num_pages] = list_first_entry(&blkif->free_pages, page[num_pages] = list_first_entry(&ring->free_pages,
struct page, lru); struct page, lru);
list_del(&page[num_pages]->lru); list_del(&page[num_pages]->lru);
blkif->free_pages_num--; ring->free_pages_num--;
if (++num_pages == NUM_BATCH_FREE_PAGES) { if (++num_pages == NUM_BATCH_FREE_PAGES) {
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
gnttab_free_pages(num_pages, page); gnttab_free_pages(num_pages, page);
spin_lock_irqsave(&blkif->free_pages_lock, flags); spin_lock_irqsave(&ring->free_pages_lock, flags);
num_pages = 0; num_pages = 0;
} }
} }
spin_unlock_irqrestore(&blkif->free_pages_lock, flags); spin_unlock_irqrestore(&ring->free_pages_lock, flags);
if (num_pages != 0) if (num_pages != 0)
gnttab_free_pages(num_pages, page); gnttab_free_pages(num_pages, page);
} }
#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
static int do_block_io_op(struct xen_blkif *blkif); static int do_block_io_op(struct xen_blkif_ring *ring);
static int dispatch_rw_block_io(struct xen_blkif *blkif, static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req); struct pending_req *pending_req);
static void make_response(struct xen_blkif *blkif, u64 id, static void make_response(struct xen_blkif_ring *ring, u64 id,
unsigned short op, int st); unsigned short op, int st);
#define foreach_grant_safe(pos, n, rbtree, node) \ #define foreach_grant_safe(pos, n, rbtree, node) \
@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
/* /*
* We don't need locking around the persistent grant helpers * We don't need locking around the persistent grant helpers
* because blkback uses a single-thread for each backed, so we * because blkback uses a single-thread for each backend, so we
* can be sure that this functions will never be called recursively. * can be sure that this functions will never be called recursively.
* *
* The only exception to that is put_persistent_grant, that can be called * The only exception to that is put_persistent_grant, that can be called
@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
* bit operations to modify the flags of a persistent grant and to count * bit operations to modify the flags of a persistent grant and to count
* the number of used grants. * the number of used grants.
*/ */
static int add_persistent_gnt(struct xen_blkif *blkif, static int add_persistent_gnt(struct xen_blkif_ring *ring,
struct persistent_gnt *persistent_gnt) struct persistent_gnt *persistent_gnt)
{ {
struct rb_node **new = NULL, *parent = NULL; struct rb_node **new = NULL, *parent = NULL;
struct persistent_gnt *this; struct persistent_gnt *this;
struct xen_blkif *blkif = ring->blkif;
if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
if (!blkif->vbd.overflow_max_grants) if (!blkif->vbd.overflow_max_grants)
blkif->vbd.overflow_max_grants = 1; blkif->vbd.overflow_max_grants = 1;
return -EBUSY; return -EBUSY;
} }
/* Figure out where to put new node */ /* Figure out where to put new node */
new = &blkif->persistent_gnts.rb_node; new = &ring->persistent_gnts.rb_node;
while (*new) { while (*new) {
this = container_of(*new, struct persistent_gnt, node); this = container_of(*new, struct persistent_gnt, node);
@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
/* Add new node and rebalance tree. */ /* Add new node and rebalance tree. */
rb_link_node(&(persistent_gnt->node), parent, new); rb_link_node(&(persistent_gnt->node), parent, new);
rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
blkif->persistent_gnt_c++; ring->persistent_gnt_c++;
atomic_inc(&blkif->persistent_gnt_in_use); atomic_inc(&ring->persistent_gnt_in_use);
return 0; return 0;
} }
static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
grant_ref_t gref) grant_ref_t gref)
{ {
struct persistent_gnt *data; struct persistent_gnt *data;
struct rb_node *node = NULL; struct rb_node *node = NULL;
node = blkif->persistent_gnts.rb_node; node = ring->persistent_gnts.rb_node;
while (node) { while (node) {
data = container_of(node, struct persistent_gnt, node); data = container_of(node, struct persistent_gnt, node);
@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
return NULL; return NULL;
} }
set_bit(PERSISTENT_GNT_ACTIVE, data->flags); set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
atomic_inc(&blkif->persistent_gnt_in_use); atomic_inc(&ring->persistent_gnt_in_use);
return data; return data;
} }
} }
return NULL; return NULL;
} }
static void put_persistent_gnt(struct xen_blkif *blkif, static void put_persistent_gnt(struct xen_blkif_ring *ring,
struct persistent_gnt *persistent_gnt) struct persistent_gnt *persistent_gnt)
{ {
if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
pr_alert_ratelimited("freeing a grant already unused\n"); pr_alert_ratelimited("freeing a grant already unused\n");
set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
atomic_dec(&blkif->persistent_gnt_in_use); atomic_dec(&ring->persistent_gnt_in_use);
} }
static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
unsigned int num) unsigned int num)
{ {
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
segs_to_unmap = 0; segs_to_unmap = 0;
} }
@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt; struct persistent_gnt *persistent_gnt;
int segs_to_unmap = 0; int segs_to_unmap = 0;
struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
struct gntab_unmap_queue_data unmap_data; struct gntab_unmap_queue_data unmap_data;
unmap_data.pages = pages; unmap_data.pages = pages;
unmap_data.unmap_ops = unmap; unmap_data.unmap_ops = unmap;
unmap_data.kunmap_ops = NULL; unmap_data.kunmap_ops = NULL;
while(!list_empty(&blkif->persistent_purge_list)) { while(!list_empty(&ring->persistent_purge_list)) {
persistent_gnt = list_first_entry(&blkif->persistent_purge_list, persistent_gnt = list_first_entry(&ring->persistent_purge_list,
struct persistent_gnt, struct persistent_gnt,
remove_node); remove_node);
list_del(&persistent_gnt->remove_node); list_del(&persistent_gnt->remove_node);
@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
segs_to_unmap = 0; segs_to_unmap = 0;
} }
kfree(persistent_gnt); kfree(persistent_gnt);
@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
if (segs_to_unmap > 0) { if (segs_to_unmap > 0) {
unmap_data.count = segs_to_unmap; unmap_data.count = segs_to_unmap;
BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
put_free_pages(blkif, pages, segs_to_unmap); put_free_pages(ring, pages, segs_to_unmap);
} }
} }
static void purge_persistent_gnt(struct xen_blkif *blkif) static void purge_persistent_gnt(struct xen_blkif_ring *ring)
{ {
struct persistent_gnt *persistent_gnt; struct persistent_gnt *persistent_gnt;
struct rb_node *n; struct rb_node *n;
@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
bool scan_used = false, clean_used = false; bool scan_used = false, clean_used = false;
struct rb_root *root; struct rb_root *root;
if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
(blkif->persistent_gnt_c == xen_blkif_max_pgrants && (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
!blkif->vbd.overflow_max_grants)) { !ring->blkif->vbd.overflow_max_grants)) {
return; goto out;
} }
if (work_busy(&blkif->persistent_purge_work)) { if (work_busy(&ring->persistent_purge_work)) {
pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
return; goto out;
} }
num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
num_clean = min(blkif->persistent_gnt_c, num_clean); num_clean = min(ring->persistent_gnt_c, num_clean);
if ((num_clean == 0) || if ((num_clean == 0) ||
(num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
return; goto out;
/* /*
* At this point, we can assure that there will be no calls * At this point, we can assure that there will be no calls
@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
pr_debug("Going to purge %u persistent grants\n", num_clean); pr_debug("Going to purge %u persistent grants\n", num_clean);
BUG_ON(!list_empty(&blkif->persistent_purge_list)); BUG_ON(!list_empty(&ring->persistent_purge_list));
root = &blkif->persistent_gnts; root = &ring->persistent_gnts;
purge_list: purge_list:
foreach_grant_safe(persistent_gnt, n, root, node) { foreach_grant_safe(persistent_gnt, n, root, node) {
BUG_ON(persistent_gnt->handle == BUG_ON(persistent_gnt->handle ==
@ -414,7 +425,7 @@ purge_list:
rb_erase(&persistent_gnt->node, root); rb_erase(&persistent_gnt->node, root);
list_add(&persistent_gnt->remove_node, list_add(&persistent_gnt->remove_node,
&blkif->persistent_purge_list); &ring->persistent_purge_list);
if (--num_clean == 0) if (--num_clean == 0)
goto finished; goto finished;
} }
@ -435,30 +446,32 @@ finished:
goto purge_list; goto purge_list;
} }
blkif->persistent_gnt_c -= (total - num_clean); ring->persistent_gnt_c -= (total - num_clean);
blkif->vbd.overflow_max_grants = 0; ring->blkif->vbd.overflow_max_grants = 0;
/* We can defer this work */ /* We can defer this work */
schedule_work(&blkif->persistent_purge_work); schedule_work(&ring->persistent_purge_work);
pr_debug("Purged %u/%u\n", (total - num_clean), total); pr_debug("Purged %u/%u\n", (total - num_clean), total);
out:
return; return;
} }
/* /*
* Retrieve from the 'pending_reqs' a free pending_req structure to be used. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
*/ */
static struct pending_req *alloc_req(struct xen_blkif *blkif) static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
{ {
struct pending_req *req = NULL; struct pending_req *req = NULL;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&blkif->pending_free_lock, flags); spin_lock_irqsave(&ring->pending_free_lock, flags);
if (!list_empty(&blkif->pending_free)) { if (!list_empty(&ring->pending_free)) {
req = list_entry(blkif->pending_free.next, struct pending_req, req = list_entry(ring->pending_free.next, struct pending_req,
free_list); free_list);
list_del(&req->free_list); list_del(&req->free_list);
} }
spin_unlock_irqrestore(&blkif->pending_free_lock, flags); spin_unlock_irqrestore(&ring->pending_free_lock, flags);
return req; return req;
} }
@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
* Return the 'pending_req' structure back to the freepool. We also * Return the 'pending_req' structure back to the freepool. We also
* wake up the thread if it was waiting for a free page. * wake up the thread if it was waiting for a free page.
*/ */
static void free_req(struct xen_blkif *blkif, struct pending_req *req) static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
{ {
unsigned long flags; unsigned long flags;
int was_empty; int was_empty;
spin_lock_irqsave(&blkif->pending_free_lock, flags); spin_lock_irqsave(&ring->pending_free_lock, flags);
was_empty = list_empty(&blkif->pending_free); was_empty = list_empty(&ring->pending_free);
list_add(&req->free_list, &blkif->pending_free); list_add(&req->free_list, &ring->pending_free);
spin_unlock_irqrestore(&blkif->pending_free_lock, flags); spin_unlock_irqrestore(&ring->pending_free_lock, flags);
if (was_empty) if (was_empty)
wake_up(&blkif->pending_free_wq); wake_up(&ring->pending_free_wq);
} }
/* /*
@ -556,10 +569,10 @@ abort:
/* /*
* Notification from the guest OS. * Notification from the guest OS.
*/ */
static void blkif_notify_work(struct xen_blkif *blkif) static void blkif_notify_work(struct xen_blkif_ring *ring)
{ {
blkif->waiting_reqs = 1; ring->waiting_reqs = 1;
wake_up(&blkif->wq); wake_up(&ring->wq);
} }
irqreturn_t xen_blkif_be_int(int irq, void *dev_id) irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
* SCHEDULER FUNCTIONS * SCHEDULER FUNCTIONS
*/ */
static void print_stats(struct xen_blkif *blkif) static void print_stats(struct xen_blkif_ring *ring)
{ {
pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
" | ds %4llu | pg: %4u/%4d\n", " | ds %4llu | pg: %4u/%4d\n",
current->comm, blkif->st_oo_req, current->comm, ring->st_oo_req,
blkif->st_rd_req, blkif->st_wr_req, ring->st_rd_req, ring->st_wr_req,
blkif->st_f_req, blkif->st_ds_req, ring->st_f_req, ring->st_ds_req,
blkif->persistent_gnt_c, ring->persistent_gnt_c,
xen_blkif_max_pgrants); xen_blkif_max_pgrants);
blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
blkif->st_rd_req = 0; ring->st_rd_req = 0;
blkif->st_wr_req = 0; ring->st_wr_req = 0;
blkif->st_oo_req = 0; ring->st_oo_req = 0;
blkif->st_ds_req = 0; ring->st_ds_req = 0;
} }
int xen_blkif_schedule(void *arg) int xen_blkif_schedule(void *arg)
{ {
struct xen_blkif *blkif = arg; struct xen_blkif_ring *ring = arg;
struct xen_blkif *blkif = ring->blkif;
struct xen_vbd *vbd = &blkif->vbd; struct xen_vbd *vbd = &blkif->vbd;
unsigned long timeout; unsigned long timeout;
int ret; int ret;
xen_blkif_get(blkif); xen_blkif_get(blkif);
set_freezable();
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
if (try_to_freeze()) if (try_to_freeze())
continue; continue;
@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = msecs_to_jiffies(LRU_INTERVAL);
timeout = wait_event_interruptible_timeout( timeout = wait_event_interruptible_timeout(
blkif->wq, ring->wq,
blkif->waiting_reqs || kthread_should_stop(), ring->waiting_reqs || kthread_should_stop(),
timeout); timeout);
if (timeout == 0) if (timeout == 0)
goto purge_gnt_list; goto purge_gnt_list;
timeout = wait_event_interruptible_timeout( timeout = wait_event_interruptible_timeout(
blkif->pending_free_wq, ring->pending_free_wq,
!list_empty(&blkif->pending_free) || !list_empty(&ring->pending_free) ||
kthread_should_stop(), kthread_should_stop(),
timeout); timeout);
if (timeout == 0) if (timeout == 0)
goto purge_gnt_list; goto purge_gnt_list;
blkif->waiting_reqs = 0; ring->waiting_reqs = 0;
smp_mb(); /* clear flag *before* checking for work */ smp_mb(); /* clear flag *before* checking for work */
ret = do_block_io_op(blkif); ret = do_block_io_op(ring);
if (ret > 0) if (ret > 0)
blkif->waiting_reqs = 1; ring->waiting_reqs = 1;
if (ret == -EACCES) if (ret == -EACCES)
wait_event_interruptible(blkif->shutdown_wq, wait_event_interruptible(ring->shutdown_wq,
kthread_should_stop()); kthread_should_stop());
purge_gnt_list: purge_gnt_list:
if (blkif->vbd.feature_gnt_persistent && if (blkif->vbd.feature_gnt_persistent &&
time_after(jiffies, blkif->next_lru)) { time_after(jiffies, ring->next_lru)) {
purge_persistent_gnt(blkif); purge_persistent_gnt(ring);
blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
} }
/* Shrink if we have more than xen_blkif_max_buffer_pages */ /* Shrink if we have more than xen_blkif_max_buffer_pages */
shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
if (log_stats && time_after(jiffies, blkif->st_print)) if (log_stats && time_after(jiffies, ring->st_print))
print_stats(blkif); print_stats(ring);
} }
/* Drain pending purge work */ /* Drain pending purge work */
flush_work(&blkif->persistent_purge_work); flush_work(&ring->persistent_purge_work);
if (log_stats) if (log_stats)
print_stats(blkif); print_stats(ring);
blkif->xenblkd = NULL; ring->xenblkd = NULL;
xen_blkif_put(blkif); xen_blkif_put(blkif);
return 0; return 0;
@ -658,22 +673,22 @@ purge_gnt_list:
/* /*
* Remove persistent grants and empty the pool of free pages * Remove persistent grants and empty the pool of free pages
*/ */
void xen_blkbk_free_caches(struct xen_blkif *blkif) void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
{ {
/* Free all persistent grant pages */ /* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
free_persistent_gnts(blkif, &blkif->persistent_gnts, free_persistent_gnts(ring, &ring->persistent_gnts,
blkif->persistent_gnt_c); ring->persistent_gnt_c);
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
blkif->persistent_gnt_c = 0; ring->persistent_gnt_c = 0;
/* Since we are shutting down remove all pages from the buffer */ /* Since we are shutting down remove all pages from the buffer */
shrink_free_pagepool(blkif, 0 /* All */); shrink_free_pagepool(ring, 0 /* All */);
} }
static unsigned int xen_blkbk_unmap_prepare( static unsigned int xen_blkbk_unmap_prepare(
struct xen_blkif *blkif, struct xen_blkif_ring *ring,
struct grant_page **pages, struct grant_page **pages,
unsigned int num, unsigned int num,
struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *unmap_ops,
@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
if (pages[i]->persistent_gnt != NULL) { if (pages[i]->persistent_gnt != NULL) {
put_persistent_gnt(blkif, pages[i]->persistent_gnt); put_persistent_gnt(ring, pages[i]->persistent_gnt);
continue; continue;
} }
if (pages[i]->handle == BLKBACK_INVALID_HANDLE) if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
{ {
struct pending_req* pending_req = (struct pending_req*) (data->data); struct pending_req *pending_req = (struct pending_req *)(data->data);
struct xen_blkif *blkif = pending_req->blkif; struct xen_blkif_ring *ring = pending_req->ring;
struct xen_blkif *blkif = ring->blkif;
/* BUG_ON used to reproduce existing behaviour, /* BUG_ON used to reproduce existing behaviour,
but is this the best way to deal with this? */ but is this the best way to deal with this? */
BUG_ON(result); BUG_ON(result);
put_free_pages(blkif, data->pages, data->count); put_free_pages(ring, data->pages, data->count);
make_response(blkif, pending_req->id, make_response(ring, pending_req->id,
pending_req->operation, pending_req->status); pending_req->operation, pending_req->status);
free_req(blkif, pending_req); free_req(ring, pending_req);
/* /*
* Make sure the request is freed before releasing blkif, * Make sure the request is freed before releasing blkif,
* or there could be a race between free_req and the * or there could be a race between free_req and the
@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
* pending_free_wq if there's a drain going on, but it has * pending_free_wq if there's a drain going on, but it has
* to be taken into account if the current model is changed. * to be taken into account if the current model is changed.
*/ */
if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
complete(&blkif->drain_complete); complete(&blkif->drain_complete);
} }
xen_blkif_put(blkif); xen_blkif_put(blkif);
@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
static void xen_blkbk_unmap_and_respond(struct pending_req *req) static void xen_blkbk_unmap_and_respond(struct pending_req *req)
{ {
struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
struct xen_blkif *blkif = req->blkif; struct xen_blkif_ring *ring = req->ring;
struct grant_page **pages = req->segments; struct grant_page **pages = req->segments;
unsigned int invcount; unsigned int invcount;
invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
req->unmap, req->unmap_pages); req->unmap, req->unmap_pages);
work->data = req; work->data = req;
@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
* of hypercalls, but since this is only used in error paths there's * of hypercalls, but since this is only used in error paths there's
* no real need. * no real need.
*/ */
static void xen_blkbk_unmap(struct xen_blkif *blkif, static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
struct grant_page *pages[], struct grant_page *pages[],
int num) int num)
{ {
@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
while (num) { while (num) {
unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
unmap, unmap_pages); unmap, unmap_pages);
if (invcount) { if (invcount) {
ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
BUG_ON(ret); BUG_ON(ret);
put_free_pages(blkif, unmap_pages, invcount); put_free_pages(ring, unmap_pages, invcount);
} }
pages += batch; pages += batch;
num -= batch; num -= batch;
} }
} }
static int xen_blkbk_map(struct xen_blkif *blkif, static int xen_blkbk_map(struct xen_blkif_ring *ring,
struct grant_page *pages[], struct grant_page *pages[],
int num, bool ro) int num, bool ro)
{ {
@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
int ret = 0; int ret = 0;
int last_map = 0, map_until = 0; int last_map = 0, map_until = 0;
int use_persistent_gnts; int use_persistent_gnts;
struct xen_blkif *blkif = ring->blkif;
use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
@ -806,10 +823,11 @@ again:
for (i = map_until; i < num; i++) { for (i = map_until; i < num; i++) {
uint32_t flags; uint32_t flags;
if (use_persistent_gnts) if (use_persistent_gnts) {
persistent_gnt = get_persistent_gnt( persistent_gnt = get_persistent_gnt(
blkif, ring,
pages[i]->gref); pages[i]->gref);
}
if (persistent_gnt) { if (persistent_gnt) {
/* /*
@ -819,7 +837,7 @@ again:
pages[i]->page = persistent_gnt->page; pages[i]->page = persistent_gnt->page;
pages[i]->persistent_gnt = persistent_gnt; pages[i]->persistent_gnt = persistent_gnt;
} else { } else {
if (get_free_page(blkif, &pages[i]->page)) if (get_free_page(ring, &pages[i]->page))
goto out_of_memory; goto out_of_memory;
addr = vaddr(pages[i]->page); addr = vaddr(pages[i]->page);
pages_to_gnt[segs_to_map] = pages[i]->page; pages_to_gnt[segs_to_map] = pages[i]->page;
@ -852,7 +870,7 @@ again:
BUG_ON(new_map_idx >= segs_to_map); BUG_ON(new_map_idx >= segs_to_map);
if (unlikely(map[new_map_idx].status != 0)) { if (unlikely(map[new_map_idx].status != 0)) {
pr_debug("invalid buffer -- could not remap it\n"); pr_debug("invalid buffer -- could not remap it\n");
put_free_pages(blkif, &pages[seg_idx]->page, 1); put_free_pages(ring, &pages[seg_idx]->page, 1);
pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
ret |= 1; ret |= 1;
goto next; goto next;
@ -862,7 +880,7 @@ again:
continue; continue;
} }
if (use_persistent_gnts && if (use_persistent_gnts &&
blkif->persistent_gnt_c < xen_blkif_max_pgrants) { ring->persistent_gnt_c < xen_blkif_max_pgrants) {
/* /*
* We are using persistent grants, the grant is * We are using persistent grants, the grant is
* not mapped but we might have room for it. * not mapped but we might have room for it.
@ -880,7 +898,7 @@ again:
persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->gnt = map[new_map_idx].ref;
persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->handle = map[new_map_idx].handle;
persistent_gnt->page = pages[seg_idx]->page; persistent_gnt->page = pages[seg_idx]->page;
if (add_persistent_gnt(blkif, if (add_persistent_gnt(ring,
persistent_gnt)) { persistent_gnt)) {
kfree(persistent_gnt); kfree(persistent_gnt);
persistent_gnt = NULL; persistent_gnt = NULL;
@ -888,7 +906,7 @@ again:
} }
pages[seg_idx]->persistent_gnt = persistent_gnt; pages[seg_idx]->persistent_gnt = persistent_gnt;
pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
persistent_gnt->gnt, blkif->persistent_gnt_c, persistent_gnt->gnt, ring->persistent_gnt_c,
xen_blkif_max_pgrants); xen_blkif_max_pgrants);
goto next; goto next;
} }
@ -913,7 +931,7 @@ next:
out_of_memory: out_of_memory:
pr_alert("%s: out of memory\n", __func__); pr_alert("%s: out of memory\n", __func__);
put_free_pages(blkif, pages_to_gnt, segs_to_map); put_free_pages(ring, pages_to_gnt, segs_to_map);
return -ENOMEM; return -ENOMEM;
} }
@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
{ {
int rc; int rc;
rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
pending_req->nr_segs, pending_req->nr_segs,
(pending_req->operation != BLKIF_OP_READ)); (pending_req->operation != BLKIF_OP_READ));
@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
struct phys_req *preq) struct phys_req *preq)
{ {
struct grant_page **pages = pending_req->indirect_pages; struct grant_page **pages = pending_req->indirect_pages;
struct xen_blkif *blkif = pending_req->blkif; struct xen_blkif_ring *ring = pending_req->ring;
int indirect_grefs, rc, n, nseg, i; int indirect_grefs, rc, n, nseg, i;
struct blkif_request_segment *segments = NULL; struct blkif_request_segment *segments = NULL;
@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
for (i = 0; i < indirect_grefs; i++) for (i = 0; i < indirect_grefs; i++)
pages[i]->gref = req->u.indirect.indirect_grefs[i]; pages[i]->gref = req->u.indirect.indirect_grefs[i];
rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
if (rc) if (rc)
goto unmap; goto unmap;
@ -972,15 +990,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
unmap: unmap:
if (segments) if (segments)
kunmap_atomic(segments); kunmap_atomic(segments);
xen_blkbk_unmap(blkif, pages, indirect_grefs); xen_blkbk_unmap(ring, pages, indirect_grefs);
return rc; return rc;
} }
static int dispatch_discard_io(struct xen_blkif *blkif, static int dispatch_discard_io(struct xen_blkif_ring *ring,
struct blkif_request *req) struct blkif_request *req)
{ {
int err = 0; int err = 0;
int status = BLKIF_RSP_OKAY; int status = BLKIF_RSP_OKAY;
struct xen_blkif *blkif = ring->blkif;
struct block_device *bdev = blkif->vbd.bdev; struct block_device *bdev = blkif->vbd.bdev;
unsigned long secure; unsigned long secure;
struct phys_req preq; struct phys_req preq;
@ -997,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
goto fail_response; goto fail_response;
} }
blkif->st_ds_req++; ring->st_ds_req++;
secure = (blkif->vbd.discard_secure && secure = (blkif->vbd.discard_secure &&
(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@ -1013,26 +1032,28 @@ fail_response:
} else if (err) } else if (err)
status = BLKIF_RSP_ERROR; status = BLKIF_RSP_ERROR;
make_response(blkif, req->u.discard.id, req->operation, status); make_response(ring, req->u.discard.id, req->operation, status);
xen_blkif_put(blkif); xen_blkif_put(blkif);
return err; return err;
} }
static int dispatch_other_io(struct xen_blkif *blkif, static int dispatch_other_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req) struct pending_req *pending_req)
{ {
free_req(blkif, pending_req); free_req(ring, pending_req);
make_response(blkif, req->u.other.id, req->operation, make_response(ring, req->u.other.id, req->operation,
BLKIF_RSP_EOPNOTSUPP); BLKIF_RSP_EOPNOTSUPP);
return -EIO; return -EIO;
} }
static void xen_blk_drain_io(struct xen_blkif *blkif) static void xen_blk_drain_io(struct xen_blkif_ring *ring)
{ {
struct xen_blkif *blkif = ring->blkif;
atomic_set(&blkif->drain, 1); atomic_set(&blkif->drain, 1);
do { do {
if (atomic_read(&blkif->inflight) == 0) if (atomic_read(&ring->inflight) == 0)
break; break;
wait_for_completion_interruptible_timeout( wait_for_completion_interruptible_timeout(
&blkif->drain_complete, HZ); &blkif->drain_complete, HZ);
@ -1053,12 +1074,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
(error == -EOPNOTSUPP)) { (error == -EOPNOTSUPP)) {
pr_debug("flush diskcache op failed, not supported\n"); pr_debug("flush diskcache op failed, not supported\n");
xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
pending_req->status = BLKIF_RSP_EOPNOTSUPP; pending_req->status = BLKIF_RSP_EOPNOTSUPP;
} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
(error == -EOPNOTSUPP)) { (error == -EOPNOTSUPP)) {
pr_debug("write barrier op failed, not supported\n"); pr_debug("write barrier op failed, not supported\n");
xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
pending_req->status = BLKIF_RSP_EOPNOTSUPP; pending_req->status = BLKIF_RSP_EOPNOTSUPP;
} else if (error) { } else if (error) {
pr_debug("Buffer not up-to-date at end of operation," pr_debug("Buffer not up-to-date at end of operation,"
@ -1092,9 +1113,9 @@ static void end_block_io_op(struct bio *bio)
* and transmute it to the block API to hand it over to the proper block disk. * and transmute it to the block API to hand it over to the proper block disk.
*/ */
static int static int
__do_block_io_op(struct xen_blkif *blkif) __do_block_io_op(struct xen_blkif_ring *ring)
{ {
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings = &ring->blk_rings;
struct blkif_request req; struct blkif_request req;
struct pending_req *pending_req; struct pending_req *pending_req;
RING_IDX rc, rp; RING_IDX rc, rp;
@ -1107,7 +1128,7 @@ __do_block_io_op(struct xen_blkif *blkif)
if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
rc = blk_rings->common.rsp_prod_pvt; rc = blk_rings->common.rsp_prod_pvt;
pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
rp, rc, rp - rc, blkif->vbd.pdevice); rp, rc, rp - rc, ring->blkif->vbd.pdevice);
return -EACCES; return -EACCES;
} }
while (rc != rp) { while (rc != rp) {
@ -1120,14 +1141,14 @@ __do_block_io_op(struct xen_blkif *blkif)
break; break;
} }
pending_req = alloc_req(blkif); pending_req = alloc_req(ring);
if (NULL == pending_req) { if (NULL == pending_req) {
blkif->st_oo_req++; ring->st_oo_req++;
more_to_do = 1; more_to_do = 1;
break; break;
} }
switch (blkif->blk_protocol) { switch (ring->blkif->blk_protocol) {
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
break; break;
@ -1151,16 +1172,16 @@ __do_block_io_op(struct xen_blkif *blkif)
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_INDIRECT: case BLKIF_OP_INDIRECT:
if (dispatch_rw_block_io(blkif, &req, pending_req)) if (dispatch_rw_block_io(ring, &req, pending_req))
goto done; goto done;
break; break;
case BLKIF_OP_DISCARD: case BLKIF_OP_DISCARD:
free_req(blkif, pending_req); free_req(ring, pending_req);
if (dispatch_discard_io(blkif, &req)) if (dispatch_discard_io(ring, &req))
goto done; goto done;
break; break;
default: default:
if (dispatch_other_io(blkif, &req, pending_req)) if (dispatch_other_io(ring, &req, pending_req))
goto done; goto done;
break; break;
} }
@ -1173,13 +1194,13 @@ done:
} }
static int static int
do_block_io_op(struct xen_blkif *blkif) do_block_io_op(struct xen_blkif_ring *ring)
{ {
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings = &ring->blk_rings;
int more_to_do; int more_to_do;
do { do {
more_to_do = __do_block_io_op(blkif); more_to_do = __do_block_io_op(ring);
if (more_to_do) if (more_to_do)
break; break;
@ -1192,7 +1213,7 @@ do_block_io_op(struct xen_blkif *blkif)
* Transmutation of the 'struct blkif_request' to a proper 'struct bio' * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
* and call the 'submit_bio' to pass it to the underlying storage. * and call the 'submit_bio' to pass it to the underlying storage.
*/ */
static int dispatch_rw_block_io(struct xen_blkif *blkif, static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
struct blkif_request *req, struct blkif_request *req,
struct pending_req *pending_req) struct pending_req *pending_req)
{ {
@ -1220,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
switch (req_operation) { switch (req_operation) {
case BLKIF_OP_READ: case BLKIF_OP_READ:
blkif->st_rd_req++; ring->st_rd_req++;
operation = READ; operation = READ;
break; break;
case BLKIF_OP_WRITE: case BLKIF_OP_WRITE:
blkif->st_wr_req++; ring->st_wr_req++;
operation = WRITE_ODIRECT; operation = WRITE_ODIRECT;
break; break;
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
drain = true; drain = true;
case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_FLUSH_DISKCACHE:
blkif->st_f_req++; ring->st_f_req++;
operation = WRITE_FLUSH; operation = WRITE_FLUSH;
break; break;
default: default:
@ -1255,7 +1276,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
preq.nr_sects = 0; preq.nr_sects = 0;
pending_req->blkif = blkif; pending_req->ring = ring;
pending_req->id = req->u.rw.id; pending_req->id = req->u.rw.id;
pending_req->operation = req_operation; pending_req->operation = req_operation;
pending_req->status = BLKIF_RSP_OKAY; pending_req->status = BLKIF_RSP_OKAY;
@ -1282,12 +1303,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
goto fail_response; goto fail_response;
} }
if (xen_vbd_translate(&preq, blkif, operation) != 0) { if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
operation == READ ? "read" : "write", operation == READ ? "read" : "write",
preq.sector_number, preq.sector_number,
preq.sector_number + preq.nr_sects, preq.sector_number + preq.nr_sects,
blkif->vbd.pdevice); ring->blkif->vbd.pdevice);
goto fail_response; goto fail_response;
} }
@ -1299,7 +1320,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
if (((int)preq.sector_number|(int)seg[i].nsec) & if (((int)preq.sector_number|(int)seg[i].nsec) &
((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
pr_debug("Misaligned I/O request from domain %d\n", pr_debug("Misaligned I/O request from domain %d\n",
blkif->domid); ring->blkif->domid);
goto fail_response; goto fail_response;
} }
} }
@ -1308,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* issue the WRITE_FLUSH. * issue the WRITE_FLUSH.
*/ */
if (drain) if (drain)
xen_blk_drain_io(pending_req->blkif); xen_blk_drain_io(pending_req->ring);
/* /*
* If we have failed at this point, we need to undo the M2P override, * If we have failed at this point, we need to undo the M2P override,
@ -1323,8 +1344,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* This corresponding xen_blkif_put is done in __end_block_io_op, or * This corresponding xen_blkif_put is done in __end_block_io_op, or
* below (in "!bio") if we are handling a BLKIF_OP_DISCARD. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
*/ */
xen_blkif_get(blkif); xen_blkif_get(ring->blkif);
atomic_inc(&blkif->inflight); atomic_inc(&ring->inflight);
for (i = 0; i < nseg; i++) { for (i = 0; i < nseg; i++) {
while ((bio == NULL) || while ((bio == NULL) ||
@ -1372,19 +1393,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
blk_finish_plug(&plug); blk_finish_plug(&plug);
if (operation == READ) if (operation == READ)
blkif->st_rd_sect += preq.nr_sects; ring->st_rd_sect += preq.nr_sects;
else if (operation & WRITE) else if (operation & WRITE)
blkif->st_wr_sect += preq.nr_sects; ring->st_wr_sect += preq.nr_sects;
return 0; return 0;
fail_flush: fail_flush:
xen_blkbk_unmap(blkif, pending_req->segments, xen_blkbk_unmap(ring, pending_req->segments,
pending_req->nr_segs); pending_req->nr_segs);
fail_response: fail_response:
/* Haven't submitted any bio's yet. */ /* Haven't submitted any bio's yet. */
make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
free_req(blkif, pending_req); free_req(ring, pending_req);
msleep(1); /* back off a bit */ msleep(1); /* back off a bit */
return -EIO; return -EIO;
@ -1402,21 +1423,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
/* /*
* Put a response on the ring on how the operation fared. * Put a response on the ring on how the operation fared.
*/ */
static void make_response(struct xen_blkif *blkif, u64 id, static void make_response(struct xen_blkif_ring *ring, u64 id,
unsigned short op, int st) unsigned short op, int st)
{ {
struct blkif_response resp; struct blkif_response resp;
unsigned long flags; unsigned long flags;
union blkif_back_rings *blk_rings = &blkif->blk_rings; union blkif_back_rings *blk_rings;
int notify; int notify;
resp.id = id; resp.id = id;
resp.operation = op; resp.operation = op;
resp.status = st; resp.status = st;
spin_lock_irqsave(&blkif->blk_ring_lock, flags); spin_lock_irqsave(&ring->blk_ring_lock, flags);
blk_rings = &ring->blk_rings;
/* Place on the response ring for the relevant domain. */ /* Place on the response ring for the relevant domain. */
switch (blkif->blk_protocol) { switch (ring->blkif->blk_protocol) {
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
&resp, sizeof(resp)); &resp, sizeof(resp));
@ -1434,9 +1456,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
} }
blk_rings->common.rsp_prod_pvt++; blk_rings->common.rsp_prod_pvt++;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
if (notify) if (notify)
notify_remote_via_irq(blkif->irq); notify_remote_via_irq(ring->irq);
} }
static int __init xen_blkif_init(void) static int __init xen_blkif_init(void)
@ -1452,6 +1474,9 @@ static int __init xen_blkif_init(void)
xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
} }
if (xenblk_max_queues == 0)
xenblk_max_queues = num_online_cpus();
rc = xen_blkif_interface_init(); rc = xen_blkif_interface_init();
if (rc) if (rc)
goto failed_init; goto failed_init;

View File

@ -46,6 +46,7 @@
#include <xen/interface/io/protocols.h> #include <xen/interface/io/protocols.h>
extern unsigned int xen_blkif_max_ring_order; extern unsigned int xen_blkif_max_ring_order;
extern unsigned int xenblk_max_queues;
/* /*
* This is the maximum number of segments that would be allowed in indirect * This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend. * requests. This value will also be passed to the frontend.
@ -269,68 +270,79 @@ struct persistent_gnt {
struct list_head remove_node; struct list_head remove_node;
}; };
struct xen_blkif { /* Per-ring information. */
/* Unique identifier for this interface. */ struct xen_blkif_ring {
domid_t domid;
unsigned int handle;
/* Physical parameters of the comms window. */ /* Physical parameters of the comms window. */
unsigned int irq; unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings; union blkif_back_rings blk_rings;
void *blk_ring; void *blk_ring;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
/* Private fields. */ /* Private fields. */
spinlock_t blk_ring_lock; spinlock_t blk_ring_lock;
atomic_t refcnt;
wait_queue_head_t wq; wait_queue_head_t wq;
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
atomic_t inflight; atomic_t inflight;
/* One thread per one blkif. */ /* One thread per blkif ring. */
struct task_struct *xenblkd; struct task_struct *xenblkd;
unsigned int waiting_reqs; unsigned int waiting_reqs;
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
atomic_t persistent_gnt_in_use;
unsigned long next_lru;
/* used by the kworker that offload work from the persistent purge */
struct list_head persistent_purge_list;
struct work_struct persistent_purge_work;
/* buffer of free pages to map grant refs */
spinlock_t free_pages_lock;
int free_pages_num;
struct list_head free_pages;
/* List of all 'pending_req' available */ /* List of all 'pending_req' available */
struct list_head pending_free; struct list_head pending_free;
/* And its spinlock. */ /* And its spinlock. */
spinlock_t pending_free_lock; spinlock_t pending_free_lock;
wait_queue_head_t pending_free_wq; wait_queue_head_t pending_free_wq;
/* statistics */ /* Tree to store persistent grants. */
spinlock_t pers_gnts_lock;
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
atomic_t persistent_gnt_in_use;
unsigned long next_lru;
/* Statistics. */
unsigned long st_print; unsigned long st_print;
unsigned long long st_rd_req; unsigned long long st_rd_req;
unsigned long long st_wr_req; unsigned long long st_wr_req;
unsigned long long st_oo_req; unsigned long long st_oo_req;
unsigned long long st_f_req; unsigned long long st_f_req;
unsigned long long st_ds_req; unsigned long long st_ds_req;
unsigned long long st_rd_sect; unsigned long long st_rd_sect;
unsigned long long st_wr_sect; unsigned long long st_wr_sect;
/* Used by the kworker that offload work from the persistent purge. */
struct list_head persistent_purge_list;
struct work_struct persistent_purge_work;
/* Buffer of free pages to map grant refs. */
spinlock_t free_pages_lock;
int free_pages_num;
struct list_head free_pages;
struct work_struct free_work; struct work_struct free_work;
/* Thread shutdown wait queue. */ /* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq; wait_queue_head_t shutdown_wq;
unsigned int nr_ring_pages; struct xen_blkif *blkif;
};
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
unsigned int handle;
/* Comms information. */
enum blkif_protocol blk_protocol;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
atomic_t refcnt;
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
struct work_struct free_work;
unsigned int nr_ring_pages;
/* All rings for this device. */
struct xen_blkif_ring *rings;
unsigned int nr_rings;
}; };
struct seg_buf { struct seg_buf {
@ -352,7 +364,7 @@ struct grant_page {
* response queued for it, with the saved 'id' passed back. * response queued for it, with the saved 'id' passed back.
*/ */
struct pending_req { struct pending_req {
struct xen_blkif *blkif; struct xen_blkif_ring *ring;
u64 id; u64 id;
int nr_segs; int nr_segs;
atomic_t pendcnt; atomic_t pendcnt;
@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id); irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg); int xen_blkif_schedule(void *arg);
int xen_blkif_purge_persistent(void *arg); int xen_blkif_purge_persistent(void *arg);
void xen_blkbk_free_caches(struct xen_blkif *blkif); void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state); struct backend_info *be, int state);

View File

@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
{ {
int err; int err;
char name[BLKBACK_NAME_LEN]; char name[BLKBACK_NAME_LEN];
struct xen_blkif_ring *ring;
int i;
/* Not ready to connect? */ /* Not ready to connect? */
if (!blkif->irq || !blkif->vbd.bdev) if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
return; return;
/* Already connected? */ /* Already connected? */
@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
} }
invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); for (i = 0; i < blkif->nr_rings; i++) {
if (IS_ERR(blkif->xenblkd)) { ring = &blkif->rings[i];
err = PTR_ERR(blkif->xenblkd); ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
blkif->xenblkd = NULL; if (IS_ERR(ring->xenblkd)) {
xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); err = PTR_ERR(ring->xenblkd);
return; ring->xenblkd = NULL;
xenbus_dev_fatal(blkif->be->dev, err,
"start %s-%d xenblkd", name, i);
goto out;
}
} }
return;
out:
while (--i >= 0) {
ring = &blkif->rings[i];
kthread_stop(ring->xenblkd);
}
return;
}
static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
{
unsigned int r;
blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
if (!blkif->rings)
return -ENOMEM;
for (r = 0; r < blkif->nr_rings; r++) {
struct xen_blkif_ring *ring = &blkif->rings[r];
spin_lock_init(&ring->blk_ring_lock);
init_waitqueue_head(&ring->wq);
INIT_LIST_HEAD(&ring->pending_free);
INIT_LIST_HEAD(&ring->persistent_purge_list);
INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
spin_lock_init(&ring->free_pages_lock);
INIT_LIST_HEAD(&ring->free_pages);
spin_lock_init(&ring->pending_free_lock);
init_waitqueue_head(&ring->pending_free_wq);
init_waitqueue_head(&ring->shutdown_wq);
ring->blkif = blkif;
ring->st_print = jiffies;
xen_blkif_get(blkif);
}
return 0;
} }
static struct xen_blkif *xen_blkif_alloc(domid_t domid) static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
blkif->domid = domid; blkif->domid = domid;
spin_lock_init(&blkif->blk_ring_lock);
atomic_set(&blkif->refcnt, 1); atomic_set(&blkif->refcnt, 1);
init_waitqueue_head(&blkif->wq);
init_completion(&blkif->drain_complete); init_completion(&blkif->drain_complete);
atomic_set(&blkif->drain, 0);
blkif->st_print = jiffies;
blkif->persistent_gnts.rb_node = NULL;
spin_lock_init(&blkif->free_pages_lock);
INIT_LIST_HEAD(&blkif->free_pages);
INIT_LIST_HEAD(&blkif->persistent_purge_list);
blkif->free_pages_num = 0;
atomic_set(&blkif->persistent_gnt_in_use, 0);
atomic_set(&blkif->inflight, 0);
INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
INIT_LIST_HEAD(&blkif->pending_free);
INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
spin_lock_init(&blkif->pending_free_lock);
init_waitqueue_head(&blkif->pending_free_wq);
init_waitqueue_head(&blkif->shutdown_wq);
return blkif; return blkif;
} }
static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
unsigned int nr_grefs, unsigned int evtchn) unsigned int nr_grefs, unsigned int evtchn)
{ {
int err; int err;
struct xen_blkif *blkif = ring->blkif;
/* Already connected through? */ /* Already connected through? */
if (blkif->irq) if (ring->irq)
return 0; return 0;
err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
&blkif->blk_ring); &ring->blk_ring);
if (err < 0) if (err < 0)
return err; return err;
@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
case BLKIF_PROTOCOL_NATIVE: case BLKIF_PROTOCOL_NATIVE:
{ {
struct blkif_sring *sring; struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif->blk_ring; sring = (struct blkif_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.native, sring, BACK_RING_INIT(&ring->blk_rings.native, sring,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_32: case BLKIF_PROTOCOL_X86_32:
{ {
struct blkif_x86_32_sring *sring_x86_32; struct blkif_x86_32_sring *sring_x86_32;
sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_64: case BLKIF_PROTOCOL_X86_64:
{ {
struct blkif_x86_64_sring *sring_x86_64; struct blkif_x86_64_sring *sring_x86_64;
sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
XEN_PAGE_SIZE * nr_grefs); XEN_PAGE_SIZE * nr_grefs);
break; break;
} }
@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
xen_blkif_be_int, 0, xen_blkif_be_int, 0,
"blkif-backend", blkif); "blkif-backend", ring);
if (err < 0) { if (err < 0) {
xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
blkif->blk_rings.common.sring = NULL; ring->blk_rings.common.sring = NULL;
return err; return err;
} }
blkif->irq = err; ring->irq = err;
return 0; return 0;
} }
@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
static int xen_blkif_disconnect(struct xen_blkif *blkif) static int xen_blkif_disconnect(struct xen_blkif *blkif)
{ {
struct pending_req *req, *n; struct pending_req *req, *n;
int i = 0, j; unsigned int j, r;
if (blkif->xenblkd) { for (r = 0; r < blkif->nr_rings; r++) {
kthread_stop(blkif->xenblkd); struct xen_blkif_ring *ring = &blkif->rings[r];
wake_up(&blkif->shutdown_wq); unsigned int i = 0;
blkif->xenblkd = NULL;
if (ring->xenblkd) {
kthread_stop(ring->xenblkd);
wake_up(&ring->shutdown_wq);
ring->xenblkd = NULL;
}
/* The above kthread_stop() guarantees that at this point we
* don't have any discard_io or other_io requests. So, checking
* for inflight IO is enough.
*/
if (atomic_read(&ring->inflight) > 0)
return -EBUSY;
if (ring->irq) {
unbind_from_irqhandler(ring->irq, ring);
ring->irq = 0;
}
if (ring->blk_rings.common.sring) {
xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
ring->blk_rings.common.sring = NULL;
}
/* Remove all persistent grants and the cache of ballooned pages. */
xen_blkbk_free_caches(ring);
/* Check that there is no request in use */
list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
kfree(req->segments[j]);
for (j = 0; j < MAX_INDIRECT_PAGES; j++)
kfree(req->indirect_pages[j]);
kfree(req);
i++;
}
BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
BUG_ON(!list_empty(&ring->persistent_purge_list));
BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
BUG_ON(!list_empty(&ring->free_pages));
BUG_ON(ring->free_pages_num != 0);
BUG_ON(ring->persistent_gnt_c != 0);
WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
xen_blkif_put(blkif);
} }
/* The above kthread_stop() guarantees that at this point we
* don't have any discard_io or other_io requests. So, checking
* for inflight IO is enough.
*/
if (atomic_read(&blkif->inflight) > 0)
return -EBUSY;
if (blkif->irq) {
unbind_from_irqhandler(blkif->irq, blkif);
blkif->irq = 0;
}
if (blkif->blk_rings.common.sring) {
xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
blkif->blk_rings.common.sring = NULL;
}
/* Remove all persistent grants and the cache of ballooned pages. */
xen_blkbk_free_caches(blkif);
/* Check that there is no request in use */
list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
kfree(req->segments[j]);
for (j = 0; j < MAX_INDIRECT_PAGES; j++)
kfree(req->indirect_pages[j]);
kfree(req);
i++;
}
WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
blkif->nr_ring_pages = 0; blkif->nr_ring_pages = 0;
/*
* blkif->rings was allocated in connect_ring, so we should free it in
* here.
*/
kfree(blkif->rings);
blkif->rings = NULL;
blkif->nr_rings = 0;
return 0; return 0;
} }
@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
xen_vbd_free(&blkif->vbd); xen_vbd_free(&blkif->vbd);
/* Make sure everything is drained before shutting down */ /* Make sure everything is drained before shutting down */
BUG_ON(blkif->persistent_gnt_c != 0);
BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
BUG_ON(blkif->free_pages_num != 0);
BUG_ON(!list_empty(&blkif->persistent_purge_list));
BUG_ON(!list_empty(&blkif->free_pages));
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
kmem_cache_free(xen_blkif_cachep, blkif); kmem_cache_free(xen_blkif_cachep, blkif);
} }
@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
* sysfs interface for VBD I/O requests * sysfs interface for VBD I/O requests
*/ */
#define VBD_SHOW(name, format, args...) \ #define VBD_SHOW_ALLRING(name, format) \
static ssize_t show_##name(struct device *_dev, \ static ssize_t show_##name(struct device *_dev, \
struct device_attribute *attr, \ struct device_attribute *attr, \
char *buf) \ char *buf) \
{ \ { \
struct xenbus_device *dev = to_xenbus_device(_dev); \ struct xenbus_device *dev = to_xenbus_device(_dev); \
struct backend_info *be = dev_get_drvdata(&dev->dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \
struct xen_blkif *blkif = be->blkif; \
unsigned int i; \
unsigned long long result = 0; \
\ \
return sprintf(buf, format, ##args); \ if (!blkif->rings) \
goto out; \
\
for (i = 0; i < blkif->nr_rings; i++) { \
struct xen_blkif_ring *ring = &blkif->rings[i]; \
\
result += ring->st_##name; \
} \
\
out: \
return sprintf(buf, format, result); \
} \ } \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); VBD_SHOW_ALLRING(oo_req, "%llu\n");
VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); VBD_SHOW_ALLRING(rd_req, "%llu\n");
VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); VBD_SHOW_ALLRING(wr_req, "%llu\n");
VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); VBD_SHOW_ALLRING(f_req, "%llu\n");
VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); VBD_SHOW_ALLRING(ds_req, "%llu\n");
VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); VBD_SHOW_ALLRING(rd_sect, "%llu\n");
VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); VBD_SHOW_ALLRING(wr_sect, "%llu\n");
static struct attribute *xen_vbdstat_attrs[] = { static struct attribute *xen_vbdstat_attrs[] = {
&dev_attr_oo_req.attr, &dev_attr_oo_req.attr,
@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
.attrs = xen_vbdstat_attrs, .attrs = xen_vbdstat_attrs,
}; };
#define VBD_SHOW(name, format, args...) \
static ssize_t show_##name(struct device *_dev, \
struct device_attribute *attr, \
char *buf) \
{ \
struct xenbus_device *dev = to_xenbus_device(_dev); \
struct backend_info *be = dev_get_drvdata(&dev->dev); \
\
return sprintf(buf, format, ##args); \
} \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
VBD_SHOW(mode, "%s\n", be->mode); VBD_SHOW(mode, "%s\n", be->mode);
@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
dev_set_drvdata(&dev->dev, NULL); dev_set_drvdata(&dev->dev, NULL);
if (be->blkif) { if (be->blkif)
xen_blkif_disconnect(be->blkif); xen_blkif_disconnect(be->blkif);
xen_blkif_put(be->blkif);
}
/* Put the reference we set in xen_blkif_alloc(). */
xen_blkif_put(be->blkif);
kfree(be->mode); kfree(be->mode);
kfree(be); kfree(be);
return 0; return 0;
@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
goto fail; goto fail;
} }
/* Multi-queue: advertise how many queues are supported by us.*/
err = xenbus_printf(XBT_NIL, dev->nodename,
"multi-queue-max-queues", "%u", xenblk_max_queues);
if (err)
pr_warn("Error writing multi-queue-max-queues\n");
/* setup back pointer */ /* setup back pointer */
be->blkif->be = be; be->blkif->be = be;
@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
} }
err = connect_ring(be); err = connect_ring(be);
if (err) if (err) {
/*
* Clean up so that memory resources can be used by
* other devices. connect_ring reported already error.
*/
xen_blkif_disconnect(be->blkif);
break; break;
}
xen_update_blkif_status(be->blkif); xen_update_blkif_status(be->blkif);
break; break;
@ -825,50 +902,43 @@ again:
xenbus_transaction_end(xbt, 1); xenbus_transaction_end(xbt, 1);
} }
/*
static int connect_ring(struct backend_info *be) * Each ring may have multi pages, depends on "ring-page-order".
*/
static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
{ {
struct xenbus_device *dev = be->dev;
unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
unsigned int evtchn, nr_grefs, ring_page_order;
unsigned int pers_grants;
char protocol[64] = "";
struct pending_req *req, *n; struct pending_req *req, *n;
int err, i, j; int err, i, j;
struct xen_blkif *blkif = ring->blkif;
struct xenbus_device *dev = blkif->be->dev;
unsigned int ring_page_order, nr_grefs, evtchn;
pr_debug("%s %s\n", __func__, dev->otherend); err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
&evtchn); &evtchn);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/event-channel", xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
dev->otherend);
return err; return err;
} }
pr_info("event-channel %u\n", evtchn);
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
&ring_page_order); &ring_page_order);
if (err != 1) { if (err != 1) {
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
"%u", &ring_ref[0]);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/ring-ref", xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
dev->otherend);
return err; return err;
} }
nr_grefs = 1; nr_grefs = 1;
pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
ring_ref[0]);
} else { } else {
unsigned int i; unsigned int i;
if (ring_page_order > xen_blkif_max_ring_order) { if (ring_page_order > xen_blkif_max_ring_order) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
dev->otherend, ring_page_order, dir, ring_page_order,
xen_blkif_max_ring_order); xen_blkif_max_ring_order);
return err; return err;
} }
@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
char ring_ref_name[RINGREF_NAME_LEN]; char ring_ref_name[RINGREF_NAME_LEN];
snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
"%u", &ring_ref[i]); "%u", &ring_ref[i]);
if (err != 1) { if (err != 1) {
err = -EINVAL; err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/%s", xenbus_dev_fatal(dev, err, "reading %s/%s",
dev->otherend, ring_ref_name); dir, ring_ref_name);
return err; return err;
} }
pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
} }
} }
blkif->nr_ring_pages = nr_grefs;
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -1;
}
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
be->blkif->nr_ring_pages = nr_grefs;
pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
req = kzalloc(sizeof(*req), GFP_KERNEL); req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req) if (!req)
goto fail; goto fail;
list_add_tail(&req->free_list, &be->blkif->pending_free); list_add_tail(&req->free_list, &ring->pending_free);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
if (!req->segments[j]) if (!req->segments[j])
@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
} }
/* Map the shared frame, irq etc. */ /* Map the shared frame, irq etc. */
err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
if (err) { if (err) {
xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
return err; return err;
@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
return 0; return 0;
fail: fail:
list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
list_del(&req->free_list); list_del(&req->free_list);
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
if (!req->segments[j]) if (!req->segments[j])
@ -962,6 +1003,93 @@ fail:
kfree(req); kfree(req);
} }
return -ENOMEM; return -ENOMEM;
}
static int connect_ring(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
unsigned int pers_grants;
char protocol[64] = "";
int err, i;
char *xspath;
size_t xspathsize;
const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
unsigned int requested_num_queues = 0;
pr_debug("%s %s\n", __func__, dev->otherend);
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -ENOSYS;
}
err = xenbus_gather(XBT_NIL, dev->otherend,
"feature-persistent", "%u",
&pers_grants, NULL);
if (err)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
/*
* Read the number of hardware queues from frontend.
*/
err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
"%u", &requested_num_queues);
if (err < 0) {
requested_num_queues = 1;
} else {
if (requested_num_queues > xenblk_max_queues
|| requested_num_queues == 0) {
/* Buggy or malicious guest. */
xenbus_dev_fatal(dev, err,
"guest requested %u queues, exceeding the maximum of %u.",
requested_num_queues, xenblk_max_queues);
return -ENOSYS;
}
}
be->blkif->nr_rings = requested_num_queues;
if (xen_blkif_alloc_rings(be->blkif))
return -ENOMEM;
pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : "");
if (be->blkif->nr_rings == 1)
return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
else {
xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
xspath = kmalloc(xspathsize, GFP_KERNEL);
if (!xspath) {
xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
return -ENOMEM;
}
for (i = 0; i < be->blkif->nr_rings; i++) {
memset(xspath, 0, xspathsize);
snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
err = read_per_ring_refs(&be->blkif->rings[i], xspath);
if (err) {
kfree(xspath);
return err;
}
}
kfree(xspath);
}
return 0;
} }
static const struct xenbus_device_id xen_blkbk_ids[] = { static const struct xenbus_device_id xen_blkbk_ids[] = {

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,54 @@
typedef uint16_t blkif_vdev_t; typedef uint16_t blkif_vdev_t;
typedef uint64_t blkif_sector_t; typedef uint64_t blkif_sector_t;
/*
* Multiple hardware queues/rings:
* If supported, the backend will write the key "multi-queue-max-queues" to
* the directory for that vbd, and set its value to the maximum supported
* number of queues.
* Frontends that are aware of this feature and wish to use it can write the
* key "multi-queue-num-queues" with the number they wish to use, which must be
* greater than zero, and no more than the value reported by the backend in
* "multi-queue-max-queues".
*
* For frontends requesting just one queue, the usual event-channel and
* ring-ref keys are written as before, simplifying the backend processing
* to avoid distinguishing between a frontend that doesn't understand the
* multi-queue feature, and one that does, but requested only one queue.
*
* Frontends requesting two or more queues must not write the toplevel
* event-channel and ring-ref keys, instead writing those keys under sub-keys
* having the name "queue-N" where N is the integer ID of the queue/ring for
* which those keys belong. Queues are indexed from zero.
* For example, a frontend with two queues must write the following set of
* queue-related keys:
*
* /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
* /local/domain/1/device/vbd/0/queue-0 = ""
* /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
* /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
* /local/domain/1/device/vbd/0/queue-1 = ""
* /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
* /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
*
* It is also possible to use multiple queues/rings together with
* feature multi-page ring buffer.
* For example, a frontend requests two queues/rings and the size of each ring
* buffer is two pages must write the following set of related keys:
*
* /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
* /local/domain/1/device/vbd/0/ring-page-order = "1"
* /local/domain/1/device/vbd/0/queue-0 = ""
* /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
* /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
* /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
* /local/domain/1/device/vbd/0/queue-1 = ""
* /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
* /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
* /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
*
*/
/* /*
* REQUEST CODES. * REQUEST CODES.
*/ */