From aaa8e2b34c35d67abf1892cd62ea4e7565ca262c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 15 Oct 2010 13:16:53 +0200 Subject: [PATCH 01/27] drbd: consolidate explicit drbd_md_sync into drbd_create_new_uuid Every code path changing the current UUID needs to get it on stable storage anyways. Flush it to disk right there, remove the now obsolte explicit drbd_md_sync statements in the other code paths. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 3 ++- drivers/block/drbd/drbd_nl.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8bfedc7164f..8d029b14e7c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1267,7 +1267,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); - drbd_md_sync(mdev); } spin_lock_irq(&mdev->req_lock); _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); @@ -3659,6 +3658,8 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) get_random_bytes(&val, sizeof(u64)); _drbd_uuid_set(mdev, UI_CURRENT, val); + /* get it to stable storage _now_ */ + drbd_md_sync(mdev); } void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 87925e97e61..c498c4827de 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1953,7 +1953,6 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); - drbd_md_sync(mdev); } drbd_suspend_io(mdev); reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); From 3beec1d446fba335f07787636920892dd3b2c658 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Oct 2010 13:31:48 +0200 Subject: [PATCH 02/27] drbd: tag a few error messages with "assert failed" If those messages ever get logged, clearly state that they are actually failed ASSERTS, so our regression tests can pick them up from the logs more easily. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index c07c370c4c8..e0e0bf6f16a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -114,11 +114,11 @@ struct drbd_conf; #define D_ASSERT(exp) if (!(exp)) \ dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) -#define ERR_IF(exp) if (({ \ - int _b = (exp) != 0; \ - if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ - __func__, #exp, __FILE__, __LINE__); \ - _b; \ +#define ERR_IF(exp) if (({ \ + int _b = (exp) != 0; \ + if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \ + __func__, #exp, __FILE__, __LINE__); \ + _b; \ })) /* Defines to control fault insertion */ From 82f59cc6353889b426cf13b6596d5a3d100fa09e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Sat, 16 Oct 2010 12:13:47 +0200 Subject: [PATCH 03/27] drbd: fix potential deadlock on detach If we have contention in drbd_al_begin_iod (heavy randon IO), an administrative request to detach the disk may deadlock for similar reasons as the recently fixed deadlock if detaching because of IO-error. The approach taken here is to either go through the intermediate cleanup state D_FAILED, or first lock out application io, don't just go directly to D_DISKLESS. We need an additional state bit (WAS_IO_ERROR) to distinguish the -> D_FAILED because of IO-error from other failures. Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS. If only attaching, ldev may be missing still, but would be referenced from within the after_state_ch for -> D_FAILED, potentially dereferencing a NULL pointer. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 20 +++-- drivers/block/drbd/drbd_main.c | 138 +++++++++++++++++------------ drivers/block/drbd/drbd_nl.c | 16 +++- drivers/block/drbd/drbd_receiver.c | 2 +- 4 files changed, 113 insertions(+), 63 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e0e0bf6f16a..03c15e317c3 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -852,7 +852,8 @@ enum { BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ - GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ + GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ + WAS_IO_ERROR, /* Local disk failed returned IO error */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -1281,6 +1282,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); extern void drbd_go_diskless(struct drbd_conf *mdev); +extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout @@ -1798,17 +1800,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, case EP_PASS_ON: if (!forcedetach) { if (__ratelimit(&drbd_ratelimit_state)) - dev_err(DEV, "Local IO failed in %s." - "Passing error on...\n", where); + dev_err(DEV, "Local IO failed in %s.\n", where); break; } /* NOTE fall through to detach case if forcedetach set */ case EP_DETACH: case EP_CALL_HELPER: + set_bit(WAS_IO_ERROR, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); - dev_err(DEV, "Local IO failed in %s." - "Detaching...\n", where); + dev_err(DEV, + "Local IO failed in %s. Detaching...\n", where); } break; } @@ -2127,7 +2129,11 @@ static inline void put_ldev(struct drbd_conf *mdev) __release(local); D_ASSERT(i >= 0); if (i == 0) { + if (mdev->state.disk == D_DISKLESS) + /* even internal references gone, safe to destroy */ + drbd_ldev_destroy(mdev); if (mdev->state.disk == D_FAILED) + /* all application IO references gone. */ drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); } @@ -2138,6 +2144,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat { int io_allowed; + /* never get a reference while D_DISKLESS */ + if (mdev->state.disk == D_DISKLESS) + return 0; + atomic_inc(&mdev->local_cnt); io_allowed = (mdev->state.disk >= mins); if (!io_allowed) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8d029b14e7c..992c3aecdf7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -834,6 +834,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) ns.conn = os.conn; + /* we cannot fail (again) if we already detached */ + if (ns.disk == D_FAILED && os.disk == D_DISKLESS) + ns.disk = D_DISKLESS; + + /* if we are only D_ATTACHING yet, + * we can (and should) go directly to D_DISKLESS. */ + if (ns.disk == D_FAILED && os.disk == D_ATTACHING) + ns.disk = D_DISKLESS; + /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -1055,7 +1064,15 @@ int __drbd_set_state(struct drbd_conf *mdev, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) set_bit(DEVICE_DYING, &mdev->flags); - mdev->state.i = ns.i; + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * after_state_ch works run, where we put_ldev again. */ + if ((os.disk != D_FAILED && ns.disk == D_FAILED) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) + atomic_inc(&mdev->local_cnt); + + mdev->state = ns; wake_up(&mdev->misc_wait); wake_up(&mdev->state_wait); @@ -1363,63 +1380,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); - /* first half of local IO error */ - if (os.disk > D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh = EP_PASS_ON; + /* first half of local IO error, failure to attach, + * or administrative detach */ + if (os.disk != D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh; + int was_io_error; + /* corresponding get_ldev was in __drbd_set_state, to serialize + * our cleanup here with the transition to D_DISKLESS, + * so it is safe to dreference ldev here. */ + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that my disk is broken.\n"); + dev_warn(DEV, "Notified peer that I am detaching my disk\n"); else - dev_err(DEV, "Sending state for drbd_io_error() failed\n"); + dev_err(DEV, "Sending state for detaching disk failed\n"); drbd_rs_cancel_all(mdev); - if (get_ldev_if_state(mdev, D_FAILED)) { - eh = mdev->ldev->dc.on_io_error; - put_ldev(mdev); - } - if (eh == EP_CALL_HELPER) + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + put_ldev(mdev); + + if (was_io_error && eh == EP_CALL_HELPER) drbd_khelper(mdev, "local-io-error"); } + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (mdev->state.disk != D_DISKLESS) + dev_err(DEV, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(mdev->state.disk)); - /* second half of local IO error handling, - * after local_cnt references have reached zero: */ - if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - } + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); - if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { - /* We must still be diskless, - * re-attach has to be serialized with this! */ - if (mdev->state.disk != D_DISKLESS) - dev_err(DEV, - "ASSERT FAILED: disk is %s while going diskless\n", - drbd_disk_str(mdev->state.disk)); - - /* we cannot assert local_cnt == 0 here, as get_ldev_if_state - * will inc/dec it frequently. Since we became D_DISKLESS, no - * one has touched the protected members anymore, though, so we - * are safe to free them here. */ if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I detached my disk.\n"); + dev_warn(DEV, "Notified peer that I'm now diskless.\n"); else - dev_err(DEV, "Sending state for detach failed\n"); - - lc_destroy(mdev->resync); - mdev->resync = NULL; - lc_destroy(mdev->act_log); - mdev->act_log = NULL; - __no_warn(local, - drbd_free_bc(mdev->ldev); - mdev->ldev = NULL;); - - if (mdev->md_io_tmpp) { - __free_page(mdev->md_io_tmpp); - mdev->md_io_tmpp = NULL; - } + dev_err(DEV, "Sending state for being diskless failed\n"); + /* corresponding get_ldev in __drbd_set_state + * this may finaly trigger drbd_ldev_destroy. */ + put_ldev(mdev); } /* Disks got bigger while they were detached */ @@ -2897,7 +2915,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) D_ASSERT(list_empty(&mdev->resync_work.list)); D_ASSERT(list_empty(&mdev->unplug_work.list)); D_ASSERT(list_empty(&mdev->go_diskless.list)); - } @@ -3756,19 +3773,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } +void drbd_ldev_destroy(struct drbd_conf *mdev) +{ + lc_destroy(mdev->resync); + mdev->resync = NULL; + lc_destroy(mdev->act_log); + mdev->act_log = NULL; + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); + + if (mdev->md_io_tmpp) { + __free_page(mdev->md_io_tmpp); + mdev->md_io_tmpp = NULL; + } + clear_bit(GO_DISKLESS, &mdev->flags); +} + static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) { D_ASSERT(mdev->state.disk == D_FAILED); /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will * inc/dec it frequently. Once we are D_DISKLESS, no one will touch - * the protected members anymore, though, so in the after_state_ch work - * it will be safe to free them. */ + * the protected members anymore, though, so once put_ldev reaches zero + * again, it will be safe to free them. */ drbd_force_state(mdev, NS(disk, D_DISKLESS)); - /* We need to wait for return of references checked out while we still - * have been D_FAILED, though (drbd_md_sync, bitmap io). */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - - clear_bit(GO_DISKLESS, &mdev->flags); return 1; } @@ -3777,9 +3806,6 @@ void drbd_go_diskless(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_FAILED); if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) drbd_queue_work(&mdev->data.work, &mdev->go_diskless); - /* don't drbd_queue_work_front, - * we need to serialize with the after_state_ch work - * of the -> D_FAILED transition. */ } /** diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c498c4827de..0cba7d3d2b5 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -870,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp retcode = ERR_DISK_CONFIGURED; goto fail; } + /* It may just now have detached because of IO error. Make sure + * drbd_ldev_destroy is done already, we may end up here very fast, + * e.g. if someone calls attach from the on-io-error handler, + * to realize a "hot spare" feature (not that I'd recommend that) */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); @@ -1262,7 +1267,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp force_diskless_dec: put_ldev(mdev); force_diskless: - drbd_force_state(mdev, NS(disk, D_DISKLESS)); + drbd_force_state(mdev, NS(disk, D_FAILED)); drbd_md_sync(mdev); release_bdev2_fail: if (nbc) @@ -1285,10 +1290,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp return 0; } +/* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); + if (mdev->state.disk == D_DISKLESS) + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + drbd_resume_io(mdev); return 0; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6ec922c623a..04a823b01da 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3363,7 +3363,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned if (ns.conn == C_MASK) { ns.conn = C_CONNECTED; if (mdev->state.disk == D_NEGOTIATING) { - drbd_force_state(mdev, NS(disk, D_DISKLESS)); + drbd_force_state(mdev, NS(disk, D_FAILED)); } else if (peer_state.disk == D_NEGOTIATING) { dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); peer_state.disk = D_DISKLESS; From 6719fb036cea56a5ee9d0ac912ed8c7cabb27f49 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 18 Oct 2010 23:04:07 +0200 Subject: [PATCH 04/27] drbd: fix potential data divergence after multiple failures If we get an IO-error during an activity log transaction, if we failed to write the bitmap of the evicted extent, we must not write the transaction itself. If we failed to write the transaction, we must not even submit the corresponding bio, as its extent is not yet marked in the activity log. Otherwise, if this was a disconneted Primary (degraded cluster), which now lost its disk as well, and we later re-attach the same backend storage, we possibly "forget" to resync some parts of the disk that potentially have been changed. On the receiving side, when receiving from a peer with unhealthy disk, checking for pdsk == D_DISKLESS is not enough, we need to set out of sync and do AL transactions for everything pdsk < D_INCONSISTENT on the receiving side. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 26 +++++++++++++++++++++----- drivers/block/drbd/drbd_receiver.c | 3 ++- drivers/block/drbd/drbd_req.c | 19 ++++++++++++++----- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index ac04ef97eac..bd925180a2b 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -284,18 +284,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) u32 xor_sum = 0; if (!get_ldev(mdev)) { - dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); + dev_err(DEV, + "disk is %s, cannot start al transaction (-%d +%d)\n", + drbd_disk_str(mdev->state.disk), evicted, new_enr); complete(&((struct update_al_work *)w)->event); return 1; } /* do we have to do a bitmap write, first? * TODO reduce maximum latency: * submit both bios, then wait for both, - * instead of doing two synchronous sector writes. */ + * instead of doing two synchronous sector writes. + * For now, we must not write the transaction, + * if we cannot write out the bitmap of the evicted extent. */ if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); - mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ + /* The bitmap write may have failed, causing a state change. */ + if (mdev->state.disk < D_INCONSISTENT) { + dev_err(DEV, + "disk is %s, cannot write al transaction (-%d +%d)\n", + drbd_disk_str(mdev->state.disk), evicted, new_enr); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } + + mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ buffer = (struct al_transaction *)page_address(mdev->md_io_page); buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); @@ -739,7 +753,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev) unsigned int enr; unsigned long add = 0; char ppb[10]; - int i; + int i, tmp; wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); @@ -747,7 +761,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev) enr = lc_element_by_index(mdev->act_log, i)->lc_number; if (enr == LC_FREE) continue; - add += drbd_bm_ALe_set_all(mdev, enr); + tmp = drbd_bm_ALe_set_all(mdev, enr); + dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); + add += tmp; } lc_unlock(mdev->act_log); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 04a823b01da..1146faa7ae3 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1995,10 +1995,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned break; } - if (mdev->state.pdsk == D_DISKLESS) { + if (mdev->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ drbd_set_out_of_sync(mdev, e->sector, e->size); e->flags |= EE_CALL_AL_COMPLETE_IO; + e->flags &= ~EE_MAY_SET_IN_SYNC; drbd_al_begin_io(mdev, e->sector); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9e91a2545fc..d26b213dbf1 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -942,12 +942,21 @@ allocate_barrier: if (local) { req->private_bio->bi_bdev = mdev->ldev->backing_bdev; - if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) + /* State may have changed since we grabbed our reference on the + * mdev->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(mdev)) { + if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + put_ldev(mdev); + } else bio_endio(req->private_bio, -EIO); - else - generic_make_request(req->private_bio); } /* we need to plug ALWAYS since we possibly need to kick lo_dev. From bc571b8cb930ea78207851dd38b5a435fcb8891c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 21 Oct 2010 18:07:31 +0200 Subject: [PATCH 05/27] drbd: fix a misleading printk This codepath used to be called only for failed kmalloc GFP_ATOMIC, but is now also triggered by other things. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 108d58015cd..1a2d2f0759b 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -925,7 +925,7 @@ out: drbd_md_sync(mdev); if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { - dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); + dev_info(DEV, "Writing the whole bitmap\n"); drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); } From fb2c7a10eec051317ff091b2cb2d73c5ecd98c19 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Oct 2010 12:08:13 +0200 Subject: [PATCH 06/27] drbd: rate limit an error message If we don't rate limit it, and you happen to log err level messages via serial console, an IO error on a disconnected Primary may cause serious unresponsiveness. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d26b213dbf1..31d04b17c45 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -813,7 +813,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) mdev->state.conn >= C_CONNECTED)); if (!(local || remote) && !is_susp(mdev->state)) { - dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); goto fail_free_complete; } From 8825f7c3e5c7b251b49fc594658a96f59417ee16 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 21 Oct 2010 17:21:19 +0200 Subject: [PATCH 07/27] drbd: Silenced an assert That assertion's condition needed adjustment for today's semantics Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 2 +- include/linux/drbd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 31d04b17c45..5c225485355 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -258,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) if (!hlist_unhashed(&req->colision)) hlist_del(&req->colision); else - D_ASSERT((s & RQ_NET_MASK) == 0); + D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); /* for writes we need to do some extra housekeeping */ if (rw == WRITE) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 9b2a0158f39..ef44c7a0638 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.9rc2" +#define REL_VERSION "8.3.9" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 95 From 2451fc3b2bd3a7205270da75a21dde0d5d7c96a2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 24 Aug 2010 13:43:11 +0200 Subject: [PATCH 08/27] drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 12 -- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_nl.c | 4 +- drivers/block/drbd/drbd_proc.c | 1 - drivers/block/drbd/drbd_receiver.c | 212 ++++------------------------- drivers/block/drbd/drbd_worker.c | 21 --- 6 files changed, 33 insertions(+), 219 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 03c15e317c3..1b915fd9278 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -749,17 +749,12 @@ struct drbd_epoch { /* drbd_epoch flag bits */ enum { - DE_BARRIER_IN_NEXT_EPOCH_ISSUED, - DE_BARRIER_IN_NEXT_EPOCH_DONE, - DE_CONTAINS_A_BARRIER, DE_HAVE_BARRIER_NUMBER, - DE_IS_FINISHING, }; enum epoch_event { EV_PUT, EV_GOT_BARRIER_NR, - EV_BARRIER_DONE, EV_BECAME_LAST, EV_CLEANUP = 32, /* used as flag */ }; @@ -801,11 +796,6 @@ enum { __EE_CALL_AL_COMPLETE_IO, __EE_MAY_SET_IN_SYNC, - /* This epoch entry closes an epoch using a barrier. - * On sucessful completion, the epoch is released, - * and the P_BARRIER_ACK send. */ - __EE_IS_BARRIER, - /* In case a barrier failed, * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, @@ -820,7 +810,6 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) @@ -948,7 +937,6 @@ enum write_ordering_e { WO_none, WO_drain_io, WO_bdev_flush, - WO_bio_barrier }; struct fifo_buffer { diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 992c3aecdf7..8e0d707df23 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2858,7 +2858,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) drbd_thread_init(mdev, &mdev->asender, drbd_asender); mdev->agreed_pro_version = PRO_VERSION_MAX; - mdev->write_ordering = WO_bio_barrier; + mdev->write_ordering = WO_bdev_flush; mdev->resync_wenr = LC_FREE; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 0cba7d3d2b5..899878fcf97 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1117,8 +1117,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp nbc = NULL; resync_lru = NULL; - mdev->write_ordering = WO_bio_barrier; - drbd_bump_write_ordering(mdev, WO_bio_barrier); + mdev->write_ordering = WO_bdev_flush; + drbd_bump_write_ordering(mdev, WO_bdev_flush); if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &mdev->flags); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index ad325c5d0ce..7e6ac307e2d 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -158,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v) [WO_none] = 'n', [WO_drain_io] = 'd', [WO_bdev_flush] = 'f', - [WO_bio_barrier] = 'b', }; seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1146faa7ae3..2952c1277b1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -49,11 +49,6 @@ #include "drbd_vli.h" -struct flush_work { - struct drbd_work w; - struct drbd_epoch *epoch; -}; - enum finish_epoch { FE_STILL_LIVE, FE_DESTROYED, @@ -66,16 +61,6 @@ static int drbd_do_auth(struct drbd_conf *mdev); static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); static int e_end_block(struct drbd_conf *, struct drbd_work *, int); -static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) -{ - struct drbd_epoch *prev; - spin_lock(&mdev->epoch_lock); - prev = list_entry(epoch->list.prev, struct drbd_epoch, list); - if (prev == epoch || prev == mdev->current_epoch) - prev = NULL; - spin_unlock(&mdev->epoch_lock); - return prev; -} #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) @@ -981,7 +966,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi return TRUE; } -static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +static void drbd_flush(struct drbd_conf *mdev) { int rv; @@ -997,24 +982,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d } put_ldev(mdev); } - - return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); -} - -static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) -{ - struct flush_work *fw = (struct flush_work *)w; - struct drbd_epoch *epoch = fw->epoch; - - kfree(w); - - if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) - drbd_flush_after_epoch(mdev, epoch); - - drbd_may_finish_epoch(mdev, epoch, EV_PUT | - (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); - - return 1; } /** @@ -1027,15 +994,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev) { - int finish, epoch_size; + int epoch_size; struct drbd_epoch *next_epoch; - int schedule_flush = 0; enum finish_epoch rv = FE_STILL_LIVE; spin_lock(&mdev->epoch_lock); do { next_epoch = NULL; - finish = 0; epoch_size = atomic_read(&epoch->epoch_size); @@ -1045,16 +1010,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, break; case EV_GOT_BARRIER_NR: set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); - - /* Special case: If we just switched from WO_bio_barrier to - WO_bdev_flush we should not finish the current epoch */ - if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && - mdev->write_ordering != WO_bio_barrier && - epoch == mdev->current_epoch) - clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); - break; - case EV_BARRIER_DONE: - set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); break; case EV_BECAME_LAST: /* nothing to do*/ @@ -1063,23 +1018,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && - epoch->list.prev == &mdev->current_epoch->list && - !test_bit(DE_IS_FINISHING, &epoch->flags)) { - /* Nearly all conditions are met to finish that epoch... */ - if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || - mdev->write_ordering == WO_none || - (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || - ev & EV_CLEANUP) { - finish = 1; - set_bit(DE_IS_FINISHING, &epoch->flags); - } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && - mdev->write_ordering == WO_bio_barrier) { - atomic_inc(&epoch->active); - schedule_flush = 1; - } - } - if (finish) { + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); @@ -1102,6 +1041,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, /* atomic_set(&epoch->active, 0); is already zero */ if (rv == FE_STILL_LIVE) rv = FE_RECYCLED; + wake_up(&mdev->ee_wait); } } @@ -1113,22 +1053,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, spin_unlock(&mdev->epoch_lock); - if (schedule_flush) { - struct flush_work *fw; - fw = kmalloc(sizeof(*fw), GFP_ATOMIC); - if (fw) { - fw->w.cb = w_flush; - fw->epoch = epoch; - drbd_queue_work(&mdev->data.work, &fw->w); - } else { - dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); - set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); - /* That is not a recursion, only one level */ - drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); - drbd_may_finish_epoch(mdev, epoch, EV_PUT); - } - } - return rv; } @@ -1144,19 +1068,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) [WO_none] = "none", [WO_drain_io] = "drain", [WO_bdev_flush] = "flush", - [WO_bio_barrier] = "barrier", }; pwo = mdev->write_ordering; wo = min(pwo, wo); - if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) - wo = WO_bdev_flush; if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) wo = WO_drain_io; if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) wo = WO_none; mdev->write_ordering = wo; - if (pwo != mdev->write_ordering || wo == WO_bio_barrier) + if (pwo != mdev->write_ordering || wo == WO_bdev_flush) dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); } @@ -1192,7 +1113,7 @@ next_bio: bio->bi_sector = sector; bio->bi_bdev = mdev->ldev->backing_bdev; /* we special case some flags in the multi-bio case, see below - * (REQ_UNPLUG, REQ_HARDBARRIER) */ + * (REQ_UNPLUG) */ bio->bi_rw = rw; bio->bi_private = e; bio->bi_end_io = drbd_endio_sec; @@ -1226,11 +1147,6 @@ next_bio: bio->bi_rw &= ~REQ_UNPLUG; drbd_generic_make_request(mdev, fault_type, bio); - - /* strip off REQ_HARDBARRIER, - * unless it is the first or last bio */ - if (bios && bios->bi_next) - bios->bi_rw &= ~REQ_HARDBARRIER; } while (bios); maybe_kick_lo(mdev); return 0; @@ -1244,45 +1160,9 @@ fail: return -ENOMEM; } -/** - * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set - * @mdev: DRBD device. - * @w: work object. - * @cancel: The connection will be closed anyways (unused in this callback) - */ -int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) -{ - struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, - (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) - so that we can finish that epoch in drbd_may_finish_epoch(). - That is necessary if we already have a long chain of Epochs, before - we realize that REQ_HARDBARRIER is actually not supported */ - - /* As long as the -ENOTSUPP on the barrier is reported immediately - that will never trigger. If it is reported late, we will just - print that warning and continue correctly for all future requests - with WO_bdev_flush */ - if (previous_epoch(mdev, e->epoch)) - dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); - - /* we still have a local reference, - * get_ldev was done in receive_Data. */ - - e->w.cb = e_end_block; - if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) { - /* drbd_submit_ee fails for one reason only: - * if was not able to allocate sufficient bios. - * requeue, try again later. */ - e->w.cb = w_e_reissue; - drbd_queue_work(&mdev->data.work, &e->w); - } - return 1; -} - static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - int rv, issue_flush; + int rv; struct p_barrier *p = &mdev->data.rbuf.barrier; struct drbd_epoch *epoch; @@ -1300,44 +1180,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign * Therefore we must send the barrier_ack after the barrier request was * completed. */ switch (mdev->write_ordering) { - case WO_bio_barrier: case WO_none: if (rv == FE_RECYCLED) return TRUE; - break; + + /* receiver context, in the writeout path of the other node. + * avoid potential distributed deadlock */ + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; + else + dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); + /* Fall through */ case WO_bdev_flush: case WO_drain_io: - if (rv == FE_STILL_LIVE) { - set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); - drbd_wait_ee_list_empty(mdev, &mdev->active_ee); - rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); - } - if (rv == FE_RECYCLED) - return TRUE; - - /* The asender will send all the ACKs and barrier ACKs out, since - all EEs moved from the active_ee to the done_ee. We need to - provide a new epoch object for the EEs that come in soon */ - break; - } - - /* receiver context, in the writeout path of the other node. - * avoid potential distributed deadlock */ - epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); - if (!epoch) { - dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); - issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); drbd_wait_ee_list_empty(mdev, &mdev->active_ee); - if (issue_flush) { - rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); - if (rv == FE_RECYCLED) - return TRUE; + drbd_flush(mdev); + + if (atomic_read(&mdev->current_epoch->epoch_size)) { + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; } - drbd_wait_ee_list_empty(mdev, &mdev->done_ee); + epoch = mdev->current_epoch; + wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0); + + D_ASSERT(atomic_read(&epoch->active) == 0); + D_ASSERT(epoch->flags == 0); return TRUE; + default: + dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); + return FALSE; } epoch->flags = 0; @@ -1652,15 +1528,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; sector_t sector = e->sector; - struct drbd_epoch *epoch; int ok = 1, pcmd; - if (e->flags & EE_IS_BARRIER) { - epoch = previous_epoch(mdev, e->epoch); - if (epoch) - drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); - } - if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { if (likely((e->flags & EE_WAS_ERROR) == 0)) { pcmd = (mdev->state.conn >= C_SYNC_SOURCE && @@ -1817,27 +1686,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned e->epoch = mdev->current_epoch; atomic_inc(&e->epoch->epoch_size); atomic_inc(&e->epoch->active); - - if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { - struct drbd_epoch *epoch; - /* Issue a barrier if we start a new epoch, and the previous epoch - was not a epoch containing a single request which already was - a Barrier. */ - epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); - if (epoch == e->epoch) { - set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - rw |= REQ_HARDBARRIER; - e->flags |= EE_IS_BARRIER; - } else { - if (atomic_read(&epoch->epoch_size) > 1 || - !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { - set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); - set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - rw |= REQ_HARDBARRIER; - e->flags |= EE_IS_BARRIER; - } - } - } spin_unlock(&mdev->epoch_lock); dp_flags = be32_to_cpu(p->dp_flags); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1a2d2f0759b..b0551ba7ad0 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -102,12 +102,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) put_ldev(mdev); } -static int is_failed_barrier(int ee_flags) -{ - return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED)) - == (EE_IS_BARRIER|EE_WAS_ERROR); -} - /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) @@ -119,21 +113,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo int is_syncer_req; int do_al_complete_io; - /* if this is a failed barrier request, disable use of barriers, - * and schedule for resubmission */ - if (is_failed_barrier(e->flags)) { - drbd_bump_write_ordering(mdev, WO_bdev_flush); - spin_lock_irqsave(&mdev->req_lock, flags); - list_del(&e->w.list); - e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED; - e->w.cb = w_e_reissue; - /* put_ldev actually happens below, once we come here again. */ - __release(local); - spin_unlock_irqrestore(&mdev->req_lock, flags); - drbd_queue_work(&mdev->data.work, &e->w); - return; - } - D_ASSERT(e->block_id != ID_VACANT); /* after we moved e to done_ee, From a8a4e51e6965db84d2af041370ea2ab6232aa4f1 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Aug 2010 10:21:04 +0200 Subject: [PATCH 09/27] drbd: REQ_HARDBARRIER -> REQ_FUA transition for meta data accesses Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 16 ++-------------- drivers/block/drbd/drbd_int.h | 7 +++---- drivers/block/drbd/drbd_nl.c | 4 ++-- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index bd925180a2b..ba95cba192b 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -78,11 +78,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, init_completion(&md_io.event); md_io.error = 0; - if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) - rw |= REQ_HARDBARRIER; + if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) + rw |= REQ_FUA; rw |= REQ_UNPLUG | REQ_SYNC; - retry: bio = bio_alloc(GFP_NOIO, 1); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; @@ -100,17 +99,6 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, wait_for_completion(&md_io.event); ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; - /* check for unsupported barrier op. - * would rather check on EOPNOTSUPP, but that is not reliable. - * don't try again for ANY return value != 0 */ - if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) { - /* Try again with no barrier */ - dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); - set_bit(MD_NO_BARRIER, &mdev->flags); - rw &= ~REQ_HARDBARRIER; - bio_put(bio); - goto retry; - } out: bio_put(bio); return ok; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1b915fd9278..575bfba1b0d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -835,8 +835,7 @@ enum { NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, - MD_NO_BARRIER, /* meta data device does not support barriers, - so don't even try */ + MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ @@ -2404,13 +2403,13 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) { int r; - if (test_bit(MD_NO_BARRIER, &mdev->flags)) + if (test_bit(MD_NO_FUA, &mdev->flags)) return; r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); if (r) { - set_bit(MD_NO_BARRIER, &mdev->flags); + set_bit(MD_NO_FUA, &mdev->flags); dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); } } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 899878fcf97..29e5c70e4e2 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1103,9 +1103,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (nbc->dc.no_md_flush) - set_bit(MD_NO_BARRIER, &mdev->flags); + set_bit(MD_NO_FUA, &mdev->flags); else - clear_bit(MD_NO_BARRIER, &mdev->flags); + clear_bit(MD_NO_FUA, &mdev->flags); /* Point of no return reached. * Devices and memory are no longer released by error cleanup below. From 650789c87f16dcdf1dd0a67ac7461b7537534855 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Aug 2010 10:47:17 +0200 Subject: [PATCH 10/27] drbd: Removed checks for REQ_HARDBARRIER on incomming BIOs Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 - drivers/block/drbd/drbd_main.c | 5 ----- drivers/block/drbd/drbd_req.c | 14 -------------- 3 files changed, 20 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 575bfba1b0d..0c527d28162 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -832,7 +832,6 @@ enum { * Gets cleared when the state.conn * goes into C_CONNECTED state. */ WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ - NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8e0d707df23..d7072bf0630 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2788,11 +2788,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) drbd_set_defaults(mdev); - /* for now, we do NOT yet support it, - * even though we start some framework - * to eventually support barriers */ - set_bit(NO_BARRIER_SUPP, &mdev->flags); - atomic_set(&mdev->ap_bio_cnt, 0); atomic_set(&mdev->ap_pending_cnt, 0); atomic_set(&mdev->rs_pending_cnt, 0); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5c225485355..11a75d32a2e 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1032,20 +1032,6 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio) return 0; } - /* Reject barrier requests if we know the underlying device does - * not support them. - * XXX: Need to get this info from peer as well some how so we - * XXX: reject if EITHER side/data/metadata area does not support them. - * - * because of those XXX, this is not yet enabled, - * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. - */ - if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) { - /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * what we "blindly" assume: */ From afa842fa641e11a025725883b04d1e144e6bad39 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Fri, 22 Oct 2010 14:21:07 -0500 Subject: [PATCH 11/27] cciss: fix board status waiting code After a reset, we should first wait for the board to become "not ready", and then wait for it to become "ready", instead of immediately waiting for it to become "ready", and do this waiting *after* restoring PCI config space registers. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 43 +++++++++++++++++++++++++++++++++++-------- drivers/block/cciss.h | 4 ++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index f09e6df15aa..fd08644bf2a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4006,18 +4006,31 @@ static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, return -ENODEV; } -static int __devinit cciss_wait_for_board_ready(ctlr_info_t *h) +static int __devinit cciss_wait_for_board_state(struct pci_dev *pdev, + void __iomem *vaddr, int wait_for_ready) +#define BOARD_READY 1 +#define BOARD_NOT_READY 0 { - int i; + int i, iterations; u32 scratchpad; - for (i = 0; i < CCISS_BOARD_READY_ITERATIONS; i++) { - scratchpad = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); - if (scratchpad == CCISS_FIRMWARE_READY) - return 0; + if (wait_for_ready) + iterations = CCISS_BOARD_READY_ITERATIONS; + else + iterations = CCISS_BOARD_NOT_READY_ITERATIONS; + + for (i = 0; i < iterations; i++) { + scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET); + if (wait_for_ready) { + if (scratchpad == CCISS_FIRMWARE_READY) + return 0; + } else { + if (scratchpad != CCISS_FIRMWARE_READY) + return 0; + } msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS); } - dev_warn(&h->pdev->dev, "board not ready, timed out.\n"); + dev_warn(&pdev->dev, "board not ready, timed out.\n"); return -ENODEV; } @@ -4183,7 +4196,7 @@ static int __devinit cciss_pci_init(ctlr_info_t *h) err = -ENOMEM; goto err_out_free_res; } - err = cciss_wait_for_board_ready(h); + err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY); if (err) goto err_out_free_res; err = cciss_find_cfgtables(h); @@ -4534,6 +4547,20 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) need a little pause here */ msleep(CCISS_POST_RESET_PAUSE_MSECS); + /* Wait for board to become not ready, then ready. */ + dev_info(&pdev->dev, "Waiting for board to become ready.\n"); + rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY); + if (rc) /* Don't bail, might be E500, etc. which can't be reset */ + dev_warn(&pdev->dev, + "failed waiting for board to become not ready\n"); + rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY); + if (rc) { + dev_warn(&pdev->dev, + "failed waiting for board to become ready\n"); + goto unmap_cfgtable; + } + dev_info(&pdev->dev, "board ready.\n"); + /* Controller should be in simple mode at this point. If it's not, * It means we're on one of those controllers which doesn't support * the doorbell reset method and on which the PCI power management reset diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index ae340ffc8f8..4b8933d778f 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -200,10 +200,14 @@ struct ctlr_info * the above. */ #define CCISS_BOARD_READY_WAIT_SECS (120) +#define CCISS_BOARD_NOT_READY_WAIT_SECS (10) #define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) #define CCISS_BOARD_READY_ITERATIONS \ ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ CCISS_BOARD_READY_POLL_INTERVAL_MSECS) +#define CCISS_BOARD_NOT_READY_ITERATIONS \ + ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \ + CCISS_BOARD_READY_POLL_INTERVAL_MSECS) #define CCISS_POST_RESET_PAUSE_MSECS (3000) #define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) #define CCISS_POST_RESET_NOOP_RETRIES (12) From f442e64b93e16dba6bf9ab7e8dc5a90f6bcd8a85 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Fri, 22 Oct 2010 14:21:12 -0500 Subject: [PATCH 12/27] cciss: Use kernel provided PCI state save and restore functions and use the doorbell reset method if available (which doesn't lock up the controller if you properly save and restore all the PCI registers that you're supposed to.) Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 73 +++++++++---------------------------------- 1 file changed, 15 insertions(+), 58 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index fd08644bf2a..2e547bddc5a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4361,36 +4361,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u #define cciss_soft_reset_controller(p) cciss_message(p, 1, 0) #define cciss_noop(p) cciss_message(p, 3, 0) -static __devinit int cciss_reset_msi(struct pci_dev *pdev) -{ -/* the #defines are stolen from drivers/pci/msi.h. */ -#define msi_control_reg(base) (base + PCI_MSI_FLAGS) -#define PCI_MSIX_FLAGS_ENABLE (1 << 15) - - int pos; - u16 control = 0; - - pos = pci_find_capability(pdev, PCI_CAP_ID_MSI); - if (pos) { - pci_read_config_word(pdev, msi_control_reg(pos), &control); - if (control & PCI_MSI_FLAGS_ENABLE) { - dev_info(&pdev->dev, "resetting MSI\n"); - pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE); - } - } - - pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); - if (pos) { - pci_read_config_word(pdev, msi_control_reg(pos), &control); - if (control & PCI_MSIX_FLAGS_ENABLE) { - dev_info(&pdev->dev, "resetting MSI-X\n"); - pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE); - } - } - - return 0; -} - static int cciss_controller_hard_reset(struct pci_dev *pdev, void * __iomem vaddr, bool use_doorbell) { @@ -4445,17 +4415,17 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev, * states or using the doorbell register. */ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) { - u16 saved_config_space[32]; u64 cfg_offset; u32 cfg_base_addr; u64 cfg_base_addr_index; void __iomem *vaddr; unsigned long paddr; u32 misc_fw_support, active_transport; - int rc, i; + int rc; CfgTable_struct __iomem *cfgtable; bool use_doorbell; u32 board_id; + u16 command_register; /* For controllers as old a the p600, this is very nearly * the same thing as @@ -4465,14 +4435,6 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) * pci_set_power_state(pci_dev, PCI_D0); * pci_restore_state(pci_dev); * - * but we can't use these nice canned kernel routines on - * kexec, because they also check the MSI/MSI-X state in PCI - * configuration space and do the wrong thing when it is - * set/cleared. Also, the pci_save/restore_state functions - * violate the ordering requirements for restoring the - * configuration space from the CCISS document (see the - * comment below). So we roll our own .... - * * For controllers newer than the P600, the pci power state * method of resetting doesn't work so we have another way * using the doorbell register. @@ -4491,8 +4453,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) return -ENODEV; } - for (i = 0; i < 32; i++) - pci_read_config_word(pdev, 2*i, &saved_config_space[i]); + /* Save the PCI command register */ + pci_read_config_word(pdev, 4, &command_register); + /* Turn the board off. This is so that later pci_restore_state() + * won't turn the board on before the rest of config space is ready. + */ + pci_disable_device(pdev); + pci_save_state(pdev); /* find the first memory BAR, so we can find the cfg table */ rc = cciss_pci_find_memory_BAR(pdev, &paddr); @@ -4527,21 +4494,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); if (rc) goto unmap_cfgtable; - - /* Restore the PCI configuration space. The Open CISS - * Specification says, "Restore the PCI Configuration - * Registers, offsets 00h through 60h. It is important to - * restore the command register, 16-bits at offset 04h, - * last. Do not restore the configuration status register, - * 16-bits at offset 06h." Note that the offset is 2*i. - */ - for (i = 0; i < 32; i++) { - if (i == 2 || i == 3) - continue; - pci_write_config_word(pdev, 2*i, saved_config_space[i]); + pci_restore_state(pdev); + rc = pci_enable_device(pdev); + if (rc) { + dev_warn(&pdev->dev, "failed to enable device.\n"); + goto unmap_cfgtable; } - wmb(); - pci_write_config_word(pdev, 4, saved_config_space[2]); + pci_write_config_word(pdev, 4, command_register); /* Some devices (notably the HP Smart Array 5i Controller) need a little pause here */ @@ -4601,8 +4560,6 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) return 0; /* just try to do the kdump anyhow. */ if (rc) return -ENODEV; - if (cciss_reset_msi(pdev)) - return -ENODEV; /* Now try to get the controller to respond to a no-op */ for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { From 186fb9cf6a1154bc9b071adfd72fcf256285eb26 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Fri, 22 Oct 2010 14:21:17 -0500 Subject: [PATCH 13/27] cciss: limit commands allocated on reset_devices This is to conserve memory in a memory-limited kdump scenario Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 2e547bddc5a..9e761b994b4 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4079,6 +4079,11 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h) static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h) { h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands)); + + /* Limit commands in memory limited kdump scenario. */ + if (reset_devices && h->max_commands > 32) + h->max_commands = 32; + if (h->max_commands < 16) { dev_warn(&h->pdev->dev, "Controller reports " "max supported commands of %d, an obvious lie. " From 332c2f80a894d349bfb95fae00daf74477d4afcd Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Fri, 22 Oct 2010 14:21:22 -0500 Subject: [PATCH 14/27] cciss: use usleep_range not msleep for small sleeps Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 9e761b994b4..c792a6080d5 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3785,7 +3785,7 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h) for (i = 0; i < MAX_CONFIG_WAIT; i++) { if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) break; - msleep(10); + usleep_range(10000, 20000); } } From 4205df34003eec4371020872cdfa228ffae5bd6a Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Sat, 23 Oct 2010 18:47:31 +0200 Subject: [PATCH 15/27] cciss: remove controllers supported by hpsa We would prefer not to have any overlap between the two drivers. Remove the cciss_allow_hpsa option, as it it is no longer needed. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 45 ++++--------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index c792a6080d5..39631cbccaf 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -66,11 +66,6 @@ MODULE_VERSION("3.6.26"); MODULE_LICENSE("GPL"); static DEFINE_MUTEX(cciss_mutex); -static int cciss_allow_hpsa; -module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(cciss_allow_hpsa, - "Prevent cciss driver from accessing hardware known to be " - " supported by the hpsa driver"); #include "cciss_cmd.h" #include "cciss.h" @@ -98,19 +93,6 @@ static const struct pci_device_id cciss_pci_device_id[] = { {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3350}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3351}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3352}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3353}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3354}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3355}, {0,} }; @@ -131,6 +113,8 @@ static struct board_type products[] = { {0x409D0E11, "Smart Array 6400 EM", &SA5_access}, {0x40910E11, "Smart Array 6i", &SA5_access}, {0x3225103C, "Smart Array P600", &SA5_access}, + {0x3223103C, "Smart Array P800", &SA5_access}, + {0x3234103C, "Smart Array P400", &SA5_access}, {0x3235103C, "Smart Array P400i", &SA5_access}, {0x3211103C, "Smart Array E200i", &SA5_access}, {0x3212103C, "Smart Array E200", &SA5_access}, @@ -138,24 +122,7 @@ static struct board_type products[] = { {0x3214103C, "Smart Array E200i", &SA5_access}, {0x3215103C, "Smart Array E200i", &SA5_access}, {0x3237103C, "Smart Array E500", &SA5_access}, -/* controllers below this line are also supported by the hpsa driver. */ -#define HPSA_BOUNDARY 0x3223103C - {0x3223103C, "Smart Array P800", &SA5_access}, - {0x3234103C, "Smart Array P400", &SA5_access}, - {0x323D103C, "Smart Array P700m", &SA5_access}, - {0x3241103C, "Smart Array P212", &SA5_access}, - {0x3243103C, "Smart Array P410", &SA5_access}, - {0x3245103C, "Smart Array P410i", &SA5_access}, - {0x3247103C, "Smart Array P411", &SA5_access}, - {0x3249103C, "Smart Array P812", &SA5_access}, - {0x324A103C, "Smart Array P712m", &SA5_access}, - {0x324B103C, "Smart Array P711m", &SA5_access}, - {0x3350103C, "Smart Array", &SA5_access}, - {0x3351103C, "Smart Array", &SA5_access}, - {0x3352103C, "Smart Array", &SA5_access}, - {0x3353103C, "Smart Array", &SA5_access}, - {0x3354103C, "Smart Array", &SA5_access}, - {0x3355103C, "Smart Array", &SA5_access}, + {0x323d103c, "Smart Array P700M", &SA5_access}, }; /* How long to wait (in milliseconds) for board to go into simple mode */ @@ -3969,13 +3936,9 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id) *board_id = ((subsystem_device_id << 16) & 0xffff0000) | subsystem_vendor_id; - for (i = 0; i < ARRAY_SIZE(products); i++) { - /* Stand aside for hpsa driver on request */ - if (cciss_allow_hpsa && products[i].board_id == HPSA_BOUNDARY) - return -ENODEV; + for (i = 0; i < ARRAY_SIZE(products); i++) if (*board_id == products[i].board_id) return i; - } dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", *board_id); return -ENODEV; From 9284bcf4e335e5f18a8bc7b26461c33ab60d0689 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 08:10:18 -0600 Subject: [PATCH 16/27] block: check for proper length of iov entries in blk_rq_map_user_iov() Ensure that we pass down properly validated iov segments before calling into the mapping or copy functions. Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-map.c b/block/blk-map.c index d4a586d8691..5d5dbe47c22 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -205,6 +205,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unaligned = 1; break; } + if (!iov[i].iov_len) + return -EINVAL; } if (unaligned || (q->dma_pad_mask & len) || map_data) From 9f864c80913467312c7b8690e41fb5ebd1b50e92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 11:31:42 -0600 Subject: [PATCH 17/27] block: take care not to overflow when calculating total iov length Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a8b5a10eb5b..4f4230b79bb 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->iovec_count) { const int size = sizeof(struct sg_iovec) * hdr->iovec_count; size_t iov_data_len; - struct sg_iovec *iov; + struct sg_iovec *sg_iov; + struct iovec *iov; + int i; - iov = kmalloc(size, GFP_KERNEL); - if (!iov) { + sg_iov = kmalloc(size, GFP_KERNEL); + if (!sg_iov) { ret = -ENOMEM; goto out; } - if (copy_from_user(iov, hdr->dxferp, size)) { - kfree(iov); + if (copy_from_user(sg_iov, hdr->dxferp, size)) { + kfree(sg_iov); ret = -EFAULT; goto out; } + /* + * Sum up the vecs, making sure they don't overflow + */ + iov = (struct iovec *) sg_iov; + iov_data_len = 0; + for (i = 0; i < hdr->iovec_count; i++) { + if (iov_data_len + iov[i].iov_len < iov_data_len) { + kfree(sg_iov); + ret = -EINVAL; + goto out; + } + iov_data_len += iov[i].iov_len; + } + /* SG_IO howto says that the shorter of the two wins */ - iov_data_len = iov_length((struct iovec *)iov, - hdr->iovec_count); if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten((struct iovec *)iov, + hdr->iovec_count = iov_shorten(iov, hdr->iovec_count, hdr->dxfer_len); iov_data_len = hdr->dxfer_len; } - ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, + ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, iov_data_len, GFP_KERNEL); - kfree(iov); + kfree(sg_iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); From f3f63c1c28bc861a931fac283b5bc3585efb8967 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 11:46:56 -0600 Subject: [PATCH 18/27] block: limit vec count in bio_kmalloc() and bio_alloc_map_data() Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/bio.c b/fs/bio.c index 8abb2dfb2e7..8317a2c106b 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; From cb4644cac4a2797afc847e6c92736664d4b0ea34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Nov 2010 14:36:25 +0100 Subject: [PATCH 19/27] bio: take care not overflow page count when mapping/copying user data If the iovec is being set up in a way that causes uaddr + PAGE_SIZE to overflow, we could end up attempting to map a huge number of pages. Check for this invalid input type. Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/bio.c b/fs/bio.c index 8317a2c106b..4bd454fa844 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -834,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -962,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -989,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { From 90fdb0b98a62d78a0650b9fd3ddc58a48f71d740 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Nov 2010 14:29:13 +0100 Subject: [PATCH 20/27] cciss: fix proc warning on attempt to remove non-existant directory Randy reports that he gets the following stack trace when removing the cciss module: [ 109.164277] Pid: 3463, comm: rmmod Not tainted 2.6.37-rc1 #7 [ 109.164280] Call Trace: [ 109.164292] [] warn_slowpath_common+0xc6/0xf3 [ 109.164299] [] warn_slowpath_fmt+0x5b/0x6b [ 109.164307] [] ? _raw_spin_unlock+0x40/0x4b [ 109.164313] [] remove_proc_entry+0x156/0x35e [ 109.164320] [] ? do_raw_spin_unlock+0xff/0x10f [ 109.164327] [] ? trace_hardirqs_on+0x10/0x4a [ 109.164333] [] ? _raw_spin_unlock_irq+0x4c/0x7b [ 109.164339] [] ? wait_for_common+0x145/0x15e [ 109.164345] [] ? default_wake_function+0x0/0x22 [ 109.164357] [] cciss_cleanup+0xa9/0xc7 [cciss] [ 109.164365] [] sys_delete_module+0x2d6/0x368 [ 109.164371] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 109.164377] [] ? audit_syscall_entry+0x172/0x1a5 [ 109.164383] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 109.164389] [] system_call_fastpath+0x16/0x1b [ 109.164394] ---[ end trace 88e8568246ed0b1d ]--- which will happen if you don't actually have an HP CISS adapter, since it'll do an uncondional removal of a proc directory it never attempted to create in that case. Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 2cc4dda4627..2cdbc247d0a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4936,7 +4936,8 @@ static void __exit cciss_cleanup(void) } } kthread_stop(cciss_scan_thread); - remove_proc_entry("driver/cciss", NULL); + if (proc_cciss) + remove_proc_entry("driver/cciss", NULL); bus_unregister(&cciss_bus_type); } From 77304d2abac6101f7249754ffdd4421258877ab0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Nov 2010 14:39:12 +0100 Subject: [PATCH 21/27] block: read i_size with i_size_read() Convert direct reads of an inode's i_size to using i_size_read(). i_size_{read,write} use a seqcount to protect reads from accessing incomple writes. Concurrent i_size_write()s require mutual exclussion to protect the seqcount that is used by i_size_{read,write}. But i_size_read() callers do not need to use additional locking. Signed-off-by: Mike Snitzer Acked-by: NeilBrown Acked-by: Lars Ellenberg Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/compat_ioctl.c | 4 ++-- block/ioctl.c | 6 +++--- drivers/block/drbd/drbd_int.h | 2 +- drivers/md/md.c | 20 ++++++++++---------- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f0834e2f572..17fcb83670c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1351,7 +1351,7 @@ static void handle_bad_sector(struct bio *bio) bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio->bi_sector + bio_sectors(bio), - (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); + (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); } @@ -1404,7 +1404,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; /* Test device or partition size, when known. */ - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (maxsector) { sector_t sector = bio->bi_sector; diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 119f07b74dc..58c6ee5b010 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -744,13 +744,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return compat_put_ulong(arg, size >> 9); case BLKGETSIZE64_32: - return compat_put_u64(arg, bdev->bd_inode->i_size); + return compat_put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESETUP32: case BLKTRACESTART: /* compatible */ diff --git a/block/ioctl.c b/block/ioctl.c index d724ceb1d46..38aa194f63e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -125,7 +125,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, start >>= 9; len >>= 9; - if (start + len > (bdev->bd_inode->i_size >> 9)) + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; if (secure) flags |= BLKDEV_DISCARD_SECURE; @@ -307,12 +307,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, ret = blkdev_reread_part(bdev); break; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return put_ulong(arg, size >> 9); case BLKGETSIZE64: - return put_u64(arg, bdev->bd_inode->i_size); + return put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACESETUP: diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 9bdcf4393c0..2637f499f77 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1874,7 +1874,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) static inline sector_t drbd_get_capacity(struct block_device *bdev) { /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ - return bdev ? bdev->bd_inode->i_size >> 9 : 0; + return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0; } /** diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e957f3140a..324a3663fcd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -706,7 +706,7 @@ static struct mdk_personality *find_pers(int level, char *clevel) /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct block_device *bdev) { - sector_t num_sectors = bdev->bd_inode->i_size / 512; + sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -1386,7 +1386,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) */ switch(minor_version) { case 0: - sb_start = rdev->bdev->bd_inode->i_size >> 9; + sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; sb_start -= 8*2; sb_start &= ~(sector_t)(4*2-1); break; @@ -1472,7 +1472,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - le64_to_cpu(sb->data_offset); else rdev->sectors = rdev->sb_start; @@ -1680,7 +1680,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = rdev->bdev->bd_inode->i_size >> 9; + max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; @@ -1690,7 +1690,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) } else { /* minor version 0; superblock after data */ sector_t sb_start; - sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; + sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) @@ -2584,7 +2584,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (!sectors) return -EBUSY; } else if (!sectors) - sectors = (rdev->bdev->bd_inode->i_size >> 9) - + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - rdev->data_offset; } if (sectors < my_mddev->dev_sectors) @@ -2797,7 +2797,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING "md: %s has zero or unknown size, marking faulty!\n", @@ -5235,8 +5235,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - } else + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); rdev->sectors = rdev->sb_start; @@ -5306,7 +5306,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev->bdev); else - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; rdev->sectors = rdev->sb_start; From a014741c0adfb8fb79952939ca087cf03d272bb9 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 8 Nov 2010 14:42:40 +0100 Subject: [PATCH 22/27] block: ioctl: fix information leak to userland Structure hd_geometry is copied to userland with 4 padding bytes between cylinders and start fields uninitialized on 64-bit platforms. It leads to leaking of contents of kernel stack memory. Currently there is no memset() in real implementations of getgeo() in drivers/block/, so it makes sense to have memset() in blkdev_ioctl(). Signed-off-by: Vasiliy Kulikov Signed-off-by: Jens Axboe --- block/ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/ioctl.c b/block/ioctl.c index 38aa194f63e..3d866d0037f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -242,6 +242,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, * We need to set the startsect first, the driver may * want to override it. */ + memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) From 1447399b3e34af016c368b4178db7ef0e04e15b0 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 9 Nov 2010 21:33:02 +0100 Subject: [PATCH 23/27] ioprio: fix RCU locking around task dereference With 2.6.37-rc1, I observe sys_ioprio_set not taking the RCU lock [1] across access to the task credentials. Inspecting the code in fs/ioprio.c, the tasklist_lock is held for read across the __task_cred call, which is presumably sufficient to prevent the task credentials becoming stale. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:419 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by start-stop-daem/2246: #0: (tasklist_lock){.?.?..}, at: [] sys_ioprio_set+0x8a/0x400 stack backtrace: Pid: 2246, comm: start-stop-daem Not tainted 2.6.37-rc1-330cd+ #2 Call Trace: [] lockdep_rcu_dereference+0xa4/0xc0 [] find_task_by_pid_ns+0x81/0x90 [] find_task_by_vpid+0x1d/0x20 [] sys_ioprio_set+0x3f0/0x400 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Take the RCU lock for read across acquiring the pointer to the task credentials and dereferencing it. Signed-off-by: Daniel J Blueman Fixed up by Jens to fix missing rcu_read_unlock() on mismatches. Signed-off-by: Jens Axboe --- fs/ioprio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb92dcc..8def14e24c3 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -139,7 +139,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != who) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == who; + rcu_read_unlock(); + if (!match) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -232,7 +237,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != user->uid) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == user->uid; + rcu_read_unlock(); + if (!match) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) From f85acd81aa623e3dcf268c90e5cd8ecf36830984 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 9 Nov 2010 21:26:56 +0100 Subject: [PATCH 24/27] ioprio: rcu_read_lock/unlock protect find_task_by_vpid call (V2) Commit 4221a9918e38b7494cee341dda7b7b4bb8c04bde "Add RCU check for find_task_by_vpid()" introduced rcu_lockdep_assert to find_task_by_pid_ns= Assertion failed in sys_ioprio_get. The patch is fixing assertion failure in ioprio_set as well. kernel/pid.c:419 invoked rcu_dereference_check() without protection! stack backtrace: Pid: 4254, comm: iotop Not tainted Call Trace: [] lockdep_rcu_dereference+0xaa/0xb2 [] find_task_by_pid_ns+0x4f/0x68 [] find_task_by_vpid+0x1d/0x1f [] sys_ioprio_get+0x50/0x2da [] system_call_fastpath+0x16/0x1b V2: rcu critical section expanded according to comment by Paul E. McKenney Signed-off-by: Sergey Senozhatsky Acked-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/ioprio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ioprio.c b/fs/ioprio.c index 8def14e24c3..2f7d05c8992 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = set_task_ioprio(p, ioprio); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -205,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = get_task_ioprio(p); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: [PATCH 25/27] block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ------- block/elevator.c | 4 ++-- drivers/block/aoe/aoeblk.c | 3 --- drivers/block/loop.c | 6 ------ drivers/block/xen-blkfront.c | 2 -- drivers/scsi/scsi_error.c | 18 +++++------------- drivers/usb/storage/uas.c | 5 +---- include/linux/bio.h | 4 ---- include/linux/blk_types.h | 6 ++---- include/linux/blkdev.h | 3 +-- kernel/trace/blktrace.c | 4 ---- 11 files changed, 11 insertions(+), 51 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 17fcb83670c..4ce953f1b39 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1194,13 +1194,6 @@ static int __make_request(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; int rw_flags; - /* REQ_HARDBARRIER is no more */ - if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, - "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even diff --git a/block/elevator.c b/block/elevator.c index 282e8308f7e..2569512830d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; + stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -691,7 +691,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 541e1887996..528f6318ded 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -180,9 +180,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) BUG(); bio_endio(bio, -ENXIO); return 0; - } else if (bio->bi_rw & REQ_HARDBARRIER) { - bio_endio(bio, -EOPNOTSUPP); - return 0; } else if (bio->bi_io_vec == NULL) { printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); BUG(); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1e5284ef65f..7ea0bea2f7e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -481,12 +481,6 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) if (bio_rw(bio) == WRITE) { struct file *file = lo->lo_backing_file; - /* REQ_HARDBARRIER is deprecated */ - if (bio->bi_rw & REQ_HARDBARRIER) { - ret = -EOPNOTSUPP; - goto out; - } - if (bio->bi_rw & REQ_FLUSH) { ret = vfs_fsync(file, 0); if (unlikely(ret && ret != -EINVAL)) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 06e2812ba12..255035cfc88 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -289,8 +289,6 @@ static int blkif_queue_request(struct request *req) ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & REQ_HARDBARRIER) - ring_req->operation = BLKIF_OP_WRITE_BARRIER; ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 1de30eb83bb..f3cf924a2cd 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -320,19 +320,11 @@ static int scsi_check_sense(struct scsi_cmnd *scmd) "changed. The Linux SCSI layer does not " "automatically adjust these parameters.\n"); - if (scmd->request->cmd_flags & REQ_HARDBARRIER) - /* - * barrier requests should always retry on UA - * otherwise block will get a spurious error - */ - return NEEDS_RETRY; - else - /* - * for normal (non barrier) commands, pass the - * UA upwards for a determination in the - * completion functions - */ - return SUCCESS; + /* + * Pass the UA upwards for a determination in the completion + * functions. + */ + return SUCCESS; /* these three are not supported */ case COPY_ABORTED: diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 2054b1e25a6..d1268191acb 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -331,10 +331,7 @@ static struct urb *uas_alloc_cmd_urb(struct uas_dev_info *devinfo, gfp_t gfp, iu->iu_id = IU_ID_COMMAND; iu->tag = cpu_to_be16(stream_id); - if (sdev->ordered_tags && (cmnd->request->cmd_flags & REQ_HARDBARRIER)) - iu->prio_attr = UAS_ORDERED_TAG; - else - iu->prio_attr = UAS_SIMPLE_TAG; + iu->prio_attr = UAS_SIMPLE_TAG; iu->len = len; int_to_scsilun(sdev->lun, &iu->lun); memcpy(iu->cdb, cmnd->cmnd, cmnd->cmd_len); diff --git a/include/linux/bio.h b/include/linux/bio.h index ba679992d39..35dcdb3589b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -66,10 +66,6 @@ #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_empty_barrier(bio) \ - ((bio->bi_rw & REQ_HARDBARRIER) && \ - !bio_has_data(bio) && \ - !(bio->bi_rw & REQ_DISCARD)) static inline unsigned int bio_cur_bytes(struct bio *bio) { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0437ab6bb54..46ad5197537 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -122,7 +122,6 @@ enum rq_flag_bits { __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_HARDBARRIER, /* may not be passed by drive either */ __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_DISCARD, /* request to discard sectors */ @@ -159,7 +158,6 @@ enum rq_flag_bits { #define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) #define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) #define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) -#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) #define REQ_SYNC (1 << __REQ_SYNC) #define REQ_META (1 << __REQ_META) #define REQ_DISCARD (1 << __REQ_DISCARD) @@ -168,8 +166,8 @@ enum rq_flag_bits { #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ - (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \ - REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) + (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_DISCARD | \ + REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define REQ_CLONE_MASK REQ_COMMON_MASK #define REQ_UNPLUG (1 << __REQ_UNPLUG) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5027a599077..aae86fd10c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -552,8 +552,7 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \ - REQ_FLUSH | REQ_FUA) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) From 17a9e7bbae178d1326e4631ab6350a272349c99d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 11 Nov 2010 12:09:59 +0100 Subject: [PATCH 26/27] Documentation: remove anticipatory scheduler info Remove anticipatory block I/O scheduler info from Documentation/ since the code has been deleted. Signed-off-by: Randy Dunlap Reported-by: "Robert P. J. Day" Cc: Jens Axboe Signed-off-by: Jens Axboe --- Documentation/block/switching-sched.txt | 8 ++++---- Documentation/kernel-parameters.txt | 2 +- Documentation/rbtree.txt | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt index d5af3f63081..71cfbdc0f74 100644 --- a/Documentation/block/switching-sched.txt +++ b/Documentation/block/switching-sched.txt @@ -16,7 +16,7 @@ you can do so by typing: As of the Linux 2.6.10 kernel, it is now possible to change the IO scheduler for a given block device on the fly (thus making it possible, for instance, to set the CFQ scheduler for the system default, but -set a specific device to use the anticipatory or noop schedulers - which +set a specific device to use the deadline or noop schedulers - which can improve that device's throughput). To set a specific scheduler, simply do this: @@ -31,7 +31,7 @@ a "cat /sys/block/DEV/queue/scheduler" - the list of valid names will be displayed, with the currently selected scheduler in brackets: # cat /sys/block/hda/queue/scheduler -noop anticipatory deadline [cfq] -# echo anticipatory > /sys/block/hda/queue/scheduler +noop deadline [cfq] +# echo deadline > /sys/block/hda/queue/scheduler # cat /sys/block/hda/queue/scheduler -noop [anticipatory] deadline cfq +noop [deadline] cfq diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ed45e9802aa..92e83e53148 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -706,7 +706,7 @@ and is between 256 and 4096 characters. It is defined in the file arch/x86/kernel/cpu/cpufreq/elanfreq.c. elevator= [IOSCHED] - Format: {"anticipatory" | "cfq" | "deadline" | "noop"} + Format: {"cfq" | "deadline" | "noop"} See Documentation/block/as-iosched.txt and Documentation/block/deadline-iosched.txt for details. diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index 221f38be98f..19f8278c385 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -21,8 +21,8 @@ three rotations, respectively, to balance the tree), with slightly slower To quote Linux Weekly News: There are a number of red-black trees in use in the kernel. - The anticipatory, deadline, and CFQ I/O schedulers all employ - rbtrees to track requests; the packet CD/DVD driver does the same. + The deadline and CFQ I/O schedulers employ rbtrees to + track requests; the packet CD/DVD driver does the same. The high-resolution timer code uses an rbtree to organize outstanding timer requests. The ext3 filesystem tracks directory entries in a red-black tree. Virtual memory areas (VMAs) are tracked with red-black From cedb4a7d9f6aedb0dce94d6285b69dcb3c10fa05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Nov 2010 13:37:54 +0100 Subject: [PATCH 27/27] block: remove unused copy_io_context() Reported-by: Oleg Nesterov Signed-off-by: Jens Axboe --- block/blk-ioc.c | 14 -------------- include/linux/iocontext.h | 1 - 2 files changed, 15 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d22c4c55c40..3c7a339fe38 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -153,20 +153,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node) } EXPORT_SYMBOL(get_io_context); -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_long_read(&src->refcount) == 0); - atomic_long_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 3e70b21884a..b2eee896dcb 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -76,7 +76,6 @@ int put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); -void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else static inline void exit_io_context(struct task_struct *task) {