Merge branch 'for-3.16/drivers' of git://git.kernel.dk/linux-block into next

Pull block driver changes from Jens Axboe: "Now that the core bits are in, here's the pull request for the driver related changes for 3.16. Nothing out of the ordinary here, mostly business as usual. There are a few pulls of for-3.16/core into this branch, which were done when the blk-mq was modified after the mtip32xx conversion was put in. The pull request contains: - skd and cciss converted to use pci_enable_msix_exact(). From Alexander Gordeev. - A few mtip32xx fixes from Asai @ Micron. - The conversion of mtip32xx from make_request_fn to blk-mq, and a later small fix for that conversion on quiescing for non-queued IO. From me. - A fix for bsg to use an exported function to check whether this driver is request based or not. Needed updating for blk-mq, which is request based, but does not have a request_fn hook. From me. - Small floppy bug fix from Jiri. - A series of cleanups for the cdrom uniform layer from Joe Perches. Gets rid of various old ugly macros, making the code conform more to the modern coding style. - A series of patches for drbd from the drbd crew (Lars Ellenberg and Philipp Reisner). - A use-after-free fix for null_blk from Ming Lei. - Also from Ming Lei is a performance patch for virtio-blk, which can net us a 3x win on kvm platforms where world notification is expensive. - Ming Lei also fixed a stall issue in virtio-blk, due to a race between queue start/stop and resource limits. - A small batch of fixes for xen-blk{back,front} from Olaf Hering and Valentin Priescu" * 'for-3.16/drivers' of git://git.kernel.dk/linux-block: (54 commits) block: virtio_blk: don't hold spin lock during world switch xen-blkback: defer freeing blkif to avoid blocking xenwatch xen blkif.h: fix comment typo in discard-alignment xen/blkback: disable discard feature if requested by toolstack xen-blkfront: remove type check from blkfront_setup_discard floppy: do not corrupt bio.bi_flags when reading block 0 mtip32xx: move error handling to service thread virtio_blk: fix race between start and stop queue mtip32xx: stop block hardware queues before quiescing IO mtip32xx: blk_mq_init_queue() returns an ERR_PTR mtip32xx: convert to use blk-mq cdrom: Remove unnecessary prototype for cdrom_get_disc_info cdrom: Remove unnecessary prototype for cdrom_mrw_exit cdrom: Remove cdrom_count_tracks prototype cdrom: Remove cdrom_get_next_writeable prototype cdrom: Remove cdrom_get_last_written prototype cdrom: Move mmc_ioctls above cdrom_ioctl to remove unnecessary prototype cdrom: Remove unnecessary sanitize_format prototype cdrom: Remove unnecessary check_for_audio_disc prototype cdrom: Remove prototype for open_for_data ...
2014-06-02 13:57:01 -07:00 · 2014-06-02 13:57:01 -07:00 · 80081ec309
--- a/block/bsg.c
+++ b/block/bsg.c
@ -1008,7 +1008,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 	/*
 	 * we need a proper transport to send commands, not a stacked device
 	 */
-	if (!q->request_fn)
+	if (!queue_is_rq_based(q))
 		return 0;

 	bcd = &q->bsg_dev;
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
 		goto default_int_mode;

 	if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
-		err = pci_enable_msix(h->pdev, cciss_msix_entries, 4);
+		err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
 		if (!err) {
 			h->intr[0] = cciss_msix_entries[0].vector;
 			h->intr[1] = cciss_msix_entries[1].vector;
@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
 			h->intr[3] = cciss_msix_entries[3].vector;
 			h->msix_vector = 1;
 			return;
-		}
-		if (err > 0) {
-			dev_warn(&h->pdev->dev,
-				"only %d MSI-X vectors available\n", err);
 		} else {
 			dev_warn(&h->pdev->dev,
 				"MSI-X init failed %d\n", err);
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@ -29,7 +29,6 @@
 #include <linux/drbd_limits.h>
 #include <linux/dynamic_debug.h>
 #include "drbd_int.h"
-#include "drbd_wrappers.h"


 enum al_transaction_types {
@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd

 	BUG_ON(!bdev->md_bdev);

-	drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+	dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
 	     current->comm, current->pid, __func__,
 	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
 	     (void*)_RET_IP_ );
@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
 	return _al_get(device, first, true);
 }

-static
 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
 {
 	/* for bios crossing activity log extent boundaries,
@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
 	int wake_up = 0;
 	unsigned long flags;

-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
 	if (size == 0)
 		return 0;

-	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "sector: %llus, size: %d\n",
 			(unsigned long long)sector, size);
 		return 0;
@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
 	unsigned int enr = BM_SECT_TO_EXT(sector);
 	struct bm_extent *bm_ext;
 	int i, sig;
-	int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
-			 200 times -> 20 seconds. */
+	bool sa;

 retry:
 	sig = wait_event_interruptible(device->al_wait,
@ -1035,12 +1032,15 @@ retry:
 	if (test_bit(BME_LOCKED, &bm_ext->flags))
 		return 0;

+	/* step aside only while we are above c-min-rate; unless disabled. */
+	sa = drbd_rs_c_min_rate_throttle(device);
+
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
 		sig = wait_event_interruptible(device->al_wait,
 					       !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
-					       test_bit(BME_PRIORITY, &bm_ext->flags));
+					       (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));

-		if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
+		if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
 			spin_lock_irq(&device->al_lock);
 			if (lc_put(device->resync, &bm_ext->lce) == 0) {
 				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
@ -1052,9 +1052,6 @@ retry:
 				return -EINTR;
 			if (schedule_timeout_interruptible(HZ/10))
 				return -EINTR;
-			if (sa && --sa == 0)
-				drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec."
-					 "Resync stalled?\n");
 			goto retry;
 		}
 	}
@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
 	sector_t esector, nr_sectors;
 	int wake_up = 0;

-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@ -382,6 +382,12 @@ enum {
 	__EE_CALL_AL_COMPLETE_IO,
 	__EE_MAY_SET_IN_SYNC,

+	/* is this a TRIM aka REQ_DISCARD? */
+	__EE_IS_TRIM,
+	/* our lower level cannot handle trim,
+	 * and we want to fall back to zeroout instead */
+	__EE_IS_TRIM_USE_ZEROOUT,
+
 	/* In case a barrier failed,
 	 * we need to resubmit without the barrier flag. */
 	__EE_RESUBMITTED,
@ -405,7 +411,9 @@ enum {
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
-#define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_IS_TRIM             (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
 #define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
@ -579,6 +587,7 @@ struct drbd_resource {
 	struct list_head resources;
 	struct res_opts res_opts;
 	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	struct mutex adm_mutex;		/* mutex to serialize administrative requests */
 	spinlock_t req_lock;

 	unsigned susp:1;		/* IO suspended by user */
@ -609,6 +618,7 @@ struct drbd_connection {
 	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
 	struct drbd_socket meta;	/* ping/ack (metadata) packets */
 	int agreed_pro_version;		/* actually used protocol version */
+	u32 agreed_features;
 	unsigned long last_received;	/* in jiffies, either socket */
 	unsigned int ko_count;

@ -814,6 +824,28 @@ struct drbd_device {
 	struct submit_worker submit;
 };

+struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_device *device;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+};
+
 static inline struct drbd_device *minor_to_device(unsigned int minor)
 {
 	return (struct drbd_device *)idr_find(&drbd_devices, minor);
@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor)

 static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
 {
-	return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }

 #define for_each_resource(resource, _resources) \
@ -1139,6 +1171,12 @@ struct bm_extent {
 #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
 #define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */

+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE	AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
 extern int  drbd_bm_init(struct drbd_device *device);
 extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
 extern void drbd_bm_cleanup(struct drbd_device *device);
@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 extern rwlock_t global_state_lock;

 extern int conn_lowest_minor(struct drbd_connection *connection);
-enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
 extern void drbd_destroy_device(struct kref *kref);
-extern void drbd_delete_device(struct drbd_device *mdev);
+extern void drbd_delete_device(struct drbd_device *device);

 extern struct drbd_resource *drbd_create_resource(const char *name);
 extern void drbd_free_resource(struct drbd_resource *resource);
@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);


 /* drbd_nl.c */
-extern int drbd_msg_put_info(const char *info);
+extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
 extern int drbd_khelper(struct drbd_device *device, char *cmd);

 /* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
 extern int drbd_worker(struct drbd_thread *thi);
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
 void drbd_resync_after_changed(struct drbd_device *device);
@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int);
 extern void resync_timer_fn(unsigned long data);
 extern void start_resync_timer_fn(unsigned long data);

+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
-extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
 extern int drbd_submit_peer_request(struct drbd_device *,
 				    struct drbd_peer_request *, const unsigned,
 				    const int);
 extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
 extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
 						     sector_t, unsigned int,
+						     bool,
 						     gfp_t) __must_hold(local);
 extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
 				 int);
@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock)
 			(char*)&val, sizeof(val));
 }

+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+					sector_t size)
+{
+	/* set_capacity(device->this_bdev->bd_disk, size); */
+	set_capacity(device->vdisk, size);
+	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       device_to_minor(device));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (drbd_insert_fault(device, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
 void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);

 /* drbd_proc.c */
@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s);
 extern const char *drbd_role_str(enum drbd_role s);

 /* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device)

 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
-	return list_first_entry(&resource->connections,
+	return list_first_entry_or_null(&resource->connections,
 				struct drbd_connection, connections);
 }

--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b
 		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
 }

-/* Used to send write requests
- * R_PRIMARY -> Peer	(P_DATA)
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
 */
 int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
 {
@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 			dp_flags |= DP_SEND_WRITE_ACK;
 	}
 	p->dp_flags = cpu_to_be32(dp_flags);
+
+	if (dp_flags & DP_DISCARD) {
+		struct p_trim *t = (struct p_trim*)p;
+		t->size = cpu_to_be32(req->i.size);
+		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+		goto out;
+	}
+
+	/* our digest is still only over the payload.
+	 * TRIM does not carry any payload. */
 	if (dgs)
 		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
 	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 		     ... Be noisy about digest too large ...
 		} */
 	}
+out:
 	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */

 	return err;
@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	INIT_LIST_HEAD(&resource->connections);
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
+	mutex_init(&resource->adm_mutex);
 	spin_lock_init(&resource->req_lock);
 	return resource;

@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device)
 	return 0;
 }

-enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr)
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
 {
+	struct drbd_resource *resource = adm_ctx->resource;
 	struct drbd_connection *connection;
 	struct drbd_device *device;
 	struct drbd_peer_device *peer_device, *tmp_peer_device;
 	struct gendisk *disk;
 	struct request_queue *q;
 	int id;
+	int vnr = adm_ctx->volume;
 	enum drbd_ret_code err = ERR_NOMEM;

 	device = minor_to_device(minor);
@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 	if (id < 0) {
 		if (id == -ENOSPC) {
 			err = ERR_MINOR_EXISTS;
-			drbd_msg_put_info("requested minor exists already");
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_no_minor_idr;
 	}
@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 	if (id < 0) {
 		if (id == -ENOSPC) {
 			err = ERR_MINOR_EXISTS;
-			drbd_msg_put_info("requested minor exists already");
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_idr_remove_minor;
 	}
@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 		if (id < 0) {
 			if (id == -ENOSPC) {
 				err = ERR_INVALID_REQUEST;
-				drbd_msg_put_info("requested volume exists already");
+				drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
 			}
 			goto out_idr_remove_from_resource;
 		}
@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i

 	if (init_submitter(device)) {
 		err = ERR_NOMEM;
-		drbd_msg_put_info("unable to create submit workqueue");
+		drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
 		goto out_idr_remove_vol;
 	}

--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@ -1,4 +1,3 @@
-#include "drbd_wrappers.h"
 #include <linux/kernel.h>
 #include <net/netlink.h>
 #include <linux/drbd_genl_api.h>
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	/* ------------------------ ~18s average ------------------------ */
 	i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
 	dt = (jiffies - device->rs_mark_time[i]) / HZ;
-	if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+	if (dt > 180)
 		stalled = 1;

 	if (!dt)
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@ -54,6 +54,11 @@ enum drbd_packet {
 	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
 	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
 	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
+        /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+	/* REQ_DISCARD. We used "discard" in different contexts before,
+	 * which is why I chose TRIM here, to disambiguate. */
+	P_TRIM                = 0x31,

 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
@ -119,6 +124,11 @@ struct p_data {
 	u32	    dp_flags;
 } __packed;

+struct p_trim {
+	struct p_data p_data;
+	u32	    size;	/* == bio->bi_size */
+} __packed;
+
 /*
 * commands which share a struct:
 *  p_block_ack:
@ -150,6 +160,8 @@ struct p_block_req {
 *   ReportParams
 */

+#define FF_TRIM      1
+
 struct p_connection_features {
 	u32 protocol_min;
 	u32 feature_flags;
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@ -46,9 +46,10 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
-
 #include "drbd_vli.h"

+#define PRO_FEATURES (FF_TRIM)
+
 struct packet_info {
 	enum drbd_packet cmd;
 	unsigned int size;
@ -65,7 +66,7 @@ enum finish_epoch {
 static int drbd_do_features(struct drbd_connection *connection);
 static int drbd_do_auth(struct drbd_connection *connection);
 static int drbd_disconnected(struct drbd_peer_device *);
-
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_work *, int);

@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
 * @retry:	whether to retry, if not enough pages are available right now
 *
 * Tries to allocate number pages, first from our own page pool, then from
- * the kernel, unless this allocation would exceed the max_buffers setting.
+ * the kernel.
 * Possibly retry until DRBD frees sufficient pages somewhere else.
 *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
 * Returns a page chain linked via page->private.
 */
 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 	struct page *page = NULL;
 	struct net_conf *nc;
 	DEFINE_WAIT(wait);
-	int mxb;
+	unsigned int mxb;

-	/* Yes, we may run up to @number over max_buffers. If we
-	 * follow it strictly, the admin will get it wrong anyways. */
 	rcu_read_lock();
 	nc = rcu_dereference(peer_device->connection->net_conf);
 	mxb = nc ? nc->max_buffers : 1000000;
@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 			break;
 		}

-		schedule();
+		if (schedule_timeout(HZ/10) == 0)
+			mxb = UINT_MAX;
 	}
 	finish_wait(&drbd_pp_wait, &wait);

@ -331,7 +339,7 @@ You must not have the req_lock:

 struct drbd_peer_request *
 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-		    unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
+		    unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_peer_request *peer_req;
@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 		return NULL;
 	}

-	if (data_size) {
+	if (has_payload && data_size) {
 		page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
 		if (!page)
 			goto fail;
@ -1026,24 +1034,27 @@ randomize:
 	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
 		return -1;

+	/* Prevent a race between resync-handshake and
+	 * being promoted to Primary.
+	 *
+	 * Grab and release the state mutex, so we know that any current
+	 * drbd_set_role() is finished, and any incoming drbd_set_role
+	 * will see the STATE_SENT flag, and wait for it to be cleared.
+	 */
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_lock(peer_device->device->state_mutex);
+
 	set_bit(STATE_SENT, &connection->flags);

+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_unlock(peer_device->device->state_mutex);
+
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
 		kref_get(&device->kref);
 		rcu_read_unlock();

-		/* Prevent a race between resync-handshake and
-		 * being promoted to Primary.
-		 *
-		 * Grab and release the state mutex, so we know that any current
-		 * drbd_set_role() is finished, and any incoming drbd_set_role
-		 * will see the STATE_SENT flag, and wait for it to be cleared.
-		 */
-		mutex_lock(device->state_mutex);
-		mutex_unlock(device->state_mutex);
-
 		if (discard_my_data)
 			set_bit(DISCARD_MY_DATA, &device->flags);
 		else
@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
 	int err = -ENOMEM;

+	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+		/* wait for all pending IO completions, before we start
+		 * zeroing things out. */
+		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+			sector, ds >> 9, GFP_NOIO))
+			peer_req->flags |= EE_WAS_ERROR;
+		drbd_endio_write_sec_final(peer_req);
+		return 0;
+	}
+
+	if (peer_req->flags & EE_IS_TRIM)
+		nr_pages = 0; /* discards don't have any payload. */
+
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	if (!bio) {
-		drbd_err(device, "submit_ee: Allocation of a bio failed\n");
+		drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
 		goto fail;
 	}
 	/* > peer_req->i.sector, unless this is the first bio */
@ -1340,6 +1365,11 @@ next_bio:
 	bios = bio;
 	++n_bios;

+	if (rw & REQ_DISCARD) {
+		bio->bi_iter.bi_size = ds;
+		goto submit;
+	}
+
 	page_chain_for_each(page) {
 		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
 		if (!bio_add_page(bio, page, len, 0)) {
@ -1360,8 +1390,9 @@ next_bio:
 		sector += len >> 9;
 		--nr_pages;
 	}
-	D_ASSERT(device, page == NULL);
 	D_ASSERT(device, ds == 0);
+submit:
+	D_ASSERT(device, page == NULL);

 	atomic_set(&peer_req->pending_bios, n_bios);
 	do {
@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 * and from receive_Data */
 static struct drbd_peer_request *
 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-	      int data_size) __must_hold(local)
+	      struct packet_info *pi) __must_hold(local)
 {
 	struct drbd_device *device = peer_device->device;
 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
 	struct drbd_peer_request *peer_req;
 	struct page *page;
 	int dgs, ds, err;
+	int data_size = pi->size;
 	void *dig_in = peer_device->connection->int_dig_in;
 	void *dig_vv = peer_device->connection->int_dig_vv;
 	unsigned long *data;
+	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;

 	dgs = 0;
-	if (peer_device->connection->peer_integrity_tfm) {
+	if (!trim && peer_device->connection->peer_integrity_tfm) {
 		dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
 		/*
 		 * FIXME: Receive the incoming digest into the receive buffer
@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 		data_size -= dgs;
 	}

+	if (trim) {
+		D_ASSERT(peer_device, data_size == 0);
+		data_size = be32_to_cpu(trim->size);
+	}
+
 	if (!expect(IS_ALIGNED(data_size, 512)))
 		return NULL;
-	if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+	/* prepare for larger trim requests. */
+	if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
 		return NULL;

 	/* even though we trust out peer,
@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO);
+	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
 	if (!peer_req)
 		return NULL;

-	if (!data_size)
+	if (trim)
 		return peer_req;

 	ds = data_size;
@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused)
 }

 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
-			    int data_size) __releases(local)
+			    struct packet_info *pi) __releases(local)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_peer_request *peer_req;

-	peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size);
+	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
 	if (!peer_req)
 		goto fail;

@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
 	list_add(&peer_req->w.list, &device->sync_ee);
 	spin_unlock_irq(&device->resource->req_lock);

-	atomic_add(data_size >> 9, &device->rs_sect_ev);
+	atomic_add(pi->size >> 9, &device->rs_sect_ev);
 	if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
 		return 0;

@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
 		/* data is submitted to disk within recv_resync_read.
 		 * corresponding put_ldev done below on error,
 		 * or in drbd_peer_request_endio. */
-		err = recv_resync_read(peer_device, sector, pi->size);
+		err = recv_resync_read(peer_device, sector, pi);
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_err(device, "Can not write resync data to local disk.\n");
@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 */

 	sector = be64_to_cpu(p->sector);
-	peer_req = read_in_block(peer_device, p->block_id, sector, pi->size);
+	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
 	if (!peer_req) {
 		put_ldev(device);
 		return -EIO;
@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *

 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(dp_flags);
-	if (peer_req->pages == NULL) {
+	if (pi->cmd == P_TRIM) {
+		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+		peer_req->flags |= EE_IS_TRIM;
+		if (!blk_queue_discard(q))
+			peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, rw & REQ_DISCARD);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+	} else if (peer_req->pages == NULL) {
 		D_ASSERT(device, peer_req->i.size == 0);
 		D_ASSERT(device, dp_flags & DP_FLUSH);
 	}
@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 		update_peer_seq(peer_device, peer_seq);
 		spin_lock_irq(&device->resource->req_lock);
 	}
-	list_add(&peer_req->w.list, &device->active_ee);
+	/* if we use the zeroout fallback code, we process synchronously
+	 * and we wait for all pending requests, respectively wait for
+	 * active_ee to become empty in drbd_submit_peer_request();
+	 * better not add ourselves here. */
+	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+		list_add(&peer_req->w.list, &device->active_ee);
 	spin_unlock_irq(&device->resource->req_lock);

 	if (device->state.conn == C_SYNC_TARGET)
@ -2313,14 +2365,33 @@ out_interrupted:
 * The current sync rate used here uses only the most recent two step marks,
 * to have a short time average so we can react faster.
 */
-int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+{
+	struct lc_element *tmp;
+	bool throttle = true;
+
+	if (!drbd_rs_c_min_rate_throttle(device))
+		return false;
+
+	spin_lock_irq(&device->al_lock);
+	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags))
+			throttle = false;
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&device->al_lock);
+
+	return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 {
 	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
 	unsigned long db, dt, dbdt;
-	struct lc_element *tmp;
-	int curr_events;
-	int throttle = 0;
 	unsigned int c_min_rate;
+	int curr_events;

 	rcu_read_lock();
 	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
@ -2328,24 +2399,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)

 	/* feature disabled? */
 	if (c_min_rate == 0)
-		return 0;
-
-	spin_lock_irq(&device->al_lock);
-	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
-	if (tmp) {
-		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
-		if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
-			spin_unlock_irq(&device->al_lock);
-			return 0;
-		}
-		/* Do not slow down if app IO is already waiting for this extent */
-	}
-	spin_unlock_irq(&device->al_lock);
+		return false;

 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&device->rs_sect_ev);
-
 	if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
 		dbdt = Bit2KB(db/dt);

 		if (dbdt > c_min_rate)
-			throttle = 1;
+			return true;
 	}
-	return throttle;
+	return false;
 }

-
 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;
@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO);
+	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+			true /* has real payload */, GFP_NOIO);
 	if (!peer_req) {
 		put_ldev(device);
 		return -ENOMEM;
@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 		put_ldev(device);
 	}

+	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	drbd_reconsider_max_bio_size(device);
+	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+	   drbd_reconsider_max_bio_size(), we can be sure that after
+	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(device)) {
 		dd = drbd_determine_dev_size(device, ddsf, NULL);
@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 		drbd_set_my_capacity(device, p_size);
 	}

-	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-	drbd_reconsider_max_bio_size(device);
-
 	if (get_ldev(device)) {
 		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
 			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
 	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
 	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
 };

 static void drbdd(struct drbd_connection *connection)
@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection)
 	memset(p, 0, sizeof(*p));
 	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
 	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->feature_flags = cpu_to_be32(PRO_FEATURES);
 	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
 }

@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection)
 		goto incompat;

 	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);

 	drbd_info(connection, "Handshake successful: "
 	     "Agreed network protocol version %d\n", connection->agreed_pro_version);

+	drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+		  connection->agreed_features & FF_TRIM ? " " : " not ");
+
 	return 1;

 incompat:
@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}

+	if (pi.size < CHALLENGE_LEN) {
+		drbd_err(connection, "AuthChallenge payload too small.\n");
+		rv = -1;
+		goto fail;
+	}
+
 	peers_ch = kmalloc(pi.size, GFP_NOIO);
 	if (peers_ch == NULL) {
 		drbd_err(connection, "kmalloc of peers_ch failed\n");
@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}

+	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+		drbd_err(connection, "Peer presented the same challenge!\n");
+		rv = -1;
+		goto fail;
+	}
+
 	resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
 	response = kmalloc(resp_size, GFP_NOIO);
 	if (response == NULL) {
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
 		break;

+	case DISCARD_COMPLETED_NOTSUPP:
+	case DISCARD_COMPLETED_WITH_ERROR:
+		/* I'd rather not detach from local disk just because it
+		 * failed a REQ_DISCARD. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
 	case QUEUE_FOR_NET_READ:
 		/* READ or READA, and
 		 * no local disk,
@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws)
 		if (list_empty(&incoming))
 			break;

+skip_fast_path:
 		wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
 		/* Maybe more was queued, while we prepared the transaction?
 		 * Try to stuff them into this transaction as well.
@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws)
 			list_del_init(&req->tl_requests);
 			drbd_send_and_submit(device, req);
 		}
+
+		/* If all currently hot activity log extents are kept busy by
+		 * incoming requests, we still must not totally starve new
+		 * requests to cold extents. In that case, prepare one request
+		 * in blocking mode. */
+		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+			list_del_init(&req->tl_requests);
+			req->rq_state |= RQ_IN_ACT_LOG;
+			if (!drbd_al_begin_io_prepare(device, &req->i)) {
+				/* Corresponding extent was hot after all? */
+				drbd_send_and_submit(device, req);
+			} else {
+				/* Found a request to a cold extent.
+				 * Put on "pending" list,
+				 * and try to cumulate with more. */
+				list_add(&req->tl_requests, &pending);
+				goto skip_fast_path;
+			}
+		}
 	}
 }

@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	return limit;
 }

-static struct drbd_request *find_oldest_request(struct drbd_connection *connection)
+static void find_oldest_requests(
+		struct drbd_connection *connection,
+		struct drbd_device *device,
+		struct drbd_request **oldest_req_waiting_for_peer,
+		struct drbd_request **oldest_req_waiting_for_disk)
 {
-	/* Walk the transfer log,
-	 * and find the oldest not yet completed request */
 	struct drbd_request *r;
+	*oldest_req_waiting_for_peer = NULL;
+	*oldest_req_waiting_for_disk = NULL;
 	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-		if (atomic_read(&r->completion_ref))
-			return r;
+		const unsigned s = r->rq_state;
+		if (!*oldest_req_waiting_for_peer
+		&& ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
+			*oldest_req_waiting_for_peer = r;
+
+		if (!*oldest_req_waiting_for_disk
+		&& (s & RQ_LOCAL_PENDING) && r->device == device)
+			*oldest_req_waiting_for_disk = r;
+
+		if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
+			break;
 	}
-	return NULL;
 }

 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 	struct drbd_connection *connection = first_peer_device(device)->connection;
-	struct drbd_request *req; /* oldest request */
+	struct drbd_request *req_disk, *req_peer; /* oldest request */
 	struct net_conf *nc;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data)
 	now = jiffies;

 	spin_lock_irq(&device->resource->req_lock);
-	req = find_oldest_request(connection);
-	if (!req) {
+	find_oldest_requests(connection, device, &req_peer, &req_disk);
+	if (req_peer == NULL && req_disk == NULL) {
 		spin_unlock_irq(&device->resource->req_lock);
 		mod_timer(&device->request_timer, now + et);
 		return;
@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data)
 	 * ~198 days with 250 HZ, we have a window where the timeout would need
 	 * to expire twice (worst case) to become effective. Good enough.
 	 */
-	if (ent && req->rq_state & RQ_NET_PENDING &&
-		 time_after(now, req->start_time + ent) &&
+	if (ent && req_peer &&
+		 time_after(now, req_peer->start_time + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
-	if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device &&
-		 time_after(now, req->start_time + dt) &&
+	if (dt && req_disk &&
+		 time_after(now, req_disk->start_time + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
 		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
 	}
-	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
+
+	/* Reschedule timer for the nearest not already expired timeout.
+	 * Fallback to now + min(effective network timeout, disk timeout). */
+	ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
+		? req_peer->start_time + ent : now + et;
+	dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
+		? req_disk->start_time + dt : now + et;
+	nt = time_before(ent, dt) ? ent : dt;
 	spin_unlock_irq(&connection->resource->req_lock);
 	mod_timer(&device->request_timer, nt);
 }
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/drbd.h>
 #include "drbd_int.h"
-#include "drbd_wrappers.h"

 /* The request callbacks will be called in irq context by the IDE drivers,
   and in Softirqs/Tasklets/BH context by the SCSI drivers,
@ -111,11 +110,14 @@ enum drbd_req_event {
 	BARRIER_ACKED, /* in protocol A and B */
 	DATA_RECEIVED, /* (remote read) */

+	COMPLETED_OK,
 	READ_COMPLETED_WITH_ERROR,
 	READ_AHEAD_COMPLETED_WITH_ERROR,
 	WRITE_COMPLETED_WITH_ERROR,
+	DISCARD_COMPLETED_NOTSUPP,
+	DISCARD_COMPLETED_WITH_ERROR,
+
 	ABORT_DISK_IO,
-	COMPLETED_OK,
 	RESEND,
 	FAIL_FROZEN_DISK_IO,
 	RESTART_FROZEN_DISK_IO,
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns,
-				       enum sanitize_state_warnings *warn);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);

 static inline bool is_susp(union drbd_state s)
 {
@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask,

 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	os = drbd_read_state(device);
-	ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
 	rv = is_valid_transition(os, ns);
 	if (rv >= SS_SUCCESS)
 		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask,

 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	os = drbd_read_state(device);
-	ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
 	rv = is_valid_transition(os, ns);
 	if (rv < SS_SUCCESS) {
 		spin_unlock_irqrestore(&device->resource->req_lock, flags);
@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st
 * When we loose connection, we have to set the state of the peers disk (pdsk)
 * to D_UNKNOWN. This rule and many more along those lines are in this function.
 */
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns,
-				       enum sanitize_state_warnings *warn)
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
 {
 	enum drbd_fencing_p fp;
 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st
 	}

 	if (fp == FP_STONITH &&
-	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED))
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
 		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */

 	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
 		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */

 	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,

 	os = drbd_read_state(device);

-	ns = sanitize_state(device, ns, &ssw);
+	ns = sanitize_state(device, os, ns, &ssw);
 	if (ns.i == os.i)
 		return SS_NOTHING_TO_DO;

@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
 		os = drbd_read_state(device);
-		ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);

 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;
@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 		number_of_volumes++;
 		os = drbd_read_state(device);
 		ns = apply_mask_val(os, mask, val);
-		ns = sanitize_state(device, ns, NULL);
+		ns = sanitize_state(device, os, ns, NULL);

 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;
@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 static enum drbd_state_rv
 _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
 {
-	enum drbd_state_rv rv;
+	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;

 	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
-		return SS_CW_SUCCESS;
+		rv = SS_CW_SUCCESS;

 	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
-		return SS_CW_FAILED_BY_PEER;
+		rv = SS_CW_FAILED_BY_PEER;

-	rv = conn_is_valid_transition(connection, mask, val, 0);
-	if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
-		rv = SS_UNKNOWN_ERROR; /* continue waiting */
+	err = conn_is_valid_transition(connection, mask, val, 0);
+	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+		return rv;

-	return rv;
+	return err;
 }

 enum drbd_state_rv
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele

 /* writes on behalf of the partner, or resync writes,
 * "submitted" by the receiver, final stage.  */
-static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 {
 	unsigned long flags = 0;
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel

 	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);

-	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+	/* FIXME do we want to detach for failed REQ_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
 	spin_unlock_irqrestore(&device->resource->req_lock, flags);

@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error)
 	struct drbd_device *device = peer_req->peer_device->device;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
+	int is_discard = !!(bio->bi_rw & REQ_DISCARD);

 	if (error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
-				is_write ? "write" : "read", error,
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", error,
 				(unsigned long long)peer_req->i.sector);
 	if (!error && !uptodate) {
 		if (__ratelimit(&drbd_ratelimit_state))
@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error)

 	/* to avoid recursion in __req_mod */
 	if (unlikely(error)) {
-		what = (bio_data_dir(bio) == WRITE)
+		if (bio->bi_rw & REQ_DISCARD)
+			what = (error == -EOPNOTSUPP)
+				? DISCARD_COMPLETED_NOTSUPP
+				: DISCARD_COMPLETED_WITH_ERROR;
+		else
+			what = (bio_data_dir(bio) == WRITE)
 			? WRITE_COMPLETED_WITH_ERROR
 			: (bio_rw(bio) == READ)
 			  ? READ_COMPLETED_WITH_ERROR
@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
-				       size, GFP_TRY);
+				       size, true /* has real payload */, GFP_TRY);
 	if (!peer_req)
 		goto defer;

@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 	return fb;
 }

-static int drbd_rs_controller(struct drbd_device *device)
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
 	struct disk_conf *dc;
-	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 	unsigned int want;     /* The number of sectors we want in the proxy */
 	int req_sect; /* Number of sectors to request in this turn */
 	int correction; /* Number of sectors more we need in the proxy*/
@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device)
 	int max_sect;
 	struct fifo_buffer *plan;

-	sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
-	device->rs_in_flight -= sect_in;
-
 	dc = rcu_dereference(device->ldev->disk_conf);
 	plan = rcu_dereference(device->rs_plan_s);

@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device)

 static int drbd_rs_number_requests(struct drbd_device *device)
 {
-	int number;
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;

 	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
 	if (rcu_dereference(device->rs_plan_s)->size) {
-		number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
 		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 	} else {
 		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	}
 	rcu_read_unlock();

-	/* ignore the amount of pending requests, the resync controller should
-	 * throttle down to incoming reply rate soon enough anyways. */
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+	if (mxb - device->rs_in_flight < number)
+		number = mxb - device->rs_in_flight;
+
 	return number;
 }

@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel)

 	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
 	number = drbd_rs_number_requests(device);
-	if (number == 0)
+	if (number <= 0)
 		goto requeue;

 	for (i = 0; i < number; i++) {
@ -647,7 +663,7 @@ next_sector:
 		 */
 		align = 1;
 		rollback_i = i;
-		for (;;) {
+		while (i < number) {
 			if (size + BM_BLOCK_SIZE > max_bio_size)
 				break;

@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	}
 	clear_bit(B_RS_H_DONE, &device->flags);

-	write_lock_irq(&global_state_lock);
+	/* req_lock: serialize with drbd_send_and_submit() and others
+	 * global_state_lock: for stable sync-after dependencies */
+	spin_lock_irq(&device->resource->req_lock);
+	write_lock(&global_state_lock);
 	/* Did some connection breakage or IO error race with us? */
 	if (device->state.conn < C_CONNECTED
 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
-		write_unlock_irq(&global_state_lock);
+		write_unlock(&global_state_lock);
+		spin_unlock_irq(&device->resource->req_lock);
 		mutex_unlock(device->state_mutex);
 		return;
 	}
@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		}
 		_drbd_pause_after(device);
 	}
-	write_unlock_irq(&global_state_lock);
+	write_unlock(&global_state_lock);
+	spin_unlock_irq(&device->resource->req_lock);

 	if (r == SS_SUCCESS) {
 		/* reset rs_last_bcast when a resync or verify is started,
@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }

-/* If the resource already closed the current epoch, but we did not
- * (because we have not yet seen new requests), we should send the
- * corresponding barrier now.  Must be checked within the same spinlock
- * that is used to check for new requests. */
-static bool need_to_send_barrier(struct drbd_connection *connection)
-{
-	if (!connection->send.seen_any_write_yet)
-		return false;
-
-	/* Skip barriers that do not contain any writes.
-	 * This may happen during AHEAD mode. */
-	if (!connection->send.current_epoch_writes)
-		return false;
-
-	/* ->req_lock is held when requests are queued on
-	 * connection->sender_work, and put into ->transfer_log.
-	 * It is also held when ->current_tle_nr is increased.
-	 * So either there are already new requests queued,
-	 * and corresponding barriers will be send there.
-	 * Or nothing new is queued yet, so the difference will be 1.
-	 */
-	if (atomic_read(&connection->current_tle_nr) !=
-	    connection->send.current_epoch_nr + 1)
-		return false;
-
-	return true;
-}
-
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 			spin_unlock_irq(&connection->resource->req_lock);
 			break;
 		}
-		send_barrier = need_to_send_barrier(connection);
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
 		spin_unlock_irq(&connection->resource->req_lock);
-		if (send_barrier) {
-			drbd_send_barrier(connection);
-			connection->send.current_epoch_nr++;
-		}
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@ -1,54 +0,0 @@
-#ifndef _DRBD_WRAPPERS_H
-#define _DRBD_WRAPPERS_H
-
-#include <linux/ctype.h>
-#include <linux/mm.h>
-#include "drbd_int.h"
-
-/* see get_sb_bdev and bd_claim */
-extern char *drbd_sec_holder;
-
-/* sets the number of 512 byte sectors of our virtual device */
-static inline void drbd_set_my_capacity(struct drbd_device *device,
-					sector_t size)
-{
-	/* set_capacity(device->this_bdev->bd_disk, size); */
-	set_capacity(device->vdisk, size);
-	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
-}
-
-#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
-
-/* bi_end_io handlers */
-extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_peer_request_endio(struct bio *bio, int error);
-extern void drbd_request_endio(struct bio *bio, int error);
-
-/*
- * used to submit our private bio
- */
-static inline void drbd_generic_make_request(struct drbd_device *device,
-					     int fault_type, struct bio *bio)
-{
-	__release(local);
-	if (!bio->bi_bdev) {
-		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-				"bio->bi_bdev == NULL\n",
-		       device_to_minor(device));
-		dump_stack();
-		bio_endio(bio, -ENODEV);
-		return;
-	}
-
-	if (drbd_insert_fault(device, fault_type))
-		bio_endio(bio, -EIO);
-	else
-		generic_make_request(bio);
-}
-
-#ifndef __CHECKER__
-# undef __cond_lock
-# define __cond_lock(x,c) (c)
-#endif
-
-#endif
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@ -3812,7 +3812,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	bio.bi_iter.bi_size = size;
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = 0;
-	bio.bi_flags = (1 << BIO_QUIET);
+	bio.bi_flags |= (1 << BIO_QUIET);
 	bio.bi_private = &cbdata;
 	bio.bi_end_io = floppy_rb0_cb;

--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@ -40,9 +40,11 @@
 #define MTIP_MAX_RETRIES	2

 /* Various timeout values in ms */
-#define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
-#define MTIP_IOCTL_COMMAND_TIMEOUT_MS     5000
-#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS  5000
+#define MTIP_NCQ_CMD_TIMEOUT_MS      15000
+#define MTIP_IOCTL_CMD_TIMEOUT_MS    5000
+#define MTIP_INT_CMD_TIMEOUT_MS      5000
+#define MTIP_QUIESCE_IO_TIMEOUT_MS   (MTIP_NCQ_CMD_TIMEOUT_MS * \
+				     (MTIP_MAX_RETRIES + 1))

 /* check for timeouts every 500ms */
 #define MTIP_TIMEOUT_CHECK_PERIOD	500
@ -331,12 +333,8 @@ struct mtip_cmd {
 	 */
 	void (*comp_func)(struct mtip_port *port,
 				int tag,
-				void *data,
+				struct mtip_cmd *cmd,
 				int status);
-	/* Additional callback function that may be called by comp_func() */
-	void (*async_callback)(void *data, int status);
-
-	void *async_data; /* Addl. data passed to async_callback() */

 	int scatter_ents; /* Number of scatter list entries used */

@ -347,10 +345,6 @@ struct mtip_cmd {
 	int retries; /* The number of retries left for this command. */

 	int direction; /* Data transfer direction */
-
-	unsigned long comp_time; /* command completion time, in jiffies */
-
-	atomic_t active; /* declares if this command sent to the drive. */
 };

 /* Structure used to describe a port. */
@ -436,12 +430,6 @@ struct mtip_port {
 	 * or error handling is active
 	 */
 	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
-	/*
-	 * Array of command slots. Structure includes pointers to the
-	 * command header and command table, and completion function and data
-	 * pointers.
-	 */
-	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
 	/* Used by mtip_service_thread to wait for an event */
 	wait_queue_head_t svc_wait;
 	/*
@ -452,13 +440,7 @@ struct mtip_port {
 	/*
 	 * Timer used to complete commands that have been active for too long.
 	 */
-	struct timer_list cmd_timer;
 	unsigned long ic_pause_timer;
-	/*
-	 * Semaphore used to block threads if there are no
-	 * command slots available.
-	 */
-	struct semaphore cmd_slot;

 	/* Semaphore to control queue depth of unaligned IOs */
 	struct semaphore cmd_slot_unal;
@ -485,6 +467,8 @@ struct driver_data {

 	struct request_queue *queue; /* Our request queue. */

+	struct blk_mq_tag_set tags; /* blk_mq tags */
+
 	struct mtip_port *port; /* Pointer to the port data structure. */

 	unsigned product_type; /* magic value declaring the product type */
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@ -203,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 		entry = llist_reverse_order(entry);
 		do {
 			cmd = container_of(entry, struct nullb_cmd, ll_list);
-			end_cmd(cmd);
 			entry = entry->next;
+			end_cmd(cmd);
 		} while (entry);
 	}

--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@ -3944,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev)
 	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
 		entries[i].entry = i;

-	rc = pci_enable_msix_range(pdev, entries,
-				   SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT);
-	if (rc < 0) {
+	rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
+	if (rc) {
 		pr_err("(%s): failed to enable MSI-X %d\n",
 		       skd_name(skdev), rc);
 		goto msix_out;
 	}

-	skdev->msix_count = rc;
+	skdev->msix_count = SKD_MAX_MSIX_COUNT;
 	skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
 				      skdev->msix_count, GFP_KERNEL);
 	if (!skdev->msix_entries) {
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@ -162,6 +162,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	unsigned int num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
 	int err;
+	bool notify = false;

 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);

@ -214,10 +215,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	}

-	if (last)
-		virtqueue_kick(vblk->vq);
-
+	if (last && virtqueue_kick_prepare(vblk->vq))
+		notify = true;
 	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+
+	if (notify)
+		virtqueue_notify(vblk->vq);
 	return BLK_MQ_RQ_QUEUE_OK;
 }

--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@ -314,7 +314,7 @@ struct xen_blkif {
 	unsigned long long			st_rd_sect;
 	unsigned long long			st_wr_sect;

-	wait_queue_head_t	waiting_to_free;
+	struct work_struct	free_work;
 	/* Thread shutdown wait queue. */
 	wait_queue_head_t	shutdown_wq;
 };
@ -361,7 +361,7 @@ struct pending_req {
 #define xen_blkif_put(_b)				\
 	do {						\
 		if (atomic_dec_and_test(&(_b)->refcnt))	\
-			wake_up(&(_b)->waiting_to_free);\
+			schedule_work(&(_b)->free_work);\
 	} while (0)

 struct phys_req {
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@ -35,12 +35,26 @@ static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
+static void xen_blkif_free(struct xen_blkif *blkif);
+static void xen_vbd_free(struct xen_vbd *vbd);

 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
 {
 	return be->dev;
 }

+/*
+ * The last request could free the device from softirq context and
+ * xen_blkif_free() can sleep.
+ */
+static void xen_blkif_deferred_free(struct work_struct *work)
+{
+	struct xen_blkif *blkif;
+
+	blkif = container_of(work, struct xen_blkif, free_work);
+	xen_blkif_free(blkif);
+}
+
 static int blkback_name(struct xen_blkif *blkif, char *buf)
 {
 	char *devpath, *devname;
@ -121,7 +135,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	init_completion(&blkif->drain_complete);
 	atomic_set(&blkif->drain, 0);
 	blkif->st_print = jiffies;
-	init_waitqueue_head(&blkif->waiting_to_free);
 	blkif->persistent_gnts.rb_node = NULL;
 	spin_lock_init(&blkif->free_pages_lock);
 	INIT_LIST_HEAD(&blkif->free_pages);
@ -132,6 +145,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);

 	INIT_LIST_HEAD(&blkif->pending_free);
+	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);

 	for (i = 0; i < XEN_BLKIF_REQS; i++) {
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
@ -231,7 +245,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 	return 0;
 }

-static void xen_blkif_disconnect(struct xen_blkif *blkif)
+static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
@ -239,9 +253,12 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 		blkif->xenblkd = NULL;
 	}

-	atomic_dec(&blkif->refcnt);
-	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
-	atomic_inc(&blkif->refcnt);
+	/* The above kthread_stop() guarantees that at this point we
+	 * don't have any discard_io or other_io requests. So, checking
+	 * for inflight IO is enough.
+	 */
+	if (atomic_read(&blkif->inflight) > 0)
+		return -EBUSY;

 	if (blkif->irq) {
 		unbind_from_irqhandler(blkif->irq, blkif);
@ -252,6 +269,8 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
 		blkif->blk_rings.common.sring = NULL;
 	}
+
+	return 0;
 }

 static void xen_blkif_free(struct xen_blkif *blkif)
@ -259,8 +278,8 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	struct pending_req *req, *n;
 	int i = 0, j;

-	if (!atomic_dec_and_test(&blkif->refcnt))
-		BUG();
+	xen_blkif_disconnect(blkif);
+	xen_vbd_free(&blkif->vbd);

 	/* Remove all persistent grants and the cache of ballooned pages. */
 	xen_blkbk_free_caches(blkif);
@ -449,16 +468,15 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 		be->backend_watch.node = NULL;
 	}

+	dev_set_drvdata(&dev->dev, NULL);
+
 	if (be->blkif) {
 		xen_blkif_disconnect(be->blkif);
-		xen_vbd_free(&be->blkif->vbd);
-		xen_blkif_free(be->blkif);
-		be->blkif = NULL;
+		xen_blkif_put(be->blkif);
 	}

 	kfree(be->mode);
 	kfree(be);
-	dev_set_drvdata(&dev->dev, NULL);
 	return 0;
 }

@ -481,10 +499,15 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info
 	struct xenbus_device *dev = be->dev;
 	struct xen_blkif *blkif = be->blkif;
 	int err;
-	int state = 0;
+	int state = 0, discard_enable;
 	struct block_device *bdev = be->blkif->vbd.bdev;
 	struct request_queue *q = bdev_get_queue(bdev);

+	err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d",
+			   &discard_enable);
+	if (err == 1 && !discard_enable)
+		return;
+
 	if (blk_queue_discard(q)) {
 		err = xenbus_printf(xbt, dev->nodename,
 			"discard-granularity", "%u",
@ -700,7 +723,11 @@ static void frontend_changed(struct xenbus_device *dev,
 		 * Enforce precondition before potential leak point.
 		 * xen_blkif_disconnect() is idempotent.
 		 */
-		xen_blkif_disconnect(be->blkif);
+		err = xen_blkif_disconnect(be->blkif);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "pending I/O");
+			break;
+		}

 		err = connect_ring(be);
 		if (err)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@ -1635,36 +1635,24 @@ blkfront_closing(struct blkfront_info *info)
 static void blkfront_setup_discard(struct blkfront_info *info)
 {
 	int err;
-	char *type;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int discard_secure;

-	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
-	if (IS_ERR(type))
-		return;
-
-	info->feature_secdiscard = 0;
-	if (strncmp(type, "phy", 3) == 0) {
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			"discard-granularity", "%u", &discard_granularity,
-			"discard-alignment", "%u", &discard_alignment,
-			NULL);
-		if (!err) {
-			info->feature_discard = 1;
-			info->discard_granularity = discard_granularity;
-			info->discard_alignment = discard_alignment;
-		}
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "discard-secure", "%d", &discard_secure,
-			    NULL);
-		if (!err)
-			info->feature_secdiscard = discard_secure;
-
-	} else if (strncmp(type, "file", 4) == 0)
-		info->feature_discard = 1;
-
-	kfree(type);
+	info->feature_discard = 1;
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		"discard-granularity", "%u", &discard_granularity,
+		"discard-alignment", "%u", &discard_alignment,
+		NULL);
+	if (!err) {
+		info->discard_granularity = discard_granularity;
+		info->discard_alignment = discard_alignment;
+	}
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		    "discard-secure", "%d", &discard_secure,
+		    NULL);
+	if (!err)
+		info->feature_secdiscard = !!discard_secure;
 }

 static int blkfront_setup_indirect(struct blkfront_info *info)
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -620,6 +620,15 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)

 #define rq_data_dir(rq)		(((rq)->cmd_flags & 1) != 0)

+/*
+ * Driver can handle struct request, if it either has an old style
+ * request_fn defined, or is blk-mq based.
+ */
+static inline bool queue_is_rq_based(struct request_queue *q)
+{
+	return q->request_fn || q->mq_ops;
+}
+
 static inline unsigned int blk_queue_cluster(struct request_queue *q)
 {
 	return q->limits.cluster;
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@ -86,7 +86,7 @@ typedef uint64_t blkif_sector_t;
 *     Interface%20manuals/100293068c.pdf
 * The backend can optionally provide three extra XenBus attributes to
 * further optimize the discard functionality:
- * 'discard-aligment' - Devices that support discard functionality may
+ * 'discard-alignment' - Devices that support discard functionality may
 * internally allocate space in units that are bigger than the exported
 * logical block size. The discard-alignment parameter indicates how many bytes
 * the beginning of the partition is offset from the internal allocation unit's