drbd: fix potential data divergence after multiple failures

If we get an IO-error during an activity log transaction, if we failed to write the bitmap of the evicted extent, we must not write the transaction itself. If we failed to write the transaction, we must not even submit the corresponding bio, as its extent is not yet marked in the activity log. Otherwise, if this was a disconneted Primary (degraded cluster), which now lost its disk as well, and we later re-attach the same backend storage, we possibly "forget" to resync some parts of the disk that potentially have been changed. On the receiving side, when receiving from a peer with unhealthy disk, checking for pdsk == D_DISKLESS is not enough, we need to set out of sync and do AL transactions for everything pdsk < D_INCONSISTENT on the receiving side. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
2010-10-18 23:04:07 +02:00 · 2010-10-18 23:04:07 +02:00 · 6719fb036c
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@ -284,18 +284,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	u32 xor_sum = 0;

 	if (!get_ldev(mdev)) {
-		dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+		dev_err(DEV,
+			"disk is %s, cannot start al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
 		complete(&((struct update_al_work *)w)->event);
 		return 1;
 	}
 	/* do we have to do a bitmap write, first?
 	 * TODO reduce maximum latency:
 	 * submit both bios, then wait for both,
-	 * instead of doing two synchronous sector writes. */
+	 * instead of doing two synchronous sector writes.
+	 * For now, we must not write the transaction,
+	 * if we cannot write out the bitmap of the evicted extent. */
 	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
 		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);

-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+	/* The bitmap write may have failed, causing a state change. */
+	if (mdev->state.disk < D_INCONSISTENT) {
+		dev_err(DEV,
+			"disk is %s, cannot write al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
+
+	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
 	buffer = (struct al_transaction *)page_address(mdev->md_io_page);

 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@ -739,7 +753,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 	unsigned int enr;
 	unsigned long add = 0;
 	char ppb[10];
-	int i;
+	int i, tmp;

 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));

@ -747,7 +761,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
 		if (enr == LC_FREE)
 			continue;
-		add += drbd_bm_ALe_set_all(mdev, enr);
+		tmp = drbd_bm_ALe_set_all(mdev, enr);
+		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+		add += tmp;
 	}

 	lc_unlock(mdev->act_log);
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@ -1995,10 +1995,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		break;
 	}

-	if (mdev->state.pdsk == D_DISKLESS) {
+	if (mdev->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
 		drbd_set_out_of_sync(mdev, e->sector, e->size);
 		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		e->flags &= ~EE_MAY_SET_IN_SYNC;
 		drbd_al_begin_io(mdev, e->sector);
 	}

--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@ -942,12 +942,21 @@ allocate_barrier:
 	if (local) {
 		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;

-		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-				     : rw == READ  ? DRBD_FAULT_DT_RD
-				     :               DRBD_FAULT_DT_RA))
+		/* State may have changed since we grabbed our reference on the
+		 * mdev->ldev member. Double check, and short-circuit to endio.
+		 * In case the last activity log transaction failed to get on
+		 * stable storage, and this is a WRITE, we may not even submit
+		 * this bio. */
+		if (get_ldev(mdev)) {
+			if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+					     : rw == READ  ? DRBD_FAULT_DT_RD
+					     :               DRBD_FAULT_DT_RA))
+				bio_endio(req->private_bio, -EIO);
+			else
+				generic_make_request(req->private_bio);
+			put_ldev(mdev);
+		} else
 			bio_endio(req->private_bio, -EIO);
-		else
-			generic_make_request(req->private_bio);
 	}

 	/* we need to plug ALWAYS since we possibly need to kick lo_dev.