2019-05-31 11:09:56 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2006-01-16 19:50:04 +03:00
|
|
|
/*
|
|
|
|
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
|
2008-04-29 21:35:48 +04:00
|
|
|
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
|
2006-01-16 19:50:04 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/buffer_head.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/delay.h>
|
2006-10-02 19:38:25 +04:00
|
|
|
#include <linux/bio.h>
|
2006-02-28 01:23:27 +03:00
|
|
|
#include <linux/gfs2_ondisk.h>
|
2006-01-16 19:50:04 +03:00
|
|
|
|
|
|
|
#include "gfs2.h"
|
2006-02-28 01:23:27 +03:00
|
|
|
#include "incore.h"
|
2006-01-16 19:50:04 +03:00
|
|
|
#include "glock.h"
|
|
|
|
#include "glops.h"
|
|
|
|
#include "inode.h"
|
|
|
|
#include "log.h"
|
|
|
|
#include "lops.h"
|
|
|
|
#include "meta_io.h"
|
|
|
|
#include "rgrp.h"
|
|
|
|
#include "trans.h"
|
2006-02-28 01:23:27 +03:00
|
|
|
#include "util.h"
|
2011-04-14 17:09:52 +04:00
|
|
|
#include "trace_gfs2.h"
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2009-04-20 11:16:26 +04:00
|
|
|
static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2009-04-20 11:16:26 +04:00
|
|
|
struct buffer_head *bh, *head;
|
|
|
|
int nr_underway = 0;
|
2016-11-01 19:00:38 +03:00
|
|
|
int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2009-04-20 11:16:26 +04:00
|
|
|
BUG_ON(!PageLocked(page));
|
|
|
|
BUG_ON(!page_has_buffers(page));
|
|
|
|
|
|
|
|
head = page_buffers(page);
|
|
|
|
bh = head;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (!buffer_mapped(bh))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* If it's a fully non-blocking write attempt and we cannot
|
|
|
|
* lock the buffer then redirty the page. Note that this can
|
2012-07-25 19:12:13 +04:00
|
|
|
* potentially cause a busy-wait loop from flusher thread and kswapd
|
2009-04-20 11:16:26 +04:00
|
|
|
* activity, but those code paths have their own higher-level
|
|
|
|
* throttling.
|
|
|
|
*/
|
2010-10-27 01:21:26 +04:00
|
|
|
if (wbc->sync_mode != WB_SYNC_NONE) {
|
2009-04-20 11:16:26 +04:00
|
|
|
lock_buffer(bh);
|
|
|
|
} else if (!trylock_buffer(bh)) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (test_clear_buffer_dirty(bh)) {
|
|
|
|
mark_buffer_async_write(bh);
|
|
|
|
} else {
|
|
|
|
unlock_buffer(bh);
|
|
|
|
}
|
|
|
|
} while ((bh = bh->b_this_page) != head);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The page and its buffers are protected by PageWriteback(), so we can
|
|
|
|
* drop the bh refcounts early.
|
|
|
|
*/
|
|
|
|
BUG_ON(PageWriteback(page));
|
|
|
|
set_page_writeback(page);
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct buffer_head *next = bh->b_this_page;
|
|
|
|
if (buffer_async_write(bh)) {
|
2016-06-05 22:31:43 +03:00
|
|
|
submit_bh(REQ_OP_WRITE, write_flags, bh);
|
2009-04-20 11:16:26 +04:00
|
|
|
nr_underway++;
|
|
|
|
}
|
|
|
|
bh = next;
|
|
|
|
} while (bh != head);
|
|
|
|
unlock_page(page);
|
|
|
|
|
|
|
|
if (nr_underway == 0)
|
|
|
|
end_page_writeback(page);
|
|
|
|
|
2010-05-12 01:35:34 +04:00
|
|
|
return 0;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
|
|
|
|
2009-12-08 15:12:13 +03:00
|
|
|
const struct address_space_operations gfs2_meta_aops = {
|
2006-01-16 19:50:04 +03:00
|
|
|
.writepage = gfs2_aspace_writepage,
|
2006-07-11 17:46:33 +04:00
|
|
|
.releasepage = gfs2_releasepage,
|
2006-01-16 19:50:04 +03:00
|
|
|
};
|
|
|
|
|
2014-03-31 20:48:27 +04:00
|
|
|
const struct address_space_operations gfs2_rgrp_aops = {
|
|
|
|
.writepage = gfs2_aspace_writepage,
|
|
|
|
.releasepage = gfs2_releasepage,
|
|
|
|
};
|
|
|
|
|
2006-01-16 19:50:04 +03:00
|
|
|
/**
|
2008-05-21 20:03:22 +04:00
|
|
|
* gfs2_getbuf - Get a buffer with a given address space
|
2006-11-23 19:16:32 +03:00
|
|
|
* @gl: the glock
|
2006-01-16 19:50:04 +03:00
|
|
|
* @blkno: the block number (filesystem scope)
|
|
|
|
* @create: 1 if the buffer should be created
|
|
|
|
*
|
|
|
|
* Returns: the buffer
|
|
|
|
*/
|
|
|
|
|
2008-05-21 20:03:22 +04:00
|
|
|
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2009-12-08 15:12:13 +03:00
|
|
|
struct address_space *mapping = gfs2_glock2aspace(gl);
|
2015-03-16 19:52:05 +03:00
|
|
|
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
|
2006-01-16 19:50:04 +03:00
|
|
|
struct page *page;
|
|
|
|
struct buffer_head *bh;
|
|
|
|
unsigned int shift;
|
|
|
|
unsigned long index;
|
|
|
|
unsigned int bufnum;
|
|
|
|
|
2013-12-06 20:19:54 +04:00
|
|
|
if (mapping == NULL)
|
|
|
|
mapping = &sdp->sd_aspace;
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
|
2006-01-16 19:50:04 +03:00
|
|
|
index = blkno >> shift; /* convert block to page */
|
|
|
|
bufnum = blkno - (index << shift); /* block buf index within page */
|
|
|
|
|
|
|
|
if (create) {
|
|
|
|
for (;;) {
|
2006-11-23 19:16:32 +03:00
|
|
|
page = grab_cache_page(mapping, index);
|
2006-01-16 19:50:04 +03:00
|
|
|
if (page)
|
|
|
|
break;
|
|
|
|
yield();
|
|
|
|
}
|
|
|
|
} else {
|
2014-06-05 03:10:31 +04:00
|
|
|
page = find_get_page_flags(mapping, index,
|
|
|
|
FGP_LOCK|FGP_ACCESSED);
|
2006-01-16 19:50:04 +03:00
|
|
|
if (!page)
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
|
|
|
|
|
|
|
|
/* Locate header for our buffer within our page */
|
|
|
|
for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
|
|
|
|
/* Do nothing */;
|
|
|
|
get_bh(bh);
|
|
|
|
|
|
|
|
if (!buffer_mapped(bh))
|
|
|
|
map_bh(bh, sdp->sd_vfs, blkno);
|
|
|
|
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
|
|
|
return bh;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void meta_prep_new(struct buffer_head *bh)
|
|
|
|
{
|
|
|
|
struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
|
|
|
|
|
|
|
|
lock_buffer(bh);
|
|
|
|
clear_buffer_dirty(bh);
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
unlock_buffer(bh);
|
|
|
|
|
|
|
|
mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* gfs2_meta_new - Get a block
|
|
|
|
* @gl: The glock associated with this block
|
|
|
|
* @blkno: The block number
|
|
|
|
*
|
|
|
|
* Returns: The buffer
|
|
|
|
*/
|
|
|
|
|
2006-09-04 20:49:07 +04:00
|
|
|
struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
|
|
|
struct buffer_head *bh;
|
2008-05-21 20:03:22 +04:00
|
|
|
bh = gfs2_getbuf(gl, blkno, CREATE);
|
2006-01-16 19:50:04 +03:00
|
|
|
meta_prep_new(bh);
|
|
|
|
return bh;
|
|
|
|
}
|
|
|
|
|
2015-11-13 16:44:57 +03:00
|
|
|
static void gfs2_meta_read_endio(struct bio *bio)
|
2015-11-12 00:00:35 +03:00
|
|
|
{
|
2015-11-13 16:44:57 +03:00
|
|
|
struct bio_vec *bvec;
|
2019-02-15 14:13:19 +03:00
|
|
|
struct bvec_iter_all iter_all;
|
2015-11-13 16:44:57 +03:00
|
|
|
|
2019-04-25 10:03:00 +03:00
|
|
|
bio_for_each_segment_all(bvec, bio, iter_all) {
|
2015-11-13 16:44:57 +03:00
|
|
|
struct page *page = bvec->bv_page;
|
|
|
|
struct buffer_head *bh = page_buffers(page);
|
|
|
|
unsigned int len = bvec->bv_len;
|
|
|
|
|
|
|
|
while (bh_offset(bh) < bvec->bv_offset)
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
do {
|
|
|
|
struct buffer_head *next = bh->b_this_page;
|
|
|
|
len -= bh->b_size;
|
2017-06-03 10:38:06 +03:00
|
|
|
bh->b_end_io(bh, !bio->bi_status);
|
2015-11-13 16:44:57 +03:00
|
|
|
bh = next;
|
|
|
|
} while (bh && len);
|
|
|
|
}
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
2015-11-12 00:00:35 +03:00
|
|
|
|
2015-11-13 16:44:57 +03:00
|
|
|
/*
|
|
|
|
* Submit several consecutive buffer head I/O requests as a single bio I/O
|
|
|
|
* request. (See submit_bh_wbc.)
|
|
|
|
*/
|
2016-06-05 22:31:56 +03:00
|
|
|
static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
|
|
|
|
int num)
|
2015-11-13 16:44:57 +03:00
|
|
|
{
|
2016-08-16 14:25:22 +03:00
|
|
|
while (num > 0) {
|
|
|
|
struct buffer_head *bh = *bhs;
|
|
|
|
struct bio *bio;
|
|
|
|
|
|
|
|
bio = bio_alloc(GFP_NOIO, num);
|
|
|
|
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
|
2017-08-23 20:10:32 +03:00
|
|
|
bio_set_dev(bio, bh->b_bdev);
|
2016-08-16 14:25:22 +03:00
|
|
|
while (num > 0) {
|
|
|
|
bh = *bhs;
|
|
|
|
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
|
|
|
|
BUG_ON(bio->bi_iter.bi_size == 0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bhs++;
|
|
|
|
num--;
|
|
|
|
}
|
|
|
|
bio->bi_end_io = gfs2_meta_read_endio;
|
|
|
|
bio_set_op_attrs(bio, op, op_flags);
|
|
|
|
submit_bio(bio);
|
2015-11-12 00:00:35 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-01-16 19:50:04 +03:00
|
|
|
/**
|
|
|
|
* gfs2_meta_read - Read a block from disk
|
|
|
|
* @gl: The glock covering the block
|
|
|
|
* @blkno: The block number
|
2006-09-22 01:05:23 +04:00
|
|
|
* @flags: flags
|
2006-01-16 19:50:04 +03:00
|
|
|
* @bhp: the place where the buffer is returned (NULL on failure)
|
|
|
|
*
|
|
|
|
* Returns: errno
|
|
|
|
*/
|
|
|
|
|
2006-09-04 20:49:07 +04:00
|
|
|
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
|
2015-11-12 00:00:35 +03:00
|
|
|
int rahead, struct buffer_head **bhp)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2015-03-16 19:52:05 +03:00
|
|
|
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
|
2015-11-13 16:44:57 +03:00
|
|
|
struct buffer_head *bh, *bhs[2];
|
|
|
|
int num = 0;
|
2009-04-07 17:13:01 +04:00
|
|
|
|
gfs2: Force withdraw to replay journals and wait for it to finish
When a node withdraws from a file system, it often leaves its journal
in an incomplete state. This is especially true when the withdraw is
caused by io errors writing to the journal. Before this patch, a
withdraw would try to write a "shutdown" record to the journal, tell
dlm it's done with the file system, and none of the other nodes
know about the problem. Later, when the problem is fixed and the
withdrawn node is rebooted, it would then discover that its own
journal was incomplete, and replay it. However, replaying it at this
point is almost guaranteed to introduce corruption because the other
nodes are likely to have used affected resource groups that appeared
in the journal since the time of the withdraw. Replaying the journal
later will overwrite any changes made, and not through any fault of
dlm, which was instructed during the withdraw to release those
resources.
This patch makes file system withdraws seen by the entire cluster.
Withdrawing nodes dequeue their journal glock to allow recovery.
The remaining nodes check all the journals to see if they are
clean or in need of replay. They try to replay dirty journals, but
only the journals of withdrawn nodes will be "not busy" and
therefore available for replay.
Until the journal replay is complete, no i/o related glocks may be
given out, to ensure that the replay does not cause the
aforementioned corruption: We cannot allow any journal replay to
overwrite blocks associated with a glock once it is held.
The "live" glock which is now used to signal when a withdraw
occurs. When a withdraw occurs, the node signals its withdraw by
dequeueing the "live" glock and trying to enqueue it in EX mode,
thus forcing the other nodes to all see a demote request, by way
of a "1CB" (one callback) try lock. The "live" glock is not
granted in EX; the callback is only just used to indicate a
withdraw has occurred.
Note that all nodes in the cluster must wait for the recovering
node to finish replaying the withdrawing node's journal before
continuing. To this end, it checks that the journals are clean
multiple times in a retry loop.
Also note that the withdraw function may be called from a wide
variety of situations, and therefore, we need to take extra
precautions to make sure pointers are valid before using them in
many circumstances.
We also need to take care when glocks decide to withdraw, since
the withdraw code now uses glocks.
Also, before this patch, if a process encountered an error and
decided to withdraw, if another process was already withdrawing,
the second withdraw would be silently ignored, which set it free
to unlock its glocks. That's correct behavior if the original
withdrawer encounters further errors down the road. But if
secondary waiters don't wait for the journal replay, unlocking
glocks will allow other nodes to use them, despite the fact that
the journal containing those blocks is being replayed. The
replay needs to finish before our glocks are released to other
nodes. IOW, secondary withdraws need to wait for the first
withdraw to finish.
For example, if an rgrp glock is unlocked by a process that didn't
wait for the first withdraw, a journal replay could introduce file
system corruption by replaying a rgrp block that has already been
granted to a different cluster node.
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2020-01-28 22:23:45 +03:00
|
|
|
if (unlikely(gfs2_withdrawn(sdp)) &&
|
2020-04-22 22:14:30 +03:00
|
|
|
(!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) {
|
2012-06-18 11:31:31 +04:00
|
|
|
*bhp = NULL;
|
2009-04-07 17:13:01 +04:00
|
|
|
return -EIO;
|
2012-06-18 11:31:31 +04:00
|
|
|
}
|
2009-04-07 17:13:01 +04:00
|
|
|
|
|
|
|
*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
|
|
|
|
|
|
|
|
lock_buffer(bh);
|
|
|
|
if (buffer_uptodate(bh)) {
|
|
|
|
unlock_buffer(bh);
|
2015-11-13 16:44:57 +03:00
|
|
|
flags &= ~DIO_WAIT;
|
|
|
|
} else {
|
|
|
|
bh->b_end_io = end_buffer_read_sync;
|
|
|
|
get_bh(bh);
|
|
|
|
bhs[num++] = bh;
|
2009-04-07 17:13:01 +04:00
|
|
|
}
|
2015-11-13 16:44:57 +03:00
|
|
|
|
|
|
|
if (rahead) {
|
|
|
|
bh = gfs2_getbuf(gl, blkno + 1, CREATE);
|
|
|
|
|
|
|
|
lock_buffer(bh);
|
|
|
|
if (buffer_uptodate(bh)) {
|
|
|
|
unlock_buffer(bh);
|
|
|
|
brelse(bh);
|
|
|
|
} else {
|
|
|
|
bh->b_end_io = end_buffer_read_sync;
|
|
|
|
bhs[num++] = bh;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-01 16:40:10 +03:00
|
|
|
gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
|
2009-04-07 17:13:01 +04:00
|
|
|
if (!(flags & DIO_WAIT))
|
|
|
|
return 0;
|
|
|
|
|
2015-11-13 16:44:57 +03:00
|
|
|
bh = *bhp;
|
2009-04-07 17:13:01 +04:00
|
|
|
wait_on_buffer(bh);
|
|
|
|
if (unlikely(!buffer_uptodate(bh))) {
|
|
|
|
struct gfs2_trans *tr = current->journal_info;
|
2017-01-25 20:50:47 +03:00
|
|
|
if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
|
2018-06-07 13:56:46 +03:00
|
|
|
gfs2_io_error_bh_wd(sdp, bh);
|
2009-04-07 17:13:01 +04:00
|
|
|
brelse(bh);
|
2012-06-18 11:31:31 +04:00
|
|
|
*bhp = NULL;
|
2009-04-07 17:13:01 +04:00
|
|
|
return -EIO;
|
2006-09-22 01:05:23 +04:00
|
|
|
}
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
return 0;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2006-09-22 01:05:23 +04:00
|
|
|
* gfs2_meta_wait - Reread a block from disk
|
2006-01-16 19:50:04 +03:00
|
|
|
* @sdp: the filesystem
|
2006-09-22 01:05:23 +04:00
|
|
|
* @bh: The block to wait for
|
2006-01-16 19:50:04 +03:00
|
|
|
*
|
|
|
|
* Returns: errno
|
|
|
|
*/
|
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2019-11-14 17:52:15 +03:00
|
|
|
if (unlikely(gfs2_withdrawn(sdp)))
|
2006-01-16 19:50:04 +03:00
|
|
|
return -EIO;
|
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
wait_on_buffer(bh);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
if (!buffer_uptodate(bh)) {
|
|
|
|
struct gfs2_trans *tr = current->journal_info;
|
2017-01-25 20:50:47 +03:00
|
|
|
if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
|
2018-06-07 13:56:46 +03:00
|
|
|
gfs2_io_error_bh_wd(sdp, bh);
|
2006-09-22 01:05:23 +04:00
|
|
|
return -EIO;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
2019-11-14 17:52:15 +03:00
|
|
|
if (unlikely(gfs2_withdrawn(sdp)))
|
2006-09-22 01:05:23 +04:00
|
|
|
return -EIO;
|
2006-01-16 19:50:04 +03:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-02 19:53:35 +03:00
|
|
|
void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
|
2007-09-17 13:59:52 +04:00
|
|
|
{
|
2009-12-08 15:12:13 +03:00
|
|
|
struct address_space *mapping = bh->b_page->mapping;
|
|
|
|
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
|
2007-09-17 13:59:52 +04:00
|
|
|
struct gfs2_bufdata *bd = bh->b_private;
|
2016-05-02 19:53:35 +03:00
|
|
|
struct gfs2_trans *tr = current->journal_info;
|
2013-12-13 17:31:06 +04:00
|
|
|
int was_pinned = 0;
|
2009-12-08 15:12:13 +03:00
|
|
|
|
2007-09-17 13:59:52 +04:00
|
|
|
if (test_clear_buffer_pinned(bh)) {
|
2011-04-14 17:09:52 +04:00
|
|
|
trace_gfs2_pin(bd, 0);
|
GFS2: Various gfs2_logd improvements
This patch contains various tweaks to how log flushes and active item writeback
work. gfs2_logd is now managed by a waitqueue, and gfs2_log_reseve now waits
for gfs2_logd to do the log flushing. Multiple functions were rewritten to
remove the need to call gfs2_log_lock(). Instead of using one test to see if
gfs2_logd had work to do, there are now seperate tests to check if there
are two many buffers in the incore log or if there are two many items on the
active items list.
This patch is a port of a patch Steve Whitehouse wrote about a year ago, with
some minor changes. Since gfs2_ail1_start always submits all the active items,
it no longer needs to keep track of the first ai submitted, so this has been
removed. In gfs2_log_reserve(), the order of the calls to
prepare_to_wait_exclusive() and wake_up() when firing off the logd thread has
been switched. If it called wake_up first there was a small window for a race,
where logd could run and return before gfs2_log_reserve was ready to get woken
up. If gfs2_logd ran, but did not free up enough blocks, gfs2_log_reserve()
would be left waiting for gfs2_logd to eventualy run because it timed out.
Finally, gt_logd_secs, which controls how long to wait before gfs2_logd times
out, and flushes the log, can now be set on mount with ar_commit.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2010-05-04 23:29:16 +04:00
|
|
|
atomic_dec(&sdp->sd_log_pinned);
|
2012-05-01 20:00:34 +04:00
|
|
|
list_del_init(&bd->bd_list);
|
2016-05-02 19:53:35 +03:00
|
|
|
if (meta == REMOVE_META)
|
2007-09-17 13:59:52 +04:00
|
|
|
tr->tr_num_buf_rm++;
|
2014-02-22 01:55:33 +04:00
|
|
|
else
|
2007-09-17 13:59:52 +04:00
|
|
|
tr->tr_num_databuf_rm++;
|
2017-01-25 20:50:47 +03:00
|
|
|
set_bit(TR_TOUCHED, &tr->tr_flags);
|
2013-12-13 17:31:06 +04:00
|
|
|
was_pinned = 1;
|
2007-09-17 13:59:52 +04:00
|
|
|
brelse(bh);
|
|
|
|
}
|
|
|
|
if (bd) {
|
2011-03-14 15:40:29 +03:00
|
|
|
spin_lock(&sdp->sd_ail_lock);
|
GFS2: replace gfs2_ail structure with gfs2_trans
In order to allow transactions and log flushes to happen at the same
time, gfs2 needs to move the transaction accounting and active items
list code into the gfs2_trans structure. As a first step toward this,
this patch removes the gfs2_ail structure, and handles the active items
list in the gfs_trans structure. This keeps gfs2 from allocating an ail
structure on log flushes, and gives us a struture that can later be used
to store the transaction accounting outside of the gfs2 superblock
structure.
With this patch, at the end of a transaction, gfs2 will add the
gfs2_trans structure to the superblock if there is not one already.
This structure now has the active items fields that were previously in
gfs2_ail. This is not necessary in the case where the transaction was
simply used to add revokes, since these are never written outside of the
journal, and thus, don't need an active items list.
Also, in order to make sure that the transaction structure is not
removed while it's still in use by gfs2_trans_end, unlocking the
sd_log_flush_lock has to happen slightly later in ending the
transaction.
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2013-04-06 05:31:46 +04:00
|
|
|
if (bd->bd_tr) {
|
2007-09-17 13:59:52 +04:00
|
|
|
gfs2_trans_add_revoke(sdp, bd);
|
2013-12-13 17:31:06 +04:00
|
|
|
} else if (was_pinned) {
|
|
|
|
bh->b_private = NULL;
|
|
|
|
kmem_cache_free(gfs2_bufdata_cachep, bd);
|
2007-09-17 13:59:52 +04:00
|
|
|
}
|
2011-03-14 15:40:29 +03:00
|
|
|
spin_unlock(&sdp->sd_ail_lock);
|
2007-09-17 13:59:52 +04:00
|
|
|
}
|
|
|
|
clear_buffer_dirty(bh);
|
|
|
|
clear_buffer_uptodate(bh);
|
|
|
|
}
|
|
|
|
|
2006-01-16 19:50:04 +03:00
|
|
|
/**
|
|
|
|
* gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
|
|
|
|
* @ip: the inode who owns the buffers
|
|
|
|
* @bstart: the first buffer in the run
|
|
|
|
* @blen: the number of buffers in the run
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2006-09-04 20:49:07 +04:00
|
|
|
void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2006-06-14 23:32:57 +04:00
|
|
|
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
|
2006-01-16 19:50:04 +03:00
|
|
|
struct buffer_head *bh;
|
|
|
|
|
|
|
|
while (blen) {
|
2008-05-21 20:03:22 +04:00
|
|
|
bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
|
2006-01-16 19:50:04 +03:00
|
|
|
if (bh) {
|
2007-09-03 14:01:33 +04:00
|
|
|
lock_buffer(bh);
|
|
|
|
gfs2_log_lock(sdp);
|
2016-05-02 19:53:35 +03:00
|
|
|
gfs2_remove_from_journal(bh, REMOVE_META);
|
2007-09-03 14:01:33 +04:00
|
|
|
gfs2_log_unlock(sdp);
|
2006-01-16 19:50:04 +03:00
|
|
|
unlock_buffer(bh);
|
|
|
|
brelse(bh);
|
|
|
|
}
|
|
|
|
|
|
|
|
bstart++;
|
|
|
|
blen--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* gfs2_meta_indirect_buffer - Get a metadata buffer
|
|
|
|
* @ip: The GFS2 inode
|
|
|
|
* @height: The level of this buf in the metadata (indir addr) tree (if any)
|
|
|
|
* @num: The block number (device relative) of the buffer
|
|
|
|
* @bhp: the buffer is returned here
|
|
|
|
*
|
|
|
|
* Returns: errno
|
|
|
|
*/
|
|
|
|
|
2006-09-04 20:49:07 +04:00
|
|
|
int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
|
2012-05-10 16:33:55 +04:00
|
|
|
struct buffer_head **bhp)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2006-09-22 01:05:23 +04:00
|
|
|
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
|
|
|
|
struct gfs2_glock *gl = ip->i_gl;
|
2007-10-15 19:29:05 +04:00
|
|
|
struct buffer_head *bh;
|
|
|
|
int ret = 0;
|
2012-05-10 16:33:55 +04:00
|
|
|
u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
|
2015-11-12 00:00:35 +03:00
|
|
|
int rahead = 0;
|
|
|
|
|
|
|
|
if (num == ip->i_no_addr)
|
|
|
|
rahead = ip->i_rahead;
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2015-11-12 00:00:35 +03:00
|
|
|
ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
|
2012-05-10 16:33:55 +04:00
|
|
|
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
|
|
|
|
brelse(bh);
|
|
|
|
ret = -EIO;
|
2017-07-03 19:37:02 +03:00
|
|
|
} else {
|
|
|
|
*bhp = bh;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
2007-10-15 19:29:05 +04:00
|
|
|
return ret;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* gfs2_meta_ra - start readahead on an extent of a file
|
|
|
|
* @gl: the glock the blocks belong to
|
|
|
|
* @dblock: the starting disk block
|
|
|
|
* @extlen: the number of blocks in the extent
|
|
|
|
*
|
2006-09-22 01:05:23 +04:00
|
|
|
* returns: the first buffer in the extent
|
2006-01-16 19:50:04 +03:00
|
|
|
*/
|
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
|
2006-01-16 19:50:04 +03:00
|
|
|
{
|
2015-03-16 19:52:05 +03:00
|
|
|
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
|
2006-01-16 19:50:04 +03:00
|
|
|
struct buffer_head *first_bh, *bh;
|
2006-09-04 20:49:07 +04:00
|
|
|
u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
|
2006-02-27 20:00:42 +03:00
|
|
|
sdp->sd_sb.sb_bsize_shift;
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
BUG_ON(!extlen);
|
|
|
|
|
|
|
|
if (max_ra < 1)
|
|
|
|
max_ra = 1;
|
2006-01-16 19:50:04 +03:00
|
|
|
if (extlen > max_ra)
|
|
|
|
extlen = max_ra;
|
|
|
|
|
2008-05-21 20:03:22 +04:00
|
|
|
first_bh = gfs2_getbuf(gl, dblock, CREATE);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
|
|
|
if (buffer_uptodate(first_bh))
|
|
|
|
goto out;
|
2006-09-22 01:05:23 +04:00
|
|
|
if (!buffer_locked(first_bh))
|
gfs2: add flag REQ_PRIO for metadata I/O
When gfs2 does metadata I/O, only REQ_META is used as a metadata hint of
the bio. But flag REQ_META is just a hint for block trace, not for block
layer code to handle a bio as metadata request.
For some of metadata I/Os of gfs2, A REQ_PRIO flag on the metadata bio
would be very informative to block layer code. For example, if bcache is
used as a I/O cache for gfs2, it will be possible for bcache code to get
the hint and cache the pre-fetched metadata blocks on cache device. This
behavior may be helpful to improve metadata I/O performance if the
following requests hit the cache.
Here are the locations in gfs2 code where a REQ_PRIO flag should be added,
- All places where REQ_READAHEAD is used, gfs2 code uses this flag for
metadata read ahead.
- In gfs2_meta_rq() where the first metadata block is read in.
- In gfs2_write_buf_to_page(), read in quota metadata blocks to have them
up to date.
These metadata blocks are probably to be accessed again in future, adding
a REQ_PRIO flag may have bcache to keep such metadata in fast cache
device. For system without a cache layer, REQ_PRIO can still provide hint
to block layer to handle metadata requests more properly.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2017-07-21 15:48:22 +03:00
|
|
|
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
|
|
|
dblock++;
|
|
|
|
extlen--;
|
|
|
|
|
|
|
|
while (extlen) {
|
2008-05-21 20:03:22 +04:00
|
|
|
bh = gfs2_getbuf(gl, dblock, CREATE);
|
2006-01-16 19:50:04 +03:00
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
if (!buffer_uptodate(bh) && !buffer_locked(bh))
|
gfs2: add flag REQ_PRIO for metadata I/O
When gfs2 does metadata I/O, only REQ_META is used as a metadata hint of
the bio. But flag REQ_META is just a hint for block trace, not for block
layer code to handle a bio as metadata request.
For some of metadata I/Os of gfs2, A REQ_PRIO flag on the metadata bio
would be very informative to block layer code. For example, if bcache is
used as a I/O cache for gfs2, it will be possible for bcache code to get
the hint and cache the pre-fetched metadata blocks on cache device. This
behavior may be helpful to improve metadata I/O performance if the
following requests hit the cache.
Here are the locations in gfs2 code where a REQ_PRIO flag should be added,
- All places where REQ_READAHEAD is used, gfs2 code uses this flag for
metadata read ahead.
- In gfs2_meta_rq() where the first metadata block is read in.
- In gfs2_write_buf_to_page(), read in quota metadata blocks to have them
up to date.
These metadata blocks are probably to be accessed again in future, adding
a REQ_PRIO flag may have bcache to keep such metadata in fast cache
device. For system without a cache layer, REQ_PRIO can still provide hint
to block layer to handle metadata requests more properly.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2017-07-21 15:48:22 +03:00
|
|
|
ll_rw_block(REQ_OP_READ,
|
|
|
|
REQ_RAHEAD | REQ_META | REQ_PRIO,
|
|
|
|
1, &bh);
|
2006-09-22 01:05:23 +04:00
|
|
|
brelse(bh);
|
2006-01-16 19:50:04 +03:00
|
|
|
dblock++;
|
|
|
|
extlen--;
|
2006-09-22 01:05:23 +04:00
|
|
|
if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
|
|
|
|
goto out;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
|
|
|
|
2006-09-22 01:05:23 +04:00
|
|
|
wait_on_buffer(first_bh);
|
2006-09-04 20:04:26 +04:00
|
|
|
out:
|
2006-09-22 01:05:23 +04:00
|
|
|
return first_bh;
|
2006-01-16 19:50:04 +03:00
|
|
|
}
|
|
|
|
|