2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* linux/include/linux/ext3_fs_i.h
|
|
|
|
*
|
|
|
|
* Copyright (C) 1992, 1993, 1994, 1995
|
|
|
|
* Remy Card (card@masi.ibp.fr)
|
|
|
|
* Laboratoire MASI - Institut Blaise Pascal
|
|
|
|
* Universite Pierre et Marie Curie (Paris VI)
|
|
|
|
*
|
|
|
|
* from
|
|
|
|
*
|
|
|
|
* linux/include/linux/minix_fs_i.h
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_EXT3_FS_I
|
|
|
|
#define _LINUX_EXT3_FS_I
|
|
|
|
|
|
|
|
#include <linux/rwsem.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/seqlock.h>
|
2006-03-23 14:00:42 +03:00
|
|
|
#include <linux/mutex.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
|
[PATCH] ext3_fsblk_t: filesystem, group blocks and bug fixes
Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
int type, thus limited ext3 filesystem to 8TB (4kblock size based). While
trying to fix them, it seems quite confusing in the ext3 code where some
blocks are filesystem-wide blocks, some are group relative offsets that need
to be signed value (as -1 has special meaning). So it seem saner to define
two types of physical blocks: one is filesystem wide blocks, another is
group-relative blocks. The following patches clarify these two types of
blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
filesystem limit to 8TB.
With this series of patches and the percpu counter data type changes in the mm
tree, we are able to extend exts filesystem limit to 16TB.
This work is also a pre-request for the recent >32 bit ext3 work, and makes
the kernel to able to address 48 bit ext3 block a lot easier: Simply redefine
ext3_fsblk_t from unsigned long to sector_t and redefine the format string for
ext3 filesystem block corresponding.
Two RFC with a series patches have been posted to ext2-devel list and have
been reviewed and discussed:
http://marc.theaimsgroup.com/?l=ext2-devel&m=114722190816690&w=2
http://marc.theaimsgroup.com/?l=ext2-devel&m=114784919525942&w=2
Patches are tested on both 32 bit machine and 64 bit machine, <8TB ext3 and
>8TB ext3 filesystem(with the latest to be released e2fsprogs-1.39). Tests
includes overnight fsx, tiobench, dbench and fsstress.
This patch:
Defines ext3_fsblk_t and ext3_grpblk_t, and the printk format string for
filesystem wide blocks.
This patch classifies all block group relative blocks, and ext3_fsblk_t blocks
occurs in the same function where used to be confusing before. Also include
kernel bug fixes for filesystem wide in-kernel block variables. There are
some fileystem wide blocks are treated as int/unsigned int type in the kernel
currently, especially in ext3 block allocation and reservation code. This
patch fixed those bugs by converting those variables to ext3_fsblk_t(unsigned
long) type.
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-25 16:48:06 +04:00
|
|
|
/* data type for block offset of block group */
|
|
|
|
typedef int ext3_grpblk_t;
|
|
|
|
|
|
|
|
/* data type for filesystem-wide blocks number */
|
|
|
|
typedef unsigned long ext3_fsblk_t;
|
|
|
|
|
|
|
|
#define E3FSBLK "%lu"
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
struct ext3_reserve_window {
|
2006-06-25 16:48:07 +04:00
|
|
|
ext3_fsblk_t _rsv_start; /* First byte reserved */
|
|
|
|
ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
struct ext3_reserve_window_node {
|
2006-09-27 12:49:35 +04:00
|
|
|
struct rb_node rsv_node;
|
2005-04-17 02:20:36 +04:00
|
|
|
__u32 rsv_goal_size;
|
|
|
|
__u32 rsv_alloc_hit;
|
|
|
|
struct ext3_reserve_window rsv_window;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ext3_block_alloc_info {
|
|
|
|
/* information about reservation window */
|
|
|
|
struct ext3_reserve_window_node rsv_window_node;
|
|
|
|
/*
|
|
|
|
* was i_next_alloc_block in ext3_inode_info
|
|
|
|
* is the logical (file-relative) number of the
|
|
|
|
* most-recently-allocated block in this file.
|
|
|
|
* We use this for detecting linearly ascending allocation requests.
|
|
|
|
*/
|
|
|
|
__u32 last_alloc_logical_block;
|
|
|
|
/*
|
|
|
|
* Was i_next_alloc_goal in ext3_inode_info
|
|
|
|
* is the *physical* companion to i_next_alloc_block.
|
2007-05-09 10:57:56 +04:00
|
|
|
* it the physical block number of the block which was most-recentl
|
2005-04-17 02:20:36 +04:00
|
|
|
* allocated to this file. This give us the goal (target) for the next
|
|
|
|
* allocation when we detect linearly ascending requests.
|
|
|
|
*/
|
2006-06-25 16:48:07 +04:00
|
|
|
ext3_fsblk_t last_alloc_physical_block;
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#define rsv_start rsv_window._rsv_start
|
|
|
|
#define rsv_end rsv_window._rsv_end
|
|
|
|
|
|
|
|
/*
|
|
|
|
* third extended file system inode data in memory
|
|
|
|
*/
|
|
|
|
struct ext3_inode_info {
|
|
|
|
__le32 i_data[15]; /* unconverted */
|
|
|
|
__u32 i_flags;
|
|
|
|
#ifdef EXT3_FRAGMENTS
|
|
|
|
__u32 i_faddr;
|
|
|
|
__u8 i_frag_no;
|
|
|
|
__u8 i_frag_size;
|
|
|
|
#endif
|
2006-06-25 16:48:07 +04:00
|
|
|
ext3_fsblk_t i_file_acl;
|
2005-04-17 02:20:36 +04:00
|
|
|
__u32 i_dir_acl;
|
|
|
|
__u32 i_dtime;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* i_block_group is the number of the block group which contains
|
|
|
|
* this file's inode. Constant across the lifetime of the inode,
|
|
|
|
* it is ued for making block allocation decisions - we try to
|
|
|
|
* place a file's data blocks near its inode block, and new inodes
|
|
|
|
* near to their parent directory's inode.
|
|
|
|
*/
|
|
|
|
__u32 i_block_group;
|
2010-03-30 01:30:19 +04:00
|
|
|
unsigned long i_state_flags; /* Dynamic state flags for ext3 */
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* block reservation info */
|
|
|
|
struct ext3_block_alloc_info *i_block_alloc_info;
|
|
|
|
|
|
|
|
__u32 i_dir_start_lookup;
|
|
|
|
#ifdef CONFIG_EXT3_FS_XATTR
|
|
|
|
/*
|
|
|
|
* Extended attributes can be read independently of the main file
|
2006-01-10 02:59:24 +03:00
|
|
|
* data. Taking i_mutex even when reading would cause contention
|
2005-04-17 02:20:36 +04:00
|
|
|
* between readers of EAs and writers of regular file data, so
|
|
|
|
* instead we synchronize on xattr_sem when reading or changing
|
|
|
|
* EAs.
|
|
|
|
*/
|
|
|
|
struct rw_semaphore xattr_sem;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct list_head i_orphan; /* unlinked but open inodes */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* i_disksize keeps track of what the inode size is ON DISK, not
|
|
|
|
* in memory. During truncate, i_size is set to the new size by
|
|
|
|
* the VFS prior to calling ext3_truncate(), but the filesystem won't
|
|
|
|
* set i_disksize to 0 until the truncate is actually under way.
|
|
|
|
*
|
|
|
|
* The intent is that i_disksize always represents the blocks which
|
|
|
|
* are used by this file. This allows recovery to restart truncate
|
|
|
|
* on orphans if we crash during truncate. We actually write i_disksize
|
|
|
|
* into the on-disk inode when writing inodes out, instead of i_size.
|
|
|
|
*
|
|
|
|
* The only time when i_disksize and i_size may be different is when
|
|
|
|
* a truncate is in progress. The only things which change i_disksize
|
|
|
|
* are ext3_get_block (growth) and ext3_truncate (shrinkth).
|
|
|
|
*/
|
|
|
|
loff_t i_disksize;
|
|
|
|
|
|
|
|
/* on-disk additional length */
|
|
|
|
__u16 i_extra_isize;
|
|
|
|
|
|
|
|
/*
|
2006-03-23 14:00:42 +03:00
|
|
|
* truncate_mutex is for serialising ext3_truncate() against
|
2005-04-17 02:20:36 +04:00
|
|
|
* ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
|
|
|
|
* data tree are chopped off during truncate. We can't do that in
|
|
|
|
* ext3 because whenever we perform intermediate commits during
|
|
|
|
* truncate, the inode and all the metadata blocks *must* be in a
|
|
|
|
* consistent state which allows truncation of the orphans to restart
|
|
|
|
* during recovery. Hence we must fix the get_block-vs-truncate race
|
2006-03-23 14:00:42 +03:00
|
|
|
* by other means, so we have truncate_mutex.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2006-03-23 14:00:42 +03:00
|
|
|
struct mutex truncate_mutex;
|
2009-10-16 21:26:15 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Transactions that contain inode's metadata needed to complete
|
|
|
|
* fsync and fdatasync, respectively.
|
|
|
|
*/
|
|
|
|
atomic_t i_sync_tid;
|
|
|
|
atomic_t i_datasync_tid;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
struct inode vfs_inode;
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* _LINUX_EXT3_FS_I */
|