2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* fs/dcache.c
|
|
|
|
*
|
|
|
|
* Complete reimplementation
|
|
|
|
* (C) 1997 Thomas Schoebel-Theuer,
|
|
|
|
* with heavy changes by Linus Torvalds
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Notes on the allocation strategy:
|
|
|
|
*
|
|
|
|
* The dcache is a master of the icache - whenever a dcache entry
|
|
|
|
* exists, the inode will always exist. "iput()" is done either when
|
|
|
|
* the dcache entry is deleted or garbage collected.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/fs.h>
|
2005-08-08 21:52:16 +04:00
|
|
|
#include <linux/fsnotify.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/hash.h>
|
|
|
|
#include <linux/cache.h>
|
2011-11-17 08:57:37 +04:00
|
|
|
#include <linux/export.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <linux/security.h>
|
|
|
|
#include <linux/seqlock.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/bootmem.h>
|
2009-03-30 03:50:06 +04:00
|
|
|
#include <linux/fs_struct.h>
|
2009-07-16 17:44:29 +04:00
|
|
|
#include <linux/hardirq.h>
|
2011-01-07 09:50:05 +03:00
|
|
|
#include <linux/bit_spinlock.h>
|
|
|
|
#include <linux/rculist_bl.h>
|
2011-05-20 23:50:29 +04:00
|
|
|
#include <linux/prefetch.h>
|
2011-08-16 18:31:30 +04:00
|
|
|
#include <linux/ratelimit.h>
|
2013-08-28 04:18:00 +04:00
|
|
|
#include <linux/list_lru.h>
|
2015-02-14 01:39:45 +03:00
|
|
|
#include <linux/kasan.h>
|
|
|
|
|
2006-09-30 22:52:18 +04:00
|
|
|
#include "internal.h"
|
2011-11-24 04:26:23 +04:00
|
|
|
#include "mount.h"
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-01-07 09:49:30 +03:00
|
|
|
/*
|
|
|
|
* Usage:
|
2011-01-07 09:50:06 +03:00
|
|
|
* dcache->d_inode->i_lock protects:
|
2014-10-27 02:19:16 +03:00
|
|
|
* - i_dentry, d_u.d_alias, d_inode of aliases
|
2011-01-07 09:50:05 +03:00
|
|
|
* dcache_hash_bucket lock protects:
|
|
|
|
* - the dcache hash table
|
|
|
|
* s_anon bl list spinlock protects:
|
|
|
|
* - the s_anon list (see __d_drop)
|
2013-08-28 04:17:55 +04:00
|
|
|
* dentry->d_sb->s_dentry_lru_lock protects:
|
2011-01-07 09:49:31 +03:00
|
|
|
* - the dcache lru lists and counters
|
|
|
|
* d_lock protects:
|
|
|
|
* - d_flags
|
|
|
|
* - d_name
|
|
|
|
* - d_lru
|
2011-01-07 09:49:32 +03:00
|
|
|
* - d_count
|
2011-01-07 09:49:33 +03:00
|
|
|
* - d_unhashed()
|
2011-01-07 09:49:34 +03:00
|
|
|
* - d_parent and d_subdirs
|
|
|
|
* - childrens' d_child and d_parent
|
2014-10-27 02:19:16 +03:00
|
|
|
* - d_u.d_alias, d_inode
|
2011-01-07 09:49:30 +03:00
|
|
|
*
|
|
|
|
* Ordering:
|
2011-01-07 09:50:06 +03:00
|
|
|
* dentry->d_inode->i_lock
|
2011-01-07 09:49:38 +03:00
|
|
|
* dentry->d_lock
|
2013-08-28 04:17:55 +04:00
|
|
|
* dentry->d_sb->s_dentry_lru_lock
|
2011-01-07 09:50:05 +03:00
|
|
|
* dcache_hash_bucket lock
|
|
|
|
* s_anon lock
|
2011-01-07 09:49:30 +03:00
|
|
|
*
|
2011-01-07 09:49:33 +03:00
|
|
|
* If there is an ancestor relationship:
|
|
|
|
* dentry->d_parent->...->d_parent->d_lock
|
|
|
|
* ...
|
|
|
|
* dentry->d_parent->d_lock
|
|
|
|
* dentry->d_lock
|
|
|
|
*
|
|
|
|
* If no ancestor relationship:
|
2011-01-07 09:49:30 +03:00
|
|
|
* if (dentry1 < dentry2)
|
|
|
|
* dentry1->d_lock
|
|
|
|
* dentry2->d_lock
|
|
|
|
*/
|
2006-03-26 13:37:24 +04:00
|
|
|
int sysctl_vfs_cache_pressure __read_mostly = 100;
|
2005-04-17 02:20:36 +04:00
|
|
|
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
|
|
|
|
|
[PATCH] audit: watching subtrees
New kind of audit rule predicates: "object is visible in given subtree".
The part that can be sanely implemented, that is. Limitations:
* if you have hardlink from outside of tree, you'd better watch
it too (or just watch the object itself, obviously)
* if you mount something under a watched tree, tell audit
that new chunk should be added to watched subtrees
* if you umount something in a watched tree and it's still mounted
elsewhere, you will get matches on events happening there. New command
tells audit to recalculate the trees, trimming such sources of false
positives.
Note that it's _not_ about path - if something mounted in several places
(multiple mount, bindings, different namespaces, etc.), the match does
_not_ depend on which one we are using for access.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2007-07-22 16:04:18 +04:00
|
|
|
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-01-07 09:49:37 +03:00
|
|
|
EXPORT_SYMBOL(rename_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2006-12-07 07:33:20 +03:00
|
|
|
static struct kmem_cache *dentry_cache __read_mostly;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the single most critical data structure when it comes
|
|
|
|
* to the dcache: the hashtable for lookups. Somebody should try
|
|
|
|
* to make this good - I've just made it work.
|
|
|
|
*
|
|
|
|
* This hash-function tries to avoid losing too many bits of hash
|
|
|
|
* information, yet avoid using a prime hash-size or similar.
|
|
|
|
*/
|
|
|
|
|
2006-03-26 13:37:24 +04:00
|
|
|
static unsigned int d_hash_mask __read_mostly;
|
|
|
|
static unsigned int d_hash_shift __read_mostly;
|
2011-01-07 09:50:05 +03:00
|
|
|
|
2011-04-24 09:32:03 +04:00
|
|
|
static struct hlist_bl_head *dentry_hashtable __read_mostly;
|
2011-01-07 09:50:05 +03:00
|
|
|
|
2012-03-03 02:23:30 +04:00
|
|
|
static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
|
vfs: get rid of batshit-insane pointless dentry hash calculations
For some odd historical reason, the final mixing round for the dentry
cache hash table lookup had an insane "xor with big constant" logic. In
two places.
The big constant that is being xor'ed is GOLDEN_RATIO_PRIME, which is a
fairly random-looking number that is designed to be *multiplied* with so
that the bits get spread out over a whole long-word.
But xor'ing with it is insane. It doesn't really even change the hash -
it really only shifts the hash around in the hash table. To make
matters worse, the insane big constant is different on 32-bit and 64-bit
builds, even though the name hash bits we use are always 32-bit (and the
bits from the pointer we mix in effectively are too).
It's all total voodoo programming, in other words.
Now, some testing and analysis of the hash chains shows that the rest of
the hash function seems to be fairly good. It does pick the right bits
of the parent dentry pointer, for example, and while it's generally a
bad idea to use an xor to mix down the upper bits (because if there is a
repeating pattern, the xor can cause "destructive interference"), it
seems to not have been a disaster.
For example, replacing the hash with the normal "hash_long()" code (that
uses the GOLDEN_RATIO_PRIME constant correctly, btw) actually just makes
the hash worse. The hand-picked hash knew which bits of the pointer had
the highest entropy, and hash_long() ends up mixing bits less optimally
at least in some trivial tests.
So the hash function overall seems fine, it just has that really odd
"shift result around by a constant xor".
So get rid of the silly xor, and replace the down-mixing of the bits
with an add instead of an xor that tends to not have the same kind of
destructive interference issues. Some stats on the resulting hash
chains shows that they look statistically identical before and after,
but the code is simpler and no longer makes you go "WTF?".
Also, the incoming hash really is just "unsigned int", not a long, and
there's no real point to worry about the high 26 bits of the dentry
pointer for the 64-bit case, because they are all going to be identical
anyway.
So also change the hashing to be done in the more natural 'unsigned int'
that is the real size of the actual hashed data anyway.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-20 03:19:53 +04:00
|
|
|
unsigned int hash)
|
2011-01-07 09:50:05 +03:00
|
|
|
{
|
vfs: get rid of batshit-insane pointless dentry hash calculations
For some odd historical reason, the final mixing round for the dentry
cache hash table lookup had an insane "xor with big constant" logic. In
two places.
The big constant that is being xor'ed is GOLDEN_RATIO_PRIME, which is a
fairly random-looking number that is designed to be *multiplied* with so
that the bits get spread out over a whole long-word.
But xor'ing with it is insane. It doesn't really even change the hash -
it really only shifts the hash around in the hash table. To make
matters worse, the insane big constant is different on 32-bit and 64-bit
builds, even though the name hash bits we use are always 32-bit (and the
bits from the pointer we mix in effectively are too).
It's all total voodoo programming, in other words.
Now, some testing and analysis of the hash chains shows that the rest of
the hash function seems to be fairly good. It does pick the right bits
of the parent dentry pointer, for example, and while it's generally a
bad idea to use an xor to mix down the upper bits (because if there is a
repeating pattern, the xor can cause "destructive interference"), it
seems to not have been a disaster.
For example, replacing the hash with the normal "hash_long()" code (that
uses the GOLDEN_RATIO_PRIME constant correctly, btw) actually just makes
the hash worse. The hand-picked hash knew which bits of the pointer had
the highest entropy, and hash_long() ends up mixing bits less optimally
at least in some trivial tests.
So the hash function overall seems fine, it just has that really odd
"shift result around by a constant xor".
So get rid of the silly xor, and replace the down-mixing of the bits
with an add instead of an xor that tends to not have the same kind of
destructive interference issues. Some stats on the resulting hash
chains shows that they look statistically identical before and after,
but the code is simpler and no longer makes you go "WTF?".
Also, the incoming hash really is just "unsigned int", not a long, and
there's no real point to worry about the high 26 bits of the dentry
pointer for the 64-bit case, because they are all going to be identical
anyway.
So also change the hashing to be done in the more natural 'unsigned int'
that is the real size of the actual hashed data anyway.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-20 03:19:53 +04:00
|
|
|
hash += (unsigned long) parent / L1_CACHE_BYTES;
|
vfs: fix bad hashing of dentries
Josef Bacik found a performance regression between 3.2 and 3.10 and
narrowed it down to commit bfcfaa77bdf0 ("vfs: use 'unsigned long'
accesses for dcache name comparison and hashing"). He reports:
"The test case is essentially
for (i = 0; i < 1000000; i++)
mkdir("a$i");
On xfs on a fio card this goes at about 20k dir/sec with 3.2, and 12k
dir/sec with 3.10. This is because we spend waaaaay more time in
__d_lookup on 3.10 than in 3.2.
The new hashing function for strings is suboptimal for <
sizeof(unsigned long) string names (and hell even > sizeof(unsigned
long) string names that I've tested). I broke out the old hashing
function and the new one into a userspace helper to get real numbers
and this is what I'm getting:
Old hash table had 1000000 entries, 0 dupes, 0 max dupes
New hash table had 12628 entries, 987372 dupes, 900 max dupes
We had 11400 buckets with a p50 of 30 dupes, p90 of 240 dupes, p99 of 567 dupes for the new hash
My test does the hash, and then does the d_hash into a integer pointer
array the same size as the dentry hash table on my system, and then
just increments the value at the address we got to see how many
entries we overlap with.
As you can see the old hash function ended up with all 1 million
entries in their own bucket, whereas the new one they are only
distributed among ~12.5k buckets, which is why we're using so much
more CPU in __d_lookup".
The reason for this hash regression is two-fold:
- On 64-bit architectures the down-mixing of the original 64-bit
word-at-a-time hash into the final 32-bit hash value is very
simplistic and suboptimal, and just adds the two 32-bit parts
together.
In particular, because there is no bit shuffling and the mixing
boundary is also a byte boundary, similar character patterns in the
low and high word easily end up just canceling each other out.
- the old byte-at-a-time hash mixed each byte into the final hash as it
hashed the path component name, resulting in the low bits of the hash
generally being a good source of hash data. That is not true for the
word-at-a-time case, and the hash data is distributed among all the
bits.
The fix is the same in both cases: do a better job of mixing the bits up
and using as much of the hash data as possible. We already have the
"hash_32|64()" functions to do that.
Reported-by: Josef Bacik <jbacik@fb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-09-13 22:30:10 +04:00
|
|
|
return dentry_hashtable + hash_32(hash, d_hash_shift);
|
2011-01-07 09:50:05 +03:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* Statistics gathering. */
|
|
|
|
struct dentry_stat_t dentry_stat = {
|
|
|
|
.age_limit = 45,
|
|
|
|
};
|
|
|
|
|
fs: bump inode and dentry counters to long
This series reworks our current object cache shrinking infrastructure in
two main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version.
It is modeled after the filesystem users: dentries, inodes, and xfs
(for various tasks), but we expect that other users could benefit in
the near future with little or no modification. Let us know if you
have any issues.
* The underlying list_lru being proposed automatically and
transparently keeps the elements in per-node lists, and is able to
manipulate the node lists individually. Given this infrastructure, we
are able to modify the up-to-now hammer called shrink_slab to proceed
with node-reclaim instead of always searching memory from all over like
it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock. The locks usually disappear from the profilers with this
change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically,
those two pieces of work have been posted together. This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested. You can see more about the
history of such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
This patch:
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:17:53 +04:00
|
|
|
static DEFINE_PER_CPU(long, nr_dentry);
|
2013-08-28 04:17:54 +04:00
|
|
|
static DEFINE_PER_CPU(long, nr_dentry_unused);
|
2010-10-10 13:36:23 +04:00
|
|
|
|
|
|
|
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
|
2013-08-28 04:17:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here we resort to our own counters instead of using generic per-cpu counters
|
|
|
|
* for consistency with what the vfs inode code does. We are expected to harvest
|
|
|
|
* better code and performance by having our own specialized counters.
|
|
|
|
*
|
|
|
|
* Please note that the loop is done over all possible CPUs, not over all online
|
|
|
|
* CPUs. The reason for this is that we don't want to play games with CPUs going
|
|
|
|
* on and off. If one of them goes off, we will just keep their counters.
|
|
|
|
*
|
|
|
|
* glommer: See cffbc8a for details, and if you ever intend to change this,
|
|
|
|
* please update all vfs counters to match.
|
|
|
|
*/
|
fs: bump inode and dentry counters to long
This series reworks our current object cache shrinking infrastructure in
two main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version.
It is modeled after the filesystem users: dentries, inodes, and xfs
(for various tasks), but we expect that other users could benefit in
the near future with little or no modification. Let us know if you
have any issues.
* The underlying list_lru being proposed automatically and
transparently keeps the elements in per-node lists, and is able to
manipulate the node lists individually. Given this infrastructure, we
are able to modify the up-to-now hammer called shrink_slab to proceed
with node-reclaim instead of always searching memory from all over like
it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock. The locks usually disappear from the profilers with this
change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically,
those two pieces of work have been posted together. This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested. You can see more about the
history of such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
This patch:
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:17:53 +04:00
|
|
|
static long get_nr_dentry(void)
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:19 +03:00
|
|
|
{
|
|
|
|
int i;
|
fs: bump inode and dentry counters to long
This series reworks our current object cache shrinking infrastructure in
two main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version.
It is modeled after the filesystem users: dentries, inodes, and xfs
(for various tasks), but we expect that other users could benefit in
the near future with little or no modification. Let us know if you
have any issues.
* The underlying list_lru being proposed automatically and
transparently keeps the elements in per-node lists, and is able to
manipulate the node lists individually. Given this infrastructure, we
are able to modify the up-to-now hammer called shrink_slab to proceed
with node-reclaim instead of always searching memory from all over like
it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock. The locks usually disappear from the profilers with this
change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically,
those two pieces of work have been posted together. This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested. You can see more about the
history of such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
This patch:
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:17:53 +04:00
|
|
|
long sum = 0;
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:19 +03:00
|
|
|
for_each_possible_cpu(i)
|
|
|
|
sum += per_cpu(nr_dentry, i);
|
|
|
|
return sum < 0 ? 0 : sum;
|
|
|
|
}
|
|
|
|
|
2013-08-28 04:17:54 +04:00
|
|
|
static long get_nr_dentry_unused(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
long sum = 0;
|
|
|
|
for_each_possible_cpu(i)
|
|
|
|
sum += per_cpu(nr_dentry_unused, i);
|
|
|
|
return sum < 0 ? 0 : sum;
|
|
|
|
}
|
|
|
|
|
2014-06-07 01:38:05 +04:00
|
|
|
int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
|
2010-10-10 13:36:23 +04:00
|
|
|
size_t *lenp, loff_t *ppos)
|
|
|
|
{
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:19 +03:00
|
|
|
dentry_stat.nr_dentry = get_nr_dentry();
|
2013-08-28 04:17:54 +04:00
|
|
|
dentry_stat.nr_unused = get_nr_dentry_unused();
|
fs: bump inode and dentry counters to long
This series reworks our current object cache shrinking infrastructure in
two main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version.
It is modeled after the filesystem users: dentries, inodes, and xfs
(for various tasks), but we expect that other users could benefit in
the near future with little or no modification. Let us know if you
have any issues.
* The underlying list_lru being proposed automatically and
transparently keeps the elements in per-node lists, and is able to
manipulate the node lists individually. Given this infrastructure, we
are able to modify the up-to-now hammer called shrink_slab to proceed
with node-reclaim instead of always searching memory from all over like
it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock. The locks usually disappear from the profilers with this
change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically,
those two pieces of work have been posted together. This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested. You can see more about the
history of such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
This patch:
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:17:53 +04:00
|
|
|
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
2010-10-10 13:36:23 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-03-05 03:51:42 +04:00
|
|
|
/*
|
|
|
|
* Compare 2 name strings, return 0 if they match, otherwise non-zero.
|
|
|
|
* The strings are both count bytes long, and count is non-zero.
|
|
|
|
*/
|
2012-05-03 21:16:43 +04:00
|
|
|
#ifdef CONFIG_DCACHE_WORD_ACCESS
|
|
|
|
|
|
|
|
#include <asm/word-at-a-time.h>
|
|
|
|
/*
|
|
|
|
* NOTE! 'cs' and 'scount' come from a dentry, so it has a
|
|
|
|
* aligned allocation for this particular component. We don't
|
|
|
|
* strictly need the load_unaligned_zeropad() safety, but it
|
|
|
|
* doesn't hurt either.
|
|
|
|
*
|
|
|
|
* In contrast, 'ct' and 'tcount' can be from a pathname, and do
|
|
|
|
* need the careful unaligned handling.
|
|
|
|
*/
|
2012-05-10 23:19:19 +04:00
|
|
|
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
|
2012-03-05 03:51:42 +04:00
|
|
|
{
|
2012-03-06 23:16:17 +04:00
|
|
|
unsigned long a,b,mask;
|
|
|
|
|
|
|
|
for (;;) {
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
a = *(unsigned long *)cs;
|
2012-05-03 21:16:43 +04:00
|
|
|
b = load_unaligned_zeropad(ct);
|
2012-03-06 23:16:17 +04:00
|
|
|
if (tcount < sizeof(unsigned long))
|
|
|
|
break;
|
|
|
|
if (unlikely(a != b))
|
|
|
|
return 1;
|
|
|
|
cs += sizeof(unsigned long);
|
|
|
|
ct += sizeof(unsigned long);
|
|
|
|
tcount -= sizeof(unsigned long);
|
|
|
|
if (!tcount)
|
|
|
|
return 0;
|
|
|
|
}
|
2013-12-12 21:40:21 +04:00
|
|
|
mask = bytemask_from_count(tcount);
|
2012-03-06 23:16:17 +04:00
|
|
|
return unlikely(!!((a ^ b) & mask));
|
2012-05-03 21:16:43 +04:00
|
|
|
}
|
|
|
|
|
2012-03-06 23:16:17 +04:00
|
|
|
#else
|
2012-05-03 21:16:43 +04:00
|
|
|
|
2012-05-10 23:19:19 +04:00
|
|
|
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
|
2012-05-03 21:16:43 +04:00
|
|
|
{
|
2012-03-05 03:51:42 +04:00
|
|
|
do {
|
|
|
|
if (*cs != *ct)
|
|
|
|
return 1;
|
|
|
|
cs++;
|
|
|
|
ct++;
|
|
|
|
tcount--;
|
|
|
|
} while (tcount);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-05-03 21:16:43 +04:00
|
|
|
#endif
|
|
|
|
|
2012-05-10 23:19:19 +04:00
|
|
|
static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
|
|
|
|
{
|
2012-05-22 03:14:04 +04:00
|
|
|
const unsigned char *cs;
|
2012-05-10 23:19:19 +04:00
|
|
|
/*
|
|
|
|
* Be careful about RCU walk racing with rename:
|
|
|
|
* use ACCESS_ONCE to fetch the name pointer.
|
|
|
|
*
|
|
|
|
* NOTE! Even if a rename will mean that the length
|
|
|
|
* was not loaded atomically, we don't care. The
|
|
|
|
* RCU walk will check the sequence count eventually,
|
|
|
|
* and catch it. And we won't overrun the buffer,
|
|
|
|
* because we're reading the name pointer atomically,
|
|
|
|
* and a dentry name is guaranteed to be properly
|
|
|
|
* terminated with a NUL byte.
|
|
|
|
*
|
|
|
|
* End result: even if 'len' is wrong, we'll exit
|
|
|
|
* early because the data cannot match (there can
|
|
|
|
* be no NUL in the ct/tcount data)
|
|
|
|
*/
|
2012-05-22 03:14:04 +04:00
|
|
|
cs = ACCESS_ONCE(dentry->d_name.name);
|
|
|
|
smp_read_barrier_depends();
|
|
|
|
return dentry_string_cmp(cs, ct, tcount);
|
2012-05-10 23:19:19 +04:00
|
|
|
}
|
|
|
|
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
struct external_name {
|
|
|
|
union {
|
|
|
|
atomic_t count;
|
|
|
|
struct rcu_head head;
|
|
|
|
} u;
|
|
|
|
unsigned char name[];
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct external_name *external_name(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
return container_of(dentry->d_name.name, struct external_name, name[0]);
|
|
|
|
}
|
|
|
|
|
2010-10-10 13:36:22 +04:00
|
|
|
static void __d_free(struct rcu_head *head)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-10-10 13:36:22 +04:00
|
|
|
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
|
|
|
|
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
kmem_cache_free(dentry_cache, dentry);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __d_free_external(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
|
|
|
|
kfree(external_name(dentry));
|
2005-04-17 02:20:36 +04:00
|
|
|
kmem_cache_free(dentry_cache, dentry);
|
|
|
|
}
|
|
|
|
|
2014-10-12 20:45:37 +04:00
|
|
|
static inline int dname_external(const struct dentry *dentry)
|
|
|
|
{
|
|
|
|
return dentry->d_name.name != dentry->d_iname;
|
|
|
|
}
|
|
|
|
|
2015-03-05 17:09:22 +03:00
|
|
|
/*
|
|
|
|
* Make sure other CPUs see the inode attached before the type is set.
|
|
|
|
*/
|
|
|
|
static inline void __d_set_inode_and_type(struct dentry *dentry,
|
|
|
|
struct inode *inode,
|
|
|
|
unsigned type_flags)
|
|
|
|
{
|
|
|
|
unsigned flags;
|
|
|
|
|
|
|
|
dentry->d_inode = inode;
|
|
|
|
smp_wmb();
|
|
|
|
flags = READ_ONCE(dentry->d_flags);
|
|
|
|
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
|
|
|
|
flags |= type_flags;
|
|
|
|
WRITE_ONCE(dentry->d_flags, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ideally, we want to make sure that other CPUs see the flags cleared before
|
|
|
|
* the inode is detached, but this is really a violation of RCU principles
|
|
|
|
* since the ordering suggests we should always set inode before flags.
|
|
|
|
*
|
|
|
|
* We should instead replace or discard the entire dentry - but that sucks
|
|
|
|
* performancewise on mass deletion/rename.
|
|
|
|
*/
|
|
|
|
static inline void __d_clear_type_and_inode(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
unsigned flags = READ_ONCE(dentry->d_flags);
|
|
|
|
|
|
|
|
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
|
|
|
|
WRITE_ONCE(dentry->d_flags, flags);
|
|
|
|
smp_wmb();
|
|
|
|
dentry->d_inode = NULL;
|
|
|
|
}
|
|
|
|
|
2014-04-30 07:40:14 +04:00
|
|
|
static void dentry_free(struct dentry *dentry)
|
|
|
|
{
|
2014-10-27 02:19:16 +03:00
|
|
|
WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
if (unlikely(dname_external(dentry))) {
|
|
|
|
struct external_name *p = external_name(dentry);
|
|
|
|
if (likely(atomic_dec_and_test(&p->u.count))) {
|
|
|
|
call_rcu(&dentry->d_u.d_rcu, __d_free_external);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2014-04-30 07:40:14 +04:00
|
|
|
/* if dentry was never visible to RCU, immediate free is OK */
|
|
|
|
if (!(dentry->d_flags & DCACHE_RCUACCESS))
|
|
|
|
__d_free(&dentry->d_u.d_rcu);
|
|
|
|
else
|
|
|
|
call_rcu(&dentry->d_u.d_rcu, __d_free);
|
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
/**
|
2015-06-11 15:46:46 +03:00
|
|
|
* dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
|
2011-01-23 07:16:06 +03:00
|
|
|
* @dentry: the target dentry
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
* After this call, in-progress rcu-walk path lookup will fail. This
|
|
|
|
* should be called after unhashing, and after changing d_inode (if
|
|
|
|
* the dentry has not already been unhashed).
|
|
|
|
*/
|
2015-06-11 15:46:46 +03:00
|
|
|
static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
{
|
2015-06-11 15:46:46 +03:00
|
|
|
lockdep_assert_held(&dentry->d_lock);
|
|
|
|
/* Go through am invalidation barrier */
|
|
|
|
write_seqcount_invalidate(&dentry->d_seq);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Release the dentry's inode, using the filesystem
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
* d_iput() operation if defined. Dentry has no refcount
|
|
|
|
* and is unhashed.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2006-01-15 00:20:43 +03:00
|
|
|
static void dentry_iput(struct dentry * dentry)
|
2008-06-23 20:11:52 +04:00
|
|
|
__releases(dentry->d_lock)
|
2011-01-07 09:50:06 +03:00
|
|
|
__releases(dentry->d_inode->i_lock)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
if (inode) {
|
2015-03-05 17:09:22 +03:00
|
|
|
__d_clear_type_and_inode(dentry);
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_del_init(&dentry->d_u.d_alias);
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-09-20 06:54:29 +04:00
|
|
|
if (!inode->i_nlink)
|
|
|
|
fsnotify_inoderemove(inode);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (dentry->d_op && dentry->d_op->d_iput)
|
|
|
|
dentry->d_op->d_iput(dentry, inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
|
|
|
} else {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
/*
|
|
|
|
* Release the dentry's inode, using the filesystem
|
|
|
|
* d_iput() operation if defined. dentry remains in-use.
|
|
|
|
*/
|
|
|
|
static void dentry_unlink_inode(struct dentry * dentry)
|
|
|
|
__releases(dentry->d_lock)
|
2011-01-07 09:50:06 +03:00
|
|
|
__releases(dentry->d_inode->i_lock)
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
2015-03-05 17:09:22 +03:00
|
|
|
__d_clear_type_and_inode(dentry);
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_del_init(&dentry->d_u.d_alias);
|
2015-06-11 15:46:46 +03:00
|
|
|
dentry_rcuwalk_invalidate(dentry);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
if (!inode->i_nlink)
|
|
|
|
fsnotify_inoderemove(inode);
|
|
|
|
if (dentry->d_op && dentry->d_op->d_iput)
|
|
|
|
dentry->d_op->d_iput(dentry, inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
|
2013-09-14 06:55:10 +04:00
|
|
|
/*
|
|
|
|
* The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
|
|
|
|
* is in use - which includes both the "real" per-superblock
|
|
|
|
* LRU list _and_ the DCACHE_SHRINK_LIST use.
|
|
|
|
*
|
|
|
|
* The DCACHE_SHRINK_LIST bit is set whenever the dentry is
|
|
|
|
* on the shrink list (ie not on the superblock LRU list).
|
|
|
|
*
|
|
|
|
* The per-cpu "nr_dentry_unused" counters are updated with
|
|
|
|
* the DCACHE_LRU_LIST bit.
|
|
|
|
*
|
|
|
|
* These helper functions make sure we always follow the
|
|
|
|
* rules. d_lock must be held by the caller.
|
|
|
|
*/
|
|
|
|
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
|
|
|
|
static void d_lru_add(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, 0);
|
|
|
|
dentry->d_flags |= DCACHE_LRU_LIST;
|
|
|
|
this_cpu_inc(nr_dentry_unused);
|
|
|
|
WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void d_lru_del(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
|
|
|
|
dentry->d_flags &= ~DCACHE_LRU_LIST;
|
|
|
|
this_cpu_dec(nr_dentry_unused);
|
|
|
|
WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void d_shrink_del(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
|
|
|
|
list_del_init(&dentry->d_lru);
|
|
|
|
dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
|
|
|
|
this_cpu_dec(nr_dentry_unused);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void d_shrink_add(struct dentry *dentry, struct list_head *list)
|
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, 0);
|
|
|
|
list_add(&dentry->d_lru, list);
|
|
|
|
dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
|
|
|
|
this_cpu_inc(nr_dentry_unused);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These can only be called under the global LRU lock, ie during the
|
|
|
|
* callback for freeing the LRU list. "isolate" removes it from the
|
|
|
|
* LRU lists entirely, while shrink_move moves it to the indicated
|
|
|
|
* private list.
|
|
|
|
*/
|
2015-02-13 01:59:35 +03:00
|
|
|
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
|
2013-09-14 06:55:10 +04:00
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
|
|
|
|
dentry->d_flags &= ~DCACHE_LRU_LIST;
|
|
|
|
this_cpu_dec(nr_dentry_unused);
|
2015-02-13 01:59:35 +03:00
|
|
|
list_lru_isolate(lru, &dentry->d_lru);
|
2013-09-14 06:55:10 +04:00
|
|
|
}
|
|
|
|
|
2015-02-13 01:59:35 +03:00
|
|
|
static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
|
|
|
|
struct list_head *list)
|
2013-09-14 06:55:10 +04:00
|
|
|
{
|
|
|
|
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
|
|
|
|
dentry->d_flags |= DCACHE_SHRINK_LIST;
|
2015-02-13 01:59:35 +03:00
|
|
|
list_lru_isolate_move(lru, &dentry->d_lru, list);
|
2013-09-14 06:55:10 +04:00
|
|
|
}
|
|
|
|
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
/*
|
2013-08-28 04:18:00 +04:00
|
|
|
* dentry_lru_(add|del)_list) must be called with d_lock held.
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
*/
|
|
|
|
static void dentry_lru_add(struct dentry *dentry)
|
|
|
|
{
|
2013-09-14 06:55:10 +04:00
|
|
|
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
|
|
|
|
d_lru_add(dentry);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:30 +03:00
|
|
|
/**
|
|
|
|
* d_drop - drop a dentry
|
|
|
|
* @dentry: dentry to drop
|
|
|
|
*
|
|
|
|
* d_drop() unhashes the entry from the parent dentry hashes, so that it won't
|
|
|
|
* be found through a VFS lookup any more. Note that this is different from
|
|
|
|
* deleting the dentry - d_delete will try to mark the dentry negative if
|
|
|
|
* possible, giving a successful _negative_ lookup, while d_drop will
|
|
|
|
* just make the cache lookup fail.
|
|
|
|
*
|
|
|
|
* d_drop() is used mainly for stuff that wants to invalidate a dentry for some
|
|
|
|
* reason (NFS timeouts or autofs deletes).
|
|
|
|
*
|
|
|
|
* __d_drop requires dentry->d_lock.
|
|
|
|
*/
|
|
|
|
void __d_drop(struct dentry *dentry)
|
|
|
|
{
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 18:58:46 +04:00
|
|
|
if (!d_unhashed(dentry)) {
|
2013-10-04 19:09:01 +04:00
|
|
|
struct hlist_bl_head *b;
|
dcache: use IS_ROOT to decide where dentry is hashed
Every hashed dentry is either hashed in the dentry_hashtable, or a
superblock's s_anon list.
__d_drop() assumes it can determine which is the case by checking
DCACHE_DISCONNECTED; this is not true.
It is true that when DCACHE_DISCONNECTED is cleared, the dentry is not
only hashed on dentry_hashtable, but is fully connected to its parents
back to the root.
But the converse is *not* true: fs/exportfs/expfs.c:reconnect_path()
attempts to connect a directory (found by filehandle lookup) back to
root by ascending to parents and performing lookups one at a time. It
does not clear DCACHE_DISCONNECTED until it's done, and that is not at
all an atomic process.
In particular, it is possible for DCACHE_DISCONNECTED to be set on a
dentry which is hashed on the dentry_hashtable.
Instead, use IS_ROOT() to check which hash chain a dentry is on. This
*does* work:
Dentries are hashed only by:
- d_obtain_alias, which adds an IS_ROOT() dentry to sb_anon.
- __d_rehash, called by _d_rehash: hashes to the dentry's
parent, and all callers of _d_rehash appear to have d_parent
set to a "real" parent.
- __d_rehash, called by __d_move: rehashes the moved dentry to
hash chain determined by target, and assigns target's d_parent
to its d_parent, before dropping the dentry's d_lock.
Therefore I believe it's safe for a holder of a dentry's d_lock to
assume that it is hashed on sb_anon if and only if IS_ROOT(dentry) is
true.
I believe the incorrect assumption about DCACHE_DISCONNECTED was
originally introduced by ceb5bdc2d246 "fs: dcache per-bucket dcache hash
locking".
Also add a comment while we're here.
Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-06-28 20:10:55 +04:00
|
|
|
/*
|
|
|
|
* Hashed dentries are normally on the dentry hashtable,
|
|
|
|
* with the exception of those newly allocated by
|
|
|
|
* d_obtain_alias, which are always IS_ROOT:
|
|
|
|
*/
|
|
|
|
if (unlikely(IS_ROOT(dentry)))
|
2013-10-04 19:09:01 +04:00
|
|
|
b = &dentry->d_sb->s_anon;
|
|
|
|
else
|
|
|
|
b = d_hash(dentry->d_parent, dentry->d_name.hash);
|
|
|
|
|
|
|
|
hlist_bl_lock(b);
|
|
|
|
__hlist_bl_del(&dentry->d_hash);
|
|
|
|
dentry->d_hash.pprev = NULL;
|
|
|
|
hlist_bl_unlock(b);
|
2015-06-11 15:46:46 +03:00
|
|
|
dentry_rcuwalk_invalidate(dentry);
|
2011-01-07 09:49:30 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__d_drop);
|
|
|
|
|
|
|
|
void d_drop(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
__d_drop(dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_drop);
|
|
|
|
|
2014-05-28 21:51:12 +04:00
|
|
|
static void __dentry_kill(struct dentry *dentry)
|
2011-01-07 09:49:48 +03:00
|
|
|
{
|
dentry_kill(): don't try to remove from shrink list
If the victim in on the shrink list, don't remove it from there.
If shrink_dentry_list() manages to remove it from the list before
we are done - fine, we'll just free it as usual. If not - mark
it with new flag (DCACHE_MAY_FREE) and leave it there.
Eventually, shrink_dentry_list() will get to it, remove the sucker
from shrink list and call dentry_kill(dentry, 0). Which is where
we'll deal with freeing.
Since now dentry_kill(dentry, 0) may happen after or during
dentry_kill(dentry, 1), we need to recognize that (by seeing
DCACHE_DENTRY_KILLED already set), unlock everything
and either free the sucker (in case DCACHE_MAY_FREE has been
set) or leave it for ongoing dentry_kill(dentry, 1) to deal with.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-01 18:30:00 +04:00
|
|
|
struct dentry *parent = NULL;
|
|
|
|
bool can_free = true;
|
|
|
|
if (!IS_ROOT(dentry))
|
2011-01-07 09:49:48 +03:00
|
|
|
parent = dentry->d_parent;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
|
vfs: use lockred "dead" flag to mark unrecoverably dead dentries
This simplifies the RCU to refcounting code in particular.
I was originally intending to leave this for later, but walking through
all the dput() logic (see previous commit), I realized that the dput()
"might_sleep()" check was misleadingly weak. And I removed it as
misleading, both for performance profiling and for debugging.
However, the might_sleep() debugging case is actually true: the final
dput() can indeed sleep, if the inode of the dentry that you are
releasing ends up sleeping at iput time (see dentry_iput()). So the
problem with the might_sleep() in dput() wasn't that it wasn't true, it
was that it wasn't actually testing and triggering on the interesting
case.
In particular, just about *any* dput() can indeed sleep, if you happen
to race with another thread deleting the file in question, and you then
lose the race to the be the last dput() for that file. But because it's
a very rare race, the debugging code would never trigger it in practice.
Why is this problematic? The new d_rcu_to_refcount() (see commit
15570086b590: "vfs: reimplement d_rcu_to_refcount() using
lockref_get_or_lock()") does a dput() for the failure case, and it does
it under the RCU lock. So potentially sleeping really is a bug.
But there's no way I'm going to fix this with the previous complicated
"lockref_get_or_lock()" interface. And rather than revert to the old
and crufty nested dentry locking code (which did get this right by
delaying the reference count updates until they were verified to be
safe), let's make forward progress.
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-09 00:46:52 +04:00
|
|
|
/*
|
|
|
|
* The dentry is now unrecoverably dead to the world.
|
|
|
|
*/
|
|
|
|
lockref_mark_dead(&dentry->d_lockref);
|
|
|
|
|
2011-10-28 21:02:42 +04:00
|
|
|
/*
|
|
|
|
* inform the fs via d_prune that this dentry is about to be
|
|
|
|
* unhashed and destroyed.
|
|
|
|
*/
|
2014-05-30 19:39:02 +04:00
|
|
|
if (dentry->d_flags & DCACHE_OP_PRUNE)
|
2013-04-15 10:13:21 +04:00
|
|
|
dentry->d_op->d_prune(dentry);
|
|
|
|
|
2014-04-30 07:42:52 +04:00
|
|
|
if (dentry->d_flags & DCACHE_LRU_LIST) {
|
|
|
|
if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
|
|
|
|
d_lru_del(dentry);
|
|
|
|
}
|
2011-01-07 09:49:48 +03:00
|
|
|
/* if it was on the hash then remove it */
|
|
|
|
__d_drop(dentry);
|
2014-10-27 02:31:10 +03:00
|
|
|
__list_del_entry(&dentry->d_child);
|
2014-04-29 23:45:28 +04:00
|
|
|
/*
|
|
|
|
* Inform d_walk() that we are no longer attached to the
|
|
|
|
* dentry tree
|
|
|
|
*/
|
|
|
|
dentry->d_flags |= DCACHE_DENTRY_KILLED;
|
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
dentry_iput(dentry);
|
|
|
|
/*
|
|
|
|
* dentry_iput drops the locks, at which point nobody (except
|
|
|
|
* transient RCU lookups) can reach this dentry.
|
|
|
|
*/
|
2015-01-10 02:19:03 +03:00
|
|
|
BUG_ON(dentry->d_lockref.count > 0);
|
2014-04-29 23:45:28 +04:00
|
|
|
this_cpu_dec(nr_dentry);
|
|
|
|
if (dentry->d_op && dentry->d_op->d_release)
|
|
|
|
dentry->d_op->d_release(dentry);
|
|
|
|
|
dentry_kill(): don't try to remove from shrink list
If the victim in on the shrink list, don't remove it from there.
If shrink_dentry_list() manages to remove it from the list before
we are done - fine, we'll just free it as usual. If not - mark
it with new flag (DCACHE_MAY_FREE) and leave it there.
Eventually, shrink_dentry_list() will get to it, remove the sucker
from shrink list and call dentry_kill(dentry, 0). Which is where
we'll deal with freeing.
Since now dentry_kill(dentry, 0) may happen after or during
dentry_kill(dentry, 1), we need to recognize that (by seeing
DCACHE_DENTRY_KILLED already set), unlock everything
and either free the sucker (in case DCACHE_MAY_FREE has been
set) or leave it for ongoing dentry_kill(dentry, 1) to deal with.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-01 18:30:00 +04:00
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
|
|
|
|
dentry->d_flags |= DCACHE_MAY_FREE;
|
|
|
|
can_free = false;
|
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
if (likely(can_free))
|
|
|
|
dentry_free(dentry);
|
2014-05-28 21:51:12 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finish off a dentry we've decided to kill.
|
|
|
|
* dentry->d_lock must be held, returns with it unlocked.
|
|
|
|
* If ref is non-zero, then decrement the refcount too.
|
|
|
|
* Returns dentry requiring refcount drop, or NULL if we're done.
|
|
|
|
*/
|
2014-05-29 17:18:26 +04:00
|
|
|
static struct dentry *dentry_kill(struct dentry *dentry)
|
2014-05-28 21:51:12 +04:00
|
|
|
__releases(dentry->d_lock)
|
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
struct dentry *parent = NULL;
|
|
|
|
|
|
|
|
if (inode && unlikely(!spin_trylock(&inode->i_lock)))
|
|
|
|
goto failed;
|
|
|
|
|
|
|
|
if (!IS_ROOT(dentry)) {
|
|
|
|
parent = dentry->d_parent;
|
|
|
|
if (unlikely(!spin_trylock(&parent->d_lock))) {
|
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
goto failed;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
__dentry_kill(dentry);
|
2014-04-29 23:45:28 +04:00
|
|
|
return parent;
|
2014-05-28 21:51:12 +04:00
|
|
|
|
|
|
|
failed:
|
2014-05-29 17:18:26 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
cpu_relax();
|
2014-05-28 21:51:12 +04:00
|
|
|
return dentry; /* try again with same dentry */
|
2011-01-07 09:49:48 +03:00
|
|
|
}
|
|
|
|
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
static inline struct dentry *lock_parent(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct dentry *parent = dentry->d_parent;
|
|
|
|
if (IS_ROOT(dentry))
|
|
|
|
return NULL;
|
2015-01-10 02:19:03 +03:00
|
|
|
if (unlikely(dentry->d_lockref.count < 0))
|
2014-06-12 08:29:13 +04:00
|
|
|
return NULL;
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
if (likely(spin_trylock(&parent->d_lock)))
|
|
|
|
return parent;
|
|
|
|
rcu_read_lock();
|
2014-06-12 08:29:13 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
again:
|
|
|
|
parent = ACCESS_ONCE(dentry->d_parent);
|
|
|
|
spin_lock(&parent->d_lock);
|
|
|
|
/*
|
|
|
|
* We can't blindly lock dentry until we are sure
|
|
|
|
* that we won't violate the locking order.
|
|
|
|
* Any changes of dentry->d_parent must have
|
|
|
|
* been done with parent->d_lock held, so
|
|
|
|
* spin_lock() above is enough of a barrier
|
|
|
|
* for checking if it's still our child.
|
|
|
|
*/
|
|
|
|
if (unlikely(parent != dentry->d_parent)) {
|
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
if (parent != dentry)
|
2014-05-31 20:13:21 +04:00
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
else
|
|
|
|
parent = NULL;
|
|
|
|
return parent;
|
|
|
|
}
|
|
|
|
|
2015-01-10 02:19:03 +03:00
|
|
|
/*
|
|
|
|
* Try to do a lockless dput(), and return whether that was successful.
|
|
|
|
*
|
|
|
|
* If unsuccessful, we return false, having already taken the dentry lock.
|
|
|
|
*
|
|
|
|
* The caller needs to hold the RCU read lock, so that the dentry is
|
|
|
|
* guaranteed to stay around even if the refcount goes down to zero!
|
|
|
|
*/
|
|
|
|
static inline bool fast_dput(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
unsigned int d_flags;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have a d_op->d_delete() operation, we sould not
|
freeing unlinked file indefinitely delayed
Normally opening a file, unlinking it and then closing will have
the inode freed upon close() (provided that it's not otherwise busy and
has no remaining links, of course). However, there's one case where that
does *not* happen. Namely, if you open it by fhandle with cold dcache,
then unlink() and close().
In normal case you get d_delete() in unlink(2) notice that dentry
is busy and unhash it; on the final dput() it will be forcibly evicted from
dcache, triggering iput() and inode removal. In this case, though, we end
up with *two* dentries - disconnected (created by open-by-fhandle) and
regular one (used by unlink()). The latter will have its reference to inode
dropped just fine, but the former will not - it's considered hashed (it
is on the ->s_anon list), so it will stay around until the memory pressure
will finally do it in. As the result, we have the final iput() delayed
indefinitely. It's trivial to reproduce -
void flush_dcache(void)
{
system("mount -o remount,rw /");
}
static char buf[20 * 1024 * 1024];
main()
{
int fd;
union {
struct file_handle f;
char buf[MAX_HANDLE_SZ];
} x;
int m;
x.f.handle_bytes = sizeof(x);
chdir("/root");
mkdir("foo", 0700);
fd = open("foo/bar", O_CREAT | O_RDWR, 0600);
close(fd);
name_to_handle_at(AT_FDCWD, "foo/bar", &x.f, &m, 0);
flush_dcache();
fd = open_by_handle_at(AT_FDCWD, &x.f, O_RDWR);
unlink("foo/bar");
write(fd, buf, sizeof(buf));
system("df ."); /* 20Mb eaten */
close(fd);
system("df ."); /* should've freed those 20Mb */
flush_dcache();
system("df ."); /* should be the same as #2 */
}
will spit out something like
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 283282 21692 93% /
- inode gets freed only when dentry is finally evicted (here we trigger
than by remount; normally it would've happened in response to memory
pressure hell knows when).
Cc: stable@vger.kernel.org # v2.6.38+; earlier ones need s/kill_it/unhash_it/
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-07-08 04:42:38 +03:00
|
|
|
* let the dentry count go to zero, so use "put_or_lock".
|
2015-01-10 02:19:03 +03:00
|
|
|
*/
|
|
|
|
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
|
|
|
|
return lockref_put_or_lock(&dentry->d_lockref);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* .. otherwise, we can try to just decrement the
|
|
|
|
* lockref optimistically.
|
|
|
|
*/
|
|
|
|
ret = lockref_put_return(&dentry->d_lockref);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the lockref_put_return() failed due to the lock being held
|
|
|
|
* by somebody else, the fast path has failed. We will need to
|
|
|
|
* get the lock, and then check the count again.
|
|
|
|
*/
|
|
|
|
if (unlikely(ret < 0)) {
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (dentry->d_lockref.count > 1) {
|
|
|
|
dentry->d_lockref.count--;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we weren't the last ref, we're done.
|
|
|
|
*/
|
|
|
|
if (ret)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Careful, careful. The reference count went down
|
|
|
|
* to zero, but we don't hold the dentry lock, so
|
|
|
|
* somebody else could get it again, and do another
|
|
|
|
* dput(), and we need to not race with that.
|
|
|
|
*
|
|
|
|
* However, there is a very special and common case
|
|
|
|
* where we don't care, because there is nothing to
|
|
|
|
* do: the dentry is still hashed, it does not have
|
|
|
|
* a 'delete' op, and it's referenced and already on
|
|
|
|
* the LRU list.
|
|
|
|
*
|
|
|
|
* NOTE! Since we aren't locked, these values are
|
|
|
|
* not "stable". However, it is sufficient that at
|
|
|
|
* some point after we dropped the reference the
|
|
|
|
* dentry was hashed and the flags had the proper
|
|
|
|
* value. Other dentry users may have re-gotten
|
|
|
|
* a reference to the dentry and change that, but
|
|
|
|
* our work is done - we can leave the dentry
|
|
|
|
* around with a zero refcount.
|
|
|
|
*/
|
|
|
|
smp_rmb();
|
|
|
|
d_flags = ACCESS_ONCE(dentry->d_flags);
|
freeing unlinked file indefinitely delayed
Normally opening a file, unlinking it and then closing will have
the inode freed upon close() (provided that it's not otherwise busy and
has no remaining links, of course). However, there's one case where that
does *not* happen. Namely, if you open it by fhandle with cold dcache,
then unlink() and close().
In normal case you get d_delete() in unlink(2) notice that dentry
is busy and unhash it; on the final dput() it will be forcibly evicted from
dcache, triggering iput() and inode removal. In this case, though, we end
up with *two* dentries - disconnected (created by open-by-fhandle) and
regular one (used by unlink()). The latter will have its reference to inode
dropped just fine, but the former will not - it's considered hashed (it
is on the ->s_anon list), so it will stay around until the memory pressure
will finally do it in. As the result, we have the final iput() delayed
indefinitely. It's trivial to reproduce -
void flush_dcache(void)
{
system("mount -o remount,rw /");
}
static char buf[20 * 1024 * 1024];
main()
{
int fd;
union {
struct file_handle f;
char buf[MAX_HANDLE_SZ];
} x;
int m;
x.f.handle_bytes = sizeof(x);
chdir("/root");
mkdir("foo", 0700);
fd = open("foo/bar", O_CREAT | O_RDWR, 0600);
close(fd);
name_to_handle_at(AT_FDCWD, "foo/bar", &x.f, &m, 0);
flush_dcache();
fd = open_by_handle_at(AT_FDCWD, &x.f, O_RDWR);
unlink("foo/bar");
write(fd, buf, sizeof(buf));
system("df ."); /* 20Mb eaten */
close(fd);
system("df ."); /* should've freed those 20Mb */
flush_dcache();
system("df ."); /* should be the same as #2 */
}
will spit out something like
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 283282 21692 93% /
- inode gets freed only when dentry is finally evicted (here we trigger
than by remount; normally it would've happened in response to memory
pressure hell knows when).
Cc: stable@vger.kernel.org # v2.6.38+; earlier ones need s/kill_it/unhash_it/
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-07-08 04:42:38 +03:00
|
|
|
d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
|
2015-01-10 02:19:03 +03:00
|
|
|
|
|
|
|
/* Nothing to do? Dropping the reference was all we needed? */
|
|
|
|
if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Not the fast normal case? Get the lock. We've already decremented
|
|
|
|
* the refcount, but we'll need to re-check the situation after
|
|
|
|
* getting the lock.
|
|
|
|
*/
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Did somebody else grab a reference to it in the meantime, and
|
|
|
|
* we're no longer the last user after all? Alternatively, somebody
|
|
|
|
* else could have killed it and marked it dead. Either way, we
|
|
|
|
* don't need to do anything else.
|
|
|
|
*/
|
|
|
|
if (dentry->d_lockref.count) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Re-get the reference we optimistically dropped. We hold the
|
|
|
|
* lock, and we just tested that it was zero, so we can just
|
|
|
|
* set it to 1.
|
|
|
|
*/
|
|
|
|
dentry->d_lockref.count = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* This is dput
|
|
|
|
*
|
|
|
|
* This is complicated by the fact that we do not want to put
|
|
|
|
* dentries that are no longer on any hash chain on the unused
|
|
|
|
* list: we'd much rather just get rid of them immediately.
|
|
|
|
*
|
|
|
|
* However, that implies that we have to traverse the dentry
|
|
|
|
* tree upwards to the parents which might _also_ now be
|
|
|
|
* scheduled for deletion (it may have been only waiting for
|
|
|
|
* its last child to go away).
|
|
|
|
*
|
|
|
|
* This tail recursion is done by hand as we don't want to depend
|
|
|
|
* on the compiler to always get this right (gcc generally doesn't).
|
|
|
|
* Real recursion would eat up our stack space.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dput - release a dentry
|
|
|
|
* @dentry: dentry to release
|
|
|
|
*
|
|
|
|
* Release a dentry. This will drop the usage count and if appropriate
|
|
|
|
* call the dentry unlink method as well as removing it from the queues and
|
|
|
|
* releasing its resources. If the parent dentries were scheduled for release
|
|
|
|
* they too may now get deleted.
|
|
|
|
*/
|
|
|
|
void dput(struct dentry *dentry)
|
|
|
|
{
|
vfs: reorganize dput() memory accesses
This is me being a bit OCD after all the dentry optimization work this
merge window: profiles end up showing 'dput()' as a rather expensive
operation, and there were two unrelated bad reasons for that.
The first reason was reading d_lockref.count for debugging purposes,
which touches the lockref cacheline (for reads) before really need to.
More importantly, the debugging test in question is _wrong_, and has
hidden bugs. It's true that we can only sleep when the count goes down
to zero, but the test as-is hides the much more subtle bug that happens
if we race with somebody else deleting the file.
Anyway we _will_ touch that cacheline, but let's do it for a write and
in the right routine (ie in "lockref_put_or_lock()") which annotates the
costs better. So remove the misleading debug code.
The other was an unnecessary access to the cacheline that contains the
d_lru list, just to check whether we already were on the LRU list or
not. This is exactly what we have d_flags for, so that we can avoid
touching extra cache lines for the common case. So just add another bit
for "is this dentry on the LRU".
Finally, mark the tests properly likely/unlikely, so that the common
fast-paths are dense in the instruction stream.
This makes the profiles look much saner.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-09 00:26:18 +04:00
|
|
|
if (unlikely(!dentry))
|
2005-04-17 02:20:36 +04:00
|
|
|
return;
|
|
|
|
|
|
|
|
repeat:
|
2015-01-10 02:19:03 +03:00
|
|
|
rcu_read_lock();
|
|
|
|
if (likely(fast_dput(dentry))) {
|
|
|
|
rcu_read_unlock();
|
2005-04-17 02:20:36 +04:00
|
|
|
return;
|
2015-01-10 02:19:03 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Slow case: now with the dentry lock held */
|
|
|
|
rcu_read_unlock();
|
2005-04-17 02:20:36 +04:00
|
|
|
|
vfs: reorganize dput() memory accesses
This is me being a bit OCD after all the dentry optimization work this
merge window: profiles end up showing 'dput()' as a rather expensive
operation, and there were two unrelated bad reasons for that.
The first reason was reading d_lockref.count for debugging purposes,
which touches the lockref cacheline (for reads) before really need to.
More importantly, the debugging test in question is _wrong_, and has
hidden bugs. It's true that we can only sleep when the count goes down
to zero, but the test as-is hides the much more subtle bug that happens
if we race with somebody else deleting the file.
Anyway we _will_ touch that cacheline, but let's do it for a write and
in the right routine (ie in "lockref_put_or_lock()") which annotates the
costs better. So remove the misleading debug code.
The other was an unnecessary access to the cacheline that contains the
d_lru list, just to check whether we already were on the LRU list or
not. This is exactly what we have d_flags for, so that we can avoid
touching extra cache lines for the common case. So just add another bit
for "is this dentry on the LRU".
Finally, mark the tests properly likely/unlikely, so that the common
fast-paths are dense in the instruction stream.
This makes the profiles look much saner.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-09 00:26:18 +04:00
|
|
|
/* Unreachable? Get rid of it */
|
|
|
|
if (unlikely(d_unhashed(dentry)))
|
|
|
|
goto kill_it;
|
|
|
|
|
freeing unlinked file indefinitely delayed
Normally opening a file, unlinking it and then closing will have
the inode freed upon close() (provided that it's not otherwise busy and
has no remaining links, of course). However, there's one case where that
does *not* happen. Namely, if you open it by fhandle with cold dcache,
then unlink() and close().
In normal case you get d_delete() in unlink(2) notice that dentry
is busy and unhash it; on the final dput() it will be forcibly evicted from
dcache, triggering iput() and inode removal. In this case, though, we end
up with *two* dentries - disconnected (created by open-by-fhandle) and
regular one (used by unlink()). The latter will have its reference to inode
dropped just fine, but the former will not - it's considered hashed (it
is on the ->s_anon list), so it will stay around until the memory pressure
will finally do it in. As the result, we have the final iput() delayed
indefinitely. It's trivial to reproduce -
void flush_dcache(void)
{
system("mount -o remount,rw /");
}
static char buf[20 * 1024 * 1024];
main()
{
int fd;
union {
struct file_handle f;
char buf[MAX_HANDLE_SZ];
} x;
int m;
x.f.handle_bytes = sizeof(x);
chdir("/root");
mkdir("foo", 0700);
fd = open("foo/bar", O_CREAT | O_RDWR, 0600);
close(fd);
name_to_handle_at(AT_FDCWD, "foo/bar", &x.f, &m, 0);
flush_dcache();
fd = open_by_handle_at(AT_FDCWD, &x.f, O_RDWR);
unlink("foo/bar");
write(fd, buf, sizeof(buf));
system("df ."); /* 20Mb eaten */
close(fd);
system("df ."); /* should've freed those 20Mb */
flush_dcache();
system("df ."); /* should be the same as #2 */
}
will spit out something like
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 303843 1131 100% /
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/root 322023 283282 21692 93% /
- inode gets freed only when dentry is finally evicted (here we trigger
than by remount; normally it would've happened in response to memory
pressure hell knows when).
Cc: stable@vger.kernel.org # v2.6.38+; earlier ones need s/kill_it/unhash_it/
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-07-08 04:42:38 +03:00
|
|
|
if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
|
|
|
|
goto kill_it;
|
|
|
|
|
vfs: reorganize dput() memory accesses
This is me being a bit OCD after all the dentry optimization work this
merge window: profiles end up showing 'dput()' as a rather expensive
operation, and there were two unrelated bad reasons for that.
The first reason was reading d_lockref.count for debugging purposes,
which touches the lockref cacheline (for reads) before really need to.
More importantly, the debugging test in question is _wrong_, and has
hidden bugs. It's true that we can only sleep when the count goes down
to zero, but the test as-is hides the much more subtle bug that happens
if we race with somebody else deleting the file.
Anyway we _will_ touch that cacheline, but let's do it for a write and
in the right routine (ie in "lockref_put_or_lock()") which annotates the
costs better. So remove the misleading debug code.
The other was an unnecessary access to the cacheline that contains the
d_lru list, just to check whether we already were on the LRU list or
not. This is exactly what we have d_flags for, so that we can avoid
touching extra cache lines for the common case. So just add another bit
for "is this dentry on the LRU".
Finally, mark the tests properly likely/unlikely, so that the common
fast-paths are dense in the instruction stream.
This makes the profiles look much saner.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-09 00:26:18 +04:00
|
|
|
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
|
2005-04-17 02:20:36 +04:00
|
|
|
if (dentry->d_op->d_delete(dentry))
|
2011-01-07 09:49:40 +03:00
|
|
|
goto kill_it;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-10-10 13:36:24 +04:00
|
|
|
|
2013-11-01 02:43:02 +04:00
|
|
|
if (!(dentry->d_flags & DCACHE_REFERENCED))
|
|
|
|
dentry->d_flags |= DCACHE_REFERENCED;
|
2010-10-10 13:36:26 +04:00
|
|
|
dentry_lru_add(dentry);
|
2010-10-10 13:36:24 +04:00
|
|
|
|
2013-08-29 05:24:59 +04:00
|
|
|
dentry->d_lockref.count--;
|
2011-01-07 09:49:40 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
return;
|
|
|
|
|
2007-05-08 11:23:46 +04:00
|
|
|
kill_it:
|
2014-05-29 17:18:26 +04:00
|
|
|
dentry = dentry_kill(dentry);
|
2007-05-08 11:23:46 +04:00
|
|
|
if (dentry)
|
|
|
|
goto repeat;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(dput);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
|
2011-01-07 09:49:38 +03:00
|
|
|
/* This must be called with d_lock held */
|
2011-01-07 09:49:43 +03:00
|
|
|
static inline void __dget_dlock(struct dentry *dentry)
|
2011-01-07 09:49:31 +03:00
|
|
|
{
|
2013-08-29 05:24:59 +04:00
|
|
|
dentry->d_lockref.count++;
|
2011-01-07 09:49:31 +03:00
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:43 +03:00
|
|
|
static inline void __dget(struct dentry *dentry)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2013-08-29 05:24:59 +04:00
|
|
|
lockref_get(&dentry->d_lockref);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:32 +03:00
|
|
|
struct dentry *dget_parent(struct dentry *dentry)
|
|
|
|
{
|
2013-09-02 22:29:22 +04:00
|
|
|
int gotref;
|
2011-01-07 09:49:32 +03:00
|
|
|
struct dentry *ret;
|
|
|
|
|
2013-09-02 22:29:22 +04:00
|
|
|
/*
|
|
|
|
* Do optimistic parent lookup without any
|
|
|
|
* locking.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
|
|
|
ret = ACCESS_ONCE(dentry->d_parent);
|
|
|
|
gotref = lockref_get_not_zero(&ret->d_lockref);
|
|
|
|
rcu_read_unlock();
|
|
|
|
if (likely(gotref)) {
|
|
|
|
if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
|
|
|
|
return ret;
|
|
|
|
dput(ret);
|
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:32 +03:00
|
|
|
repeat:
|
2011-01-07 09:49:44 +03:00
|
|
|
/*
|
|
|
|
* Don't need rcu_dereference because we re-check it was correct under
|
|
|
|
* the lock.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2011-01-07 09:49:32 +03:00
|
|
|
ret = dentry->d_parent;
|
2011-01-07 09:49:44 +03:00
|
|
|
spin_lock(&ret->d_lock);
|
|
|
|
if (unlikely(ret != dentry->d_parent)) {
|
|
|
|
spin_unlock(&ret->d_lock);
|
|
|
|
rcu_read_unlock();
|
2011-01-07 09:49:32 +03:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 09:49:44 +03:00
|
|
|
rcu_read_unlock();
|
2013-08-29 05:24:59 +04:00
|
|
|
BUG_ON(!ret->d_lockref.count);
|
|
|
|
ret->d_lockref.count++;
|
2011-01-07 09:49:32 +03:00
|
|
|
spin_unlock(&ret->d_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dget_parent);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* d_find_alias - grab a hashed alias of inode
|
|
|
|
* @inode: inode in question
|
|
|
|
*
|
|
|
|
* If inode has a hashed alias, or is a directory and has any alias,
|
|
|
|
* acquire the reference to alias and return it. Otherwise return NULL.
|
|
|
|
* Notice that if inode is a directory there can be only one alias and
|
|
|
|
* it can be unhashed only if it has no children, or if it is the root
|
2014-02-13 04:08:06 +04:00
|
|
|
* of a filesystem, or if the directory was renamed and d_revalidate
|
|
|
|
* was the first vfs operation to notice.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 13:16:16 +04:00
|
|
|
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
|
2014-01-16 20:15:51 +04:00
|
|
|
* any other hashed alias over that one.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2014-01-16 20:15:51 +04:00
|
|
|
static struct dentry *__d_find_alias(struct inode *inode)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2011-01-07 09:49:33 +03:00
|
|
|
struct dentry *alias, *discon_alias;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-01-07 09:49:33 +03:00
|
|
|
again:
|
|
|
|
discon_alias = NULL;
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
|
2011-01-07 09:49:33 +03:00
|
|
|
spin_lock(&alias->d_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 13:16:16 +04:00
|
|
|
if (IS_ROOT(alias) &&
|
2011-01-07 09:49:33 +03:00
|
|
|
(alias->d_flags & DCACHE_DISCONNECTED)) {
|
2005-04-17 02:20:36 +04:00
|
|
|
discon_alias = alias;
|
2014-01-16 20:15:51 +04:00
|
|
|
} else {
|
2011-01-07 09:49:43 +03:00
|
|
|
__dget_dlock(alias);
|
2011-01-07 09:49:33 +03:00
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
return alias;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
}
|
|
|
|
if (discon_alias) {
|
|
|
|
alias = discon_alias;
|
|
|
|
spin_lock(&alias->d_lock);
|
|
|
|
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
|
2014-01-17 02:17:31 +04:00
|
|
|
__dget_dlock(alias);
|
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
return alias;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2011-01-07 09:49:33 +03:00
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
goto again;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2011-01-07 09:49:33 +03:00
|
|
|
return NULL;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:33 +03:00
|
|
|
struct dentry *d_find_alias(struct inode *inode)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2006-03-25 14:06:36 +03:00
|
|
|
struct dentry *de = NULL;
|
|
|
|
|
2012-06-09 21:51:19 +04:00
|
|
|
if (!hlist_empty(&inode->i_dentry)) {
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_lock(&inode->i_lock);
|
2014-01-16 20:15:51 +04:00
|
|
|
de = __d_find_alias(inode);
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2006-03-25 14:06:36 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
return de;
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_find_alias);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to kill dentries associated with this inode.
|
|
|
|
* WARNING: you must own a reference to inode.
|
|
|
|
*/
|
|
|
|
void d_prune_aliases(struct inode *inode)
|
|
|
|
{
|
2005-09-10 11:27:07 +04:00
|
|
|
struct dentry *dentry;
|
2005-04-17 02:20:36 +04:00
|
|
|
restart:
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_lock(&inode->i_lock);
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_lock(&dentry->d_lock);
|
2013-08-29 05:24:59 +04:00
|
|
|
if (!dentry->d_lockref.count) {
|
2014-05-30 19:25:30 +04:00
|
|
|
struct dentry *parent = lock_parent(dentry);
|
|
|
|
if (likely(!dentry->d_lockref.count)) {
|
|
|
|
__dentry_kill(dentry);
|
2014-11-19 10:50:34 +03:00
|
|
|
dput(parent);
|
2014-05-30 19:25:30 +04:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_prune_aliases);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-10-10 13:36:25 +04:00
|
|
|
static void shrink_dentry_list(struct list_head *list)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2014-04-30 00:13:18 +04:00
|
|
|
struct dentry *dentry, *parent;
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
|
2014-05-02 23:38:39 +04:00
|
|
|
while (!list_empty(list)) {
|
2014-05-28 21:59:13 +04:00
|
|
|
struct inode *inode;
|
2014-05-02 23:38:39 +04:00
|
|
|
dentry = list_entry(list->prev, struct dentry, d_lru);
|
2011-01-07 09:49:47 +03:00
|
|
|
spin_lock(&dentry->d_lock);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
parent = lock_parent(dentry);
|
|
|
|
|
2013-08-28 04:17:55 +04:00
|
|
|
/*
|
|
|
|
* The dispose list is isolated and dentries are not accounted
|
|
|
|
* to the LRU here, so we can simply remove it from the list
|
|
|
|
* here regardless of whether it is referenced or not.
|
|
|
|
*/
|
2013-09-14 06:55:10 +04:00
|
|
|
d_shrink_del(dentry);
|
2013-08-28 04:17:55 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* We found an inuse dentry which was not removed from
|
2013-08-28 04:17:55 +04:00
|
|
|
* the LRU because of laziness during lookup. Do not free it.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2015-01-10 02:19:03 +03:00
|
|
|
if (dentry->d_lockref.count > 0) {
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
continue;
|
|
|
|
}
|
2011-01-07 09:49:48 +03:00
|
|
|
|
2014-05-28 17:48:44 +04:00
|
|
|
|
|
|
|
if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
|
|
|
|
bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
2014-05-28 17:48:44 +04:00
|
|
|
if (can_free)
|
|
|
|
dentry_free(dentry);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-05-28 21:59:13 +04:00
|
|
|
inode = dentry->d_inode;
|
|
|
|
if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
|
2013-09-14 06:55:10 +04:00
|
|
|
d_shrink_add(dentry, list);
|
2013-08-28 04:17:55 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
2014-04-30 00:13:18 +04:00
|
|
|
continue;
|
2013-08-28 04:17:55 +04:00
|
|
|
}
|
2014-05-28 21:59:13 +04:00
|
|
|
|
|
|
|
__dentry_kill(dentry);
|
shrink_dentry_list(): take parent's ->d_lock earlier
The cause of livelocks there is that we are taking ->d_lock on
dentry and its parent in the wrong order, forcing us to use
trylock on the parent's one. d_walk() takes them in the right
order, and unfortunately it's not hard to create a situation
when shrink_dentry_list() can't make progress since trylock
keeps failing, and shrink_dcache_parent() or check_submounts_and_drop()
keeps calling d_walk() disrupting the very shrink_dentry_list() it's
waiting for.
Solution is straightforward - if that trylock fails, let's unlock
the dentry itself and take locks in the right order. We need to
stabilize ->d_parent without holding ->d_lock, but that's doable
using RCU. And we'd better do that in the very beginning of the
loop in shrink_dentry_list(), since the checks on refcount, etc.
would need to be redone anyway.
That deals with a half of the problem - killing dentries on the
shrink list itself. Another one (dropping their parents) is
in the next commit.
locking parent is interesting - it would be easy to do rcu_read_lock(),
lock whatever we think is a parent, lock dentry itself and check
if the parent is still the right one. Except that we need to check
that *before* locking the dentry, or we are risking taking ->d_lock
out of order. Fortunately, once the D1 is locked, we can check if
D2->d_parent is equal to D1 without the need to lock D2; D2->d_parent
can start or stop pointing to D1 only under D1->d_lock, so taking
D1->d_lock is enough. In other words, the right solution is
rcu_read_lock/lock what looks like parent right now/check if it's
still our parent/rcu_read_unlock/lock the child.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-05-29 16:54:52 +04:00
|
|
|
|
2014-04-30 00:13:18 +04:00
|
|
|
/*
|
|
|
|
* We need to prune ancestors too. This is necessary to prevent
|
|
|
|
* quadratic behavior of shrink_dcache_parent(), but is also
|
|
|
|
* expected to be beneficial in reducing dentry cache
|
|
|
|
* fragmentation.
|
|
|
|
*/
|
|
|
|
dentry = parent;
|
2014-05-29 17:11:45 +04:00
|
|
|
while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
|
|
|
|
parent = lock_parent(dentry);
|
|
|
|
if (dentry->d_lockref.count != 1) {
|
|
|
|
dentry->d_lockref.count--;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
inode = dentry->d_inode; /* can't be NULL */
|
|
|
|
if (unlikely(!spin_trylock(&inode->i_lock))) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
cpu_relax();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
__dentry_kill(dentry);
|
|
|
|
dentry = parent;
|
|
|
|
}
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
}
|
2010-10-10 13:36:25 +04:00
|
|
|
}
|
|
|
|
|
2015-02-13 01:59:35 +03:00
|
|
|
static enum lru_status dentry_lru_isolate(struct list_head *item,
|
|
|
|
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
|
2013-08-28 04:18:00 +04:00
|
|
|
{
|
|
|
|
struct list_head *freeable = arg;
|
|
|
|
struct dentry *dentry = container_of(item, struct dentry, d_lru);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we are inverting the lru lock/dentry->d_lock here,
|
|
|
|
* so use a trylock. If we fail to get the lock, just skip
|
|
|
|
* it
|
|
|
|
*/
|
|
|
|
if (!spin_trylock(&dentry->d_lock))
|
|
|
|
return LRU_SKIP;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Referenced dentries are still in use. If they have active
|
|
|
|
* counts, just remove them from the LRU. Otherwise give them
|
|
|
|
* another pass through the LRU.
|
|
|
|
*/
|
|
|
|
if (dentry->d_lockref.count) {
|
2015-02-13 01:59:35 +03:00
|
|
|
d_lru_isolate(lru, dentry);
|
2013-08-28 04:18:00 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return LRU_REMOVED;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dentry->d_flags & DCACHE_REFERENCED) {
|
|
|
|
dentry->d_flags &= ~DCACHE_REFERENCED;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The list move itself will be made by the common LRU code. At
|
|
|
|
* this point, we've dropped the dentry->d_lock but keep the
|
|
|
|
* lru lock. This is safe to do, since every list movement is
|
|
|
|
* protected by the lru lock even if both locks are held.
|
|
|
|
*
|
|
|
|
* This is guaranteed by the fact that all LRU management
|
|
|
|
* functions are intermediated by the LRU API calls like
|
|
|
|
* list_lru_add and list_lru_del. List movement in this file
|
|
|
|
* only ever occur through this functions or through callbacks
|
|
|
|
* like this one, that are called from the LRU API.
|
|
|
|
*
|
|
|
|
* The only exceptions to this are functions like
|
|
|
|
* shrink_dentry_list, and code that first checks for the
|
|
|
|
* DCACHE_SHRINK_LIST flag. Those are guaranteed to be
|
|
|
|
* operating only with stack provided lists after they are
|
|
|
|
* properly isolated from the main list. It is thus, always a
|
|
|
|
* local access.
|
|
|
|
*/
|
|
|
|
return LRU_ROTATE;
|
|
|
|
}
|
|
|
|
|
2015-02-13 01:59:35 +03:00
|
|
|
d_lru_shrink_move(lru, dentry, freeable);
|
2013-08-28 04:18:00 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
|
|
|
|
return LRU_REMOVED;
|
|
|
|
}
|
|
|
|
|
2010-10-10 13:36:25 +04:00
|
|
|
/**
|
2011-08-23 12:56:24 +04:00
|
|
|
* prune_dcache_sb - shrink the dcache
|
|
|
|
* @sb: superblock
|
list_lru: introduce list_lru_shrink_{count,walk}
Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support. That means when we hit the limit we will get ENOMEM w/o any
chance to recover. What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup. This is what
this patch set is intended to do.
Basically, it does two things. First, it introduces the notion of
per-memcg slab shrinker. A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be
passed the memory cgroup to scan from in shrink_control->memcg. For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.
Secondly, this patch set makes the list_lru structure per-memcg. It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru. Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to. This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.
As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit. Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.
This patch (of 9):
NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists. Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:
count = list_lru_count_node(lru, sc->nid);
freed = list_lru_walk_node(lru, sc->nid, isolate_func,
isolate_arg, &sc->nr_to_scan);
where sc is an instance of the shrink_control structure passed to it
from vmscan.
To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.
This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-13 01:58:47 +03:00
|
|
|
* @sc: shrink control, passed to list_lru_shrink_walk()
|
2011-08-23 12:56:24 +04:00
|
|
|
*
|
list_lru: introduce list_lru_shrink_{count,walk}
Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support. That means when we hit the limit we will get ENOMEM w/o any
chance to recover. What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup. This is what
this patch set is intended to do.
Basically, it does two things. First, it introduces the notion of
per-memcg slab shrinker. A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be
passed the memory cgroup to scan from in shrink_control->memcg. For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.
Secondly, this patch set makes the list_lru structure per-memcg. It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru. Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to. This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.
As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit. Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.
This patch (of 9):
NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists. Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:
count = list_lru_count_node(lru, sc->nid);
freed = list_lru_walk_node(lru, sc->nid, isolate_func,
isolate_arg, &sc->nr_to_scan);
where sc is an instance of the shrink_control structure passed to it
from vmscan.
To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.
This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-13 01:58:47 +03:00
|
|
|
* Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
|
|
|
|
* is done when we need more memory and called from the superblock shrinker
|
2011-08-23 12:56:24 +04:00
|
|
|
* function.
|
2010-10-10 13:36:25 +04:00
|
|
|
*
|
2011-08-23 12:56:24 +04:00
|
|
|
* This function may fail to free any resources if all the dentries are in
|
|
|
|
* use.
|
2010-10-10 13:36:25 +04:00
|
|
|
*/
|
list_lru: introduce list_lru_shrink_{count,walk}
Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support. That means when we hit the limit we will get ENOMEM w/o any
chance to recover. What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup. This is what
this patch set is intended to do.
Basically, it does two things. First, it introduces the notion of
per-memcg slab shrinker. A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be
passed the memory cgroup to scan from in shrink_control->memcg. For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.
Secondly, this patch set makes the list_lru structure per-memcg. It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru. Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to. This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.
As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit. Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.
This patch (of 9):
NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists. Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:
count = list_lru_count_node(lru, sc->nid);
freed = list_lru_walk_node(lru, sc->nid, isolate_func,
isolate_arg, &sc->nr_to_scan);
where sc is an instance of the shrink_control structure passed to it
from vmscan.
To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.
This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-13 01:58:47 +03:00
|
|
|
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
|
2010-10-10 13:36:25 +04:00
|
|
|
{
|
2013-08-28 04:18:00 +04:00
|
|
|
LIST_HEAD(dispose);
|
|
|
|
long freed;
|
2010-10-10 13:36:25 +04:00
|
|
|
|
list_lru: introduce list_lru_shrink_{count,walk}
Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support. That means when we hit the limit we will get ENOMEM w/o any
chance to recover. What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup. This is what
this patch set is intended to do.
Basically, it does two things. First, it introduces the notion of
per-memcg slab shrinker. A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be
passed the memory cgroup to scan from in shrink_control->memcg. For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.
Secondly, this patch set makes the list_lru structure per-memcg. It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru. Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to. This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.
As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit. Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.
This patch (of 9):
NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists. Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:
count = list_lru_count_node(lru, sc->nid);
freed = list_lru_walk_node(lru, sc->nid, isolate_func,
isolate_arg, &sc->nr_to_scan);
where sc is an instance of the shrink_control structure passed to it
from vmscan.
To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.
This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-13 01:58:47 +03:00
|
|
|
freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
|
|
|
|
dentry_lru_isolate, &dispose);
|
2013-08-28 04:18:00 +04:00
|
|
|
shrink_dentry_list(&dispose);
|
2013-08-28 04:17:57 +04:00
|
|
|
return freed;
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
}
|
2011-01-07 09:49:31 +03:00
|
|
|
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
|
2015-02-13 01:59:35 +03:00
|
|
|
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
|
2013-08-28 04:17:55 +04:00
|
|
|
{
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
struct list_head *freeable = arg;
|
|
|
|
struct dentry *dentry = container_of(item, struct dentry, d_lru);
|
2013-08-28 04:17:55 +04:00
|
|
|
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
/*
|
|
|
|
* we are inverting the lru lock/dentry->d_lock here,
|
|
|
|
* so use a trylock. If we fail to get the lock, just skip
|
|
|
|
* it
|
|
|
|
*/
|
|
|
|
if (!spin_trylock(&dentry->d_lock))
|
|
|
|
return LRU_SKIP;
|
|
|
|
|
2015-02-13 01:59:35 +03:00
|
|
|
d_lru_shrink_move(lru, dentry, freeable);
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 09:49:47 +03:00
|
|
|
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
return LRU_REMOVED;
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 08:27:13 +04:00
|
|
|
}
|
|
|
|
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* shrink_dcache_sb - shrink dcache for a superblock
|
|
|
|
* @sb: superblock
|
|
|
|
*
|
2010-10-10 13:36:25 +04:00
|
|
|
* Shrink the dcache for the specified super block. This is used to free
|
|
|
|
* the dcache before unmounting a file system.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2010-10-10 13:36:25 +04:00
|
|
|
void shrink_dcache_sb(struct super_block *sb)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
long freed;
|
|
|
|
|
|
|
|
do {
|
|
|
|
LIST_HEAD(dispose);
|
|
|
|
|
|
|
|
freed = list_lru_walk(&sb->s_dentry_lru,
|
|
|
|
dentry_lru_isolate_shrink, &dispose, UINT_MAX);
|
2010-10-10 13:36:25 +04:00
|
|
|
|
list_lru: remove special case function list_lru_dispose_all.
The list_lru implementation has one function, list_lru_dispose_all, with
only one user (the dentry code). At first, such function appears to make
sense because we are really not interested in the result of isolating each
dentry separately - all of them are going away anyway. However, it's
implementation is buggy in the following way:
When we call list_lru_dispose_all in fs/dcache.c, we scan all dentries
marking them with DCACHE_SHRINK_LIST. However, this is done without the
nlru->lock taken. The imediate result of that is that someone else may
add or remove the dentry from the LRU at the same time. When list_lru_del
happens in that scenario we will see an element that is not yet marked
with DCACHE_SHRINK_LIST (even though it will be in the future) and
obviously remove it from an lru where the element no longer is. Since
list_lru_dispose_all will in effect count down nlru's nr_items and
list_lru_del will do the same, this will lead to an imbalance.
The solution for this would not be so simple: we can obviously just keep
the lru_lock taken, but then we have no guarantees that we will be able to
acquire the dentry lock (dentry->d_lock). To properly solve this, we need
a communication mechanism between the lru and dentry code, so they can
coordinate this with each other.
Such mechanism already exists in the form of the list_lru_walk_cb
callback. So it is possible to construct a dcache-side prune function
that does the right thing only by calling list_lru_walk in a loop until no
more dentries are available.
With only one user, plus the fact that a sane solution for the problem
would involve boucing between dcache and list_lru anyway, I see little
justification to keep the special case list_lru_dispose_all in tree.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 04:18:03 +04:00
|
|
|
this_cpu_sub(nr_dentry_unused, freed);
|
|
|
|
shrink_dentry_list(&dispose);
|
|
|
|
} while (freed > 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(shrink_dcache_sb);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
/**
|
|
|
|
* enum d_walk_ret - action to talke during tree walk
|
|
|
|
* @D_WALK_CONTINUE: contrinue walk
|
|
|
|
* @D_WALK_QUIT: quit walk
|
|
|
|
* @D_WALK_NORETRY: quit when retry is needed
|
|
|
|
* @D_WALK_SKIP: skip this dentry and its children
|
|
|
|
*/
|
|
|
|
enum d_walk_ret {
|
|
|
|
D_WALK_CONTINUE,
|
|
|
|
D_WALK_QUIT,
|
|
|
|
D_WALK_NORETRY,
|
|
|
|
D_WALK_SKIP,
|
|
|
|
};
|
2011-03-16 01:29:21 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
2013-09-05 13:44:35 +04:00
|
|
|
* d_walk - walk the dentry tree
|
|
|
|
* @parent: start of walk
|
|
|
|
* @data: data passed to @enter() and @finish()
|
|
|
|
* @enter: callback when first entering the dentry
|
|
|
|
* @finish: callback when successfully finished the walk
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
2013-09-05 13:44:35 +04:00
|
|
|
* The @enter() and @finish() callbacks are called with d_lock held.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2013-09-05 13:44:35 +04:00
|
|
|
static void d_walk(struct dentry *parent, void *data,
|
|
|
|
enum d_walk_ret (*enter)(void *, struct dentry *),
|
|
|
|
void (*finish)(void *))
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2011-01-07 09:49:37 +03:00
|
|
|
struct dentry *this_parent;
|
2005-04-17 02:20:36 +04:00
|
|
|
struct list_head *next;
|
2013-09-09 23:22:25 +04:00
|
|
|
unsigned seq = 0;
|
2013-09-05 13:44:35 +04:00
|
|
|
enum d_walk_ret ret;
|
|
|
|
bool retry = true;
|
2011-01-07 09:49:37 +03:00
|
|
|
|
2011-01-07 09:49:39 +03:00
|
|
|
again:
|
2013-09-09 23:22:25 +04:00
|
|
|
read_seqbegin_or_lock(&rename_lock, &seq);
|
2011-01-07 09:49:39 +03:00
|
|
|
this_parent = parent;
|
2011-01-07 09:49:34 +03:00
|
|
|
spin_lock(&this_parent->d_lock);
|
2013-09-05 13:44:35 +04:00
|
|
|
|
|
|
|
ret = enter(data, this_parent);
|
|
|
|
switch (ret) {
|
|
|
|
case D_WALK_CONTINUE:
|
|
|
|
break;
|
|
|
|
case D_WALK_QUIT:
|
|
|
|
case D_WALK_SKIP:
|
|
|
|
goto out_unlock;
|
|
|
|
case D_WALK_NORETRY:
|
|
|
|
retry = false;
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
repeat:
|
|
|
|
next = this_parent->d_subdirs.next;
|
|
|
|
resume:
|
|
|
|
while (next != &this_parent->d_subdirs) {
|
|
|
|
struct list_head *tmp = next;
|
2014-10-27 02:19:16 +03:00
|
|
|
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
|
2005-04-17 02:20:36 +04:00
|
|
|
next = tmp->next;
|
2011-01-07 09:49:34 +03:00
|
|
|
|
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
2013-09-05 13:44:35 +04:00
|
|
|
|
|
|
|
ret = enter(data, dentry);
|
|
|
|
switch (ret) {
|
|
|
|
case D_WALK_CONTINUE:
|
|
|
|
break;
|
|
|
|
case D_WALK_QUIT:
|
2011-01-07 09:49:34 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2013-09-05 13:44:35 +04:00
|
|
|
goto out_unlock;
|
|
|
|
case D_WALK_NORETRY:
|
|
|
|
retry = false;
|
|
|
|
break;
|
|
|
|
case D_WALK_SKIP:
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
continue;
|
2011-01-07 09:49:34 +03:00
|
|
|
}
|
2013-09-05 13:44:35 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
2011-01-07 09:49:34 +03:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
|
2005-04-17 02:20:36 +04:00
|
|
|
this_parent = dentry;
|
2011-01-07 09:49:34 +03:00
|
|
|
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
|
2005-04-17 02:20:36 +04:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 09:49:34 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* All done at this level ... ascend and resume the search.
|
|
|
|
*/
|
2014-10-27 02:31:10 +03:00
|
|
|
rcu_read_lock();
|
|
|
|
ascend:
|
2005-04-17 02:20:36 +04:00
|
|
|
if (this_parent != parent) {
|
2011-03-16 01:29:21 +03:00
|
|
|
struct dentry *child = this_parent;
|
2013-10-26 01:04:27 +04:00
|
|
|
this_parent = child->d_parent;
|
|
|
|
|
|
|
|
spin_unlock(&child->d_lock);
|
|
|
|
spin_lock(&this_parent->d_lock);
|
|
|
|
|
2014-10-27 02:31:10 +03:00
|
|
|
/* might go back up the wrong parent if we have had a rename. */
|
|
|
|
if (need_seqretry(&rename_lock, seq))
|
2011-01-07 09:49:37 +03:00
|
|
|
goto rename_retry;
|
2015-05-29 06:09:19 +03:00
|
|
|
/* go into the first sibling still alive */
|
|
|
|
do {
|
|
|
|
next = child->d_child.next;
|
2014-10-27 02:31:10 +03:00
|
|
|
if (next == &this_parent->d_subdirs)
|
|
|
|
goto ascend;
|
|
|
|
child = list_entry(next, struct dentry, d_child);
|
2015-05-29 06:09:19 +03:00
|
|
|
} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
|
2013-10-26 01:04:27 +04:00
|
|
|
rcu_read_unlock();
|
2005-04-17 02:20:36 +04:00
|
|
|
goto resume;
|
|
|
|
}
|
2014-10-27 02:31:10 +03:00
|
|
|
if (need_seqretry(&rename_lock, seq))
|
2011-01-07 09:49:37 +03:00
|
|
|
goto rename_retry;
|
2014-10-27 02:31:10 +03:00
|
|
|
rcu_read_unlock();
|
2013-09-05 13:44:35 +04:00
|
|
|
if (finish)
|
|
|
|
finish(data);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
spin_unlock(&this_parent->d_lock);
|
2013-09-09 23:22:25 +04:00
|
|
|
done_seqretry(&rename_lock, seq);
|
2013-09-05 13:44:35 +04:00
|
|
|
return;
|
2011-01-07 09:49:39 +03:00
|
|
|
|
|
|
|
rename_retry:
|
2014-10-27 02:31:10 +03:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
BUG_ON(seq & 1);
|
2013-09-05 13:44:35 +04:00
|
|
|
if (!retry)
|
|
|
|
return;
|
2013-09-09 23:22:25 +04:00
|
|
|
seq = 1;
|
2011-01-07 09:49:39 +03:00
|
|
|
goto again;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2013-09-05 13:44:35 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Search for at least 1 mount point in the dentry's subdirs.
|
|
|
|
* We descend to the next level whenever the d_subdirs
|
|
|
|
* list is non-empty and continue searching.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
int *ret = data;
|
|
|
|
if (d_mountpoint(dentry)) {
|
|
|
|
*ret = 1;
|
|
|
|
return D_WALK_QUIT;
|
|
|
|
}
|
|
|
|
return D_WALK_CONTINUE;
|
|
|
|
}
|
|
|
|
|
2013-10-20 01:57:07 +04:00
|
|
|
/**
|
|
|
|
* have_submounts - check for mounts over a dentry
|
|
|
|
* @parent: dentry to check.
|
|
|
|
*
|
|
|
|
* Return true if the parent or its subdirectories contain
|
|
|
|
* a mount point
|
|
|
|
*/
|
2013-09-05 13:44:35 +04:00
|
|
|
int have_submounts(struct dentry *parent)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
d_walk(parent, &ret, check_mount, NULL);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(have_submounts);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2013-09-05 16:39:11 +04:00
|
|
|
/*
|
|
|
|
* Called by mount code to set a mountpoint and check if the mountpoint is
|
|
|
|
* reachable (e.g. NFS can unhash a directory dentry and then the complete
|
|
|
|
* subtree can become unreachable).
|
|
|
|
*
|
2014-02-13 21:39:37 +04:00
|
|
|
* Only one of d_invalidate() and d_set_mounted() must succeed. For
|
2013-09-05 16:39:11 +04:00
|
|
|
* this reason take rename_lock and d_lock on dentry and ancestors.
|
|
|
|
*/
|
|
|
|
int d_set_mounted(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct dentry *p;
|
|
|
|
int ret = -ENOENT;
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
|
2014-02-13 21:39:37 +04:00
|
|
|
/* Need exclusion wrt. d_invalidate() */
|
2013-09-05 16:39:11 +04:00
|
|
|
spin_lock(&p->d_lock);
|
|
|
|
if (unlikely(d_unhashed(p))) {
|
|
|
|
spin_unlock(&p->d_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
spin_unlock(&p->d_lock);
|
|
|
|
}
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (!d_unlinked(dentry)) {
|
|
|
|
dentry->d_flags |= DCACHE_MOUNTED;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
out:
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
2012-09-19 00:35:51 +04:00
|
|
|
* Search the dentry child list of the specified parent,
|
2005-04-17 02:20:36 +04:00
|
|
|
* and move any unused dentries to the end of the unused
|
|
|
|
* list for prune_dcache(). We descend to the next level
|
|
|
|
* whenever the d_subdirs list is non-empty and continue
|
|
|
|
* searching.
|
|
|
|
*
|
|
|
|
* It returns zero iff there are no unused children,
|
|
|
|
* otherwise it returns the number of children moved to
|
|
|
|
* the end of the unused list. This may not be the total
|
|
|
|
* number of unused children, because select_parent can
|
|
|
|
* drop the lock and return early due to latency
|
|
|
|
* constraints.
|
|
|
|
*/
|
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
struct select_data {
|
|
|
|
struct dentry *start;
|
|
|
|
struct list_head dispose;
|
|
|
|
int found;
|
|
|
|
};
|
2011-01-07 09:49:31 +03:00
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct select_data *data = _data;
|
|
|
|
enum d_walk_ret ret = D_WALK_CONTINUE;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
if (data->start == dentry)
|
|
|
|
goto out;
|
2011-01-07 09:49:34 +03:00
|
|
|
|
2014-05-03 08:02:25 +04:00
|
|
|
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
|
2013-09-05 13:44:35 +04:00
|
|
|
data->found++;
|
2014-05-03 08:02:25 +04:00
|
|
|
} else {
|
|
|
|
if (dentry->d_flags & DCACHE_LRU_LIST)
|
|
|
|
d_lru_del(dentry);
|
|
|
|
if (!dentry->d_lockref.count) {
|
|
|
|
d_shrink_add(dentry, &data->dispose);
|
|
|
|
data->found++;
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2013-09-05 13:44:35 +04:00
|
|
|
/*
|
|
|
|
* We can return to the caller if we have found some (this
|
|
|
|
* ensures forward progress). We'll be coming back to find
|
|
|
|
* the rest.
|
|
|
|
*/
|
2014-05-03 08:02:25 +04:00
|
|
|
if (!list_empty(&data->dispose))
|
|
|
|
ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
|
2005-04-17 02:20:36 +04:00
|
|
|
out:
|
2013-09-05 13:44:35 +04:00
|
|
|
return ret;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* shrink_dcache_parent - prune dcache
|
|
|
|
* @parent: parent of entries to prune
|
|
|
|
*
|
|
|
|
* Prune the dcache to remove unused children of the parent dentry.
|
|
|
|
*/
|
2013-09-05 13:44:35 +04:00
|
|
|
void shrink_dcache_parent(struct dentry *parent)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2013-09-05 13:44:35 +04:00
|
|
|
for (;;) {
|
|
|
|
struct select_data data;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
INIT_LIST_HEAD(&data.dispose);
|
|
|
|
data.start = parent;
|
|
|
|
data.found = 0;
|
|
|
|
|
|
|
|
d_walk(parent, &data, select_collect, NULL);
|
|
|
|
if (!data.found)
|
|
|
|
break;
|
|
|
|
|
|
|
|
shrink_dentry_list(&data.dispose);
|
fs/dcache.c: add cond_resched() to shrink_dcache_parent()
Call cond_resched() in shrink_dcache_parent() to maintain interactivity.
Before this patch:
void shrink_dcache_parent(struct dentry * parent)
{
while ((found = select_parent(parent, &dispose)) != 0)
shrink_dentry_list(&dispose);
}
select_parent() populates the dispose list with dentries which
shrink_dentry_list() then deletes. select_parent() carefully uses
need_resched() to avoid doing too much work at once. But neither
shrink_dcache_parent() nor its called functions call cond_resched(). So
once need_resched() is set select_parent() will return single dentry
dispose list which is then deleted by shrink_dentry_list(). This is
inefficient when there are a lot of dentry to process. This can cause
softlockup and hurts interactivity on non preemptable kernels.
This change adds cond_resched() in shrink_dcache_parent(). The benefit
of this is that need_resched() is quickly cleared so that future calls
to select_parent() are able to efficiently return a big batch of dentry.
These additional cond_resched() do not seem to impact performance, at
least for the workload below.
Here is a program which can cause soft lockup if other system activity
sets need_resched().
int main()
{
struct rlimit rlim;
int i;
int f[100000];
char buf[20];
struct timeval t1, t2;
double diff;
/* cleanup past run */
system("rm -rf x");
/* boost nfile rlimit */
rlim.rlim_cur = 200000;
rlim.rlim_max = 200000;
if (setrlimit(RLIMIT_NOFILE, &rlim))
err(1, "setrlimit");
/* make directory for files */
if (mkdir("x", 0700))
err(1, "mkdir");
if (gettimeofday(&t1, NULL))
err(1, "gettimeofday");
/* populate directory with open files */
for (i = 0; i < 100000; i++) {
snprintf(buf, sizeof(buf), "x/%d", i);
f[i] = open(buf, O_CREAT);
if (f[i] == -1)
err(1, "open");
}
/* close some of the files */
for (i = 0; i < 85000; i++)
close(f[i]);
/* unlink all files, even open ones */
system("rm -rf x");
if (gettimeofday(&t2, NULL))
err(1, "gettimeofday");
diff = (((double)t2.tv_sec * 1000000 + t2.tv_usec) -
((double)t1.tv_sec * 1000000 + t1.tv_usec));
printf("done: %g elapsed\n", diff/1e6);
return 0;
}
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-01 02:26:48 +04:00
|
|
|
cond_resched();
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(shrink_dcache_parent);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-05-03 04:36:10 +04:00
|
|
|
static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
|
2013-11-08 21:31:16 +04:00
|
|
|
{
|
2014-05-03 04:36:10 +04:00
|
|
|
/* it has busy descendents; complain about those instead */
|
|
|
|
if (!list_empty(&dentry->d_subdirs))
|
|
|
|
return D_WALK_CONTINUE;
|
2013-11-08 21:31:16 +04:00
|
|
|
|
2014-05-03 04:36:10 +04:00
|
|
|
/* root with refcount 1 is fine */
|
|
|
|
if (dentry == _data && dentry->d_lockref.count == 1)
|
|
|
|
return D_WALK_CONTINUE;
|
|
|
|
|
|
|
|
printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
|
|
|
|
" still in use (%d) [unmount of %s %s]\n",
|
2013-11-08 21:31:16 +04:00
|
|
|
dentry,
|
|
|
|
dentry->d_inode ?
|
|
|
|
dentry->d_inode->i_ino : 0UL,
|
2014-05-03 04:36:10 +04:00
|
|
|
dentry,
|
2013-11-08 21:31:16 +04:00
|
|
|
dentry->d_lockref.count,
|
|
|
|
dentry->d_sb->s_type->name,
|
|
|
|
dentry->d_sb->s_id);
|
2014-05-03 04:36:10 +04:00
|
|
|
WARN_ON(1);
|
|
|
|
return D_WALK_CONTINUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_one_tree(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
shrink_dcache_parent(dentry);
|
|
|
|
d_walk(dentry, dentry, umount_check, NULL);
|
|
|
|
d_drop(dentry);
|
|
|
|
dput(dentry);
|
2013-11-08 21:31:16 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* destroy the dentries attached to a superblock on unmounting
|
|
|
|
*/
|
|
|
|
void shrink_dcache_for_umount(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
|
2014-05-03 04:36:10 +04:00
|
|
|
WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
|
2013-11-08 21:31:16 +04:00
|
|
|
|
|
|
|
dentry = sb->s_root;
|
|
|
|
sb->s_root = NULL;
|
2014-05-03 04:36:10 +04:00
|
|
|
do_one_tree(dentry);
|
2013-11-08 21:31:16 +04:00
|
|
|
|
|
|
|
while (!hlist_bl_empty(&sb->s_anon)) {
|
2014-05-03 04:36:10 +04:00
|
|
|
dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
|
|
|
|
do_one_tree(dentry);
|
2013-11-08 21:31:16 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
struct detach_data {
|
|
|
|
struct select_data select;
|
|
|
|
struct dentry *mountpoint;
|
|
|
|
};
|
|
|
|
static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
|
2013-09-05 13:44:36 +04:00
|
|
|
{
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
struct detach_data *data = _data;
|
2013-09-05 13:44:36 +04:00
|
|
|
|
|
|
|
if (d_mountpoint(dentry)) {
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
__dget_dlock(dentry);
|
|
|
|
data->mountpoint = dentry;
|
2013-09-05 13:44:36 +04:00
|
|
|
return D_WALK_QUIT;
|
|
|
|
}
|
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
return select_collect(&data->select, dentry);
|
2013-09-05 13:44:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void check_and_drop(void *_data)
|
|
|
|
{
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
struct detach_data *data = _data;
|
2013-09-05 13:44:36 +04:00
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
if (!data->mountpoint && !data->select.found)
|
|
|
|
__d_drop(data->select.start);
|
2013-09-05 13:44:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2014-02-13 21:39:37 +04:00
|
|
|
* d_invalidate - detach submounts, prune dcache, and drop
|
|
|
|
* @dentry: dentry to invalidate (aka detach, prune and drop)
|
|
|
|
*
|
|
|
|
* no dcache lock.
|
2013-09-05 13:44:36 +04:00
|
|
|
*
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
* The final d_drop is done as an atomic operation relative to
|
|
|
|
* rename_lock ensuring there are no races with d_set_mounted. This
|
|
|
|
* ensures there are no unhashed dentries on the path to a mountpoint.
|
2013-09-05 13:44:36 +04:00
|
|
|
*/
|
2014-02-13 21:46:25 +04:00
|
|
|
void d_invalidate(struct dentry *dentry)
|
2013-09-05 13:44:36 +04:00
|
|
|
{
|
2014-02-13 21:39:37 +04:00
|
|
|
/*
|
|
|
|
* If it's already been dropped, return OK.
|
|
|
|
*/
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (d_unhashed(dentry)) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2014-02-13 21:46:25 +04:00
|
|
|
return;
|
2014-02-13 21:39:37 +04:00
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
|
2013-09-05 13:44:36 +04:00
|
|
|
/* Negative dentries can be dropped without further checks */
|
|
|
|
if (!dentry->d_inode) {
|
|
|
|
d_drop(dentry);
|
2014-02-13 21:46:25 +04:00
|
|
|
return;
|
2013-09-05 13:44:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
struct detach_data data;
|
2013-09-05 13:44:36 +04:00
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
data.mountpoint = NULL;
|
|
|
|
INIT_LIST_HEAD(&data.select.dispose);
|
|
|
|
data.select.start = dentry;
|
|
|
|
data.select.found = 0;
|
|
|
|
|
|
|
|
d_walk(dentry, &data, detach_and_collect, check_and_drop);
|
2013-09-05 13:44:36 +04:00
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
if (data.select.found)
|
|
|
|
shrink_dentry_list(&data.select.dispose);
|
2013-09-05 13:44:36 +04:00
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
if (data.mountpoint) {
|
|
|
|
detach_mounts(data.mountpoint);
|
|
|
|
dput(data.mountpoint);
|
|
|
|
}
|
2013-09-05 13:44:36 +04:00
|
|
|
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
if (!data.mountpoint && !data.select.found)
|
2013-09-05 13:44:36 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
}
|
2014-02-13 21:39:37 +04:00
|
|
|
EXPORT_SYMBOL(d_invalidate);
|
2013-09-05 13:44:36 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
2011-07-07 23:03:58 +04:00
|
|
|
* __d_alloc - allocate a dcache entry
|
|
|
|
* @sb: filesystem it will belong to
|
2005-04-17 02:20:36 +04:00
|
|
|
* @name: qstr of the name
|
|
|
|
*
|
|
|
|
* Allocates a dentry. It returns %NULL if there is insufficient memory
|
|
|
|
* available. On a success the dentry is returned. The name passed in is
|
|
|
|
* copied and the copy passed in may be reused after this call.
|
|
|
|
*/
|
|
|
|
|
2011-07-07 23:03:58 +04:00
|
|
|
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
char *dname;
|
|
|
|
|
2007-10-16 12:25:52 +04:00
|
|
|
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (!dentry)
|
|
|
|
return NULL;
|
|
|
|
|
2012-05-22 03:14:04 +04:00
|
|
|
/*
|
|
|
|
* We guarantee that the inline name is always NUL-terminated.
|
|
|
|
* This way the memcpy() done by the name switching in rename
|
|
|
|
* will still always have a NUL at the end, even if we might
|
|
|
|
* be overwriting an internal NUL character
|
|
|
|
*/
|
|
|
|
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
if (name->len > DNAME_INLINE_LEN-1) {
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
size_t size = offsetof(struct external_name, name[1]);
|
|
|
|
struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
|
|
|
|
if (!p) {
|
2005-04-17 02:20:36 +04:00
|
|
|
kmem_cache_free(dentry_cache, dentry);
|
|
|
|
return NULL;
|
|
|
|
}
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
atomic_set(&p->u.count, 1);
|
|
|
|
dname = p->name;
|
2015-02-14 01:39:45 +03:00
|
|
|
if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
|
|
|
|
kasan_unpoison_shadow(dname,
|
|
|
|
round_up(name->len + 1, sizeof(unsigned long)));
|
2005-04-17 02:20:36 +04:00
|
|
|
} else {
|
|
|
|
dname = dentry->d_iname;
|
|
|
|
}
|
|
|
|
|
|
|
|
dentry->d_name.len = name->len;
|
|
|
|
dentry->d_name.hash = name->hash;
|
|
|
|
memcpy(dname, name->name, name->len);
|
|
|
|
dname[name->len] = 0;
|
|
|
|
|
2012-05-22 03:14:04 +04:00
|
|
|
/* Make sure we always see the terminating NUL character */
|
|
|
|
smp_wmb();
|
|
|
|
dentry->d_name.name = dname;
|
|
|
|
|
2013-08-29 05:24:59 +04:00
|
|
|
dentry->d_lockref.count = 1;
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 18:58:46 +04:00
|
|
|
dentry->d_flags = 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_lock_init(&dentry->d_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
seqcount_init(&dentry->d_seq);
|
2005-04-17 02:20:36 +04:00
|
|
|
dentry->d_inode = NULL;
|
2011-07-07 23:03:58 +04:00
|
|
|
dentry->d_parent = dentry;
|
|
|
|
dentry->d_sb = sb;
|
2005-04-17 02:20:36 +04:00
|
|
|
dentry->d_op = NULL;
|
|
|
|
dentry->d_fsdata = NULL;
|
2011-01-07 09:50:05 +03:00
|
|
|
INIT_HLIST_BL_NODE(&dentry->d_hash);
|
2005-04-17 02:20:36 +04:00
|
|
|
INIT_LIST_HEAD(&dentry->d_lru);
|
|
|
|
INIT_LIST_HEAD(&dentry->d_subdirs);
|
2014-10-27 02:19:16 +03:00
|
|
|
INIT_HLIST_NODE(&dentry->d_u.d_alias);
|
|
|
|
INIT_LIST_HEAD(&dentry->d_child);
|
2011-07-07 23:03:58 +04:00
|
|
|
d_set_d_op(dentry, dentry->d_sb->s_d_op);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:19 +03:00
|
|
|
this_cpu_inc(nr_dentry);
|
2010-10-10 13:36:23 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
return dentry;
|
|
|
|
}
|
2011-07-07 23:03:58 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_alloc - allocate a dcache entry
|
|
|
|
* @parent: parent of entry to allocate
|
|
|
|
* @name: qstr of the name
|
|
|
|
*
|
|
|
|
* Allocates a dentry. It returns %NULL if there is insufficient memory
|
|
|
|
* available. On a success the dentry is returned. The name passed in is
|
|
|
|
* copied and the copy passed in may be reused after this call.
|
|
|
|
*/
|
|
|
|
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
|
|
|
|
{
|
|
|
|
struct dentry *dentry = __d_alloc(parent->d_sb, name);
|
|
|
|
if (!dentry)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
spin_lock(&parent->d_lock);
|
|
|
|
/*
|
|
|
|
* don't need child lock because it is not subject
|
|
|
|
* to concurrency here
|
|
|
|
*/
|
|
|
|
__dget_dlock(parent);
|
|
|
|
dentry->d_parent = parent;
|
2014-10-27 02:19:16 +03:00
|
|
|
list_add(&dentry->d_child, &parent->d_subdirs);
|
2011-07-07 23:03:58 +04:00
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
|
|
|
|
return dentry;
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_alloc);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2012-06-30 00:20:47 +04:00
|
|
|
/**
|
|
|
|
* d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
|
|
|
|
* @sb: the superblock
|
|
|
|
* @name: qstr of the name
|
|
|
|
*
|
|
|
|
* For a filesystem that just pins its dentries in memory and never
|
|
|
|
* performs lookups at all, return an unhashed IS_ROOT dentry.
|
|
|
|
*/
|
2011-01-07 09:50:07 +03:00
|
|
|
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
|
|
|
|
{
|
2012-06-30 00:20:47 +04:00
|
|
|
return __d_alloc(sb, name);
|
2011-01-07 09:50:07 +03:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_alloc_pseudo);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
struct dentry *d_alloc_name(struct dentry *parent, const char *name)
|
|
|
|
{
|
|
|
|
struct qstr q;
|
|
|
|
|
|
|
|
q.name = name;
|
|
|
|
q.len = strlen(name);
|
|
|
|
q.hash = full_name_hash(q.name, q.len);
|
|
|
|
return d_alloc(parent, &q);
|
|
|
|
}
|
2009-09-30 04:09:42 +04:00
|
|
|
EXPORT_SYMBOL(d_alloc_name);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-01-07 09:49:55 +03:00
|
|
|
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
|
|
|
|
{
|
2011-01-15 00:26:18 +03:00
|
|
|
WARN_ON_ONCE(dentry->d_op);
|
|
|
|
WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
|
2011-01-07 09:49:55 +03:00
|
|
|
DCACHE_OP_COMPARE |
|
|
|
|
DCACHE_OP_REVALIDATE |
|
2013-02-20 20:19:05 +04:00
|
|
|
DCACHE_OP_WEAK_REVALIDATE |
|
2015-06-18 16:32:31 +03:00
|
|
|
DCACHE_OP_DELETE |
|
|
|
|
DCACHE_OP_SELECT_INODE));
|
2011-01-07 09:49:55 +03:00
|
|
|
dentry->d_op = op;
|
|
|
|
if (!op)
|
|
|
|
return;
|
|
|
|
if (op->d_hash)
|
|
|
|
dentry->d_flags |= DCACHE_OP_HASH;
|
|
|
|
if (op->d_compare)
|
|
|
|
dentry->d_flags |= DCACHE_OP_COMPARE;
|
|
|
|
if (op->d_revalidate)
|
|
|
|
dentry->d_flags |= DCACHE_OP_REVALIDATE;
|
2013-02-20 20:19:05 +04:00
|
|
|
if (op->d_weak_revalidate)
|
|
|
|
dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
|
2011-01-07 09:49:55 +03:00
|
|
|
if (op->d_delete)
|
|
|
|
dentry->d_flags |= DCACHE_OP_DELETE;
|
2011-10-28 21:02:42 +04:00
|
|
|
if (op->d_prune)
|
|
|
|
dentry->d_flags |= DCACHE_OP_PRUNE;
|
2015-06-18 16:32:31 +03:00
|
|
|
if (op->d_select_inode)
|
|
|
|
dentry->d_flags |= DCACHE_OP_SELECT_INODE;
|
2011-01-07 09:49:55 +03:00
|
|
|
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_set_d_op);
|
|
|
|
|
2015-01-29 15:02:28 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* d_set_fallthru - Mark a dentry as falling through to a lower layer
|
|
|
|
* @dentry - The dentry to mark
|
|
|
|
*
|
|
|
|
* Mark a dentry as falling through to the lower layer (as set with
|
|
|
|
* d_pin_lower()). This flag may be recorded on the medium.
|
|
|
|
*/
|
|
|
|
void d_set_fallthru(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
dentry->d_flags |= DCACHE_FALLTHRU;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_set_fallthru);
|
|
|
|
|
2013-09-12 22:22:53 +04:00
|
|
|
static unsigned d_flags_for_inode(struct inode *inode)
|
|
|
|
{
|
2015-01-29 15:02:29 +03:00
|
|
|
unsigned add_flags = DCACHE_REGULAR_TYPE;
|
2013-09-12 22:22:53 +04:00
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
return DCACHE_MISS_TYPE;
|
|
|
|
|
|
|
|
if (S_ISDIR(inode->i_mode)) {
|
|
|
|
add_flags = DCACHE_DIRECTORY_TYPE;
|
|
|
|
if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
|
|
|
|
if (unlikely(!inode->i_op->lookup))
|
|
|
|
add_flags = DCACHE_AUTODIR_TYPE;
|
|
|
|
else
|
|
|
|
inode->i_opflags |= IOP_LOOKUP;
|
|
|
|
}
|
2015-01-29 15:02:29 +03:00
|
|
|
goto type_determined;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
|
|
|
|
if (unlikely(inode->i_op->follow_link)) {
|
2013-09-12 22:22:53 +04:00
|
|
|
add_flags = DCACHE_SYMLINK_TYPE;
|
2015-01-29 15:02:29 +03:00
|
|
|
goto type_determined;
|
|
|
|
}
|
|
|
|
inode->i_opflags |= IOP_NOFOLLOW;
|
2013-09-12 22:22:53 +04:00
|
|
|
}
|
|
|
|
|
2015-01-29 15:02:29 +03:00
|
|
|
if (unlikely(!S_ISREG(inode->i_mode)))
|
|
|
|
add_flags = DCACHE_SPECIAL_TYPE;
|
|
|
|
|
|
|
|
type_determined:
|
2013-09-12 22:22:53 +04:00
|
|
|
if (unlikely(IS_AUTOMOUNT(inode)))
|
|
|
|
add_flags |= DCACHE_NEED_AUTOMOUNT;
|
|
|
|
return add_flags;
|
|
|
|
}
|
|
|
|
|
2008-10-16 02:50:28 +04:00
|
|
|
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
|
|
|
|
{
|
2013-09-12 22:22:53 +04:00
|
|
|
unsigned add_flags = d_flags_for_inode(inode);
|
|
|
|
|
2011-01-07 09:49:35 +03:00
|
|
|
spin_lock(&dentry->d_lock);
|
2013-09-12 22:22:53 +04:00
|
|
|
if (inode)
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
|
2015-03-05 17:09:22 +03:00
|
|
|
__d_set_inode_and_type(dentry, inode, add_flags);
|
2015-06-11 15:46:46 +03:00
|
|
|
dentry_rcuwalk_invalidate(dentry);
|
2011-01-07 09:49:35 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2008-10-16 02:50:28 +04:00
|
|
|
fsnotify_d_instantiate(dentry, inode);
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* d_instantiate - fill in inode information for a dentry
|
|
|
|
* @entry: dentry to complete
|
|
|
|
* @inode: inode to attach to this dentry
|
|
|
|
*
|
|
|
|
* Fill in inode information in the entry.
|
|
|
|
*
|
|
|
|
* This turns negative dentries into productive full members
|
|
|
|
* of society.
|
|
|
|
*
|
|
|
|
* NOTE! This assumes that the inode count has been incremented
|
|
|
|
* (or otherwise set) by the caller to indicate that it is now
|
|
|
|
* in use by the dcache.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_instantiate(struct dentry *entry, struct inode * inode)
|
|
|
|
{
|
2014-10-27 02:19:16 +03:00
|
|
|
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
|
2011-01-07 09:50:06 +03:00
|
|
|
if (inode)
|
|
|
|
spin_lock(&inode->i_lock);
|
2008-10-16 02:50:28 +04:00
|
|
|
__d_instantiate(entry, inode);
|
2011-01-07 09:50:06 +03:00
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
security_d_instantiate(entry, inode);
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_instantiate);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_instantiate_unique - instantiate a non-aliased dentry
|
|
|
|
* @entry: dentry to instantiate
|
|
|
|
* @inode: inode to attach to this dentry
|
|
|
|
*
|
|
|
|
* Fill in inode information in the entry. On success, it returns NULL.
|
|
|
|
* If an unhashed alias of "entry" already exists, then we return the
|
2006-01-10 07:52:51 +03:00
|
|
|
* aliased dentry instead and drop one reference to inode.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
|
|
|
* Note that in order to avoid conflicts with rename() etc, the caller
|
|
|
|
* had better be holding the parent directory semaphore.
|
2006-01-10 07:52:51 +03:00
|
|
|
*
|
|
|
|
* This also assumes that the inode count has been incremented
|
|
|
|
* (or otherwise set) by the caller to indicate that it is now
|
|
|
|
* in use by the dcache.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2006-08-23 04:06:07 +04:00
|
|
|
static struct dentry *__d_instantiate_unique(struct dentry *entry,
|
|
|
|
struct inode *inode)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct dentry *alias;
|
|
|
|
int len = entry->d_name.len;
|
|
|
|
const char *name = entry->d_name.name;
|
|
|
|
unsigned int hash = entry->d_name.hash;
|
|
|
|
|
2006-08-23 04:06:07 +04:00
|
|
|
if (!inode) {
|
2008-10-16 02:50:28 +04:00
|
|
|
__d_instantiate(entry, NULL);
|
2006-08-23 04:06:07 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
|
2011-01-07 09:49:36 +03:00
|
|
|
/*
|
|
|
|
* Don't need alias->d_lock here, because aliases with
|
|
|
|
* d_parent == entry->d_parent are not subject to name or
|
|
|
|
* parent changes, because the parent inode i_mutex is held.
|
|
|
|
*/
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
if (alias->d_name.hash != hash)
|
2005-04-17 02:20:36 +04:00
|
|
|
continue;
|
|
|
|
if (alias->d_parent != entry->d_parent)
|
|
|
|
continue;
|
2012-05-10 23:37:10 +04:00
|
|
|
if (alias->d_name.len != len)
|
|
|
|
continue;
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
if (dentry_cmp(alias, name, len))
|
2005-04-17 02:20:36 +04:00
|
|
|
continue;
|
2011-01-07 09:49:43 +03:00
|
|
|
__dget(alias);
|
2005-04-17 02:20:36 +04:00
|
|
|
return alias;
|
|
|
|
}
|
2006-08-23 04:06:07 +04:00
|
|
|
|
2008-10-16 02:50:28 +04:00
|
|
|
__d_instantiate(entry, inode);
|
2005-04-17 02:20:36 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2006-08-23 04:06:07 +04:00
|
|
|
|
|
|
|
struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct dentry *result;
|
|
|
|
|
2014-10-27 02:19:16 +03:00
|
|
|
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
|
2006-08-23 04:06:07 +04:00
|
|
|
|
2011-01-07 09:50:06 +03:00
|
|
|
if (inode)
|
|
|
|
spin_lock(&inode->i_lock);
|
2006-08-23 04:06:07 +04:00
|
|
|
result = __d_instantiate_unique(entry, inode);
|
2011-01-07 09:50:06 +03:00
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
2006-08-23 04:06:07 +04:00
|
|
|
|
|
|
|
if (!result) {
|
|
|
|
security_d_instantiate(entry, inode);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(!d_unhashed(result));
|
|
|
|
iput(inode);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
EXPORT_SYMBOL(d_instantiate_unique);
|
|
|
|
|
2013-10-01 18:44:54 +04:00
|
|
|
/**
|
|
|
|
* d_instantiate_no_diralias - instantiate a non-aliased dentry
|
|
|
|
* @entry: dentry to complete
|
|
|
|
* @inode: inode to attach to this dentry
|
|
|
|
*
|
|
|
|
* Fill in inode information in the entry. If a directory alias is found, then
|
|
|
|
* return an error (and drop inode). Together with d_materialise_unique() this
|
|
|
|
* guarantees that a directory inode may never have more than one alias.
|
|
|
|
*/
|
|
|
|
int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
|
|
|
|
{
|
2014-10-27 02:19:16 +03:00
|
|
|
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
|
2013-10-01 18:44:54 +04:00
|
|
|
|
|
|
|
spin_lock(&inode->i_lock);
|
|
|
|
if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
iput(inode);
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
__d_instantiate(entry, inode);
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
security_d_instantiate(entry, inode);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_instantiate_no_diralias);
|
|
|
|
|
2012-01-09 01:49:21 +04:00
|
|
|
struct dentry *d_make_root(struct inode *root_inode)
|
|
|
|
{
|
|
|
|
struct dentry *res = NULL;
|
|
|
|
|
|
|
|
if (root_inode) {
|
2012-05-11 00:14:12 +04:00
|
|
|
static const struct qstr name = QSTR_INIT("/", 1);
|
2012-01-09 01:49:21 +04:00
|
|
|
|
|
|
|
res = __d_alloc(root_inode->i_sb, &name);
|
|
|
|
if (res)
|
|
|
|
d_instantiate(res, root_inode);
|
|
|
|
else
|
|
|
|
iput(root_inode);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_make_root);
|
|
|
|
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
static struct dentry * __d_find_any_alias(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct dentry *alias;
|
|
|
|
|
2012-06-09 21:51:19 +04:00
|
|
|
if (hlist_empty(&inode->i_dentry))
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
return NULL;
|
2014-10-27 02:19:16 +03:00
|
|
|
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
__dget(alias);
|
|
|
|
return alias;
|
|
|
|
}
|
|
|
|
|
2012-01-10 21:04:37 +04:00
|
|
|
/**
|
|
|
|
* d_find_any_alias - find any alias for a given inode
|
|
|
|
* @inode: inode to find an alias for
|
|
|
|
*
|
|
|
|
* If any aliases exist for the given inode, take and return a
|
|
|
|
* reference for one of them. If no aliases exist, return %NULL.
|
|
|
|
*/
|
|
|
|
struct dentry *d_find_any_alias(struct inode *inode)
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
{
|
|
|
|
struct dentry *de;
|
|
|
|
|
|
|
|
spin_lock(&inode->i_lock);
|
|
|
|
de = __d_find_any_alias(inode);
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
return de;
|
|
|
|
}
|
2012-01-10 21:04:37 +04:00
|
|
|
EXPORT_SYMBOL(d_find_any_alias);
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
|
2014-08-01 01:59:02 +04:00
|
|
|
static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
|
2008-08-11 17:48:57 +04:00
|
|
|
{
|
2012-11-09 04:09:37 +04:00
|
|
|
static const struct qstr anonstring = QSTR_INIT("/", 1);
|
2008-08-11 17:49:12 +04:00
|
|
|
struct dentry *tmp;
|
|
|
|
struct dentry *res;
|
2013-09-12 22:22:53 +04:00
|
|
|
unsigned add_flags;
|
2008-08-11 17:48:57 +04:00
|
|
|
|
|
|
|
if (!inode)
|
2008-08-11 17:49:04 +04:00
|
|
|
return ERR_PTR(-ESTALE);
|
2008-08-11 17:48:57 +04:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
res = d_find_any_alias(inode);
|
2008-08-11 17:49:12 +04:00
|
|
|
if (res)
|
|
|
|
goto out_iput;
|
|
|
|
|
2011-07-07 23:03:58 +04:00
|
|
|
tmp = __d_alloc(inode->i_sb, &anonstring);
|
2008-08-11 17:49:12 +04:00
|
|
|
if (!tmp) {
|
|
|
|
res = ERR_PTR(-ENOMEM);
|
|
|
|
goto out_iput;
|
2008-08-11 17:48:57 +04:00
|
|
|
}
|
2011-01-07 09:49:38 +03:00
|
|
|
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_lock(&inode->i_lock);
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-18 23:45:09 +03:00
|
|
|
res = __d_find_any_alias(inode);
|
2008-08-11 17:49:12 +04:00
|
|
|
if (res) {
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2008-08-11 17:49:12 +04:00
|
|
|
dput(tmp);
|
|
|
|
goto out_iput;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* attach a disconnected dentry */
|
2014-02-15 02:35:37 +04:00
|
|
|
add_flags = d_flags_for_inode(inode);
|
|
|
|
|
|
|
|
if (disconnected)
|
|
|
|
add_flags |= DCACHE_DISCONNECTED;
|
2013-09-12 22:22:53 +04:00
|
|
|
|
2008-08-11 17:49:12 +04:00
|
|
|
spin_lock(&tmp->d_lock);
|
2015-03-05 17:09:22 +03:00
|
|
|
__d_set_inode_and_type(tmp, inode, add_flags);
|
2014-10-27 02:19:16 +03:00
|
|
|
hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
|
2011-04-25 22:01:36 +04:00
|
|
|
hlist_bl_lock(&tmp->d_sb->s_anon);
|
2011-01-07 09:50:05 +03:00
|
|
|
hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
|
2011-04-25 22:01:36 +04:00
|
|
|
hlist_bl_unlock(&tmp->d_sb->s_anon);
|
2008-08-11 17:49:12 +04:00
|
|
|
spin_unlock(&tmp->d_lock);
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2010-11-19 04:52:55 +03:00
|
|
|
security_d_instantiate(tmp, inode);
|
2008-08-11 17:49:12 +04:00
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
|
|
out_iput:
|
2010-11-19 04:52:55 +03:00
|
|
|
if (res && !IS_ERR(res))
|
|
|
|
security_d_instantiate(res, inode);
|
2008-08-11 17:49:12 +04:00
|
|
|
iput(inode);
|
|
|
|
return res;
|
2008-08-11 17:48:57 +04:00
|
|
|
}
|
2014-02-15 02:35:37 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
|
|
|
|
* @inode: inode to allocate the dentry for
|
|
|
|
*
|
|
|
|
* Obtain a dentry for an inode resulting from NFS filehandle conversion or
|
|
|
|
* similar open by handle operations. The returned dentry may be anonymous,
|
|
|
|
* or may have a full name (if the inode was already in the cache).
|
|
|
|
*
|
|
|
|
* When called on a directory inode, we must ensure that the inode only ever
|
|
|
|
* has one dentry. If a dentry is found, that is returned instead of
|
|
|
|
* allocating a new one.
|
|
|
|
*
|
|
|
|
* On successful return, the reference to the inode has been transferred
|
|
|
|
* to the dentry. In case of an error the reference on the inode is released.
|
|
|
|
* To make it easier to use in export operations a %NULL or IS_ERR inode may
|
|
|
|
* be passed in and the error will be propagated to the return value,
|
|
|
|
* with a %NULL @inode replaced by ERR_PTR(-ESTALE).
|
|
|
|
*/
|
|
|
|
struct dentry *d_obtain_alias(struct inode *inode)
|
|
|
|
{
|
|
|
|
return __d_obtain_alias(inode, 1);
|
|
|
|
}
|
2009-02-28 01:02:59 +03:00
|
|
|
EXPORT_SYMBOL(d_obtain_alias);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-02-15 02:35:37 +04:00
|
|
|
/**
|
|
|
|
* d_obtain_root - find or allocate a dentry for a given inode
|
|
|
|
* @inode: inode to allocate the dentry for
|
|
|
|
*
|
|
|
|
* Obtain an IS_ROOT dentry for the root of a filesystem.
|
|
|
|
*
|
|
|
|
* We must ensure that directory inodes only ever have one dentry. If a
|
|
|
|
* dentry is found, that is returned instead of allocating a new one.
|
|
|
|
*
|
|
|
|
* On successful return, the reference to the inode has been transferred
|
|
|
|
* to the dentry. In case of an error the reference on the inode is
|
|
|
|
* released. A %NULL or IS_ERR inode may be passed in and will be the
|
|
|
|
* error will be propagate to the return value, with a %NULL @inode
|
|
|
|
* replaced by ERR_PTR(-ESTALE).
|
|
|
|
*/
|
|
|
|
struct dentry *d_obtain_root(struct inode *inode)
|
|
|
|
{
|
|
|
|
return __d_obtain_alias(inode, 0);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_obtain_root);
|
|
|
|
|
2008-05-21 10:50:46 +04:00
|
|
|
/**
|
|
|
|
* d_add_ci - lookup or allocate new dentry with case-exact name
|
|
|
|
* @inode: the inode case-insensitive lookup has found
|
|
|
|
* @dentry: the negative dentry that was passed to the parent's lookup func
|
|
|
|
* @name: the case-exact name to be associated with the returned dentry
|
|
|
|
*
|
|
|
|
* This is to avoid filling the dcache with case-insensitive names to the
|
|
|
|
* same inode, only the actual correct case is stored in the dcache for
|
|
|
|
* case-insensitive filesystems.
|
|
|
|
*
|
|
|
|
* For a case-insensitive lookup match and if the the case-exact dentry
|
|
|
|
* already exists in in the dcache, use it and return it.
|
|
|
|
*
|
|
|
|
* If no entry exists with the exact case name, allocate new dentry with
|
|
|
|
* the exact case, and return the spliced entry.
|
|
|
|
*/
|
2008-08-08 01:49:07 +04:00
|
|
|
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
|
2008-05-21 10:50:46 +04:00
|
|
|
struct qstr *name)
|
|
|
|
{
|
|
|
|
struct dentry *found;
|
|
|
|
struct dentry *new;
|
|
|
|
|
2009-01-05 21:10:37 +03:00
|
|
|
/*
|
|
|
|
* First check if a dentry matching the name already exists,
|
|
|
|
* if not go ahead and create it now.
|
|
|
|
*/
|
2008-05-21 10:50:46 +04:00
|
|
|
found = d_hash_and_lookup(dentry->d_parent, name);
|
|
|
|
if (!found) {
|
|
|
|
new = d_alloc(dentry->d_parent, name);
|
|
|
|
if (!new) {
|
2013-02-12 08:20:37 +04:00
|
|
|
found = ERR_PTR(-ENOMEM);
|
2014-10-13 02:46:26 +04:00
|
|
|
} else {
|
|
|
|
found = d_splice_alias(inode, new);
|
|
|
|
if (found) {
|
|
|
|
dput(new);
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
return new;
|
2008-05-21 10:50:46 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
iput(inode);
|
2013-02-12 08:20:37 +04:00
|
|
|
return found;
|
2008-05-21 10:50:46 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_add_ci);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
/*
|
|
|
|
* Do the slow-case of the dentry name compare.
|
|
|
|
*
|
|
|
|
* Unlike the dentry_cmp() function, we need to atomically
|
2013-05-22 02:22:44 +04:00
|
|
|
* load the name and length information, so that the
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
* filesystem can rely on them, and can use the 'name' and
|
|
|
|
* 'len' information without worrying about walking off the
|
|
|
|
* end of memory etc.
|
|
|
|
*
|
|
|
|
* Thus the read_seqcount_retry() and the "duplicate" info
|
|
|
|
* in arguments (the low-level filesystem should not look
|
|
|
|
* at the dentry inode or name contents directly, since
|
|
|
|
* rename can change them while we're in RCU mode).
|
|
|
|
*/
|
|
|
|
enum slow_d_compare {
|
|
|
|
D_COMP_OK,
|
|
|
|
D_COMP_NOMATCH,
|
|
|
|
D_COMP_SEQRETRY,
|
|
|
|
};
|
|
|
|
|
|
|
|
static noinline enum slow_d_compare slow_dentry_cmp(
|
|
|
|
const struct dentry *parent,
|
|
|
|
struct dentry *dentry,
|
|
|
|
unsigned int seq,
|
|
|
|
const struct qstr *name)
|
|
|
|
{
|
|
|
|
int tlen = dentry->d_name.len;
|
|
|
|
const char *tname = dentry->d_name.name;
|
|
|
|
|
|
|
|
if (read_seqcount_retry(&dentry->d_seq, seq)) {
|
|
|
|
cpu_relax();
|
|
|
|
return D_COMP_SEQRETRY;
|
|
|
|
}
|
2013-05-22 02:22:44 +04:00
|
|
|
if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
return D_COMP_NOMATCH;
|
|
|
|
return D_COMP_OK;
|
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
/**
|
|
|
|
* __d_lookup_rcu - search for a dentry (racy, store-free)
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
2012-03-19 08:23:05 +04:00
|
|
|
* @seqp: returns d_seq value at the point where the dentry was found
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
* Returns: dentry, or NULL
|
|
|
|
*
|
|
|
|
* __d_lookup_rcu is the dcache lookup function for rcu-walk name
|
|
|
|
* resolution (store-free path walking) design described in
|
|
|
|
* Documentation/filesystems/path-lookup.txt.
|
|
|
|
*
|
|
|
|
* This is not to be used outside core vfs.
|
|
|
|
*
|
|
|
|
* __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
|
|
|
|
* held, and rcu_read_lock held. The returned dentry must not be stored into
|
|
|
|
* without taking d_lock and checking d_seq sequence count against @seq
|
|
|
|
* returned here.
|
|
|
|
*
|
2013-09-02 22:38:06 +04:00
|
|
|
* A refcount may be taken on the found dentry with the d_rcu_to_refcount
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
* function.
|
|
|
|
*
|
|
|
|
* Alternatively, __d_lookup_rcu may be called again to look up the child of
|
|
|
|
* the returned dentry, so long as its parent's seqlock is checked after the
|
|
|
|
* child is looked up. Thus, an interlocking stepping of sequence lock checks
|
|
|
|
* is formed, giving integrity down the path walk.
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
*
|
|
|
|
* NOTE! The caller *has* to check the resulting dentry against the sequence
|
|
|
|
* number we've returned before using any of the resulting dentry state!
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
*/
|
2012-03-03 02:23:30 +04:00
|
|
|
struct dentry *__d_lookup_rcu(const struct dentry *parent,
|
|
|
|
const struct qstr *name,
|
2013-05-22 02:22:44 +04:00
|
|
|
unsigned *seqp)
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
{
|
2012-05-11 00:14:12 +04:00
|
|
|
u64 hashlen = name->hash_len;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
const unsigned char *str = name->name;
|
2012-05-11 00:14:12 +04:00
|
|
|
struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
|
2011-01-07 09:50:05 +03:00
|
|
|
struct hlist_bl_node *node;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
struct dentry *dentry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: There is significant duplication with __d_lookup_rcu which is
|
|
|
|
* required to prevent single threaded performance regressions
|
|
|
|
* especially on architectures where smp_rmb (in seqcounts) are costly.
|
|
|
|
* Keep the two functions in sync.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash list is protected using RCU.
|
|
|
|
*
|
|
|
|
* Carefully use d_seq when comparing a candidate dentry, to avoid
|
|
|
|
* races with d_move().
|
|
|
|
*
|
|
|
|
* It is possible that concurrent renames can mess up our list
|
|
|
|
* walk here and result in missing our dentry, resulting in the
|
|
|
|
* false-negative result. d_lookup() protects against concurrent
|
|
|
|
* renames using rename_lock seqlock.
|
|
|
|
*
|
2011-01-22 09:31:32 +03:00
|
|
|
* See Documentation/filesystems/path-lookup.txt for more details.
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
*/
|
2011-04-24 09:32:03 +04:00
|
|
|
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
|
2012-03-03 02:23:30 +04:00
|
|
|
unsigned seq;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
|
|
|
|
seqretry:
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
/*
|
|
|
|
* The dentry sequence count protects us from concurrent
|
2013-05-22 02:22:44 +04:00
|
|
|
* renames, and thus protects parent and name fields.
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
*
|
|
|
|
* The caller must perform a seqcount check in order
|
2013-05-22 02:22:44 +04:00
|
|
|
* to do anything useful with the returned dentry.
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
*
|
|
|
|
* NOTE! We do a "raw" seqcount_begin here. That means that
|
|
|
|
* we don't wait for the sequence count to stabilize if it
|
|
|
|
* is in the middle of a sequence change. If we do the slow
|
|
|
|
* dentry compare, we will do seqretries until it is stable,
|
|
|
|
* and if we end up with a successful lookup, we actually
|
|
|
|
* want to exit RCU lookup anyway.
|
|
|
|
*/
|
|
|
|
seq = raw_seqcount_begin(&dentry->d_seq);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
if (dentry->d_parent != parent)
|
|
|
|
continue;
|
2012-05-22 05:48:10 +04:00
|
|
|
if (d_unhashed(dentry))
|
|
|
|
continue;
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
|
2011-08-07 09:41:50 +04:00
|
|
|
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
|
2012-05-11 00:14:12 +04:00
|
|
|
if (dentry->d_name.hash != hashlen_hash(hashlen))
|
|
|
|
continue;
|
2013-05-22 02:22:44 +04:00
|
|
|
*seqp = seq;
|
|
|
|
switch (slow_dentry_cmp(parent, dentry, seq, name)) {
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
case D_COMP_OK:
|
|
|
|
return dentry;
|
|
|
|
case D_COMP_NOMATCH:
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
continue;
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
default:
|
|
|
|
goto seqretry;
|
|
|
|
}
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
}
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
|
2012-05-11 00:14:12 +04:00
|
|
|
if (dentry->d_name.hash_len != hashlen)
|
2012-05-10 23:37:10 +04:00
|
|
|
continue;
|
2013-05-22 02:22:44 +04:00
|
|
|
*seqp = seq;
|
2012-05-11 00:14:12 +04:00
|
|
|
if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
return dentry;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* d_lookup - search for a dentry
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 22:37:34 +04:00
|
|
|
* Returns: dentry, or NULL
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 22:37:34 +04:00
|
|
|
* d_lookup searches the children of the parent dentry for the name in
|
|
|
|
* question. If the dentry is found its reference count is incremented and the
|
|
|
|
* dentry is returned. The caller must use dput to free the entry when it has
|
|
|
|
* finished using it. %NULL is returned if the dentry does not exist.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2013-01-25 03:29:34 +04:00
|
|
|
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
struct dentry *dentry;
|
2011-01-07 09:49:37 +03:00
|
|
|
unsigned seq;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-08-11 06:46:53 +04:00
|
|
|
do {
|
|
|
|
seq = read_seqbegin(&rename_lock);
|
|
|
|
dentry = __d_lookup(parent, name);
|
|
|
|
if (dentry)
|
2005-04-17 02:20:36 +04:00
|
|
|
break;
|
|
|
|
} while (read_seqretry(&rename_lock, seq));
|
|
|
|
return dentry;
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_lookup);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
/**
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 22:37:34 +04:00
|
|
|
* __d_lookup - search for a dentry (racy)
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
|
|
|
* Returns: dentry, or NULL
|
|
|
|
*
|
|
|
|
* __d_lookup is like d_lookup, however it may (rarely) return a
|
|
|
|
* false-negative result due to unrelated rename activity.
|
|
|
|
*
|
|
|
|
* __d_lookup is slightly faster by avoiding rename_lock read seqlock,
|
|
|
|
* however it must be used carefully, eg. with a following d_lookup in
|
|
|
|
* the case of failure.
|
|
|
|
*
|
|
|
|
* __d_lookup callers must be commented.
|
|
|
|
*/
|
2013-01-25 03:27:00 +04:00
|
|
|
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
unsigned int len = name->len;
|
|
|
|
unsigned int hash = name->hash;
|
|
|
|
const unsigned char *str = name->name;
|
2011-04-24 09:32:03 +04:00
|
|
|
struct hlist_bl_head *b = d_hash(parent, hash);
|
2011-01-07 09:50:05 +03:00
|
|
|
struct hlist_bl_node *node;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
struct dentry *found = NULL;
|
2005-11-07 11:59:17 +03:00
|
|
|
struct dentry *dentry;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
/*
|
|
|
|
* Note: There is significant duplication with __d_lookup_rcu which is
|
|
|
|
* required to prevent single threaded performance regressions
|
|
|
|
* especially on architectures where smp_rmb (in seqcounts) are costly.
|
|
|
|
* Keep the two functions in sync.
|
|
|
|
*/
|
|
|
|
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 22:37:34 +04:00
|
|
|
/*
|
|
|
|
* The hash list is protected using RCU.
|
|
|
|
*
|
|
|
|
* Take d_lock when comparing a candidate dentry, to avoid races
|
|
|
|
* with d_move().
|
|
|
|
*
|
|
|
|
* It is possible that concurrent renames can mess up our list
|
|
|
|
* walk here and result in missing our dentry, resulting in the
|
|
|
|
* false-negative result. d_lookup() protects against concurrent
|
|
|
|
* renames using rename_lock seqlock.
|
|
|
|
*
|
2011-01-22 09:31:32 +03:00
|
|
|
* See Documentation/filesystems/path-lookup.txt for more details.
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 22:37:34 +04:00
|
|
|
*/
|
2005-04-17 02:20:36 +04:00
|
|
|
rcu_read_lock();
|
|
|
|
|
2011-04-24 09:32:03 +04:00
|
|
|
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
if (dentry->d_name.hash != hash)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (dentry->d_parent != parent)
|
|
|
|
goto next;
|
Fix NULL pointer dereference in proc_sys_compare
The VFS interface for the 'd_compare()' is a bit special (read: 'odd'),
because it really just essentially replaces a memcmp(). The filesystem
is supposed to just compare the two names with whatever case-independent
or other function.
And when I say 'is supposed to', I obviously mean that 'procfs does odd
things, and actually looks at the dentry that we don't even pass down,
rather than just the name'. Which results in problems, because we
actually call d_compare before we have even verified that the dentry is
still hashed at all.
And that causes a problm since the inode that procfs looks at may have
been free'd and the d_inode pointer is NULL. procfs just assumes that
all dentries are positive, since procfs itself never generates a
negative one. But memory pressure will still result in the dentry
getting torn down, and as it is removed by RCU, it still remains visible
on some lists - and to d_compare.
If the filesystem just did a name comparison, we wouldn't care. And we
could just fix procfs to know about negative dentries too. But rather
than have the low-level filesystems know about internal VFS details,
just move the check for a unhashed dentry up a bit, so that we will only
call d_compare on dentries that are still active.
The actual oops this caused didn't look like a NULL pointer dereference
because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)'
to get at its internal proc_inode information from the inode pointer,
and accessed a field below the inode. So the oops would look something
like
BUG: unable to handle kernel paging request at fffffffffffffff0
IP: [<ffffffff802bc6c6>] proc_sys_compare+0x36/0x50
and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and
ppc64 (Hugh Dickins).
Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 18:42:57 +04:00
|
|
|
if (d_unhashed(dentry))
|
|
|
|
goto next;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* It is safe to compare names since d_move() cannot
|
|
|
|
* change the qstr (protected by d_lock).
|
|
|
|
*/
|
2011-01-07 09:49:55 +03:00
|
|
|
if (parent->d_flags & DCACHE_OP_COMPARE) {
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
int tlen = dentry->d_name.len;
|
|
|
|
const char *tname = dentry->d_name.name;
|
2013-05-22 02:22:44 +04:00
|
|
|
if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
|
2005-04-17 02:20:36 +04:00
|
|
|
goto next;
|
|
|
|
} else {
|
2012-05-10 23:37:10 +04:00
|
|
|
if (dentry->d_name.len != len)
|
|
|
|
goto next;
|
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-05 01:59:14 +04:00
|
|
|
if (dentry_cmp(dentry, str, len))
|
2005-04-17 02:20:36 +04:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
2013-08-29 05:24:59 +04:00
|
|
|
dentry->d_lockref.count++;
|
Fix NULL pointer dereference in proc_sys_compare
The VFS interface for the 'd_compare()' is a bit special (read: 'odd'),
because it really just essentially replaces a memcmp(). The filesystem
is supposed to just compare the two names with whatever case-independent
or other function.
And when I say 'is supposed to', I obviously mean that 'procfs does odd
things, and actually looks at the dentry that we don't even pass down,
rather than just the name'. Which results in problems, because we
actually call d_compare before we have even verified that the dentry is
still hashed at all.
And that causes a problm since the inode that procfs looks at may have
been free'd and the d_inode pointer is NULL. procfs just assumes that
all dentries are positive, since procfs itself never generates a
negative one. But memory pressure will still result in the dentry
getting torn down, and as it is removed by RCU, it still remains visible
on some lists - and to d_compare.
If the filesystem just did a name comparison, we wouldn't care. And we
could just fix procfs to know about negative dentries too. But rather
than have the low-level filesystems know about internal VFS details,
just move the check for a unhashed dentry up a bit, so that we will only
call d_compare on dentries that are still active.
The actual oops this caused didn't look like a NULL pointer dereference
because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)'
to get at its internal proc_inode information from the inode pointer,
and accessed a field below the inode. So the oops would look something
like
BUG: unable to handle kernel paging request at fffffffffffffff0
IP: [<ffffffff802bc6c6>] proc_sys_compare+0x36/0x50
and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and
ppc64 (Hugh Dickins).
Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 18:42:57 +04:00
|
|
|
found = dentry;
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
break;
|
|
|
|
next:
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2006-03-31 14:31:43 +04:00
|
|
|
/**
|
|
|
|
* d_hash_and_lookup - hash the qstr then search for a dentry
|
|
|
|
* @dir: Directory to search in
|
|
|
|
* @name: qstr of name we wish to find
|
|
|
|
*
|
2013-02-12 08:20:37 +04:00
|
|
|
* On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
|
2006-03-31 14:31:43 +04:00
|
|
|
*/
|
|
|
|
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Check for a fs-specific hash function. Note that we must
|
|
|
|
* calculate the standard hash first, as the d_op->d_hash()
|
|
|
|
* routine may choose to leave the hash value unchanged.
|
|
|
|
*/
|
|
|
|
name->hash = full_name_hash(name->name, name->len);
|
2011-01-07 09:49:55 +03:00
|
|
|
if (dir->d_flags & DCACHE_OP_HASH) {
|
2013-05-22 02:22:44 +04:00
|
|
|
int err = dir->d_op->d_hash(dir, name);
|
2013-02-12 08:20:37 +04:00
|
|
|
if (unlikely(err < 0))
|
|
|
|
return ERR_PTR(err);
|
2006-03-31 14:31:43 +04:00
|
|
|
}
|
2013-02-12 08:20:37 +04:00
|
|
|
return d_lookup(dir, name);
|
2006-03-31 14:31:43 +04:00
|
|
|
}
|
2013-02-12 08:20:37 +04:00
|
|
|
EXPORT_SYMBOL(d_hash_and_lookup);
|
2006-03-31 14:31:43 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* When a file is deleted, we have two options:
|
|
|
|
* - turn this dentry into a negative dentry
|
|
|
|
* - unhash this dentry and free it.
|
|
|
|
*
|
|
|
|
* Usually, we want to just turn this into
|
|
|
|
* a negative dentry, but if anybody else is
|
|
|
|
* currently using the dentry or the inode
|
|
|
|
* we can't do that and we fall back on removing
|
|
|
|
* it from the hash queues and waiting for
|
|
|
|
* it to be deleted later when it has no users
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* d_delete - delete a dentry
|
|
|
|
* @dentry: The dentry to delete
|
|
|
|
*
|
|
|
|
* Turn the dentry into a negative dentry if possible, otherwise
|
|
|
|
* remove it from the hash queues so it can be deleted later
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_delete(struct dentry * dentry)
|
|
|
|
{
|
2011-01-07 09:50:06 +03:00
|
|
|
struct inode *inode;
|
2005-08-08 21:52:16 +04:00
|
|
|
int isdir = 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Are we the only user?
|
|
|
|
*/
|
2011-01-07 09:49:42 +03:00
|
|
|
again:
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 09:50:06 +03:00
|
|
|
inode = dentry->d_inode;
|
|
|
|
isdir = S_ISDIR(inode->i_mode);
|
2013-08-29 05:24:59 +04:00
|
|
|
if (dentry->d_lockref.count == 1) {
|
2012-09-19 18:49:51 +04:00
|
|
|
if (!spin_trylock(&inode->i_lock)) {
|
2011-01-07 09:49:42 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
cpu_relax();
|
|
|
|
goto again;
|
|
|
|
}
|
2010-05-22 00:11:04 +04:00
|
|
|
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
dentry_unlink_inode(dentry);
|
2005-08-08 21:52:16 +04:00
|
|
|
fsnotify_nameremove(dentry, isdir);
|
2005-04-17 02:20:36 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!d_unhashed(dentry))
|
|
|
|
__d_drop(dentry);
|
|
|
|
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-08-08 21:52:16 +04:00
|
|
|
|
|
|
|
fsnotify_nameremove(dentry, isdir);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_delete);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-04-24 09:32:03 +04:00
|
|
|
static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2011-01-07 09:50:05 +03:00
|
|
|
BUG_ON(!d_unhashed(entry));
|
2011-04-25 22:01:36 +04:00
|
|
|
hlist_bl_lock(b);
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 18:58:46 +04:00
|
|
|
entry->d_flags |= DCACHE_RCUACCESS;
|
2011-04-24 09:32:03 +04:00
|
|
|
hlist_bl_add_head_rcu(&entry->d_hash, b);
|
2011-04-25 22:01:36 +04:00
|
|
|
hlist_bl_unlock(b);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2006-08-23 04:06:07 +04:00
|
|
|
static void _d_rehash(struct dentry * entry)
|
|
|
|
{
|
|
|
|
__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* d_rehash - add an entry back to the hash
|
|
|
|
* @entry: dentry to add to the hash
|
|
|
|
*
|
|
|
|
* Adds a dentry to the hash according to its name.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_rehash(struct dentry * entry)
|
|
|
|
{
|
|
|
|
spin_lock(&entry->d_lock);
|
2006-08-23 04:06:07 +04:00
|
|
|
_d_rehash(entry);
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_unlock(&entry->d_lock);
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_rehash);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-01-07 09:49:26 +03:00
|
|
|
/**
|
|
|
|
* dentry_update_name_case - update case insensitive dentry with a new name
|
|
|
|
* @dentry: dentry to be updated
|
|
|
|
* @name: new name
|
|
|
|
*
|
|
|
|
* Update a case insensitive dentry with new case of name.
|
|
|
|
*
|
|
|
|
* dentry must have been returned by d_lookup with name @name. Old and new
|
|
|
|
* name lengths must match (ie. no d_compare which allows mismatched name
|
|
|
|
* lengths).
|
|
|
|
*
|
|
|
|
* Parent inode i_mutex must be held over d_lookup and into this call (to
|
|
|
|
* keep renames and concurrent inserts, and readdir(2) away).
|
|
|
|
*/
|
|
|
|
void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
|
|
|
|
{
|
2011-04-15 18:34:26 +04:00
|
|
|
BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
|
2011-01-07 09:49:26 +03:00
|
|
|
BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
|
|
|
|
|
|
|
|
spin_lock(&dentry->d_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
write_seqcount_begin(&dentry->d_seq);
|
2011-01-07 09:49:26 +03:00
|
|
|
memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
write_seqcount_end(&dentry->d_seq);
|
2011-01-07 09:49:26 +03:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dentry_update_name_case);
|
|
|
|
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
static void swap_names(struct dentry *dentry, struct dentry *target)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
if (unlikely(dname_external(target))) {
|
|
|
|
if (unlikely(dname_external(dentry))) {
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Both external: swap the pointers
|
|
|
|
*/
|
2009-01-08 05:09:14 +03:00
|
|
|
swap(target->d_name.name, dentry->d_name.name);
|
2005-04-17 02:20:36 +04:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* dentry:internal, target:external. Steal target's
|
|
|
|
* storage and make target internal.
|
|
|
|
*/
|
2007-10-22 03:41:38 +04:00
|
|
|
memcpy(target->d_iname, dentry->d_name.name,
|
|
|
|
dentry->d_name.len + 1);
|
2005-04-17 02:20:36 +04:00
|
|
|
dentry->d_name.name = target->d_name.name;
|
|
|
|
target->d_name.name = target->d_iname;
|
|
|
|
}
|
|
|
|
} else {
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
if (unlikely(dname_external(dentry))) {
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* dentry:external, target:internal. Give dentry's
|
|
|
|
* storage to target and make dentry internal
|
|
|
|
*/
|
|
|
|
memcpy(dentry->d_iname, target->d_name.name,
|
|
|
|
target->d_name.len + 1);
|
|
|
|
target->d_name.name = dentry->d_name.name;
|
|
|
|
dentry->d_name.name = dentry->d_iname;
|
|
|
|
} else {
|
|
|
|
/*
|
2014-04-01 19:08:43 +04:00
|
|
|
* Both are internal.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2014-04-01 19:08:43 +04:00
|
|
|
unsigned int i;
|
|
|
|
BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
|
2014-09-05 20:16:01 +04:00
|
|
|
kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
|
|
|
|
kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
|
2014-04-01 19:08:43 +04:00
|
|
|
for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
|
|
|
|
swap(((long *) &dentry->d_iname)[i],
|
|
|
|
((long *) &target->d_iname)[i]);
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
}
|
2014-09-24 23:27:39 +04:00
|
|
|
swap(dentry->d_name.hash_len, target->d_name.hash_len);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
static void copy_name(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
struct external_name *old_name = NULL;
|
|
|
|
if (unlikely(dname_external(dentry)))
|
|
|
|
old_name = external_name(dentry);
|
|
|
|
if (unlikely(dname_external(target))) {
|
|
|
|
atomic_inc(&external_name(target)->u.count);
|
|
|
|
dentry->d_name = target->d_name;
|
|
|
|
} else {
|
|
|
|
memcpy(dentry->d_iname, target->d_name.name,
|
|
|
|
target->d_name.len + 1);
|
|
|
|
dentry->d_name.name = dentry->d_iname;
|
|
|
|
dentry->d_name.hash_len = target->d_name.hash_len;
|
|
|
|
}
|
|
|
|
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
|
|
|
|
kfree_rcu(old_name, u.head);
|
|
|
|
}
|
|
|
|
|
2011-01-07 09:49:34 +03:00
|
|
|
static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* XXXX: do we really need to take target->d_lock?
|
|
|
|
*/
|
|
|
|
if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
|
|
|
|
spin_lock(&target->d_parent->d_lock);
|
|
|
|
else {
|
|
|
|
if (d_ancestor(dentry->d_parent, target->d_parent)) {
|
|
|
|
spin_lock(&dentry->d_parent->d_lock);
|
|
|
|
spin_lock_nested(&target->d_parent->d_lock,
|
|
|
|
DENTRY_D_LOCK_NESTED);
|
|
|
|
} else {
|
|
|
|
spin_lock(&target->d_parent->d_lock);
|
|
|
|
spin_lock_nested(&dentry->d_parent->d_lock,
|
|
|
|
DENTRY_D_LOCK_NESTED);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (target < dentry) {
|
|
|
|
spin_lock_nested(&target->d_lock, 2);
|
|
|
|
spin_lock_nested(&dentry->d_lock, 3);
|
|
|
|
} else {
|
|
|
|
spin_lock_nested(&dentry->d_lock, 2);
|
|
|
|
spin_lock_nested(&target->d_lock, 3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-27 07:11:15 +04:00
|
|
|
static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
|
2011-01-07 09:49:34 +03:00
|
|
|
{
|
|
|
|
if (target->d_parent != dentry->d_parent)
|
|
|
|
spin_unlock(&dentry->d_parent->d_lock);
|
|
|
|
if (target->d_parent != target)
|
|
|
|
spin_unlock(&target->d_parent->d_lock);
|
2014-09-27 07:11:15 +04:00
|
|
|
spin_unlock(&target->d_lock);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 09:49:34 +03:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
2011-01-07 09:49:34 +03:00
|
|
|
* When switching names, the actual string doesn't strictly have to
|
|
|
|
* be preserved in the target - because we're dropping the target
|
|
|
|
* anyway. As such, we can just do a simple memcpy() to copy over
|
2014-09-24 22:14:33 +04:00
|
|
|
* the new name before we switch, unless we are going to rehash
|
|
|
|
* it. Note that if we *do* unhash the target, we are not allowed
|
|
|
|
* to rehash it without giving it a new name/hash key - whether
|
|
|
|
* we swap or overwrite the names here, resulting name won't match
|
|
|
|
* the reality in filesystem; it's only there for d_path() purposes.
|
|
|
|
* Note that all of this is happening under rename_lock, so the
|
|
|
|
* any hash lookup seeing it in the middle of manipulations will
|
|
|
|
* be discarded anyway. So we do not care what happens to the hash
|
|
|
|
* key in that case.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2006-10-21 21:24:20 +04:00
|
|
|
/*
|
2011-07-13 05:42:24 +04:00
|
|
|
* __d_move - move a dentry
|
2005-04-17 02:20:36 +04:00
|
|
|
* @dentry: entry to move
|
|
|
|
* @target: new dentry
|
2014-04-01 19:08:43 +04:00
|
|
|
* @exchange: exchange the two dentries
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
|
|
|
* Update the dcache to reflect the move of a file name. Negative
|
2011-07-26 21:33:16 +04:00
|
|
|
* dcache entries should not be moved in this way. Caller must hold
|
|
|
|
* rename_lock, the i_mutex of the source and target directories,
|
|
|
|
* and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2014-04-01 19:08:43 +04:00
|
|
|
static void __d_move(struct dentry *dentry, struct dentry *target,
|
|
|
|
bool exchange)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
if (!dentry->d_inode)
|
|
|
|
printk(KERN_WARNING "VFS: moving negative dcache entry\n");
|
|
|
|
|
2011-01-07 09:49:34 +03:00
|
|
|
BUG_ON(d_ancestor(dentry, target));
|
|
|
|
BUG_ON(d_ancestor(target, dentry));
|
|
|
|
|
|
|
|
dentry_lock_for_move(dentry, target);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
write_seqcount_begin(&dentry->d_seq);
|
2013-10-08 02:51:59 +04:00
|
|
|
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
|
2011-01-07 09:50:05 +03:00
|
|
|
/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move the dentry to the target hash queue. Don't bother checking
|
|
|
|
* for the same hash queue because of how unlikely it is.
|
|
|
|
*/
|
|
|
|
__d_drop(dentry);
|
2011-01-07 09:49:30 +03:00
|
|
|
__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-04-01 19:08:43 +04:00
|
|
|
/*
|
|
|
|
* Unhash the target (d_delete() is not usable here). If exchanging
|
|
|
|
* the two dentries, then rehash onto the other's hash queue.
|
|
|
|
*/
|
2005-04-17 02:20:36 +04:00
|
|
|
__d_drop(target);
|
2014-04-01 19:08:43 +04:00
|
|
|
if (exchange) {
|
|
|
|
__d_rehash(target,
|
|
|
|
d_hash(dentry->d_parent, dentry->d_name.hash));
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* Switch the names.. */
|
Allow sharing external names after __d_move()
* external dentry names get a small structure prepended to them
(struct external_name).
* it contains an atomic refcount, matching the number of struct dentry
instances that have ->d_name.name pointing to that external name. The
first thing free_dentry() does is decrementing refcount of external name,
so the instances that are between the call of free_dentry() and
RCU-delayed actual freeing do not contribute.
* __d_move(x, y, false) makes the name of x equal to the name of y,
external or not. If y has an external name, extra reference is grabbed
and put into x->d_name.name. If x used to have an external name, the
reference to the old name is dropped and, should it reach zero, freeing
is scheduled via kfree_rcu().
* free_dentry() in dentry with external name decrements the refcount of
that name and, should it reach zero, does RCU-delayed call that will
free both the dentry and external name. Otherwise it does what it
used to do, except that __d_free() doesn't even look at ->d_name.name;
it simply frees the dentry.
All non-RCU accesses to dentry external name are safe wrt freeing since they
all should happen before free_dentry() is called. RCU accesses might run
into a dentry seen by free_dentry() or into an old name that got already
dropped by __d_move(); however, in both cases dentry must have been
alive and refer to that name at some point after we'd done rcu_read_lock(),
which means that any freeing must be still pending.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:54:27 +04:00
|
|
|
if (exchange)
|
|
|
|
swap_names(dentry, target);
|
|
|
|
else
|
|
|
|
copy_name(dentry, target);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-09-27 07:06:14 +04:00
|
|
|
/* ... and switch them in the tree */
|
2005-04-17 02:20:36 +04:00
|
|
|
if (IS_ROOT(dentry)) {
|
2014-09-27 07:06:14 +04:00
|
|
|
/* splicing a tree */
|
2005-04-17 02:20:36 +04:00
|
|
|
dentry->d_parent = target->d_parent;
|
|
|
|
target->d_parent = target;
|
2014-10-27 02:19:16 +03:00
|
|
|
list_del_init(&target->d_child);
|
|
|
|
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
|
2005-04-17 02:20:36 +04:00
|
|
|
} else {
|
2014-09-27 07:06:14 +04:00
|
|
|
/* swapping two dentries */
|
2009-01-08 05:09:14 +03:00
|
|
|
swap(dentry->d_parent, target->d_parent);
|
2014-10-27 02:19:16 +03:00
|
|
|
list_move(&target->d_child, &target->d_parent->d_subdirs);
|
|
|
|
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
|
2014-09-27 07:06:14 +04:00
|
|
|
if (exchange)
|
|
|
|
fsnotify_d_move(target);
|
|
|
|
fsnotify_d_move(dentry);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 09:49:52 +03:00
|
|
|
write_seqcount_end(&target->d_seq);
|
|
|
|
write_seqcount_end(&dentry->d_seq);
|
|
|
|
|
2014-09-27 07:11:15 +04:00
|
|
|
dentry_unlock_for_move(dentry, target);
|
2011-07-13 05:42:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* d_move - move a dentry
|
|
|
|
* @dentry: entry to move
|
|
|
|
* @target: new dentry
|
|
|
|
*
|
|
|
|
* Update the dcache to reflect the move of a file name. Negative
|
2011-07-26 21:33:16 +04:00
|
|
|
* dcache entries should not be moved in this way. See the locking
|
|
|
|
* requirements for __d_move.
|
2011-07-13 05:42:24 +04:00
|
|
|
*/
|
|
|
|
void d_move(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
write_seqlock(&rename_lock);
|
2014-04-01 19:08:43 +04:00
|
|
|
__d_move(dentry, target, false);
|
2005-04-17 02:20:36 +04:00
|
|
|
write_sequnlock(&rename_lock);
|
2006-10-21 21:24:20 +04:00
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_move);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-04-01 19:08:43 +04:00
|
|
|
/*
|
|
|
|
* d_exchange - exchange two dentries
|
|
|
|
* @dentry1: first dentry
|
|
|
|
* @dentry2: second dentry
|
|
|
|
*/
|
|
|
|
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
|
|
|
|
{
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
|
|
|
|
WARN_ON(!dentry1->d_inode);
|
|
|
|
WARN_ON(!dentry2->d_inode);
|
|
|
|
WARN_ON(IS_ROOT(dentry1));
|
|
|
|
WARN_ON(IS_ROOT(dentry2));
|
|
|
|
|
|
|
|
__d_move(dentry1, dentry2, true);
|
|
|
|
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
}
|
|
|
|
|
2008-10-16 02:50:28 +04:00
|
|
|
/**
|
|
|
|
* d_ancestor - search for an ancestor
|
|
|
|
* @p1: ancestor dentry
|
|
|
|
* @p2: child dentry
|
|
|
|
*
|
|
|
|
* Returns the ancestor dentry of p2 which is a child of p1, if p1 is
|
|
|
|
* an ancestor of p2, else NULL.
|
2006-10-21 21:24:20 +04:00
|
|
|
*/
|
2008-10-16 02:50:28 +04:00
|
|
|
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
|
2006-10-21 21:24:20 +04:00
|
|
|
{
|
|
|
|
struct dentry *p;
|
|
|
|
|
2008-10-16 02:50:27 +04:00
|
|
|
for (p = p2; !IS_ROOT(p); p = p->d_parent) {
|
2006-10-21 21:24:20 +04:00
|
|
|
if (p->d_parent == p1)
|
2008-10-16 02:50:28 +04:00
|
|
|
return p;
|
2006-10-21 21:24:20 +04:00
|
|
|
}
|
2008-10-16 02:50:28 +04:00
|
|
|
return NULL;
|
2006-10-21 21:24:20 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This helper attempts to cope with remotely renamed directories
|
|
|
|
*
|
|
|
|
* It assumes that the caller is already holding
|
2015-08-15 21:36:41 +03:00
|
|
|
* dentry->d_parent->d_inode->i_mutex, and rename_lock
|
2006-10-21 21:24:20 +04:00
|
|
|
*
|
|
|
|
* Note: If ever the locking in lock_rename() changes, then please
|
|
|
|
* remember to update this too...
|
|
|
|
*/
|
2014-10-13 06:16:02 +04:00
|
|
|
static int __d_unalias(struct inode *inode,
|
2011-01-07 09:50:06 +03:00
|
|
|
struct dentry *dentry, struct dentry *alias)
|
2006-10-21 21:24:20 +04:00
|
|
|
{
|
|
|
|
struct mutex *m1 = NULL, *m2 = NULL;
|
2015-02-10 18:55:53 +03:00
|
|
|
int ret = -ESTALE;
|
2006-10-21 21:24:20 +04:00
|
|
|
|
|
|
|
/* If alias and dentry share a parent, then no extra locks required */
|
|
|
|
if (alias->d_parent == dentry->d_parent)
|
|
|
|
goto out_unalias;
|
|
|
|
|
|
|
|
/* See lock_rename() */
|
|
|
|
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
|
|
|
|
goto out_err;
|
|
|
|
m1 = &dentry->d_sb->s_vfs_rename_mutex;
|
|
|
|
if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
|
|
|
|
goto out_err;
|
|
|
|
m2 = &alias->d_parent->d_inode->i_mutex;
|
|
|
|
out_unalias:
|
vfs: Lazily remove mounts on unlinked files and directories.
With the introduction of mount namespaces and bind mounts it became
possible to access files and directories that on some paths are mount
points but are not mount points on other paths. It is very confusing
when rm -rf somedir returns -EBUSY simply because somedir is mounted
somewhere else. With the addition of user namespaces allowing
unprivileged mounts this condition has gone from annoying to allowing
a DOS attack on other users in the system.
The possibility for mischief is removed by updating the vfs to support
rename, unlink and rmdir on a dentry that is a mountpoint and by
lazily unmounting mountpoints on deleted dentries.
In particular this change allows rename, unlink and rmdir system calls
on a dentry without a mountpoint in the current mount namespace to
succeed, and it allows rename, unlink, and rmdir performed on a
distributed filesystem to update the vfs cache even if when there is a
mount in some namespace on the original dentry.
There are two common patterns of maintaining mounts: Mounts on trusted
paths with the parent directory of the mount point and all ancestory
directories up to / owned by root and modifiable only by root
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
maintained by fusermount.
In the case of mounts in trusted directories owned by root and
modifiable only by root the current parent directory permissions are
sufficient to ensure a mount point on a trusted path is not removed
or renamed by anyone other than root, even if there is a context
where the there are no mount points to prevent this.
In the case of mounts in directories owned by less privileged users
races with users modifying the path of a mount point are already a
danger. fusermount already uses a combination of chdir,
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
removable of global rename, unlink, and rmdir protection really adds
nothing new to consider only a widening of the attack window, and
fusermount is already safe against unprivileged users modifying the
directory simultaneously.
In principle for perfect userspace programs returning -EBUSY for
unlink, rmdir, and rename of dentires that have mounts in the local
namespace is actually unnecessary. Unfortunately not all userspace
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
of dentries that have mounts in the current mount namespace plays an
important role of maintaining consistency with historical behavior and
making imperfect userspace applications hard to exploit.
v2: Remove spurious old_dentry.
v3: Optimized shrink_submounts_and_drop
Removed unsued afs label
v4: Simplified the changes to check_submounts_and_drop
Do not rename check_submounts_and_drop shrink_submounts_and_drop
Document what why we need atomicity in check_submounts_and_drop
Rely on the parent inode mutex to make d_revalidate and d_invalidate
an atomic unit.
v5: Refcount the mountpoint to detach in case of simultaneous
renames.
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-10-02 05:33:48 +04:00
|
|
|
__d_move(alias, dentry, false);
|
2014-10-13 06:16:02 +04:00
|
|
|
ret = 0;
|
2006-10-21 21:24:20 +04:00
|
|
|
out_err:
|
|
|
|
if (m2)
|
|
|
|
mutex_unlock(m2);
|
|
|
|
if (m1)
|
|
|
|
mutex_unlock(m1);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-02-18 23:11:26 +04:00
|
|
|
/**
|
|
|
|
* d_splice_alias - splice a disconnected dentry into the tree if one exists
|
|
|
|
* @inode: the inode which may have a disconnected dentry
|
|
|
|
* @dentry: a negative dentry which we want to point to the inode.
|
|
|
|
*
|
2014-02-18 03:03:57 +04:00
|
|
|
* If inode is a directory and has an IS_ROOT alias, then d_move that in
|
|
|
|
* place of the given dentry and return it, else simply d_add the inode
|
|
|
|
* to the dentry and return NULL.
|
2014-02-18 23:11:26 +04:00
|
|
|
*
|
2014-02-18 02:58:42 +04:00
|
|
|
* If a non-IS_ROOT directory is found, the filesystem is corrupt, and
|
|
|
|
* we should error out: directories can't have multiple aliases.
|
|
|
|
*
|
2014-02-18 23:11:26 +04:00
|
|
|
* This is needed in the lookup routine of any filesystem that is exportable
|
|
|
|
* (via knfsd) so that we can build dcache paths to directories effectively.
|
|
|
|
*
|
|
|
|
* If a dentry was found and moved, then it is returned. Otherwise NULL
|
|
|
|
* is returned. This matches the expected return value of ->lookup.
|
|
|
|
*
|
|
|
|
* Cluster filesystems may call this function with a negative, hashed dentry.
|
|
|
|
* In that case, we know that the inode will be a regular file, and also this
|
|
|
|
* will only occur during atomic_open. So we need to check for the dentry
|
|
|
|
* being already hashed only in the final case.
|
|
|
|
*/
|
|
|
|
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
|
2006-08-23 04:06:07 +04:00
|
|
|
BUG_ON(!d_unhashed(dentry));
|
|
|
|
|
|
|
|
if (!inode) {
|
2008-10-16 02:50:28 +04:00
|
|
|
__d_instantiate(dentry, NULL);
|
2014-10-13 06:16:02 +04:00
|
|
|
goto out;
|
2006-08-23 04:06:07 +04:00
|
|
|
}
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_lock(&inode->i_lock);
|
2006-10-21 21:24:20 +04:00
|
|
|
if (S_ISDIR(inode->i_mode)) {
|
2014-10-13 06:16:02 +04:00
|
|
|
struct dentry *new = __d_find_any_alias(inode);
|
|
|
|
if (unlikely(new)) {
|
2015-08-15 21:36:41 +03:00
|
|
|
/* The reference to new ensures it remains an alias */
|
|
|
|
spin_unlock(&inode->i_lock);
|
2011-07-13 05:42:24 +04:00
|
|
|
write_seqlock(&rename_lock);
|
2014-10-13 06:16:02 +04:00
|
|
|
if (unlikely(d_ancestor(new, dentry))) {
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
dput(new);
|
|
|
|
new = ERR_PTR(-ELOOP);
|
|
|
|
pr_warn_ratelimited(
|
|
|
|
"VFS: Lookup of '%s' in %s %s"
|
|
|
|
" would have caused loop\n",
|
|
|
|
dentry->d_name.name,
|
|
|
|
inode->i_sb->s_type->name,
|
|
|
|
inode->i_sb->s_id);
|
|
|
|
} else if (!IS_ROOT(new)) {
|
|
|
|
int err = __d_unalias(inode, dentry, new);
|
2011-07-13 05:42:24 +04:00
|
|
|
write_sequnlock(&rename_lock);
|
2014-10-13 06:16:02 +04:00
|
|
|
if (err) {
|
|
|
|
dput(new);
|
|
|
|
new = ERR_PTR(err);
|
|
|
|
}
|
2011-07-13 05:42:24 +04:00
|
|
|
} else {
|
2014-10-13 06:16:02 +04:00
|
|
|
__d_move(new, dentry, false);
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
security_d_instantiate(new, inode);
|
2011-08-16 18:31:30 +04:00
|
|
|
}
|
2014-10-13 06:16:02 +04:00
|
|
|
iput(inode);
|
|
|
|
return new;
|
2006-10-21 21:24:20 +04:00
|
|
|
}
|
2006-08-23 04:06:07 +04:00
|
|
|
}
|
2014-10-13 06:16:02 +04:00
|
|
|
/* already taking inode->i_lock, so d_add() by hand */
|
|
|
|
__d_instantiate(dentry, inode);
|
2011-01-07 09:50:06 +03:00
|
|
|
spin_unlock(&inode->i_lock);
|
2014-10-13 06:16:02 +04:00
|
|
|
out:
|
|
|
|
security_d_instantiate(dentry, inode);
|
|
|
|
d_rehash(dentry);
|
|
|
|
return NULL;
|
2006-08-23 04:06:07 +04:00
|
|
|
}
|
2014-10-13 06:16:02 +04:00
|
|
|
EXPORT_SYMBOL(d_splice_alias);
|
2006-08-23 04:06:07 +04:00
|
|
|
|
2008-06-23 20:11:53 +04:00
|
|
|
static int prepend(char **buffer, int *buflen, const char *str, int namelen)
|
2008-03-27 15:06:20 +03:00
|
|
|
{
|
|
|
|
*buflen -= namelen;
|
|
|
|
if (*buflen < 0)
|
|
|
|
return -ENAMETOOLONG;
|
|
|
|
*buffer -= namelen;
|
|
|
|
memcpy(*buffer, str, namelen);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
/**
|
|
|
|
* prepend_name - prepend a pathname in front of current buffer pointer
|
2013-09-12 18:55:35 +04:00
|
|
|
* @buffer: buffer pointer
|
|
|
|
* @buflen: allocated length of the buffer
|
|
|
|
* @name: name string and length qstr structure
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
*
|
|
|
|
* With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
|
|
|
|
* make sure that either the old or the new name pointer and length are
|
|
|
|
* fetched. However, there may be mismatch between length and pointer.
|
|
|
|
* The length cannot be trusted, we need to copy it byte-by-byte until
|
|
|
|
* the length is reached or a null byte is found. It also prepends "/" at
|
|
|
|
* the beginning of the name. The sequence number check at the caller will
|
|
|
|
* retry it again when a d_move() does happen. So any garbage in the buffer
|
|
|
|
* due to mismatched pointer and length will be discarded.
|
missing data dependency barrier in prepend_name()
AFAICS, prepend_name() is broken on SMP alpha. Disclaimer: I don't have
SMP alpha boxen to reproduce it on. However, it really looks like the race
is real.
CPU1: d_path() on /mnt/ramfs/<255-character>/foo
CPU2: mv /mnt/ramfs/<255-character> /mnt/ramfs/<63-character>
CPU2 does d_alloc(), which allocates an external name, stores the name there
including terminating NUL, does smp_wmb() and stores its address in
dentry->d_name.name. It proceeds to d_add(dentry, NULL) and d_move()
old dentry over to that. ->d_name.name value ends up in that dentry.
In the meanwhile, CPU1 gets to prepend_name() for that dentry. It fetches
->d_name.name and ->d_name.len; the former ends up pointing to new name
(64-byte kmalloc'ed array), the latter - 255 (length of the old name).
Nothing to force the ordering there, and normally that would be OK, since we'd
run into the terminating NUL and stop. Except that it's alpha, and we'd need
a data dependency barrier to guarantee that we see that store of NUL
__d_alloc() has done. In a similar situation dentry_cmp() would survive; it
does explicit smp_read_barrier_depends() after fetching ->d_name.name.
prepend_name() doesn't and it risks walking past the end of kmalloc'ed object
and possibly oops due to taking a page fault in kernel mode.
Cc: stable@vger.kernel.org # 3.12+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:46:30 +04:00
|
|
|
*
|
|
|
|
* Data dependency barrier is needed to make sure that we see that terminating
|
|
|
|
* NUL. Alpha strikes again, film at 11...
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
*/
|
2008-06-23 20:11:53 +04:00
|
|
|
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
|
|
|
|
{
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
const char *dname = ACCESS_ONCE(name->name);
|
|
|
|
u32 dlen = ACCESS_ONCE(name->len);
|
|
|
|
char *p;
|
|
|
|
|
missing data dependency barrier in prepend_name()
AFAICS, prepend_name() is broken on SMP alpha. Disclaimer: I don't have
SMP alpha boxen to reproduce it on. However, it really looks like the race
is real.
CPU1: d_path() on /mnt/ramfs/<255-character>/foo
CPU2: mv /mnt/ramfs/<255-character> /mnt/ramfs/<63-character>
CPU2 does d_alloc(), which allocates an external name, stores the name there
including terminating NUL, does smp_wmb() and stores its address in
dentry->d_name.name. It proceeds to d_add(dentry, NULL) and d_move()
old dentry over to that. ->d_name.name value ends up in that dentry.
In the meanwhile, CPU1 gets to prepend_name() for that dentry. It fetches
->d_name.name and ->d_name.len; the former ends up pointing to new name
(64-byte kmalloc'ed array), the latter - 255 (length of the old name).
Nothing to force the ordering there, and normally that would be OK, since we'd
run into the terminating NUL and stop. Except that it's alpha, and we'd need
a data dependency barrier to guarantee that we see that store of NUL
__d_alloc() has done. In a similar situation dentry_cmp() would survive; it
does explicit smp_read_barrier_depends() after fetching ->d_name.name.
prepend_name() doesn't and it risks walking past the end of kmalloc'ed object
and possibly oops due to taking a page fault in kernel mode.
Cc: stable@vger.kernel.org # 3.12+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-09-29 22:46:30 +04:00
|
|
|
smp_read_barrier_depends();
|
|
|
|
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
*buflen -= dlen + 1;
|
2014-03-23 08:28:40 +04:00
|
|
|
if (*buflen < 0)
|
|
|
|
return -ENAMETOOLONG;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
p = *buffer -= dlen + 1;
|
|
|
|
*p++ = '/';
|
|
|
|
while (dlen--) {
|
|
|
|
char c = *dname++;
|
|
|
|
if (!c)
|
|
|
|
break;
|
|
|
|
*p++ = c;
|
|
|
|
}
|
|
|
|
return 0;
|
2008-06-23 20:11:53 +04:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
2010-11-19 02:02:49 +03:00
|
|
|
* prepend_path - Prepend path string to a buffer
|
2008-03-27 15:06:21 +03:00
|
|
|
* @path: the dentry/vfsmount to report
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
* @root: root vfsmnt/dentry
|
2010-08-10 13:41:39 +04:00
|
|
|
* @buffer: pointer to the end of the buffer
|
|
|
|
* @buflen: pointer to buffer length
|
2007-02-13 23:08:18 +03:00
|
|
|
*
|
2013-09-12 18:55:35 +04:00
|
|
|
* The function will first try to write out the pathname without taking any
|
|
|
|
* lock other than the RCU read lock to make sure that dentries won't go away.
|
|
|
|
* It only checks the sequence number of the global rename_lock as any change
|
|
|
|
* in the dentry's d_seq will be preceded by changes in the rename_lock
|
|
|
|
* sequence number. If the sequence number had been changed, it will restart
|
|
|
|
* the whole pathname back-tracing sequence again by taking the rename_lock.
|
|
|
|
* In this case, there is no need to take the RCU read lock as the recursive
|
|
|
|
* parent pointer references will keep the dentry chain alive as long as no
|
|
|
|
* rename operation is performed.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
static int prepend_path(const struct path *path,
|
|
|
|
const struct path *root,
|
2010-08-10 13:41:39 +04:00
|
|
|
char **buffer, int *buflen)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2013-11-13 16:45:40 +04:00
|
|
|
struct dentry *dentry;
|
|
|
|
struct vfsmount *vfsmnt;
|
|
|
|
struct mount *mnt;
|
2010-08-10 13:41:39 +04:00
|
|
|
int error = 0;
|
2013-09-30 06:06:07 +04:00
|
|
|
unsigned seq, m_seq = 0;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
char *bptr;
|
|
|
|
int blen;
|
2008-03-27 15:06:20 +03:00
|
|
|
|
2013-09-09 23:22:25 +04:00
|
|
|
rcu_read_lock();
|
2013-09-30 06:06:07 +04:00
|
|
|
restart_mnt:
|
|
|
|
read_seqbegin_or_lock(&mount_lock, &m_seq);
|
|
|
|
seq = 0;
|
2013-11-13 11:21:51 +04:00
|
|
|
rcu_read_lock();
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
restart:
|
|
|
|
bptr = *buffer;
|
|
|
|
blen = *buflen;
|
2013-09-30 06:06:07 +04:00
|
|
|
error = 0;
|
2013-11-13 16:45:40 +04:00
|
|
|
dentry = path->dentry;
|
|
|
|
vfsmnt = path->mnt;
|
|
|
|
mnt = real_mount(vfsmnt);
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
read_seqbegin_or_lock(&rename_lock, &seq);
|
2010-08-10 13:41:39 +04:00
|
|
|
while (dentry != root->dentry || vfsmnt != root->mnt) {
|
2005-04-17 02:20:36 +04:00
|
|
|
struct dentry * parent;
|
|
|
|
|
|
|
|
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
|
2013-09-30 06:06:07 +04:00
|
|
|
struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
|
2015-08-15 21:36:12 +03:00
|
|
|
/* Escaped? */
|
|
|
|
if (dentry != vfsmnt->mnt_root) {
|
|
|
|
bptr = *buffer;
|
|
|
|
blen = *buflen;
|
|
|
|
error = 3;
|
|
|
|
break;
|
|
|
|
}
|
2007-02-13 23:08:18 +03:00
|
|
|
/* Global root? */
|
2013-09-30 06:06:07 +04:00
|
|
|
if (mnt != parent) {
|
|
|
|
dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
|
|
|
|
mnt = parent;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
vfsmnt = &mnt->mnt;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!error)
|
|
|
|
error = is_mounted(vfsmnt) ? 1 : 2;
|
|
|
|
break;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
parent = dentry->d_parent;
|
|
|
|
prefetch(parent);
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
error = prepend_name(&bptr, &blen, &dentry->d_name);
|
2010-08-10 13:41:39 +04:00
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
dentry = parent;
|
|
|
|
}
|
2013-09-09 23:22:25 +04:00
|
|
|
if (!(seq & 1))
|
|
|
|
rcu_read_unlock();
|
|
|
|
if (need_seqretry(&rename_lock, seq)) {
|
|
|
|
seq = 1;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
goto restart;
|
2013-09-09 23:22:25 +04:00
|
|
|
}
|
|
|
|
done_seqretry(&rename_lock, seq);
|
2013-11-13 11:21:51 +04:00
|
|
|
|
|
|
|
if (!(m_seq & 1))
|
|
|
|
rcu_read_unlock();
|
2013-09-30 06:06:07 +04:00
|
|
|
if (need_seqretry(&mount_lock, m_seq)) {
|
|
|
|
m_seq = 1;
|
|
|
|
goto restart_mnt;
|
|
|
|
}
|
|
|
|
done_seqretry(&mount_lock, m_seq);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
if (error >= 0 && bptr == *buffer) {
|
|
|
|
if (--blen < 0)
|
|
|
|
error = -ENAMETOOLONG;
|
|
|
|
else
|
|
|
|
*--bptr = '/';
|
|
|
|
}
|
|
|
|
*buffer = bptr;
|
|
|
|
*buflen = blen;
|
2013-03-27 02:25:57 +04:00
|
|
|
return error;
|
2010-08-10 13:41:39 +04:00
|
|
|
}
|
2008-06-16 15:28:07 +04:00
|
|
|
|
2010-08-10 13:41:39 +04:00
|
|
|
/**
|
|
|
|
* __d_path - return the path of a dentry
|
|
|
|
* @path: the dentry/vfsmount to report
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
* @root: root vfsmnt/dentry
|
2010-08-15 00:05:31 +04:00
|
|
|
* @buf: buffer to return value in
|
2010-08-10 13:41:39 +04:00
|
|
|
* @buflen: buffer length
|
|
|
|
*
|
2010-08-10 13:41:40 +04:00
|
|
|
* Convert a dentry into an ASCII path name.
|
2010-08-10 13:41:39 +04:00
|
|
|
*
|
|
|
|
* Returns a pointer into the buffer or an error code if the
|
|
|
|
* path was too long.
|
|
|
|
*
|
2010-10-10 13:36:21 +04:00
|
|
|
* "buflen" should be positive.
|
2010-08-10 13:41:39 +04:00
|
|
|
*
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
* If the path is not reachable from the supplied root, return %NULL.
|
2010-08-10 13:41:39 +04:00
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
char *__d_path(const struct path *path,
|
|
|
|
const struct path *root,
|
2010-08-10 13:41:39 +04:00
|
|
|
char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *res = buf + buflen;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
prepend(&res, &buflen, "\0", 1);
|
|
|
|
error = prepend_path(path, root, &res, &buflen);
|
2010-10-10 13:36:21 +04:00
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
if (error < 0)
|
|
|
|
return ERR_PTR(error);
|
|
|
|
if (error > 0)
|
|
|
|
return NULL;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
char *d_absolute_path(const struct path *path,
|
|
|
|
char *buf, int buflen)
|
|
|
|
{
|
|
|
|
struct path root = {};
|
|
|
|
char *res = buf + buflen;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
prepend(&res, &buflen, "\0", 1);
|
|
|
|
error = prepend_path(path, &root, &res, &buflen);
|
|
|
|
|
|
|
|
if (error > 1)
|
|
|
|
error = -EINVAL;
|
|
|
|
if (error < 0)
|
2010-08-10 13:41:39 +04:00
|
|
|
return ERR_PTR(error);
|
|
|
|
return res;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2010-08-10 13:41:40 +04:00
|
|
|
/*
|
|
|
|
* same as __d_path but appends "(deleted)" for unlinked files.
|
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
static int path_with_deleted(const struct path *path,
|
|
|
|
const struct path *root,
|
|
|
|
char **buf, int *buflen)
|
2010-08-10 13:41:40 +04:00
|
|
|
{
|
|
|
|
prepend(buf, buflen, "\0", 1);
|
|
|
|
if (d_unlinked(path->dentry)) {
|
|
|
|
int error = prepend(buf, buflen, " (deleted)", 10);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return prepend_path(path, root, buf, buflen);
|
|
|
|
}
|
|
|
|
|
2010-08-10 13:41:41 +04:00
|
|
|
static int prepend_unreachable(char **buffer, int *buflen)
|
|
|
|
{
|
|
|
|
return prepend(buffer, buflen, "(unreachable)", 13);
|
|
|
|
}
|
|
|
|
|
2013-09-13 00:24:55 +04:00
|
|
|
static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
|
|
|
|
{
|
|
|
|
unsigned seq;
|
|
|
|
|
|
|
|
do {
|
|
|
|
seq = read_seqcount_begin(&fs->seq);
|
|
|
|
*root = fs->root;
|
|
|
|
} while (read_seqcount_retry(&fs->seq, seq));
|
|
|
|
}
|
|
|
|
|
2008-02-15 06:38:32 +03:00
|
|
|
/**
|
|
|
|
* d_path - return the path of a dentry
|
2008-02-15 06:38:44 +03:00
|
|
|
* @path: path to report
|
2008-02-15 06:38:32 +03:00
|
|
|
* @buf: buffer to return value in
|
|
|
|
* @buflen: buffer length
|
|
|
|
*
|
|
|
|
* Convert a dentry into an ASCII path name. If the entry has been deleted
|
|
|
|
* the string " (deleted)" is appended. Note that this is ambiguous.
|
|
|
|
*
|
2008-12-02 01:35:00 +03:00
|
|
|
* Returns a pointer into the buffer or an error code if the path was
|
|
|
|
* too long. Note: Callers should use the returned pointer, not the passed
|
|
|
|
* in buffer, to use the name! The implementation often starts at an offset
|
|
|
|
* into the buffer, and may leave 0 bytes at the start.
|
2008-02-15 06:38:32 +03:00
|
|
|
*
|
2008-06-23 20:11:52 +04:00
|
|
|
* "buflen" should be positive.
|
2008-02-15 06:38:32 +03:00
|
|
|
*/
|
2008-06-10 03:40:36 +04:00
|
|
|
char *d_path(const struct path *path, char *buf, int buflen)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-08-10 13:41:40 +04:00
|
|
|
char *res = buf + buflen;
|
2008-02-15 06:34:38 +03:00
|
|
|
struct path root;
|
2010-08-10 13:41:40 +04:00
|
|
|
int error;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-05-08 11:26:18 +04:00
|
|
|
/*
|
|
|
|
* We have various synthetic filesystems that never get mounted. On
|
|
|
|
* these filesystems dentries are never used for lookup purposes, and
|
|
|
|
* thus don't need to be hashed. They also don't need a name until a
|
|
|
|
* user wants to identify the object in /proc/pid/fd/. The little hack
|
|
|
|
* below allows us to generate a name for these objects on demand:
|
vfs: In d_path don't call d_dname on a mount point
Aditya Kali (adityakali@google.com) wrote:
> Commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
> "proc: Fix the namespace inode permission checks." converted
> the namespace files into symlinks. The same commit changed
> the way namespace bind mounts appear in /proc/mounts:
> $ mount --bind /proc/self/ns/ipc /mnt/ipc
> Originally:
> $ cat /proc/mounts | grep ipc
> proc /mnt/ipc proc rw,nosuid,nodev,noexec 0 0
>
> After commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
> $ cat /proc/mounts | grep ipc
> proc ipc:[4026531839] proc rw,nosuid,nodev,noexec 0 0
>
> This breaks userspace which expects the 2nd field in
> /proc/mounts to be a valid path.
The symlink /proc/<pid>/ns/{ipc,mnt,net,pid,user,uts} point to
dentries allocated with d_alloc_pseudo that we can mount, and
that have interesting names printed out with d_dname.
When these files are bind mounted /proc/mounts is not currently
displaying the mount point correctly because d_dname is called instead
of just displaying the path where the file is mounted.
Solve this by adding an explicit check to distinguish mounted pseudo
inodes and unmounted pseudo inodes. Unmounted pseudo inodes always
use mount of their filesstem as the mnt_root in their path making
these two cases easy to distinguish.
CC: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2013-11-09 04:31:29 +04:00
|
|
|
*
|
|
|
|
* Some pseudo inodes are mountable. When they are mounted
|
|
|
|
* path->dentry == path->mnt->mnt_root. In that case don't call d_dname
|
|
|
|
* and instead have d_path return the mounted path.
|
2007-05-08 11:26:18 +04:00
|
|
|
*/
|
vfs: In d_path don't call d_dname on a mount point
Aditya Kali (adityakali@google.com) wrote:
> Commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
> "proc: Fix the namespace inode permission checks." converted
> the namespace files into symlinks. The same commit changed
> the way namespace bind mounts appear in /proc/mounts:
> $ mount --bind /proc/self/ns/ipc /mnt/ipc
> Originally:
> $ cat /proc/mounts | grep ipc
> proc /mnt/ipc proc rw,nosuid,nodev,noexec 0 0
>
> After commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
> $ cat /proc/mounts | grep ipc
> proc ipc:[4026531839] proc rw,nosuid,nodev,noexec 0 0
>
> This breaks userspace which expects the 2nd field in
> /proc/mounts to be a valid path.
The symlink /proc/<pid>/ns/{ipc,mnt,net,pid,user,uts} point to
dentries allocated with d_alloc_pseudo that we can mount, and
that have interesting names printed out with d_dname.
When these files are bind mounted /proc/mounts is not currently
displaying the mount point correctly because d_dname is called instead
of just displaying the path where the file is mounted.
Solve this by adding an explicit check to distinguish mounted pseudo
inodes and unmounted pseudo inodes. Unmounted pseudo inodes always
use mount of their filesstem as the mnt_root in their path making
these two cases easy to distinguish.
CC: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2013-11-09 04:31:29 +04:00
|
|
|
if (path->dentry->d_op && path->dentry->d_op->d_dname &&
|
|
|
|
(!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
|
2008-02-15 06:38:44 +03:00
|
|
|
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
|
2007-05-08 11:26:18 +04:00
|
|
|
|
2013-09-13 00:24:55 +04:00
|
|
|
rcu_read_lock();
|
|
|
|
get_fs_root_rcu(current->fs, &root);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
error = path_with_deleted(path, &root, &res, &buflen);
|
2013-09-13 00:24:55 +04:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
if (error < 0)
|
2010-08-10 13:41:40 +04:00
|
|
|
res = ERR_PTR(error);
|
2005-04-17 02:20:36 +04:00
|
|
|
return res;
|
|
|
|
}
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(d_path);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-05-08 11:26:18 +04:00
|
|
|
/*
|
|
|
|
* Helper function for dentry_operations.d_dname() members
|
|
|
|
*/
|
|
|
|
char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
|
|
|
|
const char *fmt, ...)
|
|
|
|
{
|
|
|
|
va_list args;
|
|
|
|
char temp[64];
|
|
|
|
int sz;
|
|
|
|
|
|
|
|
va_start(args, fmt);
|
|
|
|
sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
|
|
|
|
va_end(args);
|
|
|
|
|
|
|
|
if (sz > sizeof(temp) || sz > buflen)
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
|
|
|
|
buffer += buflen - sz;
|
|
|
|
return memcpy(buffer, temp, sz);
|
|
|
|
}
|
|
|
|
|
2013-08-24 20:08:17 +04:00
|
|
|
char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
|
|
|
|
{
|
|
|
|
char *end = buffer + buflen;
|
|
|
|
/* these dentries are never renamed, so d_lock is not needed */
|
|
|
|
if (prepend(&end, &buflen, " (deleted)", 11) ||
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
|
2013-08-24 20:08:17 +04:00
|
|
|
prepend(&end, &buflen, "/", 1))
|
|
|
|
end = ERR_PTR(-ENAMETOOLONG);
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
return end;
|
2013-08-24 20:08:17 +04:00
|
|
|
}
|
2014-01-03 17:09:47 +04:00
|
|
|
EXPORT_SYMBOL(simple_dname);
|
2013-08-24 20:08:17 +04:00
|
|
|
|
2008-03-27 15:06:20 +03:00
|
|
|
/*
|
|
|
|
* Write full pathname from the root of the filesystem into the buffer.
|
|
|
|
*/
|
2014-01-26 21:37:55 +04:00
|
|
|
static char *__dentry_path(struct dentry *d, char *buf, int buflen)
|
2008-03-27 15:06:20 +03:00
|
|
|
{
|
2014-01-26 21:37:55 +04:00
|
|
|
struct dentry *dentry;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
char *end, *retval;
|
|
|
|
int len, seq = 0;
|
|
|
|
int error = 0;
|
2008-03-27 15:06:20 +03:00
|
|
|
|
2014-01-26 21:37:55 +04:00
|
|
|
if (buflen < 2)
|
|
|
|
goto Elong;
|
|
|
|
|
2013-09-09 23:22:25 +04:00
|
|
|
rcu_read_lock();
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
restart:
|
2014-01-26 21:37:55 +04:00
|
|
|
dentry = d;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
end = buf + buflen;
|
|
|
|
len = buflen;
|
|
|
|
prepend(&end, &len, "\0", 1);
|
2008-03-27 15:06:20 +03:00
|
|
|
/* Get '/' right */
|
|
|
|
retval = end-1;
|
|
|
|
*retval = '/';
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
read_seqbegin_or_lock(&rename_lock, &seq);
|
2008-06-23 20:11:53 +04:00
|
|
|
while (!IS_ROOT(dentry)) {
|
|
|
|
struct dentry *parent = dentry->d_parent;
|
2008-03-27 15:06:20 +03:00
|
|
|
|
|
|
|
prefetch(parent);
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
error = prepend_name(&end, &len, &dentry->d_name);
|
|
|
|
if (error)
|
|
|
|
break;
|
2008-03-27 15:06:20 +03:00
|
|
|
|
|
|
|
retval = end;
|
|
|
|
dentry = parent;
|
|
|
|
}
|
2013-09-09 23:22:25 +04:00
|
|
|
if (!(seq & 1))
|
|
|
|
rcu_read_unlock();
|
|
|
|
if (need_seqretry(&rename_lock, seq)) {
|
|
|
|
seq = 1;
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
goto restart;
|
2013-09-09 23:22:25 +04:00
|
|
|
}
|
|
|
|
done_seqretry(&rename_lock, seq);
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
if (error)
|
|
|
|
goto Elong;
|
2010-06-07 06:31:14 +04:00
|
|
|
return retval;
|
|
|
|
Elong:
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
}
|
2011-01-07 09:49:29 +03:00
|
|
|
|
|
|
|
char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
|
|
|
|
{
|
dcache: Translating dentry into pathname without taking rename_lock
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:
8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock
|--42.21%-- d_path
| proc_pid_readlink
| SyS_readlinkat
| SyS_readlink
| system_call
| __GI___readlink
|
|--40.97%-- sys_getcwd
| system_call
| __getcwd
The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.
The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.
This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.
To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.
The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
check.
2. Move the global root check in prepend_path() back to the top of
the while loop.
With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.
When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-09-09 20:18:13 +04:00
|
|
|
return __dentry_path(dentry, buf, buflen);
|
2011-01-07 09:49:29 +03:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dentry_path_raw);
|
2010-06-07 06:31:14 +04:00
|
|
|
|
|
|
|
char *dentry_path(struct dentry *dentry, char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *p = NULL;
|
|
|
|
char *retval;
|
|
|
|
|
|
|
|
if (d_unlinked(dentry)) {
|
|
|
|
p = buf + buflen;
|
|
|
|
if (prepend(&p, &buflen, "//deleted", 10) != 0)
|
|
|
|
goto Elong;
|
|
|
|
buflen++;
|
|
|
|
}
|
|
|
|
retval = __dentry_path(dentry, buf, buflen);
|
|
|
|
if (!IS_ERR(retval) && p)
|
|
|
|
*p = '/'; /* restore '/' overriden with '\0' */
|
2008-03-27 15:06:20 +03:00
|
|
|
return retval;
|
|
|
|
Elong:
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
}
|
|
|
|
|
2013-09-12 21:35:47 +04:00
|
|
|
static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
|
|
|
|
struct path *pwd)
|
2013-09-12 21:12:47 +04:00
|
|
|
{
|
2013-09-12 21:35:47 +04:00
|
|
|
unsigned seq;
|
|
|
|
|
|
|
|
do {
|
|
|
|
seq = read_seqcount_begin(&fs->seq);
|
|
|
|
*root = fs->root;
|
|
|
|
*pwd = fs->pwd;
|
|
|
|
} while (read_seqcount_retry(&fs->seq, seq));
|
2013-09-12 21:12:47 +04:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* NOTE! The user-level library version returns a
|
|
|
|
* character pointer. The kernel system call just
|
|
|
|
* returns the length of the buffer filled (which
|
|
|
|
* includes the ending '\0' character), or a negative
|
|
|
|
* error value. So libc would do something like
|
|
|
|
*
|
|
|
|
* char *getcwd(char * buf, size_t size)
|
|
|
|
* {
|
|
|
|
* int retval;
|
|
|
|
*
|
|
|
|
* retval = sys_getcwd(buf, size);
|
|
|
|
* if (retval >= 0)
|
|
|
|
* return buf;
|
|
|
|
* errno = -retval;
|
|
|
|
* return NULL;
|
|
|
|
* }
|
|
|
|
*/
|
2009-01-14 16:14:22 +03:00
|
|
|
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2007-02-13 23:08:18 +03:00
|
|
|
int error;
|
2008-02-15 06:34:38 +03:00
|
|
|
struct path pwd, root;
|
2013-09-12 23:40:15 +04:00
|
|
|
char *page = __getname();
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-09-12 21:35:47 +04:00
|
|
|
rcu_read_lock();
|
|
|
|
get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-02-13 23:08:18 +03:00
|
|
|
error = -ENOENT;
|
2009-05-04 03:32:03 +04:00
|
|
|
if (!d_unlinked(pwd.dentry)) {
|
2007-02-13 23:08:18 +03:00
|
|
|
unsigned long len;
|
2013-09-12 23:40:15 +04:00
|
|
|
char *cwd = page + PATH_MAX;
|
|
|
|
int buflen = PATH_MAX;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-08-10 13:41:41 +04:00
|
|
|
prepend(&cwd, &buflen, "\0", 1);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
error = prepend_path(&pwd, &root, &cwd, &buflen);
|
2013-09-12 22:57:01 +04:00
|
|
|
rcu_read_unlock();
|
2007-02-13 23:08:18 +03:00
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
if (error < 0)
|
2007-02-13 23:08:18 +03:00
|
|
|
goto out;
|
|
|
|
|
2010-08-10 13:41:41 +04:00
|
|
|
/* Unreachable from current root */
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 17:43:34 +04:00
|
|
|
if (error > 0) {
|
2010-08-10 13:41:41 +04:00
|
|
|
error = prepend_unreachable(&cwd, &buflen);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-02-13 23:08:18 +03:00
|
|
|
error = -ERANGE;
|
2013-09-12 23:40:15 +04:00
|
|
|
len = PATH_MAX + page - cwd;
|
2007-02-13 23:08:18 +03:00
|
|
|
if (len <= size) {
|
|
|
|
error = len;
|
|
|
|
if (copy_to_user(buf, cwd, len))
|
|
|
|
error = -EFAULT;
|
|
|
|
}
|
2011-01-07 09:49:37 +03:00
|
|
|
} else {
|
2013-09-12 22:57:01 +04:00
|
|
|
rcu_read_unlock();
|
2011-01-07 09:49:37 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
out:
|
2013-09-12 23:40:15 +04:00
|
|
|
__putname(page);
|
2005-04-17 02:20:36 +04:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Test whether new_dentry is a subdirectory of old_dentry.
|
|
|
|
*
|
|
|
|
* Trivially implemented using the dcache structure
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* is_subdir - is new dentry a subdirectory of old_dentry
|
|
|
|
* @new_dentry: new dentry
|
|
|
|
* @old_dentry: old dentry
|
|
|
|
*
|
|
|
|
* Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
|
|
|
|
* Returns 0 otherwise.
|
|
|
|
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
|
|
|
|
*/
|
|
|
|
|
2008-10-16 02:50:28 +04:00
|
|
|
int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
int result;
|
2011-01-07 09:49:37 +03:00
|
|
|
unsigned seq;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-10-16 02:50:28 +04:00
|
|
|
if (new_dentry == old_dentry)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
do {
|
2005-04-17 02:20:36 +04:00
|
|
|
/* for restarting inner loop in case of seq retry */
|
|
|
|
seq = read_seqbegin(&rename_lock);
|
2011-01-07 09:49:37 +03:00
|
|
|
/*
|
|
|
|
* Need rcu_readlock to protect against the d_parent trashing
|
|
|
|
* due to d_move
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2008-10-16 02:50:28 +04:00
|
|
|
if (d_ancestor(old_dentry, new_dentry))
|
2005-04-17 02:20:36 +04:00
|
|
|
result = 1;
|
2008-10-16 02:50:28 +04:00
|
|
|
else
|
|
|
|
result = 0;
|
2011-01-07 09:49:37 +03:00
|
|
|
rcu_read_unlock();
|
2005-04-17 02:20:36 +04:00
|
|
|
} while (read_seqretry(&rename_lock, seq));
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2013-09-05 13:44:35 +04:00
|
|
|
struct dentry *root = data;
|
|
|
|
if (dentry != root) {
|
|
|
|
if (d_unhashed(dentry) || !dentry->d_inode)
|
|
|
|
return D_WALK_SKIP;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2013-09-05 13:44:34 +04:00
|
|
|
if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
|
|
|
|
dentry->d_flags |= DCACHE_GENOCIDE;
|
|
|
|
dentry->d_lockref.count--;
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2013-09-05 13:44:35 +04:00
|
|
|
return D_WALK_CONTINUE;
|
|
|
|
}
|
2011-01-07 09:49:39 +03:00
|
|
|
|
2013-09-05 13:44:35 +04:00
|
|
|
void d_genocide(struct dentry *parent)
|
|
|
|
{
|
|
|
|
d_walk(parent, parent, d_genocide_kill, NULL);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2013-06-07 09:20:27 +04:00
|
|
|
void d_tmpfile(struct dentry *dentry, struct inode *inode)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2013-06-07 09:20:27 +04:00
|
|
|
inode_dec_link_count(inode);
|
|
|
|
BUG_ON(dentry->d_name.name != dentry->d_iname ||
|
2014-10-27 02:19:16 +03:00
|
|
|
!hlist_unhashed(&dentry->d_u.d_alias) ||
|
2013-06-07 09:20:27 +04:00
|
|
|
!d_unlinked(dentry));
|
|
|
|
spin_lock(&dentry->d_parent->d_lock);
|
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
|
|
|
dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
|
|
|
|
(unsigned long long)inode->i_ino);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
spin_unlock(&dentry->d_parent->d_lock);
|
|
|
|
d_instantiate(dentry, inode);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2013-06-07 09:20:27 +04:00
|
|
|
EXPORT_SYMBOL(d_tmpfile);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
static __initdata unsigned long dhash_entries;
|
|
|
|
static int __init set_dhash_entries(char *str)
|
|
|
|
{
|
|
|
|
if (!str)
|
|
|
|
return 0;
|
|
|
|
dhash_entries = simple_strtoul(str, &str, 0);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("dhash_entries=", set_dhash_entries);
|
|
|
|
|
|
|
|
static void __init dcache_init_early(void)
|
|
|
|
{
|
2012-02-09 00:39:07 +04:00
|
|
|
unsigned int loop;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* If hashes are distributed across NUMA nodes, defer
|
|
|
|
* hash allocation until vmalloc space is available.
|
|
|
|
*/
|
|
|
|
if (hashdist)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dentry_hashtable =
|
|
|
|
alloc_large_system_hash("Dentry cache",
|
2011-04-24 09:32:03 +04:00
|
|
|
sizeof(struct hlist_bl_head),
|
2005-04-17 02:20:36 +04:00
|
|
|
dhash_entries,
|
|
|
|
13,
|
|
|
|
HASH_EARLY,
|
|
|
|
&d_hash_shift,
|
|
|
|
&d_hash_mask,
|
2012-05-23 17:33:35 +04:00
|
|
|
0,
|
2005-04-17 02:20:36 +04:00
|
|
|
0);
|
|
|
|
|
2012-02-09 00:39:07 +04:00
|
|
|
for (loop = 0; loop < (1U << d_hash_shift); loop++)
|
2011-04-24 09:32:03 +04:00
|
|
|
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2007-10-17 10:26:30 +04:00
|
|
|
static void __init dcache_init(void)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2012-02-09 00:39:07 +04:00
|
|
|
unsigned int loop;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A constructor could be added for stable state like the lists,
|
|
|
|
* but it is probably not worth it because of the cache nature
|
|
|
|
* of the dcache.
|
|
|
|
*/
|
2007-05-07 01:49:57 +04:00
|
|
|
dentry_cache = KMEM_CACHE(dentry,
|
|
|
|
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* Hash may have been set up in dcache_init_early */
|
|
|
|
if (!hashdist)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dentry_hashtable =
|
|
|
|
alloc_large_system_hash("Dentry cache",
|
2011-04-24 09:32:03 +04:00
|
|
|
sizeof(struct hlist_bl_head),
|
2005-04-17 02:20:36 +04:00
|
|
|
dhash_entries,
|
|
|
|
13,
|
|
|
|
0,
|
|
|
|
&d_hash_shift,
|
|
|
|
&d_hash_mask,
|
2012-05-23 17:33:35 +04:00
|
|
|
0,
|
2005-04-17 02:20:36 +04:00
|
|
|
0);
|
|
|
|
|
2012-02-09 00:39:07 +04:00
|
|
|
for (loop = 0; loop < (1U << d_hash_shift); loop++)
|
2011-04-24 09:32:03 +04:00
|
|
|
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* SLAB cache for __getname() consumers */
|
2006-12-07 07:33:20 +03:00
|
|
|
struct kmem_cache *names_cachep __read_mostly;
|
2010-01-05 23:45:18 +03:00
|
|
|
EXPORT_SYMBOL(names_cachep);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
EXPORT_SYMBOL(d_genocide);
|
|
|
|
|
|
|
|
void __init vfs_caches_init_early(void)
|
|
|
|
{
|
|
|
|
dcache_init_early();
|
|
|
|
inode_init_early();
|
|
|
|
}
|
|
|
|
|
2015-08-07 01:46:20 +03:00
|
|
|
void __init vfs_caches_init(void)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
|
2007-07-20 05:11:58 +04:00
|
|
|
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-10-17 10:26:30 +04:00
|
|
|
dcache_init();
|
|
|
|
inode_init();
|
2015-08-07 01:46:20 +03:00
|
|
|
files_init();
|
|
|
|
files_maxfiles_init();
|
2007-10-17 10:26:30 +04:00
|
|
|
mnt_init();
|
2005-04-17 02:20:36 +04:00
|
|
|
bdev_cache_init();
|
|
|
|
chrdev_init();
|
|
|
|
}
|