From 345e1ae0c6ba54f6a4d32154e80cadc2ee2ef1af Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Aug 2021 23:22:02 +0100
Subject: [PATCH 1/8] afs: Fix missing put on afs_read objects and missing get
 on the key therein

The afs_read objects created by afs_req_issue_op() get leaked because
afs_alloc_read() returns a ref and then afs_fetch_data() gets its own ref
which is released when the operation completes, but the initial ref is
never released.

Fix this by discarding the initial ref at the end of afs_req_issue_op().

This leak also covered another bug whereby a ref isn't got on the key
attached to the read record by afs_req_issue_op().  This isn't a problem as
long as the afs_read req never goes away...

Fix this by calling key_get() in afs_req_issue_op().

This was found by the generic/074 test.  It leaks a bunch of kmalloc-192
objects each time it is run, which can be observed by watching
/proc/slabinfo.

Fixes: f7605fa869cf ("afs: Fix leak of afs_read objects")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163010394740.3035676.8516846193899793357.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/163111665914.283156.3038561975681836591.stgit@warthog.procyon.org.uk/
---
 fs/afs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index db035ae2a134..6688fff14b0b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -295,7 +295,7 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
 	fsreq->subreq	= subreq;
 	fsreq->pos	= subreq->start + subreq->transferred;
 	fsreq->len	= subreq->len   - subreq->transferred;
-	fsreq->key	= subreq->rreq->netfs_priv;
+	fsreq->key	= key_get(subreq->rreq->netfs_priv);
 	fsreq->vnode	= vnode;
 	fsreq->iter	= &fsreq->def_iter;
 
@@ -304,6 +304,7 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
 			fsreq->pos, fsreq->len);
 
 	afs_fetch_data(fsreq->vnode, fsreq);
+	afs_put_read(fsreq);
 }
 
 static int afs_symlink_readpage(struct page *page)

From 581b2027af0018944ba301d68e7af45c6d1128b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 1 Sep 2021 09:15:21 +0100
Subject: [PATCH 2/8] afs: Fix page leak

There's a loop in afs_extend_writeback() that adds extra pages to a write
we want to make to improve the efficiency of the writeback by making it
larger.  This loop stops, however, if we hit a page we can't write back
from immediately, but it doesn't get rid of the page ref we speculatively
acquired.

This was caused by the removal of the cleanup loop when the code switched
from using find_get_pages_contig() to xarray scanning as the latter only
gets a single page at a time, not a batch.

Fix this by putting the page on a ref on an early break from the loop.
Unfortunately, we can't just add that page to the pagevec we're employing
as we'll go through that and add those pages to the RPC call.

This was found by the generic/074 test.  It leaks ~4GiB of RAM each time it
is run - which can be observed with "top".

Fixes: e87b03f5830e ("afs: Prepare for use of THPs")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163111666635.283156.177701903478910460.stgit@warthog.procyon.org.uk/
---
 fs/afs/write.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index c0534697268e..66b235266893 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -471,13 +471,18 @@ static void afs_extend_writeback(struct address_space *mapping,
 			}
 
 			/* Has the page moved or been split? */
-			if (unlikely(page != xas_reload(&xas)))
+			if (unlikely(page != xas_reload(&xas))) {
+				put_page(page);
 				break;
+			}
 
-			if (!trylock_page(page))
+			if (!trylock_page(page)) {
+				put_page(page);
 				break;
+			}
 			if (!PageDirty(page) || PageWriteback(page)) {
 				unlock_page(page);
+				put_page(page);
 				break;
 			}
 
@@ -487,6 +492,7 @@ static void afs_extend_writeback(struct address_space *mapping,
 			t = afs_page_dirty_to(page, priv);
 			if (f != 0 && !new_content) {
 				unlock_page(page);
+				put_page(page);
 				break;
 			}
 

From 3978d816523991dd86cf9aae88c295230a5ea3b2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 1 Sep 2021 19:22:50 +0100
Subject: [PATCH 3/8] afs: Add missing vnode validation checks

afs_d_revalidate() should only be validating the directory entry it is
given and the directory to which that belongs; it shouldn't be validating
the inode/vnode to which that dentry points.  Besides, validation need to
be done even if we don't call afs_d_revalidate() - which might be the case
if we're starting from a file descriptor.

In order for afs_d_revalidate() to be fixed, validation points must be
added in some other places.  Certain directory operations, such as
afs_unlink(), already check this, but not all and not all file operations
either.

Note that the validation of a vnode not only checks to see if the
attributes we have are correct, but also gets a promise from the server to
notify us if that file gets changed by a third party.

Add the following checks:

 - Check the vnode we're going to make a hard link to.
 - Check the vnode we're going to move/rename.
 - Check the vnode we're going to read from.
 - Check the vnode we're going to write to.
 - Check the vnode we're going to sync.
 - Check the vnode we're going to make a mapped page writable for.

Some of these aren't strictly necessary as we're going to perform a server
operation that might get the attributes anyway from which we can determine
if something changed - though it might not get us a callback promise.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163111667354.283156.12720698333342917516.stgit@warthog.procyon.org.uk/
---
 fs/afs/dir.c   | 11 +++++++++++
 fs/afs/file.c  | 16 +++++++++++++++-
 fs/afs/write.c | 17 +++++++++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ac829e63c570..a8e3ae55f1f9 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1792,6 +1792,10 @@ static int afs_link(struct dentry *from, struct inode *dir,
 		goto error;
 	}
 
+	ret = afs_validate(vnode, op->key);
+	if (ret < 0)
+		goto error_op;
+
 	afs_op_set_vnode(op, 0, dvnode);
 	afs_op_set_vnode(op, 1, vnode);
 	op->file[0].dv_delta = 1;
@@ -1805,6 +1809,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	op->create.reason	= afs_edit_dir_for_link;
 	return afs_do_sync_operation(op);
 
+error_op:
+	afs_put_operation(op);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1989,6 +1995,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	ret = afs_validate(vnode, op->key);
+	op->error = ret;
+	if (ret < 0)
+		goto error;
+
 	afs_op_set_vnode(op, 0, orig_dvnode);
 	afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
 	op->file[0].dv_delta = 1;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6688fff14b0b..4c8d786b53e0 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -24,12 +24,13 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 
 static void afs_readahead(struct readahead_control *ractl);
+static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 
 const struct file_operations afs_file_operations = {
 	.open		= afs_open,
 	.release	= afs_release,
 	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= afs_file_read_iter,
 	.write_iter	= afs_file_write,
 	.mmap		= afs_file_mmap,
 	.splice_read	= generic_file_splice_read,
@@ -503,3 +504,16 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		vma->vm_ops = &afs_vm_ops;
 	return ret;
 }
+
+static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+	struct afs_file *af = iocb->ki_filp->private_data;
+	int ret;
+
+	ret = afs_validate(vnode, af->key);
+	if (ret < 0)
+		return ret;
+
+	return generic_file_read_iter(iocb, iter);
+}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 66b235266893..32a764c24284 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -807,6 +807,7 @@ int afs_writepages(struct address_space *mapping,
 ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+	struct afs_file *af = iocb->ki_filp->private_data;
 	ssize_t result;
 	size_t count = iov_iter_count(from);
 
@@ -822,6 +823,10 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	if (!count)
 		return 0;
 
+	result = afs_validate(vnode, af->key);
+	if (result < 0)
+		return result;
+
 	result = generic_file_write_iter(iocb, from);
 
 	_leave(" = %zd", result);
@@ -835,13 +840,18 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
  */
 int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct inode *inode = file_inode(file);
-	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+	struct afs_file *af = file->private_data;
+	int ret;
 
 	_enter("{%llx:%llu},{n=%pD},%d",
 	       vnode->fid.vid, vnode->fid.vnode, file,
 	       datasync);
 
+	ret = afs_validate(vnode, af->key);
+	if (ret < 0)
+		return ret;
+
 	return file_write_and_wait_range(file, start, end);
 }
 
@@ -855,11 +865,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
 	struct file *file = vmf->vma->vm_file;
 	struct inode *inode = file_inode(file);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_file *af = file->private_data;
 	unsigned long priv;
 	vm_fault_t ret = VM_FAULT_RETRY;
 
 	_enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
 
+	afs_validate(vnode, af->key);
+
 	sb_start_pagefault(inode->i_sb);
 
 	/* Wait for the page to be written to the cache before we allow it to

From 63d49d843ef5fffeea069e0ffdfbd2bf40ba01c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 1 Sep 2021 18:11:33 +0100
Subject: [PATCH 4/8] afs: Fix incorrect triggering of sillyrename on 3rd-party
 invalidation

The AFS filesystem is currently triggering the silly-rename cleanup from
afs_d_revalidate() when it sees that a dentry has been changed by a third
party[1].  It should not be doing this as the cleanup includes deleting the
silly-rename target file on iput.

Fix this by removing the places in the d_revalidate handling that validate
anything other than the directory and the dirent.  It probably should not
be looking to validate the target inode of the dentry also.

This includes removing the point in afs_d_revalidate() where the inode that
a dentry used to point to was marked as being deleted (AFS_VNODE_DELETED).
We don't know it got deleted.  It could have been renamed or it could have
hard links remaining.

This was reproduced by cloning a git repo onto an afs volume on one
machine, switching to another machine and doing "git status", then
switching back to the first and doing "git status".  The second status
would show weird output due to ".git/index" getting deleted by the above
mentioned mechanism.

A simpler way to do it is to do:

	machine 1: touch a
	machine 2: touch b; mv -f b a
	machine 1: stat a

on an afs volume.  The bug shows up as the stat failing with ENOENT and the
file server log showing that machine 1 deleted "a".

Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c4 [1]
Link: https://lore.kernel.org/r/163111668100.283156.3851669884664475428.stgit@warthog.procyon.org.uk/
---
 fs/afs/dir.c | 46 +++++++---------------------------------------
 1 file changed, 7 insertions(+), 39 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a8e3ae55f1f9..4579bbda4634 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1077,9 +1077,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
  */
 static int afs_d_revalidate_rcu(struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_vnode *dvnode;
 	struct dentry *parent;
-	struct inode *dir, *inode;
+	struct inode *dir;
 	long dir_version, de_version;
 
 	_enter("%p", dentry);
@@ -1109,18 +1109,6 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
 			return -ECHILD;
 	}
 
-	/* Check to see if the vnode referred to by the dentry still
-	 * has a callback.
-	 */
-	if (d_really_is_positive(dentry)) {
-		inode = d_inode_rcu(dentry);
-		if (inode) {
-			vnode = AFS_FS_I(inode);
-			if (!afs_check_validity(vnode))
-				return -ECHILD;
-		}
-	}
-
 	return 1; /* Still valid */
 }
 
@@ -1156,17 +1144,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (IS_ERR(key))
 		key = NULL;
 
-	if (d_really_is_positive(dentry)) {
-		inode = d_inode(dentry);
-		if (inode) {
-			vnode = AFS_FS_I(inode);
-			afs_validate(vnode, key);
-			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-				goto out_bad;
-		}
-	}
-
-	/* lock down the parent dentry so we can peer at it */
+	/* Hold the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
 	dir = AFS_FS_I(d_inode(parent));
 
@@ -1175,7 +1153,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 
 	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%pd: parent dir deleted", dentry);
-		goto out_bad_parent;
+		goto not_found;
 	}
 
 	/* We only need to invalidate a dentry if the server's copy changed
@@ -1201,12 +1179,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	case 0:
 		/* the filename maps to something */
 		if (d_really_is_negative(dentry))
-			goto out_bad_parent;
+			goto not_found;
 		inode = d_inode(dentry);
 		if (is_bad_inode(inode)) {
 			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
 			       dentry);
-			goto out_bad_parent;
+			goto not_found;
 		}
 
 		vnode = AFS_FS_I(inode);
@@ -1228,9 +1206,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 			       dentry, fid.unique,
 			       vnode->fid.unique,
 			       vnode->vfs_inode.i_generation);
-			write_seqlock(&vnode->cb_lock);
-			set_bit(AFS_VNODE_DELETED, &vnode->flags);
-			write_sequnlock(&vnode->cb_lock);
 			goto not_found;
 		}
 		goto out_valid;
@@ -1245,7 +1220,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	default:
 		_debug("failed to iterate dir %pd: %d",
 		       parent, ret);
-		goto out_bad_parent;
+		goto not_found;
 	}
 
 out_valid:
@@ -1256,16 +1231,9 @@ out_valid_noupdate:
 	_leave(" = 1 [valid]");
 	return 1;
 
-	/* the dirent, if it exists, now points to a different vnode */
 not_found:
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
-	spin_unlock(&dentry->d_lock);
-
-out_bad_parent:
 	_debug("dropping dentry %pd2", dentry);
 	dput(parent);
-out_bad:
 	key_put(key);
 
 	_leave(" = 0 [bad]");

From 6e0e99d58a6530cf65f10e4bb16630c5be6c254d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Sep 2021 16:43:10 +0100
Subject: [PATCH 5/8] afs: Fix mmap coherency vs 3rd-party changes

Fix the coherency management of mmap'd data such that 3rd-party changes
become visible as soon as possible after the callback notification is
delivered by the fileserver.  This is done by the following means:

 (1) When we break a callback on a vnode specified by the CB.CallBack call
     from the server, we queue a work item (vnode->cb_work) to go and
     clobber all the PTEs mapping to that inode.

     This causes the CPU to trip through the ->map_pages() and
     ->page_mkwrite() handlers if userspace attempts to access the page(s)
     again.

     (Ideally, this would be done in the service handler for CB.CallBack,
     but the server is waiting for our reply before considering, and we
     have a list of vnodes, all of which need breaking - and the process of
     getting the mmap_lock and stripping the PTEs on all CPUs could be
     quite slow.)

 (2) Call afs_validate() from the ->map_pages() handler to check to see if
     the file has changed and to get a new callback promise from the
     server.

Also handle the fileserver telling us that it's dropping all callbacks,
possibly after it's been restarted by sending us a CB.InitCallBackState*
call by the following means:

 (3) Maintain a per-cell list of afs files that are currently mmap'd
     (cell->fs_open_mmaps).

 (4) Add a work item to each server that is invoked if there are any open
     mmaps when CB.InitCallBackState happens.  This work item goes through
     the aforementioned list and invokes the vnode->cb_work work item for
     each one that is currently using this server.

     This causes the PTEs to be cleared, causing ->map_pages() or
     ->page_mkwrite() to be called again, thereby calling afs_validate()
     again.

I've chosen to simply strip the PTEs at the point of notification reception
rather than invalidate all the pages as well because (a) it's faster, (b)
we may get a notification for other reasons than the data being altered (in
which case we don't want to clobber the pagecache) and (c) we need to ask
the server to find out - and I don't want to wait for the reply before
holding up userspace.

This was tested using the attached test program:

	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <fcntl.h>
	#include <sys/mman.h>
	int main(int argc, char *argv[])
	{
		size_t size = getpagesize();
		unsigned char *p;
		bool mod = (argc == 3);
		int fd;
		if (argc != 2 && argc != 3) {
			fprintf(stderr, "Format: %s <file> [mod]\n", argv[0]);
			exit(2);
		}
		fd = open(argv[1], mod ? O_RDWR : O_RDONLY);
		if (fd < 0) {
			perror(argv[1]);
			exit(1);
		}

		p = mmap(NULL, size, mod ? PROT_READ|PROT_WRITE : PROT_READ,
			 MAP_SHARED, fd, 0);
		if (p == MAP_FAILED) {
			perror("mmap");
			exit(1);
		}
		for (;;) {
			if (mod) {
				p[0]++;
				msync(p, size, MS_ASYNC);
				fsync(fd);
			}
			printf("%02x", p[0]);
			fflush(stdout);
			sleep(1);
		}
	}

It runs in two modes: in one mode, it mmaps a file, then sits in a loop
reading the first byte, printing it and sleeping for a second; in the
second mode it mmaps a file, then sits in a loop incrementing the first
byte and flushing, then printing and sleeping.

Two instances of this program can be run on different machines, one doing
the reading and one doing the writing.  The reader should see the changes
made by the writer, but without this patch, they aren't because validity
checking is being done lazily - only on entry to the filesystem.

Testing the InitCallBackState change is more complicated.  The server has
to be taken offline, the saved callback state file removed and then the
server restarted whilst the reading-mode program continues to run.  The
client machine then has to poke the server to trigger the InitCallBackState
call.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163111668833.283156.382633263709075739.stgit@warthog.procyon.org.uk/
---
 fs/afs/callback.c | 42 +++++++++++++++++++++++++++--
 fs/afs/cell.c     |  2 ++
 fs/afs/file.c     | 67 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/afs/internal.h |  8 ++++++
 fs/afs/server.c   |  2 ++
 fs/afs/super.c    |  1 +
 mm/memory.c       |  1 +
 7 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 7d9b23d981bf..1dd7543dbf9f 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -20,6 +20,37 @@
 #include <linux/sched.h>
 #include "internal.h"
 
+/*
+ * Handle invalidation of an mmap'd file.  We invalidate all the PTEs referring
+ * to the pages in this file's pagecache, forcing the kernel to go through
+ * ->fault() or ->page_mkwrite() - at which point we can handle invalidation
+ * more fully.
+ */
+void afs_invalidate_mmap_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
+
+	unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+}
+
+void afs_server_init_callback_work(struct work_struct *work)
+{
+	struct afs_server *server = container_of(work, struct afs_server, initcb_work);
+	struct afs_vnode *vnode;
+	struct afs_cell *cell = server->cell;
+
+	down_read(&cell->fs_open_mmaps_lock);
+
+	list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
+		if (vnode->cb_server == server) {
+			clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+			queue_work(system_unbound_wq, &vnode->cb_work);
+		}
+	}
+
+	up_read(&cell->fs_open_mmaps_lock);
+}
+
 /*
  * Allow the fileserver to request callback state (re-)initialisation.
  * Unfortunately, UUIDs are not guaranteed unique.
@@ -29,8 +60,10 @@ void afs_init_callback_state(struct afs_server *server)
 	rcu_read_lock();
 	do {
 		server->cb_s_break++;
-		server = rcu_dereference(server->uuid_next);
-	} while (0);
+		if (!list_empty(&server->cell->fs_open_mmaps))
+			queue_work(system_unbound_wq, &server->initcb_work);
+
+	} while ((server = rcu_dereference(server->uuid_next)));
 	rcu_read_unlock();
 }
 
@@ -49,6 +82,11 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 		if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
 			afs_lock_may_be_available(vnode);
 
+		if (reason != afs_cb_break_for_deleted &&
+		    vnode->status.type == AFS_FTYPE_FILE &&
+		    atomic_read(&vnode->cb_nr_mmap))
+			queue_work(system_unbound_wq, &vnode->cb_work);
+
 		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
 	} else {
 		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 887b673f6223..d88407fb9bc0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -166,6 +166,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 	seqlock_init(&cell->volume_lock);
 	cell->fs_servers = RB_ROOT;
 	seqlock_init(&cell->fs_lock);
+	INIT_LIST_HEAD(&cell->fs_open_mmaps);
+	init_rwsem(&cell->fs_open_mmaps_lock);
 	rwlock_init(&cell->vl_servers_lock);
 	cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 4c8d786b53e0..e6c447ae91f3 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -25,6 +25,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 
 static void afs_readahead(struct readahead_control *ractl);
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+static void afs_vm_open(struct vm_area_struct *area);
+static void afs_vm_close(struct vm_area_struct *area);
+static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff);
 
 const struct file_operations afs_file_operations = {
 	.open		= afs_open,
@@ -60,8 +63,10 @@ const struct address_space_operations afs_fs_aops = {
 };
 
 static const struct vm_operations_struct afs_vm_ops = {
+	.open		= afs_vm_open,
+	.close		= afs_vm_close,
 	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
+	.map_pages	= afs_vm_map_pages,
 	.page_mkwrite	= afs_page_mkwrite,
 };
 
@@ -492,19 +497,79 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 	return 1;
 }
 
+static void afs_add_open_mmap(struct afs_vnode *vnode)
+{
+	if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
+		down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+
+		list_add_tail(&vnode->cb_mmap_link,
+			      &vnode->volume->cell->fs_open_mmaps);
+
+		up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+	}
+}
+
+static void afs_drop_open_mmap(struct afs_vnode *vnode)
+{
+	if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+		return;
+
+	down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+
+	if (atomic_read(&vnode->cb_nr_mmap) == 0)
+		list_del_init(&vnode->cb_mmap_link);
+
+	up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+	flush_work(&vnode->cb_work);
+}
+
 /*
  * Handle setting up a memory mapping on an AFS file.
  */
 static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	int ret;
 
+	afs_add_open_mmap(vnode);
+
 	ret = generic_file_mmap(file, vma);
 	if (ret == 0)
 		vma->vm_ops = &afs_vm_ops;
+	else
+		afs_drop_open_mmap(vnode);
 	return ret;
 }
 
+static void afs_vm_open(struct vm_area_struct *vma)
+{
+	afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
+}
+
+static void afs_vm_close(struct vm_area_struct *vma)
+{
+	afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
+}
+
+static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
+	struct afs_file *af = vmf->vma->vm_file->private_data;
+
+	switch (afs_validate(vnode, af->key)) {
+	case 0:
+		return filemap_map_pages(vmf, start_pgoff, end_pgoff);
+	case -ENOMEM:
+		return VM_FAULT_OOM;
+	case -EINTR:
+	case -ERESTARTSYS:
+		return VM_FAULT_RETRY;
+	case -ESTALE:
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5ed416f4ff33..0deeb76c67d0 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -390,6 +390,8 @@ struct afs_cell {
 	/* Active fileserver interaction state. */
 	struct rb_root		fs_servers;	/* afs_server (by server UUID) */
 	seqlock_t		fs_lock;	/* For fs_servers  */
+	struct rw_semaphore	fs_open_mmaps_lock;
+	struct list_head	fs_open_mmaps;	/* List of vnodes that are mmapped */
 
 	/* VL server list. */
 	rwlock_t		vl_servers_lock; /* Lock on vl_servers */
@@ -503,6 +505,7 @@ struct afs_server {
 	struct hlist_node	addr4_link;	/* Link in net->fs_addresses4 */
 	struct hlist_node	addr6_link;	/* Link in net->fs_addresses6 */
 	struct hlist_node	proc_link;	/* Link in net->fs_proc */
+	struct work_struct	initcb_work;	/* Work for CB.InitCallBackState* */
 	struct afs_server	*gc_next;	/* Next server in manager's list */
 	time64_t		unuse_time;	/* Time at which last unused */
 	unsigned long		flags;
@@ -657,7 +660,10 @@ struct afs_vnode {
 	afs_lock_type_t		lock_type : 8;
 
 	/* outstanding callback notification on this file */
+	struct work_struct	cb_work;	/* Work for mmap'd files */
+	struct list_head	cb_mmap_link;	/* Link in cell->fs_open_mmaps */
 	void			*cb_server;	/* Server with callback/filelock */
+	atomic_t		cb_nr_mmap;	/* Number of mmaps */
 	unsigned int		cb_s_break;	/* Mass break counter on ->server */
 	unsigned int		cb_v_break;	/* Mass break counter on ->volume */
 	unsigned int		cb_break;	/* Break counter on vnode */
@@ -965,6 +971,8 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
 /*
  * callback.c
  */
+extern void afs_invalidate_mmap_work(struct work_struct *);
+extern void afs_server_init_callback_work(struct work_struct *work);
 extern void afs_init_callback_state(struct afs_server *);
 extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
 extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 684a2b02b9ff..6e5b9a19b234 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -235,6 +235,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 	server->addr_version = alist->version;
 	server->uuid = *uuid;
 	rwlock_init(&server->fs_lock);
+	INIT_WORK(&server->initcb_work, afs_server_init_callback_work);
 	init_waitqueue_head(&server->probe_wq);
 	INIT_LIST_HEAD(&server->probe_link);
 	spin_lock_init(&server->probe_lock);
@@ -467,6 +468,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
 		afs_give_up_callbacks(net, server);
 
+	flush_work(&server->initcb_work);
 	afs_put_server(net, server, afs_server_trace_destroy);
 }
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e38bb1e7a4d2..d110def8aa8e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -698,6 +698,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	vnode->lock_state	= AFS_VNODE_LOCK_NONE;
 
 	init_rwsem(&vnode->rmdir_lock);
+	INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
 
 	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
diff --git a/mm/memory.c b/mm/memory.c
index 25fc46e87214..adf9b9ef8277 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3403,6 +3403,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	i_mmap_unlock_write(mapping);
 }
+EXPORT_SYMBOL_GPL(unmap_mapping_pages);
 
 /**
  * unmap_mapping_range - unmap the portion of all mmaps in the specified

From 4fe6a946823a9bc8619fd16b7ea7d15914a30f22 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Sep 2021 21:51:01 +0100
Subject: [PATCH 6/8] afs: Try to avoid taking RCU read lock when checking
 vnode validity

Try to avoid taking the RCU read lock when checking the validity of a
vnode's callback state.  The only thing it's needed for is to pin the
parent volume's server list whilst we search it to find the record of the
server we're currently using to see if it has been reinitialised (ie. it
sent us a CB.InitCallBackState* RPC).

Do this by the following means:

 (1) Keep an additional per-cell counter (fs_s_break) that's incremented
     each time any of the fileservers in the cell reinitialises.

     Since the new counter can be accessed without RCU from the vnode, we
     can check that first - and only if it differs, get the RCU read lock
     and check the volume's server list.

 (2) Replace afs_get_s_break_rcu() with afs_check_server_good() which now
     indicates whether the callback promise is still expected to be present
     on the server.  This does the checks as described in (1).

 (3) Restructure afs_check_validity() to take account of the change in (2).

     We can also get rid of the valid variable and just use the need_clear
     variable with the addition of the afs_cb_break_no_promise reason.

 (4) afs_check_validity() probably shouldn't be altering vnode->cb_v_break
     and vnode->cb_s_break when it doesn't have cb_lock exclusively locked.

     Move the change to vnode->cb_v_break to __afs_break_callback().

     Delegate the change to vnode->cb_s_break to afs_select_fileserver()
     and set vnode->cb_fs_s_break there also.

 (5) afs_validate() no longer needs to get the RCU read lock around its
     call to afs_check_validity() - and can skip the call entirely if we
     don't have a promise.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163111669583.283156.1397603105683094563.stgit@warthog.procyon.org.uk/
---
 fs/afs/callback.c          |  2 +
 fs/afs/inode.c             | 88 +++++++++++++++++++-------------------
 fs/afs/internal.h          |  2 +
 fs/afs/rotate.c            |  1 +
 include/trace/events/afs.h |  8 +++-
 5 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1dd7543dbf9f..1b4d5809808d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -60,6 +60,7 @@ void afs_init_callback_state(struct afs_server *server)
 	rcu_read_lock();
 	do {
 		server->cb_s_break++;
+		atomic_inc(&server->cell->fs_s_break);
 		if (!list_empty(&server->cell->fs_open_mmaps))
 			queue_work(system_unbound_wq, &server->initcb_work);
 
@@ -77,6 +78,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 	clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
 	if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		vnode->cb_break++;
+		vnode->cb_v_break = vnode->volume->cb_v_break;
 		afs_clear_permits(vnode);
 
 		if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 80b6c8d967d5..126daf9969db 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -587,22 +587,32 @@ static void afs_zap_data(struct afs_vnode *vnode)
 }
 
 /*
- * Get the server reinit counter for a vnode's current server.
+ * Check to see if we have a server currently serving this volume and that it
+ * hasn't been reinitialised or dropped from the list.
  */
-static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break)
+static bool afs_check_server_good(struct afs_vnode *vnode)
 {
-	struct afs_server_list *slist = rcu_dereference(vnode->volume->servers);
+	struct afs_server_list *slist;
 	struct afs_server *server;
+	bool good;
 	int i;
 
+	if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
+		return true;
+
+	rcu_read_lock();
+
+	slist = rcu_dereference(vnode->volume->servers);
 	for (i = 0; i < slist->nr_servers; i++) {
 		server = slist->servers[i].server;
 		if (server == vnode->cb_server) {
-			*_s_break = READ_ONCE(server->cb_s_break);
-			return true;
+			good = (vnode->cb_s_break == server->cb_s_break);
+			rcu_read_unlock();
+			return good;
 		}
 	}
 
+	rcu_read_unlock();
 	return false;
 }
 
@@ -611,57 +621,46 @@ static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break)
  */
 bool afs_check_validity(struct afs_vnode *vnode)
 {
-	struct afs_volume *volume = vnode->volume;
 	enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
 	time64_t now = ktime_get_real_seconds();
-	bool valid;
-	unsigned int cb_break, cb_s_break, cb_v_break;
+	unsigned int cb_break;
 	int seq = 0;
 
 	do {
 		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
-		cb_v_break = READ_ONCE(volume->cb_v_break);
 		cb_break = vnode->cb_break;
 
-		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
-		    afs_get_s_break_rcu(vnode, &cb_s_break)) {
-			if (vnode->cb_s_break != cb_s_break ||
-			    vnode->cb_v_break != cb_v_break) {
-				vnode->cb_s_break = cb_s_break;
-				vnode->cb_v_break = cb_v_break;
-				need_clear = afs_cb_break_for_vsbreak;
-				valid = false;
-			} else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+			if (vnode->cb_v_break != vnode->volume->cb_v_break)
+				need_clear = afs_cb_break_for_v_break;
+			else if (!afs_check_server_good(vnode))
+				need_clear = afs_cb_break_for_s_reinit;
+			else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 				need_clear = afs_cb_break_for_zap;
-				valid = false;
-			} else if (vnode->cb_expires_at - 10 <= now) {
+			else if (vnode->cb_expires_at - 10 <= now)
 				need_clear = afs_cb_break_for_lapsed;
-				valid = false;
-			} else {
-				valid = true;
-			}
 		} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-			valid = true;
+			;
 		} else {
-			vnode->cb_v_break = cb_v_break;
-			valid = false;
+			need_clear = afs_cb_break_no_promise;
 		}
 
 	} while (need_seqretry(&vnode->cb_lock, seq));
 
 	done_seqretry(&vnode->cb_lock, seq);
 
-	if (need_clear != afs_cb_break_no_break) {
-		write_seqlock(&vnode->cb_lock);
-		if (cb_break == vnode->cb_break)
-			__afs_break_callback(vnode, need_clear);
-		else
-			trace_afs_cb_miss(&vnode->fid, need_clear);
-		write_sequnlock(&vnode->cb_lock);
-		valid = false;
-	}
+	if (need_clear == afs_cb_break_no_break)
+		return true;
 
-	return valid;
+	write_seqlock(&vnode->cb_lock);
+	if (need_clear == afs_cb_break_no_promise)
+		vnode->cb_v_break = vnode->volume->cb_v_break;
+	else if (cb_break == vnode->cb_break)
+		__afs_break_callback(vnode, need_clear);
+	else
+		trace_afs_cb_miss(&vnode->fid, need_clear);
+	write_sequnlock(&vnode->cb_lock);
+	return false;
 }
 
 /*
@@ -675,21 +674,20 @@ bool afs_check_validity(struct afs_vnode *vnode)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
-	bool valid;
 	int ret;
 
 	_enter("{v={%llx:%llu} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	rcu_read_lock();
-	valid = afs_check_validity(vnode);
-	rcu_read_unlock();
+	if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+		if (vnode->vfs_inode.i_nlink)
+			clear_nlink(&vnode->vfs_inode);
+		goto valid;
+	}
 
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-		clear_nlink(&vnode->vfs_inode);
-
-	if (valid)
+	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+	    afs_check_validity(vnode))
 		goto valid;
 
 	down_write(&vnode->validate_lock);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0deeb76c67d0..c97618855b46 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -392,6 +392,7 @@ struct afs_cell {
 	seqlock_t		fs_lock;	/* For fs_servers  */
 	struct rw_semaphore	fs_open_mmaps_lock;
 	struct list_head	fs_open_mmaps;	/* List of vnodes that are mmapped */
+	atomic_t		fs_s_break;	/* Counter of CB.InitCallBackState messages */
 
 	/* VL server list. */
 	rwlock_t		vl_servers_lock; /* Lock on vl_servers */
@@ -664,6 +665,7 @@ struct afs_vnode {
 	struct list_head	cb_mmap_link;	/* Link in cell->fs_open_mmaps */
 	void			*cb_server;	/* Server with callback/filelock */
 	atomic_t		cb_nr_mmap;	/* Number of mmaps */
+	unsigned int		cb_fs_s_break;	/* Mass server break counter (cell->fs_s_break) */
 	unsigned int		cb_s_break;	/* Mass break counter on ->server */
 	unsigned int		cb_v_break;	/* Mass break counter on ->volume */
 	unsigned int		cb_break;	/* Break counter on vnode */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index d83f13c44b92..79e1a5f6701b 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -374,6 +374,7 @@ selected_server:
 	if (vnode->cb_server != server) {
 		vnode->cb_server = server;
 		vnode->cb_s_break = server->cb_s_break;
+		vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
 		vnode->cb_v_break = vnode->volume->cb_v_break;
 		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 	}
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 9f73ed2cf061..bca73e8c8cde 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -306,11 +306,13 @@ enum afs_flock_operation {
 
 enum afs_cb_break_reason {
 	afs_cb_break_no_break,
+	afs_cb_break_no_promise,
 	afs_cb_break_for_callback,
 	afs_cb_break_for_deleted,
 	afs_cb_break_for_lapsed,
+	afs_cb_break_for_s_reinit,
 	afs_cb_break_for_unlink,
-	afs_cb_break_for_vsbreak,
+	afs_cb_break_for_v_break,
 	afs_cb_break_for_volume_callback,
 	afs_cb_break_for_zap,
 };
@@ -602,11 +604,13 @@ enum afs_cb_break_reason {
 
 #define afs_cb_break_reasons						\
 	EM(afs_cb_break_no_break,		"no-break")		\
+	EM(afs_cb_break_no_promise,		"no-promise")		\
 	EM(afs_cb_break_for_callback,		"break-cb")		\
 	EM(afs_cb_break_for_deleted,		"break-del")		\
 	EM(afs_cb_break_for_lapsed,		"break-lapsed")		\
+	EM(afs_cb_break_for_s_reinit,		"s-reinit")		\
 	EM(afs_cb_break_for_unlink,		"break-unlink")		\
-	EM(afs_cb_break_for_vsbreak,		"break-vs")		\
+	EM(afs_cb_break_for_v_break,		"break-v")		\
 	EM(afs_cb_break_for_volume_callback,	"break-v-cb")		\
 	E_(afs_cb_break_for_zap,		"break-zap")
 

From b537a3c21775075395af475dcc6ef212fcf29db8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 10 Sep 2021 00:01:52 +0100
Subject: [PATCH 7/8] afs: Fix corruption in reads at fpos 2G-4G from an
 OpenAFS server

AFS-3 has two data fetch RPC variants, FS.FetchData and FS.FetchData64, and
Linux's afs client switches between them when talking to a non-YFS server
if the read size, the file position or the sum of the two have the upper 32
bits set of the 64-bit value.

This is a problem, however, since the file position and length fields of
FS.FetchData are *signed* 32-bit values.

Fix this by capturing the capability bits obtained from the fileserver when
it's sent an FS.GetCapabilities RPC, rather than just discarding them, and
then picking out the VICED_CAPABILITY_64BITFILES flag.  This can then be
used to decide whether to use FS.FetchData or FS.FetchData64 - and also
FS.StoreData or FS.StoreData64 - rather than using upper_32_bits() to
switch on the parameter values.

This capabilities flag could also be used to limit the maximum size of the
file, but all servers must be checked for that.

Note that the issue does not exist with FS.StoreData - that uses *unsigned*
32-bit values.  It's also not a problem with Auristor servers as its
YFS.FetchData64 op uses unsigned 64-bit values.

This can be tested by cloning a git repo through an OpenAFS client to an
OpenAFS server and then doing "git status" on it from a Linux afs
client[1].  Provided the clone has a pack file that's in the 2G-4G range,
the git status will show errors like:

	error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
	error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index

This can be observed in the server's FileLog with something like the
following appearing:

Sun Aug 29 19:31:39 2021 SRXAFS_FetchData, Fid = 2303380852.491776.3263114, Host 192.168.11.201:7001, Id 1001
Sun Aug 29 19:31:39 2021 CheckRights: len=0, for host=192.168.11.201:7001
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: Pos 18446744071815340032, Len 3154
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: file size 2400758866
...
Sun Aug 29 19:31:40 2021 SRXAFS_FetchData returns 5

Note the file position of 18446744071815340032.  This is the requested file
position sign-extended.

Fixes: b9b1f8d5930a ("AFS: write support fixes")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
cc: openafs-devel@openafs.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c9 [1]
Link: https://lore.kernel.org/r/951332.1631308745@warthog.procyon.org.uk/
---
 fs/afs/fs_probe.c     |  8 +++++++-
 fs/afs/fsclient.c     | 31 ++++++++++++++++++++-----------
 fs/afs/internal.h     |  1 +
 fs/afs/protocol_afs.h | 15 +++++++++++++++
 fs/afs/protocol_yfs.h |  6 ++++++
 5 files changed, 49 insertions(+), 12 deletions(-)
 create mode 100644 fs/afs/protocol_afs.h

diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index e7e98ad63a91..c0031a3ab42f 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -9,6 +9,7 @@
 #include <linux/slab.h>
 #include "afs_fs.h"
 #include "internal.h"
+#include "protocol_afs.h"
 #include "protocol_yfs.h"
 
 static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
@@ -102,7 +103,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
 	struct afs_addr_list *alist = call->alist;
 	struct afs_server *server = call->server;
 	unsigned int index = call->addr_ix;
-	unsigned int rtt_us = 0;
+	unsigned int rtt_us = 0, cap0;
 	int ret = call->error;
 
 	_enter("%pU,%u", &server->uuid, index);
@@ -159,6 +160,11 @@ responded:
 			clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
 			alist->addrs[index].srx_service = call->service_id;
 		}
+		cap0 = ntohl(call->tmp);
+		if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+			set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
+		else
+			clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
 	}
 
 	if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index dd3f45d906d2..4943413d9c5f 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -456,9 +456,7 @@ void afs_fs_fetch_data(struct afs_operation *op)
 	struct afs_read *req = op->fetch.req;
 	__be32 *bp;
 
-	if (upper_32_bits(req->pos) ||
-	    upper_32_bits(req->len) ||
-	    upper_32_bits(req->pos + req->len))
+	if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
 		return afs_fs_fetch_data64(op);
 
 	_enter("");
@@ -1113,9 +1111,7 @@ void afs_fs_store_data(struct afs_operation *op)
 	       (unsigned long long)op->store.pos,
 	       (unsigned long long)op->store.i_size);
 
-	if (upper_32_bits(op->store.pos) ||
-	    upper_32_bits(op->store.size) ||
-	    upper_32_bits(op->store.i_size))
+	if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
 		return afs_fs_store_data64(op);
 
 	call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData,
@@ -1229,7 +1225,7 @@ static void afs_fs_setattr_size(struct afs_operation *op)
 	       key_serial(op->key), vp->fid.vid, vp->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
-	if (upper_32_bits(attr->ia_size))
+	if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
 		return afs_fs_setattr_size64(op);
 
 	call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData_as_Status,
@@ -1657,20 +1653,33 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
 			return ret;
 
 		count = ntohl(call->tmp);
-
 		call->count = count;
 		call->count2 = count;
-		afs_extract_discard(call, count * sizeof(__be32));
+		if (count == 0) {
+			call->unmarshall = 4;
+			call->tmp = 0;
+			break;
+		}
+
+		/* Extract the first word of the capabilities to call->tmp */
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
 		fallthrough;
 
-		/* Extract capabilities words */
 	case 2:
 		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
-		/* TODO: Examine capabilities */
+		afs_extract_discard(call, (count - 1) * sizeof(__be32));
+		call->unmarshall++;
+		fallthrough;
+
+		/* Extract remaining capabilities words */
+	case 3:
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
 
 		call->unmarshall++;
 		break;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c97618855b46..b0fe27ae60db 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -520,6 +520,7 @@ struct afs_server {
 #define AFS_SERVER_FL_IS_YFS	16		/* Server is YFS not AFS */
 #define AFS_SERVER_FL_NO_IBULK	17		/* Fileserver doesn't support FS.InlineBulkStatus */
 #define AFS_SERVER_FL_NO_RM2	18		/* Fileserver doesn't support YFS.RemoveFile2 */
+#define AFS_SERVER_FL_HAS_FS64	19		/* Fileserver supports FS.{Fetch,Store}Data64 */
 	atomic_t		ref;		/* Object refcount */
 	atomic_t		active;		/* Active user count */
 	u32			addr_version;	/* Address list version */
diff --git a/fs/afs/protocol_afs.h b/fs/afs/protocol_afs.h
new file mode 100644
index 000000000000..0c39358c8b70
--- /dev/null
+++ b/fs/afs/protocol_afs.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* AFS protocol bits
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+
+#define AFSCAPABILITIESMAX 196 /* Maximum number of words in a capability set */
+
+/* AFS3 Fileserver capabilities word 0 */
+#define AFS3_VICED_CAPABILITY_ERRORTRANS	0x0001 /* Uses UAE errors */
+#define AFS3_VICED_CAPABILITY_64BITFILES	0x0002 /* FetchData64 & StoreData64 supported */
+#define AFS3_VICED_CAPABILITY_WRITELOCKACL	0x0004 /* Can lock a file even without lock perm */
+#define AFS3_VICED_CAPABILITY_SANEACLS		0x0008 /* ACLs reviewed for sanity - don't use */
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index b5bd03b1d3c7..e4cd89c44c46 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -168,3 +168,9 @@ enum yfs_lock_type {
 	yfs_LockMandatoryWrite	= 0x101,
 	yfs_LockMandatoryExtend	= 0x102,
 };
+
+/* RXYFS Viced Capability Flags */
+#define YFS_VICED_CAPABILITY_ERRORTRANS		0x0001 /* Deprecated v0.195 */
+#define YFS_VICED_CAPABILITY_64BITFILES		0x0002 /* Deprecated v0.195 */
+#define YFS_VICED_CAPABILITY_WRITELOCKACL	0x0004 /* Can lock a file even without lock perm */
+#define YFS_VICED_CAPABILITY_SANEACLS		0x0008 /* Deprecated v0.195 */

From 9d37e1cab2a9d2cee2737973fa455e6f89eee46a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 8 Sep 2021 21:55:19 +0100
Subject: [PATCH 8/8] afs: Fix updating of i_blocks on file/dir extension

When an afs file or directory is modified locally such that the total file
size is extended, i_blocks needs to be recalculated too.

Fix this by making afs_write_end() and afs_edit_dir_add() call
afs_set_i_size() rather than setting inode->i_size directly as that also
recalculates inode->i_blocks.

This can be tested by creating and writing into directories and files and
then examining them with du.  Without this change, directories show a 4
blocks (they start out at 2048 bytes) and files show 0 blocks; with this
change, they should show a number of blocks proportional to the file size
rounded up to 1024.

Fixes: 31143d5d515e ("AFS: implement basic file write support")
Fixes: 63a4681ff39c ("afs: Locally edit directory data for mkdir/create/unlink/...")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/163113612442.352844.11162345591911691150.stgit@warthog.procyon.org.uk/
---
 fs/afs/dir_edit.c |  4 ++--
 fs/afs/inode.c    | 10 ----------
 fs/afs/internal.h | 10 ++++++++++
 fs/afs/write.c    |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index f4600c1353ad..540b9fc96824 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -263,7 +263,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 		if (b == nr_blocks) {
 			_debug("init %u", b);
 			afs_edit_init_block(meta, block, b);
-			i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE);
+			afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
 		}
 
 		/* Only lower dir pages have a counter in the header. */
@@ -296,7 +296,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 new_directory:
 	afs_edit_init_block(meta, meta, 0);
 	i_size = AFS_DIR_BLOCK_SIZE;
-	i_size_write(&vnode->vfs_inode, i_size);
+	afs_set_i_size(vnode, i_size);
 	slot = AFS_DIR_RESV_BLOCKS0;
 	page = page0;
 	block = meta;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 126daf9969db..8fcffea2daf5 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -53,16 +53,6 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
 		dump_stack();
 }
 
-/*
- * Set the file size and block count.  Estimate the number of 512 bytes blocks
- * used, rounded up to nearest 1K for consistency with other AFS clients.
- */
-static void afs_set_i_size(struct afs_vnode *vnode, u64 size)
-{
-	i_size_write(&vnode->vfs_inode, size);
-	vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
-}
-
 /*
  * Initialise an inode from the vnode status.
  */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b0fe27ae60db..0ad97a8fc0d4 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1596,6 +1596,16 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
 			(void *)(unsigned long)dir_vp->scb.status.data_version;
 }
 
+/*
+ * Set the file size and block count.  Estimate the number of 512 bytes blocks
+ * used, rounded up to nearest 1K for consistency with other AFS clients.
+ */
+static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
+{
+	i_size_write(&vnode->vfs_inode, size);
+	vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+}
+
 /*
  * Check for a conflicting operation on a directory that we just unlinked from.
  * If someone managed to sneak a link or an unlink in on the file we just
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 32a764c24284..2dfe3b3a53d6 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -137,7 +137,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 		write_seqlock(&vnode->cb_lock);
 		i_size = i_size_read(&vnode->vfs_inode);
 		if (maybe_i_size > i_size)
-			i_size_write(&vnode->vfs_inode, maybe_i_size);
+			afs_set_i_size(vnode, maybe_i_size);
 		write_sequnlock(&vnode->cb_lock);
 	}