Bug 20632

Summary: stacking mount_null causes an error: mount_null: Resource deadlock avoided
Product: Base System Reporter: akr <akr>
Component: kernAssignee: freebsd-bugs (Nobody) <bugs>
Status: Closed FIXED    
Severity: Affects Only Me    
Priority: Normal    
Version: 4.1-RELEASE   
Hardware: Any   
OS: Any   

Description akr 2000-08-16 04:50:00 UTC
If null file system is stacked by mount_null, following error is caused.

  mount_null: Resource deadlock avoided

Then, umount cannot unmount the file system with same error and some
commands hangs forever.

How-To-Repeat: 
This machine has freashly installed FreeBSD 4.1-RELEASE with default file
system hierarchy.

coulee# cd /
coulee# df
Filesystem  1K-blocks     Used    Avail Capacity  Mounted on
/dev/ad0s1a     49583    27200    18417    60%    /
/dev/ad0s1f   5521533   637931  4441880    13%    /usr
/dev/ad0s1e     19815      268    17962     1%    /var
procfs              4        4        0   100%    /proc
coulee# ls
.cshrc          cdrom           kernel          root            usr
.profile        compat          kernel.GENERIC  sbin            var
COPYRIGHT       dev             mnt             stand
bin             etc             modules         sys
boot            home            proc            tmp

Try to mount /home to /mnt.  This is no problem.

coulee# mount_null /home /mnt

Try to mount /home to /mnt once more.  This causes an error.

coulee# mount_null /home /mnt
mount_null: Resource deadlock avoided

Try to unmount it.  This causes same error.

coulee# umount /mnt
umount: unmount of /mnt failed: Resource deadlock avoided

`ls /home' hangs up.

coulee# ls /home
^C^C^C^C
Comment 1 Sheldon Hearn 2000-08-16 09:01:58 UTC
On Wed, 16 Aug 2000 12:45:21 +0900, akr@m17n.org wrote:

> If null file system is stacked by mount_null, following error is caused.
> 
>   mount_null: Resource deadlock avoided
> 
> Then, umount cannot unmount the file system with same error and some
> commands hangs forever.

NULLFS is known to be broken.  You might want to try out the following
patch (unfortunately I forget who sent it, but I'll have a look) and
report back.

Ciao,
Sheldon.

Common subdirectories: ./nullfs.orig/CVS and ./nullfs/CVS
diff -c ./nullfs.orig/null_vfsops.c ./nullfs/null_vfsops.c
*** ./nullfs.orig/null_vfsops.c	Fri Aug  4 16:38:49 2000
--- ./nullfs/null_vfsops.c	Sun Aug  6 19:58:23 2000
***************
*** 57,66 ****
  
  static MALLOC_DEFINE(M_NULLFSMNT, "NULLFS mount", "NULLFS mount structure");
  
- static int	nullfs_fhtovp __P((struct mount *mp, struct fid *fidp,
- 				   struct vnode **vpp));
  static int	nullfs_checkexp __P((struct mount *mp, struct sockaddr *nam,
  				    int *extflagsp, struct ucred **credanonp));
  static int	nullfs_mount __P((struct mount *mp, char *path, caddr_t data,
  				  struct nameidata *ndp, struct proc *p));
  static int	nullfs_quotactl __P((struct mount *mp, int cmd, uid_t uid,
--- 57,69 ----
  
  static MALLOC_DEFINE(M_NULLFSMNT, "NULLFS mount", "NULLFS mount structure");
  
  static int	nullfs_checkexp __P((struct mount *mp, struct sockaddr *nam,
  				    int *extflagsp, struct ucred **credanonp));
+ static int	nullfs_extattrctl __P((struct mount *mp, int cmd,
+ 				       const char *attrname, caddr_t arg,
+ 				       struct proc *p));
+ static int	nullfs_fhtovp __P((struct mount *mp, struct fid *fidp,
+ 				   struct vnode **vpp));
  static int	nullfs_mount __P((struct mount *mp, char *path, caddr_t data,
  				  struct nameidata *ndp, struct proc *p));
  static int	nullfs_quotactl __P((struct mount *mp, int cmd, uid_t uid,
***************
*** 372,378 ****
  	struct proc *p;
  {
  	/*
! 	 * XXX - Assumes no data cached at null layer.
  	 */
  	return (0);
  }
--- 375,383 ----
  	struct proc *p;
  {
  	/*
! 	 * XXX There are no buffers to flush, as we don't
! 	 *     have VOP_BMAP. But there are pages. Don't 
! 	 *     know how to flush pages.
  	 */
  	return (0);
  }
***************
*** 383,390 ****
  	ino_t ino;
  	struct vnode **vpp;
  {
  
! 	return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp);
  }
  
  static int
--- 388,399 ----
  	ino_t ino;
  	struct vnode **vpp;
  {
+ 	int error;
+ 	error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp);
+ 	if (error)
+ 		return (error);
  
! 	return (null_node_create(mp, *vpp, vpp));
  }
  
  static int
***************
*** 393,400 ****
  	struct fid *fidp;
  	struct vnode **vpp;
  {
  
! 	return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, vpp);
  }
  
  static int
--- 402,413 ----
  	struct fid *fidp;
  	struct vnode **vpp;
  {
+ 	int error;
+ 	error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, vpp);
+ 	if (error)
+ 		return (error);
  
! 	return (null_node_create(mp, *vpp, vpp));
  }
  
  static int
diff -c ./nullfs.orig/null_vnops.c ./nullfs/null_vnops.c
*** ./nullfs.orig/null_vnops.c	Fri Aug  4 16:38:49 2000
--- ./nullfs/null_vnops.c	Sun Aug  6 20:05:18 2000
***************
*** 37,43 ****
   *
   * Ancestors:
   *	@(#)lofs_vnops.c	1.2 (Berkeley) 6/18/92
-  * $FreeBSD: src/sys/miscfs/nullfs/null_vnops.c,v 1.38 1999/12/11 16:12:56 eivind Exp $
   *	...and...
   *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
   *
--- 37,42 ----
***************
*** 183,188 ****
--- 182,193 ----
  #include <sys/namei.h>
  #include <sys/malloc.h>
  #include <sys/buf.h>
+ 
+ #include <vm/vm.h>
+ #include <vm/vm_extern.h>
+ #include <vm/vm_object.h>
+ #include <vm/vnode_pager.h>
+ 
  #include <miscfs/nullfs/null.h>
  
  static int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
***************
*** 191,203 ****
--- 196,341 ----
  
  static int	null_access __P((struct vop_access_args *ap));
  static int	null_getattr __P((struct vop_getattr_args *ap));
+ static int	null_getpages __P((struct vop_getpages_args *ap));
  static int	null_inactive __P((struct vop_inactive_args *ap));
  static int	null_lock __P((struct vop_lock_args *ap));
  static int	null_lookup __P((struct vop_lookup_args *ap));
+ static int	null_open __P((struct vop_open_args *));
  static int	null_print __P((struct vop_print_args *ap));
+ static int	null_putpages __P((struct vop_putpages_args *));
  static int	null_reclaim __P((struct vop_reclaim_args *ap));
+ static int	null_rename __P((struct vop_rename_args *));
  static int	null_setattr __P((struct vop_setattr_args *ap));
+ static int	null_strategy __P((struct vop_strategy_args *ap));
  static int	null_unlock __P((struct vop_unlock_args *ap));
+ static int	null_write __P((struct vop_write_args *ap));
+ 
+ /*
+  * We handle this to eliminate null FS to lower FS
+  * file moving. Don't know why we don't allow this,
+  * possibly we should.
+  */
+ static int
+ null_rename(ap)
+ 	struct vop_rename_args /* {
+ 		struct vnode *a_fdvp;
+ 		struct vnode *a_fvp;
+ 		struct componentname *a_fcnp;
+ 		struct vnode *a_tdvp;
+ 		struct vnode *a_tvp;
+ 		struct componentname *a_tcnp;
+ 	} */ *ap;
+ {
+ 	struct vnode *tdvp = ap->a_tdvp;
+ 	struct vnode *fvp = ap->a_fvp;
+ 	struct vnode *fdvp = ap->a_fdvp;
+ 	struct vnode *tvp = ap->a_tvp;
+ 
+ 	/* Check for cross-device rename. */
+ 	if ((fvp->v_mount != tdvp->v_mount) ||
+ 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+ 		if (tdvp == tvp)
+ 			vrele(tdvp);
+ 		else
+ 			vput(tdvp);
+ 		if (tvp)
+ 			vput(tvp);
+ 		vrele(fdvp);
+ 		vrele(fvp);
+ 		return (EXDEV);
+ 	}
+ 	
+ 	return (null_bypass((struct vop_generic_args *)ap));
+ }
+ 
+ /*
+  * We handle this to create vm_object for lowervp as
+  * vn_open does for us.
+  */
+ int
+ null_open(ap)
+ 	struct vop_open_args *ap;
+ {
+ 	int error;
+ 
+ 	/* Process operation */
+ 	error = null_bypass((struct vop_generic_args *)ap);
+ 	if (error)
+ 		return (error);
+ 
+ 	/* Create object, vfs_object_create() check itself if vnode 
+ 	 * already have one or if it can't have one */
+ 	vfs_object_create (NULLVPTOLOWERVP(ap->a_vp), ap->a_p, ap->a_cred);
+ 
+ 	return (error);
+ }
+ 
+ /*
+  * Panic VOP_STRATEGY operation. Shouldn't happen, as we
+  * don't have VOP_BMAP.
+  */
+ static int
+ null_strategy(ap)
+ 	struct vop_strategy_args *ap;
+ {
+ 	panic("null_strategy: NOT IMPLEMENTED!");
+ 
+ 	return (EOPNOTSUPP);
+ }
+ 
+ /*
+  * We need to look for file size changes, to keep
+  * vnode_pager up with correct file size.
+  */
+ static int
+ null_write(ap)
+ 	struct vop_write_args /* {
+ 		struct vnode *a_vp;
+ 		struct uio *a_uio;
+ 		int  a_ioflag;
+ 		struct ucred *a_cred;
+ 	} */ *ap;
+ {
+ 	struct vnode *vp = ap->a_vp;
+ 	struct uio *uio = ap->a_uio;
+ 	off_t newsize;
+ 	off_t oldsize;
+ 	int error;
+ 
+ 	/* Process write operation */
+ 	error = null_bypass((struct vop_generic_args *)ap);
+ 
+ 	/* Update vm_object size, if exist and file size changed */
+ 	if (vp->v_object) {
+ 		oldsize = vp->v_object->un_pager.vnp.vnp_size;
+ 		newsize = uio->uio_offset;
+ 		if (newsize > oldsize)
+ 			vnode_pager_setsize(vp, newsize);
+ 	}
+ 
+ 	return (error);
+ }
+ 
+ /*
+  * We use generic (put|get)pages routines and no VOP_BMAP operation,
+  * this force vnode_pager to use VOP_READ and VOP_WRITE
+  * to access pages.
+  */
+ int
+ null_getpages(ap)
+ 	struct vop_getpages_args *ap;
+ {
+ 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+ 		ap->a_reqpage);
+ }
+ 
+ int
+ null_putpages(ap)
+ 	struct vop_putpages_args *ap;
+ {
+ 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+ 		ap->a_sync, ap->a_rtvals);
+ }
  
  /*
   * This is the 10-Apr-92 bypass routine.
***************
*** 389,394 ****
--- 527,534 ----
  
  /*
   * Setattr call. Disallow write attempts if the layer is mounted read-only.
+  * Also look for file size change, and do appropriate vm_object
+  * modification.
   */
  int
  null_setattr(ap)
***************
*** 402,407 ****
--- 542,548 ----
  {
  	struct vnode *vp = ap->a_vp;
  	struct vattr *vap = ap->a_vap;
+ 	int error;
  
    	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
  	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
***************
*** 430,436 ****
  				return (EROFS);
  		}
  	}
! 	return (null_bypass((struct vop_generic_args *)ap));
  }
  
  /*
--- 571,590 ----
  				return (EROFS);
  		}
  	}
! 
! 	error = null_bypass((struct vop_generic_args *)ap);
! 	if (error || vp->v_object == NULL || vap->va_size == VNOVAL)
! 		return (error);
! 
! 	/* If they have changed file size, tell to vnode_pager.
! 	 * XXX If they have shrink the file, do we need to
! 	 *     invalidate buffers with vtruncbuf()? I think
! 	 *     no.
! 	 */
! 	if (vap->va_size != vp->v_object->un_pager.vnp.vnp_size)
! 		vnode_pager_setsize(vp, vap->va_size);
! 
! 	return (error);
  }
  
  /*
***************
*** 449,457 ****
--- 603,618 ----
  
  	if ((error = null_bypass((struct vop_generic_args *)ap)) != 0)
  		return (error);
+ 
+ 	/* Fake fsid, also done in vn_stat(), but... */
+ 	ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
+ 
  	return (0);
  }
  
+ /*
+  * Handle to disallow write access if mounted read-only.
+  */
  static int
  null_access(ap)
  	struct vop_access_args /* {
***************
*** 519,528 ****
--- 680,700 ----
  	} */ *ap;
  {
  	vop_nounlock(ap);
+ 
+ 	/* Support unlocking of inactive vnodes */
+ 	if (NULLVPTOLOWERVP(ap->a_vp) == NULLVP)
+ 		return (0);
+ 
  	ap->a_flags &= ~LK_INTERLOCK;
  	return (null_bypass((struct vop_generic_args *)ap));
  }
  
+ /*
+  * We have to vrele lowervp as soon as possible. Otherway
+  * we can never discover that someone has removed file
+  * on lower FS. This technique removes advantages of hash,
+  * but i don't know other way.
+  */
  static int
  null_inactive(ap)
  	struct vop_inactive_args /* {
***************
*** 533,557 ****
  	struct vnode *vp = ap->a_vp;
  	struct null_node *xp = VTONULL(vp);
  	struct vnode *lowervp = xp->null_lowervp;
! 	/*
! 	 * Do nothing (and _don't_ bypass).
! 	 * Wait to vrele lowervp until reclaim,
! 	 * so that until then our null_node is in the
! 	 * cache and reusable.
! 	 * We still have to tell the lower layer the vnode
! 	 * is now inactive though.
! 	 *
! 	 * NEEDSWORK: Someday, consider inactive'ing
! 	 * the lowervp and then trying to reactivate it
! 	 * with capabilities (v_id)
! 	 * like they do in the name lookup cache code.
! 	 * That's too much work for now.
! 	 */
! 	VOP_INACTIVE(lowervp, ap->a_p);
  	VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
  	return (0);
  }
  
  static int
  null_reclaim(ap)
  	struct vop_reclaim_args /* {
--- 705,729 ----
  	struct vnode *vp = ap->a_vp;
  	struct null_node *xp = VTONULL(vp);
  	struct vnode *lowervp = xp->null_lowervp;
! 
! 	/* Release lowervp totaly */
! 	VOP_UNLOCK(lowervp, 0, ap->a_p);
! 	vrele (lowervp);
! 
! 	/* Forget all we knew, remove hash entry */
! 	xp->null_lowervp = NULLVP;
! 	LIST_REMOVE(xp, null_hash);
! 
! 	/* Unlock our vnode */
  	VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
+ 
  	return (0);
  }
  
+ /*
+  * We can free memory in null_inactive, but we do this
+  * here. (Possible to guard vp->v_data to point somewhere)
+  */
  static int
  null_reclaim(ap)
  	struct vop_reclaim_args /* {
***************
*** 560,578 ****
  	} */ *ap;
  {
  	struct vnode *vp = ap->a_vp;
- 	struct null_node *xp = VTONULL(vp);
- 	struct vnode *lowervp = xp->null_lowervp;
  
- 	/*
- 	 * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p,
- 	 * so we can't call VOPs on ourself.
- 	 */
- 	/* After this assignment, this node will not be re-used. */
- 	xp->null_lowervp = NULLVP;
- 	LIST_REMOVE(xp, null_hash);
  	FREE(vp->v_data, M_TEMP);
  	vp->v_data = NULL;
! 	vrele (lowervp);
  	return (0);
  }
  
--- 732,741 ----
  	} */ *ap;
  {
  	struct vnode *vp = ap->a_vp;
  
  	FREE(vp->v_data, M_TEMP);
  	vp->v_data = NULL;
! 
  	return (0);
  }
  
***************
*** 593,607 ****
--- 756,778 ----
  vop_t **null_vnodeop_p;
  static struct vnodeopv_entry_desc null_vnodeop_entries[] = {
  	{ &vop_default_desc,		(vop_t *) null_bypass },
+ 
  	{ &vop_access_desc,		(vop_t *) null_access },
+ 	{ &vop_bmap_desc,		(vop_t *) vop_eopnotsupp },
  	{ &vop_getattr_desc,		(vop_t *) null_getattr },
+ 	{ &vop_getpages_desc,		(vop_t *) null_getpages },
  	{ &vop_inactive_desc,		(vop_t *) null_inactive },
  	{ &vop_lock_desc,		(vop_t *) null_lock },
  	{ &vop_lookup_desc,		(vop_t *) null_lookup },
+ 	{ &vop_open_desc,		(vop_t *) null_open },
  	{ &vop_print_desc,		(vop_t *) null_print },
+ 	{ &vop_putpages_desc,		(vop_t *) null_putpages },
  	{ &vop_reclaim_desc,		(vop_t *) null_reclaim },
+ 	{ &vop_rename_desc,		(vop_t *) null_rename },
  	{ &vop_setattr_desc,		(vop_t *) null_setattr },
+ 	{ &vop_strategy_desc,		(vop_t *) null_strategy },
  	{ &vop_unlock_desc,		(vop_t *) null_unlock },
+ 	{ &vop_write_desc,		(vop_t *) null_write },
  	{ NULL, NULL }
  };
  static struct vnodeopv_desc null_vnodeop_opv_desc =
Comment 2 Sheldon Hearn freebsd_committer freebsd_triage 2000-08-16 09:02:08 UTC
State Changed
From-To: open->feedback

I've sent the originator a patch, the origin of which I'm still 
investigating.
Comment 3 akr 2000-08-16 11:56:20 UTC
In article <14634.966412918@axl.ops.uunet.co.za>,
  Sheldon Hearn <sheldonh@uunet.co.za> writes:

> NULLFS is known to be broken. 

Sorry.  I overlooked the note in mount_null(8).

> You might want to try out the following
> patch (unfortunately I forget who sent it, but I'll have a look) and
> report back.

Unfortunately the patch doesn't fix the problem.
-- 
Tanaka Akira
Comment 4 Sheldon Hearn 2000-08-16 14:18:46 UTC
On 16 Aug 2000 19:56:20 +0900, Tanaka Akira wrote:

> > You might want to try out the following
> > patch (unfortunately I forget who sent it, but I'll have a look) and
> > report back.
> 
> Unfortunately the patch doesn't fix the problem.

I've asked Ustimenko Semen <semen@iclub.nsu.ru>, the author of the
patch, to take a look at your PR.

If you are not yet familiar with kernel debugging and would like to help
Ustimenko investigate this problem, now would be a good time to have a
look at http://www.freebsd.org/handbook/kerneldebug.html .

Ciao,
Sheldon.
Comment 5 Tor Egge 2000-08-16 19:52:29 UTC
>  > Unfortunately the patch doesn't fix the problem.
>  
>  I've asked Ustimenko Semen <semen@iclub.nsu.ru>, the author of the
>  patch, to take a look at your PR.

Here is yet another nullfs patch for -current.  It lacks some
of the features in Ustimenko's patch (e.g. support for nfs exporting
nfs mounted file systems), but it avoids the aliasing problem.


Index: sbin/mount_null/mount_null.c
===================================================================
RCS file: /home/ncvs/src/sbin/mount_null/mount_null.c,v
retrieving revision 1.14
diff -u -r1.14 mount_null.c
--- sbin/mount_null/mount_null.c	2000/07/28 11:54:08	1.14
+++ sbin/mount_null/mount_null.c	2000/08/06 19:16:28
@@ -101,9 +101,11 @@
 	(void)checkpath(argv[0], target);
 	(void)checkpath(argv[1], source);
 
+#if 0
 	if (subdir(target, source) || subdir(source, target))
 		errx(EX_USAGE, "%s (%s) and %s are not distinct paths",
 		    argv[0], target, argv[1]);
+#endif
 
 	args.target = target;
 
Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.267
diff -u -r1.267 vfs_subr.c
--- sys/kern/vfs_subr.c	2000/07/24 05:28:29	1.267
+++ sys/kern/vfs_subr.c	2000/08/06 19:16:46
@@ -711,7 +711,7 @@
 	 */
 	simple_lock(&vp->v_interlock);
 	object = vp->v_object;
-	if (object != NULL) {
+	if (object != NULL && (void *) vp == object->handle) {
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
@@ -841,6 +841,8 @@
 	int s;
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+	if (vp->v_tag == VT_NULL)
+		panic("bgetvp: null node");
 
 	vhold(vp);
 	bp->b_vp = vp;
@@ -1190,6 +1192,8 @@
 			}
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
+		if (newvp->v_tag == VT_NULL)
+			panic("reassignbuf: null node dirty list");
 		bp->b_xflags |= BX_VNDIRTY;
 		tbp = TAILQ_FIRST(listheadp);
 		if (tbp == NULL ||
@@ -1243,6 +1247,8 @@
 			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 		}
 	} else {
+		if (newvp->v_tag == VT_NULL)
+			panic("reassignbuf: null node clean list");
 		bp->b_xflags |= BX_VNCLEAN;
 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
 		if ((newvp->v_flag & VONWORKLST) &&
@@ -1683,7 +1689,7 @@
 			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
 	}
 
-	if ((obj = vp->v_object) != NULL) {
+	if ((obj = vp->v_object) != NULL && (void *) vp == obj->handle) {
 		if (obj->ref_count == 0) {
 			/*
 			 * vclean() may be called twice. The first time
@@ -2529,7 +2535,8 @@
 
 		simple_lock(&vp->v_interlock);
 		if (vp->v_object &&
-		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+		    (void *) vp == vp->v_object->handle &&
+		    (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
 			if (!vget(vp,
 				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
 				if (vp->v_object) {
@@ -2613,6 +2620,21 @@
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
+	if ((vp->v_flag & VFREE) != 0)
+		panic("vfree called on doomed node");
+	if ((vp->v_flag & VFREE) != 0)
+		panic("vfree called on free node");
+	if (vp->v_holdcnt != 0)
+		panic("vfree called with holdcnt=%d (>0)", vp->v_holdcnt);
+	if (vp->v_usecount != 0)
+		panic("vfree called with usecnt=%d (>0)", vp->v_usecount);
+	if (vp->v_object != NULL &&
+	    vp->v_object->handle == (void *) vp &&
+	    (vp->v_object->resident_page_count != 0 ||
+	     vp->v_object->ref_count != 0))
+	    panic("vfree called with object RPC: %d, RC: %d",
+		  vp->v_object->resident_page_count,
+		  vp->v_object->ref_count);
 	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
Index: sys/miscfs/nullfs/null.h
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/nullfs/null.h,v
retrieving revision 1.13
diff -u -r1.13 null.h
--- sys/miscfs/nullfs/null.h	2000/05/26 02:04:58	1.13
+++ sys/miscfs/nullfs/null.h	2000/08/06 19:16:47
@@ -55,7 +55,15 @@
 	LIST_ENTRY(null_node)	null_hash;	/* Hash list */
 	struct vnode	        *null_lowervp;	/* VREFed once */
 	struct vnode		*null_vnode;	/* Back pointer */
+	int flags;				/* flags (see below) */
+	pid_t lockholder;			/* owner of exclusive lock */
 };
+
+/* Null node flags */
+
+#define NN_LOCK		0x01
+#define NN_WANT		0x02
+#define NN_FSYNC_SKIPPED 0x04
 
 extern int nullfs_init __P((struct vfsconf *vfsp));
 extern int null_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp));
Index: sys/miscfs/nullfs/null_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_subr.c,v
retrieving revision 1.23
diff -u -r1.23 null_subr.c
--- sys/miscfs/nullfs/null_subr.c	2000/05/26 02:04:59	1.23
+++ sys/miscfs/nullfs/null_subr.c	2000/08/06 19:16:47
@@ -45,6 +45,8 @@
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <miscfs/nullfs/null.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
 
 #define LOG2_SIZEVNODE 7		/* log2(sizeof struct vnode) */
 #define	NNULLNODECACHE 16
@@ -94,6 +96,7 @@
 	struct null_node_hashhead *hd;
 	struct null_node *a;
 	struct vnode *vp;
+	int retry;
 
 	/*
 	 * Find hash base, and then search the (two-way) linked
@@ -102,6 +105,7 @@
 	 * reference count (but NOT the lower vnode's VREF counter).
 	 */
 	hd = NULL_NHASH(lowervp);
+	retry = 0;
 loop:
 	for (a = hd->lh_first; a != 0; a = a->null_hash.le_next) {
 		if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) {
@@ -112,6 +116,9 @@
 			 * the lower node.
 			 */
 			if (vget(vp, 0, p)) {
+				retry++;
+				if (retry < 2)
+					goto loop;
 				printf ("null_node_find: vget failed.\n");
 				goto loop;
 			};
@@ -138,6 +145,7 @@
 	struct null_node *xp;
 	struct vnode *othervp, *vp;
 	int error;
+	int waslocked;
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
@@ -157,6 +165,8 @@
 	xp->null_vnode = vp;
 	vp->v_data = xp;
 	xp->null_lowervp = lowervp;
+	xp->flags = 0;
+	xp->lockholder = 0;
 	/*
 	 * Before we insert our new node onto the hash chains,
 	 * check to see if someone else has beaten us to it.
@@ -167,12 +177,41 @@
 		FREE(xp, M_TEMP);
 		vp->v_type = VBAD;	/* node is discarded */
 		vp->v_usecount = 0;	/* XXX */
+		if (VSHOULDFREE(vp))
+			vfree(vp);
 		*vpp = othervp;
 		return 0;
 	};
 	VREF(lowervp);   /* Extra VREF will be vrele'd in null_node_create */
 	hd = NULL_NHASH(lowervp);
 	LIST_INSERT_HEAD(hd, xp, null_hash);
+	/*
+	 * Create the lower VM object, if needed
+	 */
+	if ((lowervp->v_type == VREG) &&
+	    ((lowervp->v_object == NULL) ||
+	     (lowervp->v_object->flags & OBJ_DEAD))) {
+		waslocked = VOP_ISLOCKED(lowervp, NULL); /* XXX: Wrong */
+		if (!waslocked) {
+			simple_lock(&lowervp->v_interlock);
+			vn_lock(lowervp,
+				LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY,
+				curproc);
+		}
+		vfs_object_create(lowervp, curproc, curproc->p_ucred);
+		if (!waslocked) {
+			simple_lock(&lowervp->v_interlock);
+			VOP_UNLOCK(lowervp, LK_INTERLOCK, curproc);
+		}
+
+	}
+	vp->v_object = lowervp->v_object;	/* XXX: share object */
+	if (vp->v_object != NULL) {
+		if ((lowervp->v_flag & VOBJBUF) != 0)
+			vp->v_flag |= VOBJBUF;
+		if ((lowervp->v_flag & VTEXT) != 0)
+			vp->v_flag |= VTEXT;
+	}
 	return 0;
 }
 
Index: sys/miscfs/nullfs/null_vfsops.c
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_vfsops.c,v
retrieving revision 1.37
diff -u -r1.37 null_vfsops.c
--- sys/miscfs/nullfs/null_vfsops.c	2000/07/28 11:54:09	1.37
+++ sys/miscfs/nullfs/null_vfsops.c	2000/08/06 19:16:47
@@ -93,7 +93,6 @@
 	struct vnode *nullm_rootvp;
 	struct null_mount *xmp;
 	u_int size;
-	int isvnunlocked = 0;
 
 #ifdef DEBUG
 	printf("nullfs_mount(mp = %p)\n", (void *)mp);
@@ -115,46 +114,23 @@
 		return (error);
 
 	/*
-	 * Unlock lower node to avoid deadlock.
-	 * (XXX) VOP_ISLOCKED is needed?
-	 */
-	if ((mp->mnt_vnodecovered->v_op == null_vnodeop_p) &&
-		VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) {
-		VOP_UNLOCK(mp->mnt_vnodecovered, 0, p);
-		isvnunlocked = 1;
-	}
-	/*
 	 * Find lower node
 	 */
-	NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF,
-		UIO_USERSPACE, args.target, p);
+	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF,
+	       UIO_USERSPACE, args.target, p);
 	error = namei(ndp);
-	/*
-	 * Re-lock vnode.
-	 */
-	if (isvnunlocked && !VOP_ISLOCKED(mp->mnt_vnodecovered, NULL))
-		vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY, p);
-
 	if (error)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 
-	/*
-	 * Sanity check on lower vnode
-	 */
 	lowerrootvp = ndp->ni_vp;
 
-	vrele(ndp->ni_dvp);
-	ndp->ni_dvp = NULLVP;
-
 	/*
-	 * Check multi null mount to avoid `lock against myself' panic.
+	 * Sanity check on lower vnode
 	 */
-	if (lowerrootvp == VTONULL(mp->mnt_vnodecovered)->null_lowervp) {
-#ifdef DEBUG
-		printf("nullfs_mount: multi null mount?\n");
-#endif
-		return (EDEADLK);
+	if (VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) {
+		vput(lowerrootvp);
+		return EBUSY;
 	}
 
 	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
@@ -171,19 +147,18 @@
 	 */
 	error = null_node_create(mp, lowerrootvp, &vp);
 	/*
-	 * Unlock the node (either the lower or the alias)
-	 */
-	VOP_UNLOCK(vp, 0, p);
-	/*
 	 * Make sure the node alias worked
 	 */
 	if (error) {
-		vrele(lowerrootvp);
+		vput(lowerrootvp);
 		free(xmp, M_NULLFSMNT);	/* XXX */
 		return (error);
 	}
-
 	/*
+	 * Unlock the node (either the lower or the alias)
+	 */
+	VOP_UNLOCK(vp, 0, p);
+	/*
 	 * Keep a held reference to the root vnode.
 	 * It is vrele'd in nullfs_unmount.
 	 */
@@ -269,7 +244,8 @@
 	/*
 	 * And blow it away for future re-use
 	 */
-	vgone(nullm_rootvp);
+	if (nullm_rootvp->v_mount == mp)
+		vgone(nullm_rootvp);
 	/*
 	 * Finally, throw away the null_mount structure
 	 */
@@ -297,18 +273,7 @@
 	 */
 	vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
 	VREF(vp);
-	if (VOP_ISLOCKED(vp, NULL)) {
-		/*
-		 * XXX
-		 * Should we check type of node?
-		 */
-#ifdef DEBUG
-		printf("nullfs_root: multi null mount?\n");
-#endif
-		vrele(vp);
-		return (EDEADLK);
-	} else
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 	*vpp = vp;
 	return 0;
 }
Index: sys/miscfs/nullfs/null_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_vnops.c,v
retrieving revision 1.39
diff -u -r1.39 null_vnops.c
--- sys/miscfs/nullfs/null_vnops.c	2000/04/18 15:15:23	1.39
+++ sys/miscfs/nullfs/null_vnops.c	2000/08/06 19:16:51
@@ -182,7 +182,12 @@
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
 #include <miscfs/nullfs/null.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
 
 static int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
 SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, 
@@ -191,12 +196,15 @@
 static int	null_access __P((struct vop_access_args *ap));
 static int	null_getattr __P((struct vop_getattr_args *ap));
 static int	null_inactive __P((struct vop_inactive_args *ap));
+static int	null_islocked __P((struct vop_islocked_args *ap));
 static int	null_lock __P((struct vop_lock_args *ap));
 static int	null_lookup __P((struct vop_lookup_args *ap));
 static int	null_print __P((struct vop_print_args *ap));
 static int	null_reclaim __P((struct vop_reclaim_args *ap));
 static int	null_setattr __P((struct vop_setattr_args *ap));
 static int	null_unlock __P((struct vop_unlock_args *ap));
+static int	null_fsync __P((struct vop_fsync_args *ap));
+static int	null_getwritemount __P((struct vop_getwritemount_args *ap));
 
 /*
  * This is the 10-Apr-92 bypass routine.
@@ -236,6 +244,7 @@
 	struct vnode ***vppp;
 	struct vnodeop_desc *descp = ap->a_desc;
 	int reles, i;
+	struct mount *old_mount;
 
 	if (null_bug_bypass)
 		printf ("null_bypass: %s\n", descp->vdesc_name);
@@ -288,6 +297,11 @@
 	 */
 	error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);
 
+	if (descp->vdesc_vp_offsets[0] != VDESC_NO_OFFSET &&
+	    old_vps[0] != NULLVP)
+	  old_mount = old_vps[0]->v_mount;
+	else
+	  old_mount = NULL;
 	/*
 	 * Maintain the illusion of call-by-value
 	 * by restoring vnodes in the argument structure
@@ -323,7 +337,7 @@
 		vppp = VOPARG_OFFSETTO(struct vnode***,
 				 descp->vdesc_vpp_offset,ap);
 		if (*vppp)
-			error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp);
+			error = null_node_create(old_mount, **vppp, *vppp);
 	}
 
  out:
@@ -496,10 +510,31 @@
 		struct proc *a_p;
 	} */ *ap;
 {
+	struct null_node *xp;
+	pid_t pid;
 
+	xp = VTONULL(ap->a_vp);
+	if ((xp->flags & NN_LOCK) != 0) {
+		pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid;
+		if (xp->lockholder == pid) {
+			panic("null_lock: recursive lock");
+		} else {
+			if ((ap->a_flags & LK_NOWAIT) != 0)
+				return EBUSY;
+			while ((xp->flags & NN_LOCK) != 0) {
+				xp->flags |= NN_WANT;
+				tsleep(xp, PINOD, "nnlock", 0);
+			}
+		}
+	}
+
 	vop_nolock(ap);
-	if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN)
-		return (0);
+	if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) {
+		pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid;
+		xp->flags |= NN_LOCK;
+		xp->lockholder = pid;
+		return 0;
+	}
 	ap->a_flags &= ~LK_INTERLOCK;
 	return (null_bypass((struct vop_generic_args *)ap));
 }
@@ -517,28 +552,58 @@
 		struct proc *a_p;
 	} */ *ap;
 {
+	struct vnode *vp = ap->a_vp;
+	struct null_node *xp;
+	pid_t pid;
+
+	xp = VTONULL(vp);
+	if ((xp->flags & NN_LOCK) != 0) {
+		pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid;
+		if (xp->lockholder == pid) {
+			xp->flags &= ~ (NN_LOCK | NN_FSYNC_SKIPPED);
+			xp->lockholder = 0;
+			if ((xp->flags & NN_WANT) != 0) {
+				xp->flags &= ~NN_WANT;
+				wakeup(xp);
+			}
+			return 0;
+		}
+	}
+
 	vop_nounlock(ap);
 	ap->a_flags &= ~LK_INTERLOCK;
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
 static int
+null_islocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp;
+	struct null_node *xp;
+
+	vp = ap->a_vp;
+	xp = VTONULL(vp);
+	if (xp == NULL || (xp->flags & NN_LOCK) != 0)
+		return LK_EXCLUSIVE;
+	
+	return (null_bypass((struct vop_generic_args *)ap));
+}
+
+static int
 null_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct proc *a_p;
 	} */ *ap;
 {
-	struct vnode *vp = ap->a_vp;
-	struct null_node *xp = VTONULL(vp);
-	struct vnode *lowervp = xp->null_lowervp;
 	/*
 	 * Do nothing (and _don't_ bypass).
 	 * Wait to vrele lowervp until reclaim,
 	 * so that until then our null_node is in the
 	 * cache and reusable.
-	 * We still have to tell the lower layer the vnode
-	 * is now inactive though.
 	 *
 	 * NEEDSWORK: Someday, consider inactive'ing
 	 * the lowervp and then trying to reactivate it
@@ -546,8 +611,8 @@
 	 * like they do in the name lookup cache code.
 	 * That's too much work for now.
 	 */
-	VOP_INACTIVE(lowervp, ap->a_p);
 	VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
+	vrecycle(ap->a_vp, (struct simplelock *) 0, ap->a_p);
 	return (0);
 }
 
@@ -561,21 +626,85 @@
 	struct vnode *vp = ap->a_vp;
 	struct null_node *xp = VTONULL(vp);
 	struct vnode *lowervp = xp->null_lowervp;
+	vm_object_t object;
 
 	/*
 	 * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p,
 	 * so we can't call VOPs on ourself.
 	 */
+
+	object = vp->v_object;
+	if (object != lowervp->v_object)
+		panic("null_reclaim: wrong object");
+	if (object != NULL) {
+		if ((void *) lowervp != object->handle)
+			panic("null_reclaim: wrong object handle");
+		vp->v_object = NULL;
+		vp->v_flag &= ~(VTEXT|VOBJBUF);
+		if (lowervp->v_usecount < 1 + object->ref_count)
+			panic("null_reclaim: lowervp->v_usecount too low");
+	}
 	/* After this assignment, this node will not be re-used. */
 	xp->null_lowervp = NULLVP;
 	LIST_REMOVE(xp, null_hash);
 	FREE(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
+	if (lowervp->v_usecount < 1)
+		panic("null_reclaim: lowervp->v_usecount < 1");
 	vrele (lowervp);
 	return (0);
 }
 
 static int
+null_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnodeop_desc *a_desc;
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *vp;
+	struct null_node *xp;
+	pid_t pid; 
+
+	vp = ap->a_vp;
+	xp = VTONULL(vp);
+
+	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd))
+		panic("dirty buffers on null vnode");
+	if (!TAILQ_EMPTY(&vp->v_cleanblkhd))
+		panic("clean buffers on null vnode");
+	
+	if ((xp->flags & NN_LOCK) != 0) {
+		pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid;
+		if (xp->lockholder == pid)
+			if (/*  vp->v_object != NULL &&
+			    vp->v_object->handle != (void *) vp && */
+			    (xp->flags & NN_FSYNC_SKIPPED) == 0) { 
+				xp->flags |= NN_FSYNC_SKIPPED;
+				return 0;
+			}
+	}
+	return (null_bypass((struct vop_generic_args *)ap));
+}
+
+int
+null_getwritemount(ap)
+	struct vop_getwritemount_args /* {
+		struct vnode *a_vp;
+		struct mount **a_mpp;
+	} */ *ap;
+{
+  if (VTONULL(ap->a_vp) == NULL) {
+    return VOP_GETWRITEMOUNT(ap->a_vp->v_mount->mnt_vnodecovered, ap->a_mpp);
+  } else
+    return (null_bypass((struct vop_generic_args *) ap));
+}
+
+
+static int
 null_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
@@ -595,12 +724,15 @@
 	{ &vop_access_desc,		(vop_t *) null_access },
 	{ &vop_getattr_desc,		(vop_t *) null_getattr },
 	{ &vop_inactive_desc,		(vop_t *) null_inactive },
+	{ &vop_islocked_desc,		(vop_t *) null_islocked },
 	{ &vop_lock_desc,		(vop_t *) null_lock },
 	{ &vop_lookup_desc,		(vop_t *) null_lookup },
 	{ &vop_print_desc,		(vop_t *) null_print },
 	{ &vop_reclaim_desc,		(vop_t *) null_reclaim },
 	{ &vop_setattr_desc,		(vop_t *) null_setattr },
 	{ &vop_unlock_desc,		(vop_t *) null_unlock },
+	{ &vop_fsync_desc,		(vop_t *) null_fsync },
+	{ &vop_getwritemount_desc,	(vop_t *) null_getwritemount },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc null_vnodeop_opv_desc =
Index: sys/vm/vnode_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.124
diff -u -r1.124 vnode_pager.c
--- sys/vm/vnode_pager.c	2000/07/11 22:07:57	1.124
+++ sys/vm/vnode_pager.c	2000/08/06 19:17:02
@@ -154,7 +154,7 @@
 		vp->v_usecount++;
 	} else {
 		object->ref_count++;
-		vp->v_usecount++;
+		((struct vnode *) object->handle)->v_usecount++;
 	}
 
 	vp->v_flag &= ~VOLOCK;


- Tor Egge
Comment 6 Sheldon Hearn 2000-08-17 10:41:46 UTC
Just a quick note to explain that the PR is still in feedback because
Ustimenko has said that he'll try to come up with something that
incorporates both his own and Tor's patch.

Ciao,
Sheldon.
Comment 7 Sheldon Hearn freebsd_committer freebsd_triage 2002-01-17 16:12:05 UTC
State Changed
From-To: feedback->closed

Automatic feedback timeout.  If additional feedback that warrants 
the re-opening of this PR is available but not included in the 
audit trail, please include the feedback in a reply to this message 
(preserving the Subject line) and ask that the PR be re-opened.