We have found out a strange mv(1) behaviour on SMP systems with UFS. If two or more processes are trying to move the same dir from one source, it could break UFS. It works on 7.X/8.X amd64 systems. Actually, i think, it works on all systems with SMP on all platforms. I guess that the problem is somewhere in ./sys/ufs/ufs/ufs_vnops.c because i can't repeat it for an example on ZFS partition. After UFS break you will see this: # mount /dev/da0p2 on / (ufs, local, noatime) devfs on /dev (devfs, local, multilabel) /dev/da0p4 on /data (ufs, local, noatime, soft-updates) # cd /data/test # ls -lai total 40 11540480 drwxr-xr-x 5 root wheel 512 Apr 21 17:30 . 2 drwxr-xr-x 8 root wheel 512 Apr 21 17:29 .. 11543482 drwxr-xr-x 3 root wheel 16384 Apr 21 17:37 dst1 11543483 drwxr-xr-x 3 root wheel 512 Apr 21 17:37 dst2 11540481 drwxr-xr-x 2 root wheel 16384 Apr 21 17:37 src 4 -rw-r--r-- 1 root wheel 738 Apr 21 17:33 test.pl # ls -lai dst1 total 20 11543482 drwxr-xr-x 3 root wheel 16384 Apr 21 17:37 . 11540480 drwxr-xr-x 5 root wheel 512 Apr 21 17:30 .. 11541475 drwx------ 3 root wheel 512 Apr 21 17:36 03qOT # ls -lai dst2 total 6 11543483 drwxr-xr-x 3 root wheel 512 Apr 21 17:37 . 11540480 drwxr-xr-x 5 root wheel 512 Apr 21 17:30 .. 11541475 drwx------ 3 root wheel 512 Apr 21 17:36 03qOT # rm -rf dst1/03qOT rm: dst1/03qOT: Invalid argument # rm -rf dst2/03qOT rm: dst2/03qOT: Invalid argument The only way to fix it is to unmount the partition and run the fsck on it. # fsck -vy /data start /data wait fsck_ufs -y /dev/da0p4 ** /dev/da0p4 ** Last Mounted on /data ** Phase 1 - Check Blocks and Sizes ** Phase 2 - Check Pathnames /test/dst1/03qOT IS AN EXTRANEOUS HARD LINK TO DIRECTORY /test/dst1/03qOT REMOVE? yes BAD INODE NUMBER FOR '..' I=11541475 OWNER=root MODE=40700 SIZE=512 MTIME=Apr 21 17:36 2011 DIR=/test/dst2/03qOT UNEXPECTED SOFT UPDATE INCONSISTENCY FIX? yes ** Phase 3 - Check Connectivity ** Phase 4 - Check Reference Counts LINK COUNT DIR I=11541475 OWNER=root MODE=40700 SIZE=512 MTIME=Apr 21 17:36 2011 COUNT 3 SHOULD BE 2 ADJUST? yes LINK COUNT DIR I=11543482 OWNER=root MODE=40755 SIZE=16384 MTIME=Apr 21 17:37 2011 COUNT 3 SHOULD BE 2 ADJUST? yes ** Phase 5 - Check Cyl groups 7757 files, 87291 used, 134202352 free (256 frags, 16775262 blocks, 0.0% fragmentation) ***** FILE SYSTEM IS CLEAN ***** ***** FILE SYSTEM WAS MODIFIED ***** How-To-Repeat: You can use this script to repeat the problem: #!/usr/local/bin/perl use strict; my $iterations = 50; my $dirs = 1000; mkdir 'src'; for(1..$iterations) { print "Iterations $_\n"; do_test(); } sub do_test { print "Creating $dirs sample directories...\n"; for(1..$dirs) { my $dirname = `mktemp -d src/XXXXX`; chomp $dirname; system("touch $dirname/data && touch $dirname/meta"); print "$_ "; } print "done\n"; my ($pid1, $pid2) = (do_job('src', 'dst1'), do_job('src', 'dst2')); waitpid($pid1, 0); waitpid($pid2, 0); } sub do_job { my ($src, $dst) = @_; my $pid = fork(); return $pid if $pid != 0; system("mkdir -p $dst && ls $src | xargs -I _ mv -n $src/_ $dst"); system("rm -rf $dst/*") == 0 or die; exit(0); }
Responsible Changed From-To: freebsd-bugs->freebsd-fs Over to maintainer(s).
Author: kib Date: Sun Apr 24 11:01:42 2011 New Revision: 220986 URL: http://svn.freebsd.org/changeset/base/220986 Log: Merge the part of r207141 that fixes the locking for ufs_rename() (and r218838 followup). Adopt the SU calls to the stable/8 SU implementation, with the help from Kirk. PR: kern/156545 Reviewed by: mckusick Tested by: pho Modified: stable/8/sys/ufs/ufs/inode.h stable/8/sys/ufs/ufs/ufs_extern.h stable/8/sys/ufs/ufs/ufs_lookup.c stable/8/sys/ufs/ufs/ufs_vnops.c Directory Properties: stable/8/sys/ (props changed) stable/8/sys/amd64/include/xen/ (props changed) stable/8/sys/cddl/contrib/opensolaris/ (props changed) stable/8/sys/contrib/dev/acpica/ (props changed) stable/8/sys/contrib/pf/ (props changed) Modified: stable/8/sys/ufs/ufs/inode.h ============================================================================== --- stable/8/sys/ufs/ufs/inode.h Sun Apr 24 10:47:56 2011 (r220985) +++ stable/8/sys/ufs/ufs/inode.h Sun Apr 24 11:01:42 2011 (r220986) @@ -120,7 +120,7 @@ struct inode { #define IN_CHANGE 0x0002 /* Inode change time update request. */ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ -#define IN_RENAME 0x0010 /* Inode is being renamed. */ +#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ #define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ #define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */ #define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the Modified: stable/8/sys/ufs/ufs/ufs_extern.h ============================================================================== --- stable/8/sys/ufs/ufs/ufs_extern.h Sun Apr 24 10:47:56 2011 (r220985) +++ stable/8/sys/ufs/ufs/ufs_extern.h Sun Apr 24 11:01:42 2011 (r220986) @@ -57,7 +57,7 @@ int ufs_bmap(struct vop_bmap_args *); int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, struct buf *, int *, int *); int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); -int ufs_checkpath(ino_t, struct inode *, struct ucred *); +int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); @@ -66,9 +66,11 @@ int ufs_extwrite(struct vop_write_args void ufs_makedirentry(struct inode *, struct componentname *, struct direct *); int ufs_direnter(struct vnode *, struct vnode *, struct direct *, - struct componentname *, struct buf *); + struct componentname *, struct buf *, int); int ufs_dirremove(struct vnode *, struct inode *, int, int); int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, + ino_t *); int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); int ufs_inactive(struct vop_inactive_args *); int ufs_init(struct vfsconf *); Modified: stable/8/sys/ufs/ufs/ufs_lookup.c ============================================================================== --- stable/8/sys/ufs/ufs/ufs_lookup.c Sun Apr 24 10:47:56 2011 (r220985) +++ stable/8/sys/ufs/ufs/ufs_lookup.c Sun Apr 24 11:01:42 2011 (r220986) @@ -76,9 +76,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, C /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) -static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *, - ino_t *); - #ifdef QUOTA static int ufs_lookup_upgrade_lock(struct vnode *vp) @@ -214,11 +211,11 @@ ufs_lookup(ap) } */ *ap; { - return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); + return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } -static int -ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, +int +ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ @@ -556,6 +553,8 @@ notfound: return (ENOENT); found: + if (dd_ino != NULL) + *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* @@ -578,11 +577,6 @@ found: if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1); - if (dd_ino != NULL) { - *dd_ino = ino; - return (0); - } - /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. @@ -590,17 +584,6 @@ found: if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); - if ((error = VFS_VGET(vdp->v_mount, ino, - LK_EXCLUSIVE, &tdp)) != 0) - return (error); - - error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); - if (error) { - vput(tdp); - return (error); - } - - /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there @@ -617,6 +600,16 @@ found: dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; + if (dd_ino != NULL) + return (0); + if ((error = VFS_VGET(vdp->v_mount, ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; @@ -648,6 +641,8 @@ found: dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); + if (dd_ino != NULL) + return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -682,6 +677,8 @@ found: cnp->cn_flags |= SAVENAME; return (0); } + if (dd_ino != NULL) + return (0); /* * Step through the translation in the name. We do not `vput' the @@ -713,7 +710,7 @@ found: * to the inode we looked up before vdp lock was * dropped. */ - error = ufs_lookup_(pdp, NULL, cnp, &ino1); + error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); @@ -865,12 +862,13 @@ ufs_makedirentry(ip, cnp, newdirp) * soft dependency code). */ int -ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) +ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; + int isrename; { struct ucred *cr; struct thread *td; @@ -943,22 +941,30 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdir blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, dp->i_offset, - dirp->d_ino, newdirbp, 1) == 0) { - bdwrite(bp); + dirp->d_ino, newdirbp, 1)) + dp->i_flag |= IN_NEEDSYNC; +#ifdef JOURNALED_SOFTUPDATES + if (newdirbp) + bdwrite(newdirbp); +#endif + bdwrite(bp); + if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); - } - /* We have just allocated a directory block in an - * indirect block. Rather than tracking when it gets - * claimed by the inode, we simply do a VOP_FSYNC - * now to ensure that it is there (in case the user - * does a future fsync). Note that we have to unlock - * the inode for the entry that we just entered, as - * the VOP_FSYNC may need to lock other inodes which - * can lead to deadlock if we also hold a lock on - * the newly entered node. + /* + * We have just allocated a directory block in an + * indirect block. We must prevent holes in the + * directory created if directory entries are + * written out of order. To accomplish this we + * fsync when we extend a directory into indirects. + * During rename it's not safe to drop the tvp lock + * so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and + * the kernel were taught to fill in sparse + * directories rather than panic. */ - if ((error = bwrite(bp))) - return (error); + if (isrename) + return (0); if (tvp != NULL) VOP_UNLOCK(tvp, 0); error = VOP_FSYNC(dvp, MNT_WAIT, td); @@ -1099,6 +1105,10 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdir (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); +#ifdef JOURNALED_SOFTUPDATES + if (newdirbp != NULL) + bdwrite(newdirbp); +#endif bdwrite(bp); } else { if (DOINGASYNC(dvp)) { @@ -1116,7 +1126,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdir * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ - if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { + if (isrename == 0 && error == 0 && + dp->i_endoff && dp->i_endoff < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp, 0); #ifdef UFS_DIRHASH @@ -1386,25 +1397,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct /* * Check if source directory is in the path of the target directory. - * Target is supplied locked, source is unlocked. - * The target is always vput before returning. */ int -ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred) +ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { - struct vnode *vp, *vp1; + struct mount *mp; + struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; - vp = ITOV(target); - if (target->i_number == source_ino) { - error = EEXIST; - goto out; - } - error = 0; + vp = tvp = ITOV(target); + mp = vp->v_mount; + *wait_ino = 0; + if (target->i_number == source_ino) + return (EEXIST); + if (target->i_number == parent_ino) + return (0); if (target->i_number == ROOTINO) - goto out; - + return (0); + error = 0; for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) @@ -1415,9 +1426,13 @@ ufs_checkpath(ino_t source_ino, struct i } if (dd_ino == ROOTINO) break; - error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1); - if (error != 0) + if (dd_ino == parent_ino) + break; + error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); + if (error != 0) { + *wait_ino = dd_ino; break; + } /* Recheck that ".." still points to vp1 after relock of vp */ error = ufs_dir_dd_ino(vp, cred, &dd_ino); if (error != 0) { @@ -1429,14 +1444,14 @@ ufs_checkpath(ino_t source_ino, struct i vput(vp1); continue; } - vput(vp); + if (vp != tvp) + vput(vp); vp = vp1; } -out: if (error == ENOTDIR) - printf("checkpath: .. not a directory\n"); - if (vp != NULL) + panic("checkpath: .. not a directory\n"); + if (vp != tvp) vput(vp); return (error); } Modified: stable/8/sys/ufs/ufs/ufs_vnops.c ============================================================================== --- stable/8/sys/ufs/ufs/ufs_vnops.c Sun Apr 24 10:47:56 2011 (r220985) +++ stable/8/sys/ufs/ufs/ufs_vnops.c Sun Apr 24 11:01:42 2011 (r220986) @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/fcntl.h> #include <sys/stat.h> +#include <sys/sysctl.h> #include <sys/bio.h> #include <sys/buf.h> #include <sys/mount.h> @@ -114,6 +115,8 @@ static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; static vop_pathconf_t ufsfifo_pathconf; +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + /* * A virgin directory (no blushing please). */ @@ -992,7 +995,7 @@ ufs_link(ap) error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); - error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); + error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); } if (error) { @@ -1043,7 +1046,7 @@ ufs_whiteout(ap) newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; - error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); + error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); break; case DELETE: @@ -1062,6 +1065,11 @@ ufs_whiteout(ap) return (error); } +static volatile int rename_restarts; +SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, + __DEVOLATILE(int *, &rename_restarts), 0, + "Times rename had to restart due to lock contention"); + /* * Rename system call. * rename("foo", "bar"); @@ -1101,111 +1109,185 @@ ufs_rename(ap) struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; + struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; - struct inode *ip, *xp, *dp; + struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; - int doingdirectory = 0, oldparent = 0, newparent = 0; + off_t endoff; + int doingdirectory, newparent; int error = 0, ioflag; - ino_t fvp_ino; + struct mount *mp; + ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif + endoff = 0; + mp = tdvp->v_mount; + VOP_UNLOCK(tdvp, 0); + if (tvp && tvp != tdvp) + VOP_UNLOCK(tvp, 0); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; -abortit: - if (tdvp == tvp) - vrele(tdvp); - else - vput(tdvp); - if (tvp) - vput(tvp); - vrele(fdvp); + mp = NULL; + goto releout; + } + error = vfs_busy(mp, 0); + if (error) { + mp = NULL; + goto releout; + } +relock: + /* + * We need to acquire 2 to 4 locks depending on whether tvp is NULL + * and fdvp and tdvp are the same directory. Subsequently we need + * to double-check all paths and in the directory rename case we + * need to verify that we are not creating a directory loop. To + * handle this we acquire all but fdvp using non-blocking + * acquisitions. If we fail to acquire any lock in the path we will + * drop all held locks, acquire the new lock in a blocking fashion, + * and then release it and restart the rename. This acquire/release + * step ensures that we do not spin on a lock waiting for release. + */ + error = vn_lock(fdvp, LK_EXCLUSIVE); + if (error) + goto releout; + if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto releout; + VOP_UNLOCK(tdvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * Re-resolve fvp to be certain it still exists and fetch the + * correct vnode. + */ + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + goto releout; + } + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); vrele(fvp); - return (error); + fvp = nvp; + atomic_add_int(&rename_restarts, 1); + goto relock; + } + vrele(fvp); + fvp = nvp; + /* + * Re-resolve tvp and acquire the vnode lock if present. + */ + error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); + if (error != 0 && error != EJUSTRETURN) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + goto releout; } - + /* + * If tvp disappeared we just carry on. + */ + if (error == EJUSTRETURN && tvp != NULL) { + vrele(tvp); + tvp = NULL; + } + /* + * Get the tvp ino if the lookup succeeded. We may have to restart + * if the non-blocking acquire fails. + */ + if (error == 0) { + nvp = NULL; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (tvp) + vrele(tvp); + tvp = nvp; + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + } + fdp = VTOI(fdvp); + fip = VTOI(fvp); + tdp = VTOI(tdvp); + tip = NULL; + if (tvp) + tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; - goto abortit; + goto unlockout; } - /* * Renaming a file to itself has no effect. The upper layers should - * not call us in that case. Temporarily just warn if they do. + * not call us in that case. However, things could change after + * we drop the locks above. */ if (fvp == tvp) { - printf("ufs_rename: fvp == tvp (can't happen)\n"); error = 0; - goto abortit; + goto unlockout; } - - if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) - goto abortit; - dp = VTOI(fdvp); - ip = VTOI(fvp); - if (ip->i_nlink >= LINK_MAX) { - VOP_UNLOCK(fvp, 0); + doingdirectory = 0; + newparent = 0; + ino = fip->i_number; + if (fip->i_nlink >= LINK_MAX) { error = EMLINK; - goto abortit; + goto unlockout; } - if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) - || (dp->i_flags & APPEND)) { - VOP_UNLOCK(fvp, 0); + if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) + || (fdp->i_flags & APPEND)) { error = EPERM; - goto abortit; + goto unlockout; } - if ((ip->i_mode & IFMT) == IFDIR) { + if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || - (ip->i_flag & IN_RENAME)) { - VOP_UNLOCK(fvp, 0); + fdp == fip || + (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; - goto abortit; + goto unlockout; } - ip->i_flag |= IN_RENAME; - oldparent = dp->i_number; + if (fdp->i_number != tdp->i_number) + newparent = tdp->i_number; doingdirectory = 1; } - vrele(fdvp); - - /* - * When the target exists, both the directory - * and target vnodes are returned locked. - */ - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); - - /* - * 1) Bump link count while we're moving stuff - * around. If we crash somewhere before - * completing our work, the link count - * may be wrong, but correctable. - */ - ip->i_effnlink++; - ip->i_nlink++; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | - DOINGASYNC(fvp)))) != 0) { - VOP_UNLOCK(fvp, 0); - goto bad; + if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || + (tvp != NULL && tvp->v_type == VDIR && + tvp->v_mountedhere != NULL)) { + error = EXDEV; + goto unlockout; } /* @@ -1214,35 +1296,55 @@ abortit: * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so - * as to be able to change "..". We must repeat the call - * to namei, as the parent directory is unlocked by the - * call to checkpath(). - */ - error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); - fvp_ino = ip->i_number; - VOP_UNLOCK(fvp, 0); - if (oldparent != dp->i_number) - newparent = dp->i_number; + * as to be able to change "..". + */ if (doingdirectory && newparent) { - if (error) /* write access check above */ - goto bad; - if (xp != NULL) - vput(tvp); - error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred); + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) - goto out; + goto unlockout; + error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, + &ino); + /* + * We encountered a lock that we have to wait for. Unlock + * everything else and VGET before restarting. + */ + if (ino) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_SHARED, &nvp); + if (error == 0) + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + if (error) + goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); - VREF(tdvp); - error = relookup(tdvp, &tvp, tcnp); - if (error) - goto out; - vrele(tdvp); - dp = VTOI(tdvp); - xp = NULL; - if (tvp) - xp = VTOI(tvp); } + if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || + tdp->i_effnlink == 0) + panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + fip->i_effnlink++; + fip->i_nlink++; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(fip); + error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp))); + if (error) + goto bad; + /* * 2) If target doesn't exist, link the target * to the source and unlink the source. @@ -1250,52 +1352,37 @@ abortit: * entry to reference the source inode and * expunge the original entry's existence. */ - if (xp == NULL) { - if (dp->i_dev != ip->i_dev) + if (tip == NULL) { + if (tdp->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); - /* - * Account for ".." in new directory. - * When source and destination have the same - * parent we don't fool with the link count. - */ if (doingdirectory && newparent) { - if ((nlink_t)dp->i_nlink >= LINK_MAX) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't adjust the link count. The + * actual link modification is completed when + * .. is rewritten below. + */ + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } - dp->i_effnlink++; - dp->i_nlink++; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | - DOINGASYNC(tdvp))); - if (error) - goto bad; } - ufs_makedirentry(ip, tcnp, &newdir); - error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); - if (error) { - if (doingdirectory && newparent) { - dp->i_effnlink--; - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; - if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); - (void)UFS_UPDATE(tdvp, 1); - } + ufs_makedirentry(fip, tcnp, &newdir); + error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); + if (error) goto bad; - } - vput(tdvp); + /* Setup tdvp for directory compaction if needed. */ + if (tdp->i_count && tdp->i_endoff && + tdp->i_endoff < tdp->i_size) + endoff = tdp->i_endoff; } else { - if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ - if (xp->i_number == ip->i_number) + if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller @@ -1303,7 +1390,7 @@ abortit: * destination of the rename. This implements append-only * directories. */ - if ((dp->i_mode & S_ISTXT) && + if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; @@ -1314,9 +1401,9 @@ abortit: * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ - if ((xp->i_mode&IFMT) == IFDIR) { - if ((xp->i_effnlink > 2) || - !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { + if ((tip->i_mode & IFMT) == IFDIR) { + if ((tip->i_effnlink > 2) || + !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } @@ -1329,20 +1416,30 @@ abortit: error = EISDIR; goto bad; } - error = ufs_dirrewrite(dp, xp, ip->i_number, - IFTODT(ip->i_mode), - (doingdirectory && newparent) ? newparent : doingdirectory); - if (error) - goto bad; if (doingdirectory) { if (!newparent) { - dp->i_effnlink--; + tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) - softdep_change_linkcnt(dp); + softdep_change_linkcnt(tdp); } - xp->i_effnlink--; + tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) - softdep_change_linkcnt(xp); + softdep_change_linkcnt(tip); + } + error = ufs_dirrewrite(tdp, tip, fip->i_number, + IFTODT(fip->i_mode), + (doingdirectory && newparent) ? newparent : doingdirectory); + if (error) { + if (doingdirectory) { + if (!newparent) { + tdp->i_effnlink++; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + tip->i_effnlink++; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(tip); + } } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* @@ -1357,115 +1454,107 @@ abortit: * them now. */ if (!newparent) { - dp->i_nlink--; - DIP_SET(dp, i_nlink, dp->i_nlink); - dp->i_flag |= IN_CHANGE; + tdp->i_nlink--; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; } - xp->i_nlink--; - DIP_SET(xp, i_nlink, xp->i_nlink); - xp->i_flag |= IN_CHANGE; + tip->i_nlink--; + DIP_SET(tip, i_nlink, tip->i_nlink); + tip->i_flag |= IN_CHANGE; ioflag = IO_NORMAL; if (!DOINGASYNC(tvp)) ioflag |= IO_SYNC; + /* Don't go to bad here as the new link exists. */ if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) - goto bad; + goto unlockout; } - vput(tdvp); - vput(tvp); - xp = NULL; } /* - * 3) Unlink the source. + * 3) Unlink the source. We have to resolve the path again to + * fixup the directory offset and count for ufs_dirremove. */ - fcnp->cn_flags &= ~MODMASK; - fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; - if ((fcnp->cn_flags & SAVESTART) == 0) - panic("ufs_rename: lost from startdir"); - VREF(fdvp); - error = relookup(fdvp, &fvp, fcnp); - if (error == 0) - vrele(fdvp); - if (fvp != NULL) { - xp = VTOI(fvp); - dp = VTOI(fdvp); - } else { - /* - * From name has disappeared. IN_RENAME is not sufficient - * to protect against directory races due to timing windows, - * so we have to remove the panic. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. - */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - vrele(ap->a_fvp); - return (0); + if (fdvp == tdvp) { + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + panic("ufs_rename: from entry went away!"); + if (ino != fip->i_number) + panic("ufs_rename: ino mismatch %d != %d\n", ino, + fip->i_number); } /* - * Ensure that the directory entry still exists and has not - * changed while the new name has been entered. If the source is - * a file then the entry may have been unlinked or renamed. In - * either case there is no further work to be done. If the source - * is a directory then it cannot have been rmdir'ed; the IN_RENAME - * flag ensures that it cannot be moved by another rename or removed - * by a rmdir. - */ - if (xp != ip) { - /* - * From name resolves to a different inode. IN_RENAME is - * not sufficient protection against timing window races - * so we can't panic here. XXX the only real way - * to solve this issue is at a much higher level. By the - * time we hit ufs_rename() it's too late. - */ -#if 0 - if (doingdirectory) - panic("ufs_rename: lost dir entry"); -#endif - } else { + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { /* - * If the source is a directory with a - * new parent, the link count of the old - * parent directory must be decremented - * and ".." set to point to the new parent. + * If tip exists we simply use its link, otherwise we must + * add a new one. */ - if (doingdirectory && newparent) { - xp->i_offset = mastertemplate.dot_reclen; - ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); - cache_purge(fdvp); - } - error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); - xp->i_flag &= ~IN_RENAME; - } - if (dp) - vput(fdvp); - if (xp) - vput(fvp); - vrele(ap->a_fvp); + if (tip == NULL) { + tdp->i_effnlink++; + tdp->i_nlink++; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + /* Don't go to bad here as the new link exists. */ + if (error) + goto unlockout; + } + fip->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); + cache_purge(fdvp); + } + error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); + +unlockout: + vput(fdvp); + vput(fvp); + if (tvp) + vput(tvp); + /* + * If compaction or fsync was requested do it now that other locks + * are no longer needed. + */ + if (error == 0 && endoff != 0) { +#ifdef UFS_DIRHASH + if (tdp->i_dirhash != NULL) + ufsdirhash_dirtrunc(tdp, endoff); +#endif + UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred, + td); + } + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + vput(tdvp); + if (mp) + vfs_unbusy(mp); return (error); bad: - if (xp) - vput(ITOV(xp)); - vput(ITOV(dp)); -out: - if (doingdirectory) - ip->i_flag &= ~IN_RENAME; - if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { - ip->i_effnlink--; - ip->i_nlink--; - DIP_SET(ip, i_nlink, ip->i_nlink); - ip->i_flag |= IN_CHANGE; - ip->i_flag &= ~IN_RENAME; - if (DOINGSOFTDEP(fvp)) - softdep_change_linkcnt(ip); - vput(fvp); - } else - vrele(fvp); + fip->i_effnlink--; + fip->i_nlink--; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(fip); + goto unlockout; + +releout: *** DIFF OUTPUT TRUNCATED AT 1000 LINES *** _______________________________________________ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscribe@freebsd.org"
State Changed From-To: open->patched HEAD contained fix for the problem, that was commited together with large +J patch. I extracted the lone ufs_rename() rework and merged it into stable/8. Lets see how it works.