diff options
Diffstat (limited to 'fs/xfs/xfs_inode.c')
| -rw-r--r-- | fs/xfs/xfs_inode.c | 1488 | 
1 files changed, 161 insertions, 1327 deletions
| diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a4e3cd8971fc..7dc6f326936c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -42,55 +42,11 @@  #include "xfs_pnfs.h"  #include "xfs_parent.h"  #include "xfs_xattr.h" -#include "xfs_sb.h" +#include "xfs_inode_util.h"  struct kmem_cache *xfs_inode_cache;  /* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( -	struct xfs_inode	*ip) -{ -	/* -	 * No point in aligning allocations if we need to COW to actually -	 * write to them. -	 */ -	if (xfs_is_always_cow_inode(ip)) -		return 0; -	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) -		return ip->i_extsize; -	if (XFS_IS_REALTIME_INODE(ip) && -	    ip->i_mount->m_sb.sb_rextsize > 1) -		return ip->i_mount->m_sb.sb_rextsize; -	return 0; -} - -/* - * Helper function to extract CoW extent size hint from inode. - * Between the extent size hint and the CoW extent size hint, we - * return the greater of the two.  If the value is zero (automatic), - * use the default size. - */ -xfs_extlen_t -xfs_get_cowextsz_hint( -	struct xfs_inode	*ip) -{ -	xfs_extlen_t		a, b; - -	a = 0; -	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) -		a = ip->i_cowextsize; -	b = xfs_get_extsz_hint(ip); - -	a = max(a, b); -	if (a == 0) -		return XFS_DEFAULT_COWEXTSZ_HINT; -	return a; -} - -/*   * These two are wrapper routines around the xfs_ilock() routine used to   * centralize some grungy code.  They are used in places that wish to lock the   * inode solely for reading the extents.  The reason these places can't just @@ -567,55 +523,6 @@ xfs_lock_two_inodes(  	}  } -uint -xfs_ip2xflags( -	struct xfs_inode	*ip) -{ -	uint			flags = 0; - -	if (ip->i_diflags & XFS_DIFLAG_ANY) { -		if (ip->i_diflags & XFS_DIFLAG_REALTIME) -			flags |= FS_XFLAG_REALTIME; -		if (ip->i_diflags & XFS_DIFLAG_PREALLOC) -			flags |= FS_XFLAG_PREALLOC; -		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) -			flags |= FS_XFLAG_IMMUTABLE; -		if (ip->i_diflags & XFS_DIFLAG_APPEND) -			flags |= FS_XFLAG_APPEND; -		if (ip->i_diflags & XFS_DIFLAG_SYNC) -			flags |= FS_XFLAG_SYNC; -		if (ip->i_diflags & XFS_DIFLAG_NOATIME) -			flags |= FS_XFLAG_NOATIME; -		if (ip->i_diflags & XFS_DIFLAG_NODUMP) -			flags |= FS_XFLAG_NODUMP; -		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) -			flags |= FS_XFLAG_RTINHERIT; -		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) -			flags |= FS_XFLAG_PROJINHERIT; -		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) -			flags |= FS_XFLAG_NOSYMLINKS; -		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) -			flags |= FS_XFLAG_EXTSIZE; -		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) -			flags |= FS_XFLAG_EXTSZINHERIT; -		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) -			flags |= FS_XFLAG_NODEFRAG; -		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) -			flags |= FS_XFLAG_FILESTREAM; -	} - -	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { -		if (ip->i_diflags2 & XFS_DIFLAG2_DAX) -			flags |= FS_XFLAG_DAX; -		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) -			flags |= FS_XFLAG_COWEXTSIZE; -	} - -	if (xfs_inode_has_attr_fork(ip)) -		flags |= FS_XFLAG_HASATTR; -	return flags; -} -  /*   * Lookups up an inode from "name". If ci_name is not NULL, then a CI match   * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -657,97 +564,6 @@ out_unlock:  	return error;  } -/* Propagate di_flags from a parent inode to a child inode. */ -static void -xfs_inode_inherit_flags( -	struct xfs_inode	*ip, -	const struct xfs_inode	*pip) -{ -	unsigned int		di_flags = 0; -	xfs_failaddr_t		failaddr; -	umode_t			mode = VFS_I(ip)->i_mode; - -	if (S_ISDIR(mode)) { -		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) -			di_flags |= XFS_DIFLAG_RTINHERIT; -		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { -			di_flags |= XFS_DIFLAG_EXTSZINHERIT; -			ip->i_extsize = pip->i_extsize; -		} -		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) -			di_flags |= XFS_DIFLAG_PROJINHERIT; -	} else if (S_ISREG(mode)) { -		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && -		    xfs_has_realtime(ip->i_mount)) -			di_flags |= XFS_DIFLAG_REALTIME; -		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { -			di_flags |= XFS_DIFLAG_EXTSIZE; -			ip->i_extsize = pip->i_extsize; -		} -	} -	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && -	    xfs_inherit_noatime) -		di_flags |= XFS_DIFLAG_NOATIME; -	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && -	    xfs_inherit_nodump) -		di_flags |= XFS_DIFLAG_NODUMP; -	if ((pip->i_diflags & XFS_DIFLAG_SYNC) && -	    xfs_inherit_sync) -		di_flags |= XFS_DIFLAG_SYNC; -	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && -	    xfs_inherit_nosymlinks) -		di_flags |= XFS_DIFLAG_NOSYMLINKS; -	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && -	    xfs_inherit_nodefrag) -		di_flags |= XFS_DIFLAG_NODEFRAG; -	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) -		di_flags |= XFS_DIFLAG_FILESTREAM; - -	ip->i_diflags |= di_flags; - -	/* -	 * Inode verifiers on older kernels only check that the extent size -	 * hint is an integer multiple of the rt extent size on realtime files. -	 * They did not check the hint alignment on a directory with both -	 * rtinherit and extszinherit flags set.  If the misaligned hint is -	 * propagated from a directory into a new realtime file, new file -	 * allocations will fail due to math errors in the rt allocator and/or -	 * trip the verifiers.  Validate the hint settings in the new file so -	 * that we don't let broken hints propagate. -	 */ -	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, -			VFS_I(ip)->i_mode, ip->i_diflags); -	if (failaddr) { -		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | -				   XFS_DIFLAG_EXTSZINHERIT); -		ip->i_extsize = 0; -	} -} - -/* Propagate di_flags2 from a parent inode to a child inode. */ -static void -xfs_inode_inherit_flags2( -	struct xfs_inode	*ip, -	const struct xfs_inode	*pip) -{ -	xfs_failaddr_t		failaddr; - -	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { -		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; -		ip->i_cowextsize = pip->i_cowextsize; -	} -	if (pip->i_diflags2 & XFS_DIFLAG2_DAX) -		ip->i_diflags2 |= XFS_DIFLAG2_DAX; - -	/* Don't let invalid cowextsize hints propagate. */ -	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, -			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); -	if (failaddr) { -		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; -		ip->i_cowextsize = 0; -	} -} -  /*   * Initialise a newly allocated inode and return the in-core inode to the   * caller locked exclusively. @@ -755,39 +571,15 @@ xfs_inode_inherit_flags2(   * Caller is responsible for unlocking the inode manually upon return   */  int -xfs_init_new_inode( -	struct mnt_idmap	*idmap, +xfs_icreate(  	struct xfs_trans	*tp, -	struct xfs_inode	*pip,  	xfs_ino_t		ino, -	umode_t			mode, -	xfs_nlink_t		nlink, -	dev_t			rdev, -	prid_t			prid, -	bool			init_xattrs, +	const struct xfs_icreate_args *args,  	struct xfs_inode	**ipp)  { -	struct inode		*dir = pip ? VFS_I(pip) : NULL;  	struct xfs_mount	*mp = tp->t_mountp; -	struct xfs_inode	*ip; -	unsigned int		flags; +	struct xfs_inode	*ip = NULL;  	int			error; -	struct timespec64	tv; -	struct inode		*inode; - -	/* -	 * Protect against obviously corrupt allocation btree records. Later -	 * xfs_iget checks will catch re-allocation of other active in-memory -	 * and on-disk inodes. If we don't catch reallocating the parent inode -	 * here we will deadlock in xfs_iget() so we have to do these checks -	 * first. -	 */ -	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { -		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); -		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), -				XFS_SICK_AG_INOBT); -		return -EFSCORRUPTED; -	}  	/*  	 * Get the in-core inode with the lock held exclusively to prevent @@ -798,96 +590,8 @@ xfs_init_new_inode(  		return error;  	ASSERT(ip != NULL); -	inode = VFS_I(ip); -	set_nlink(inode, nlink); -	inode->i_rdev = rdev; -	ip->i_projid = prid; - -	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { -		inode_fsuid_set(inode, idmap); -		inode->i_gid = dir->i_gid; -		inode->i_mode = mode; -	} else { -		inode_init_owner(idmap, inode, dir, mode); -	} - -	/* -	 * If the group ID of the new file does not match the effective group -	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared -	 * (and only if the irix_sgid_inherit compatibility variable is set). -	 */ -	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && -	    !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) -		inode->i_mode &= ~S_ISGID; - -	ip->i_disk_size = 0; -	ip->i_df.if_nextents = 0; -	ASSERT(ip->i_nblocks == 0); - -	tv = inode_set_ctime_current(inode); -	inode_set_mtime_to_ts(inode, tv); -	inode_set_atime_to_ts(inode, tv); - -	ip->i_extsize = 0; -	ip->i_diflags = 0; - -	if (xfs_has_v3inodes(mp)) { -		inode_set_iversion(inode, 1); -		ip->i_cowextsize = 0; -		ip->i_crtime = tv; -	} - -	flags = XFS_ILOG_CORE; -	switch (mode & S_IFMT) { -	case S_IFIFO: -	case S_IFCHR: -	case S_IFBLK: -	case S_IFSOCK: -		ip->i_df.if_format = XFS_DINODE_FMT_DEV; -		flags |= XFS_ILOG_DEV; -		break; -	case S_IFREG: -	case S_IFDIR: -		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) -			xfs_inode_inherit_flags(ip, pip); -		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) -			xfs_inode_inherit_flags2(ip, pip); -		fallthrough; -	case S_IFLNK: -		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; -		ip->i_df.if_bytes = 0; -		ip->i_df.if_data = NULL; -		break; -	default: -		ASSERT(0); -	} - -	/* -	 * If we need to create attributes immediately after allocating the -	 * inode, initialise an empty attribute fork right now. We use the -	 * default fork offset for attributes here as we don't know exactly what -	 * size or how many attributes we might be adding. We can do this -	 * safely here because we know the data fork is completely empty and -	 * this saves us from needing to run a separate transaction to set the -	 * fork offset in the immediate future. -	 */ -	if (init_xattrs) { -		ip->i_forkoff = xfs_default_attroffset(ip) >> 3; -		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); - -		if (!xfs_has_attr(mp)) { -			spin_lock(&mp->m_sb_lock); -			xfs_add_attr(mp); -			spin_unlock(&mp->m_sb_lock); -			xfs_log_sb(tp); -		} -	} - -	/* -	 * Log the new values stuffed into the inode. -	 */  	xfs_trans_ijoin(tp, ip, 0); -	xfs_trans_log_inode(tp, ip, flags); +	xfs_inode_init(tp, args, ip);  	/* now that we have an i_mode we can setup the inode structure */  	xfs_setup_inode(ip); @@ -896,158 +600,60 @@ xfs_init_new_inode(  	return 0;  } -/* - * Decrement the link count on an inode & log the change.  If this causes the - * link count to go to zero, move the inode to AGI unlinked list so that it can - * be freed when the last active reference goes away via xfs_inactive(). - */ +/* Return dquots for the ids that will be assigned to a new file. */  int -xfs_droplink( -	struct xfs_trans	*tp, -	struct xfs_inode	*ip) -{ -	struct inode		*inode = VFS_I(ip); - -	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - -	if (inode->i_nlink == 0) { -		xfs_info_ratelimited(tp->t_mountp, - "Inode 0x%llx link count dropped below zero.  Pinning link count.", -				ip->i_ino); -		set_nlink(inode, XFS_NLINK_PINNED); -	} -	if (inode->i_nlink != XFS_NLINK_PINNED) -		drop_nlink(inode); - -	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - -	if (inode->i_nlink) -		return 0; - -	return xfs_iunlink(tp, ip); -} - -/* - * Increment the link count on an inode & log the change. - */ -void -xfs_bumplink( -	struct xfs_trans	*tp, -	struct xfs_inode	*ip) -{ -	struct inode		*inode = VFS_I(ip); - -	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - -	if (inode->i_nlink == XFS_NLINK_PINNED - 1) -		xfs_info_ratelimited(tp->t_mountp, - "Inode 0x%llx link count exceeded maximum.  Pinning link count.", -				ip->i_ino); -	if (inode->i_nlink != XFS_NLINK_PINNED) -		inc_nlink(inode); - -	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -} - -#ifdef CONFIG_XFS_LIVE_HOOKS -/* - * Use a static key here to reduce the overhead of directory live update hooks. - * If the compiler supports jump labels, the static branch will be replaced by - * a nop sled when there are no hook users.  Online fsck is currently the only - * caller, so this is a reasonable tradeoff. - * - * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other - * parts of the kernel allocate memory with that lock held, which means that - * XFS callers cannot hold any locks that might be used by memory reclaim or - * writeback when calling the static_branch_{inc,dec} functions. - */ -DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); - -void -xfs_dir_hook_disable(void) -{ -	xfs_hooks_switch_off(&xfs_dir_hooks_switch); -} - -void -xfs_dir_hook_enable(void) -{ -	xfs_hooks_switch_on(&xfs_dir_hooks_switch); -} - -/* Call hooks for a directory update relating to a child dirent update. */ -inline void -xfs_dir_update_hook( -	struct xfs_inode		*dp, -	struct xfs_inode		*ip, -	int				delta, -	const struct xfs_name		*name) -{ -	if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { -		struct xfs_dir_update_params	p = { -			.dp		= dp, -			.ip		= ip, -			.delta		= delta, -			.name		= name, -		}; -		struct xfs_mount	*mp = ip->i_mount; - -		xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); +xfs_icreate_dqalloc( +	const struct xfs_icreate_args	*args, +	struct xfs_dquot		**udqpp, +	struct xfs_dquot		**gdqpp, +	struct xfs_dquot		**pdqpp) +{ +	struct inode			*dir = VFS_I(args->pip); +	kuid_t				uid = GLOBAL_ROOT_UID; +	kgid_t				gid = GLOBAL_ROOT_GID; +	prid_t				prid = 0; +	unsigned int			flags = XFS_QMOPT_QUOTALL; + +	if (args->idmap) { +		/* +		 * The uid/gid computation code must match what the VFS uses to +		 * assign i_[ug]id.  INHERIT adjusts the gid computation for +		 * setgid/grpid systems. +		 */ +		uid = mapped_fsuid(args->idmap, i_user_ns(dir)); +		gid = mapped_fsgid(args->idmap, i_user_ns(dir)); +		prid = xfs_get_initial_prid(args->pip); +		flags |= XFS_QMOPT_INHERIT;  	} -} -/* Call the specified function during a directory update. */ -int -xfs_dir_hook_add( -	struct xfs_mount	*mp, -	struct xfs_dir_hook	*hook) -{ -	return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); -} +	*udqpp = *gdqpp = *pdqpp = NULL; -/* Stop calling the specified function during a directory update. */ -void -xfs_dir_hook_del( -	struct xfs_mount	*mp, -	struct xfs_dir_hook	*hook) -{ -	xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +	return xfs_qm_vop_dqalloc(args->pip, uid, gid, prid, flags, udqpp, +			gdqpp, pdqpp);  } -/* Configure directory update hook functions. */ -void -xfs_dir_hook_setup( -	struct xfs_dir_hook	*hook, -	notifier_fn_t		mod_fn) -{ -	xfs_hook_setup(&hook->dirent_hook, mod_fn); -} -#endif /* CONFIG_XFS_LIVE_HOOKS */ -  int  xfs_create( -	struct mnt_idmap	*idmap, -	struct xfs_inode	*dp, +	const struct xfs_icreate_args *args,  	struct xfs_name		*name, -	umode_t			mode, -	dev_t			rdev, -	bool			init_xattrs, -	xfs_inode_t		**ipp) +	struct xfs_inode	**ipp)  { -	int			is_dir = S_ISDIR(mode); +	struct xfs_inode	*dp = args->pip; +	struct xfs_dir_update	du = { +		.dp		= dp, +		.name		= name, +	};  	struct xfs_mount	*mp = dp->i_mount; -	struct xfs_inode	*ip = NULL;  	struct xfs_trans	*tp = NULL; -	int			error; -	bool			unlock_dp_on_error = false; -	prid_t			prid; -	struct xfs_dquot	*udqp = NULL; -	struct xfs_dquot	*gdqp = NULL; -	struct xfs_dquot	*pdqp = NULL; +	struct xfs_dquot	*udqp; +	struct xfs_dquot	*gdqp; +	struct xfs_dquot	*pdqp;  	struct xfs_trans_res	*tres; -	uint			resblks;  	xfs_ino_t		ino; -	struct xfs_parent_args	*ppargs; +	bool			unlock_dp_on_error = false; +	bool			is_dir = S_ISDIR(args->mode); +	uint			resblks; +	int			error;  	trace_xfs_create(dp, name); @@ -1056,15 +662,8 @@ xfs_create(  	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))  		return -EIO; -	prid = xfs_get_initial_prid(dp); - -	/* -	 * Make sure that we have allocated dquot(s) on disk. -	 */ -	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), -			mapped_fsgid(idmap, &init_user_ns), prid, -			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, -			&udqp, &gdqp, &pdqp); +	/* Make sure that we have allocated dquot(s) on disk. */ +	error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);  	if (error)  		return error; @@ -1076,7 +675,7 @@ xfs_create(  		tres = &M_RES(mp)->tr_create;  	} -	error = xfs_parent_start(mp, &ppargs); +	error = xfs_parent_start(mp, &du.ppargs);  	if (error)  		goto out_release_dquots; @@ -1105,10 +704,9 @@ xfs_create(  	 * entry pointing to them, but a directory also the "." entry  	 * pointing to itself.  	 */ -	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); +	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);  	if (!error) -		error = xfs_init_new_inode(idmap, tp, dp, ino, mode, -				is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); +		error = xfs_icreate(tp, ino, args, &du.ip);  	if (error)  		goto out_trans_cancel; @@ -1121,38 +719,9 @@ xfs_create(  	 */  	xfs_trans_ijoin(tp, dp, 0); -	error = xfs_dir_createname(tp, dp, name, ip->i_ino, -					resblks - XFS_IALLOC_SPACE_RES(mp)); -	if (error) { -		ASSERT(error != -ENOSPC); +	error = xfs_dir_create_child(tp, resblks, &du); +	if (error)  		goto out_trans_cancel; -	} -	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); -	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - -	if (is_dir) { -		error = xfs_dir_init(tp, ip, dp); -		if (error) -			goto out_trans_cancel; - -		xfs_bumplink(tp, dp); -	} - -	/* -	 * If we have parent pointers, we need to add the attribute containing -	 * the parent information now. -	 */ -	if (ppargs) { -		error = xfs_parent_addname(tp, ppargs, dp, name, ip); -		if (error) -			goto out_trans_cancel; -	} - -	/* -	 * Create ip with a reference from dp, and add '.' and '..' references -	 * if it's a directory. -	 */ -	xfs_dir_update_hook(dp, ip, 1, name);  	/*  	 * If this is a synchronous mount, make sure that the @@ -1167,7 +736,7 @@ xfs_create(  	 * These ids of the inode couldn't have changed since the new  	 * inode has been locked ever since it was created.  	 */ -	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); +	xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);  	error = xfs_trans_commit(tp);  	if (error) @@ -1177,10 +746,10 @@ xfs_create(  	xfs_qm_dqrele(gdqp);  	xfs_qm_dqrele(pdqp); -	*ipp = ip; -	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	*ipp = du.ip; +	xfs_iunlock(du.ip, XFS_ILOCK_EXCL);  	xfs_iunlock(dp, XFS_ILOCK_EXCL); -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);  	return 0;   out_trans_cancel: @@ -1191,13 +760,13 @@ xfs_create(  	 * setup of the inode and release the inode.  This prevents recursive  	 * transactions and deadlocks from xfs_inactive.  	 */ -	if (ip) { -		xfs_iunlock(ip, XFS_ILOCK_EXCL); -		xfs_finish_inode_setup(ip); -		xfs_irele(ip); +	if (du.ip) { +		xfs_iunlock(du.ip, XFS_ILOCK_EXCL); +		xfs_finish_inode_setup(du.ip); +		xfs_irele(du.ip);  	}   out_parent: -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);   out_release_dquots:  	xfs_qm_dqrele(udqp);  	xfs_qm_dqrele(gdqp); @@ -1210,36 +779,28 @@ xfs_create(  int  xfs_create_tmpfile( -	struct mnt_idmap	*idmap, -	struct xfs_inode	*dp, -	umode_t			mode, -	bool			init_xattrs, +	const struct xfs_icreate_args *args,  	struct xfs_inode	**ipp)  { +	struct xfs_inode	*dp = args->pip;  	struct xfs_mount	*mp = dp->i_mount;  	struct xfs_inode	*ip = NULL;  	struct xfs_trans	*tp = NULL; -	int			error; -	prid_t                  prid; -	struct xfs_dquot	*udqp = NULL; -	struct xfs_dquot	*gdqp = NULL; -	struct xfs_dquot	*pdqp = NULL; +	struct xfs_dquot	*udqp; +	struct xfs_dquot	*gdqp; +	struct xfs_dquot	*pdqp;  	struct xfs_trans_res	*tres; -	uint			resblks;  	xfs_ino_t		ino; +	uint			resblks; +	int			error; + +	ASSERT(args->flags & XFS_ICREATE_TMPFILE);  	if (xfs_is_shutdown(mp))  		return -EIO; -	prid = xfs_get_initial_prid(dp); - -	/* -	 * Make sure that we have allocated dquot(s) on disk. -	 */ -	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), -			mapped_fsgid(idmap, &init_user_ns), prid, -			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, -			&udqp, &gdqp, &pdqp); +	/* Make sure that we have allocated dquot(s) on disk. */ +	error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);  	if (error)  		return error; @@ -1251,10 +812,9 @@ xfs_create_tmpfile(  	if (error)  		goto out_release_dquots; -	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); +	error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);  	if (!error) -		error = xfs_init_new_inode(idmap, tp, dp, ino, mode, -				0, 0, prid, init_xattrs, &ip); +		error = xfs_icreate(tp, ino, args, &ip);  	if (error)  		goto out_trans_cancel; @@ -1311,11 +871,15 @@ xfs_link(  	struct xfs_inode	*sip,  	struct xfs_name		*target_name)  { +	struct xfs_dir_update	du = { +		.dp		= tdp, +		.name		= target_name, +		.ip		= sip, +	};  	struct xfs_mount	*mp = tdp->i_mount;  	struct xfs_trans	*tp;  	int			error, nospace_error = 0;  	int			resblks; -	struct xfs_parent_args	*ppargs;  	trace_xfs_link(tdp, target_name); @@ -1334,7 +898,7 @@ xfs_link(  	if (error)  		goto std_return; -	error = xfs_parent_start(mp, &ppargs); +	error = xfs_parent_start(mp, &du.ppargs);  	if (error)  		goto std_return; @@ -1349,7 +913,7 @@ xfs_link(  	 * pointers are enabled because we can't back out if the xattrs must  	 * grow.  	 */ -	if (ppargs && nospace_error) { +	if (du.ppargs && nospace_error) {  		error = nospace_error;  		goto error_return;  	} @@ -1376,47 +940,9 @@ xfs_link(  		}  	} -	if (!resblks) { -		error = xfs_dir_canenter(tp, tdp, target_name); -		if (error) -			goto error_return; -	} - -	/* -	 * Handle initial link state of O_TMPFILE inode -	 */ -	if (VFS_I(sip)->i_nlink == 0) { -		struct xfs_perag	*pag; - -		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); -		error = xfs_iunlink_remove(tp, pag, sip); -		xfs_perag_put(pag); -		if (error) -			goto error_return; -	} - -	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, -				   resblks); +	error = xfs_dir_add_child(tp, resblks, &du);  	if (error)  		goto error_return; -	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); -	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - -	xfs_bumplink(tp, sip); - -	/* -	 * If we have parent pointers, we now need to add the parent record to -	 * the attribute fork of the inode. If this is the initial parent -	 * attribute, we need to create it correctly, otherwise we can just add -	 * the parent to the inode. -	 */ -	if (ppargs) { -		error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip); -		if (error) -			goto error_return; -	} - -	xfs_dir_update_hook(tdp, sip, 1, target_name);  	/*  	 * If this is a synchronous mount, make sure that the @@ -1429,7 +955,7 @@ xfs_link(  	error = xfs_trans_commit(tp);  	xfs_iunlock(tdp, XFS_ILOCK_EXCL);  	xfs_iunlock(sip, XFS_ILOCK_EXCL); -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);  	return error;   error_return: @@ -1437,7 +963,7 @@ xfs_link(  	xfs_iunlock(tdp, XFS_ILOCK_EXCL);  	xfs_iunlock(sip, XFS_ILOCK_EXCL);   out_parent: -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);   std_return:  	if (error == -ENOSPC && nospace_error)  		error = nospace_error; @@ -2024,39 +1550,6 @@ out:  }  /* - * In-Core Unlinked List Lookups - * ============================= - * - * Every inode is supposed to be reachable from some other piece of metadata - * with the exception of the root directory.  Inodes with a connection to a - * file descriptor but not linked from anywhere in the on-disk directory tree - * are collectively known as unlinked inodes, though the filesystem itself - * maintains links to these inodes so that on-disk metadata are consistent. - * - * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI - * header contains a number of buckets that point to an inode, and each inode - * record has a pointer to the next inode in the hash chain.  This - * singly-linked list causes scaling problems in the iunlink remove function - * because we must walk that list to find the inode that points to the inode - * being removed from the unlinked hash bucket list. - * - * Hence we keep an in-memory double linked list to link each inode on an - * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer - * based lists would require having 64 list heads in the perag, one for each - * list. This is expensive in terms of memory (think millions of AGs) and cache - * misses on lookups. Instead, use the fact that inodes on the unlinked list - * must be referenced at the VFS level to keep them on the list and hence we - * have an existence guarantee for inodes on the unlinked list. - * - * Given we have an existence guarantee, we can use lockless inode cache lookups - * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode - * for the double linked unlinked list, and we don't need any extra locking to - * keep the list safe as all manipulations are done under the AGI buffer lock. - * Keeping the list up to date does not require memory allocation, just finding - * the XFS inode and updating the next/prev unlinked list aginos. - */ - -/*   * Find an inode on the unlinked list. This does not take references to the   * inode as we have existence guarantees by holding the AGI buffer lock and that   * only unlinked, referenced inodes can be on the unlinked inode list.  If we @@ -2091,75 +1584,11 @@ xfs_iunlink_lookup(  }  /* - * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode - * is not in cache. - */ -static int -xfs_iunlink_update_backref( -	struct xfs_perag	*pag, -	xfs_agino_t		prev_agino, -	xfs_agino_t		next_agino) -{ -	struct xfs_inode	*ip; - -	/* No update necessary if we are at the end of the list. */ -	if (next_agino == NULLAGINO) -		return 0; - -	ip = xfs_iunlink_lookup(pag, next_agino); -	if (!ip) -		return -ENOLINK; - -	ip->i_prev_unlinked = prev_agino; -	return 0; -} - -/* - * Point the AGI unlinked bucket at an inode and log the results.  The caller - * is responsible for validating the old value. - */ -STATIC int -xfs_iunlink_update_bucket( -	struct xfs_trans	*tp, -	struct xfs_perag	*pag, -	struct xfs_buf		*agibp, -	unsigned int		bucket_index, -	xfs_agino_t		new_agino) -{ -	struct xfs_agi		*agi = agibp->b_addr; -	xfs_agino_t		old_value; -	int			offset; - -	ASSERT(xfs_verify_agino_or_null(pag, new_agino)); - -	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); -	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, -			old_value, new_agino); - -	/* -	 * We should never find the head of the list already set to the value -	 * passed in because either we're adding or removing ourselves from the -	 * head of the list. -	 */ -	if (old_value == new_agino) { -		xfs_buf_mark_corrupt(agibp); -		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); -		return -EFSCORRUPTED; -	} - -	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); -	offset = offsetof(struct xfs_agi, agi_unlinked) + -			(sizeof(xfs_agino_t) * bucket_index); -	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); -	return 0; -} - -/*   * Load the inode @next_agino into the cache and set its prev_unlinked pointer   * to @prev_agino.  Caller must hold the AGI to synchronize with other changes   * to the unlinked list.   */ -STATIC int +int  xfs_iunlink_reload_next(  	struct xfs_trans	*tp,  	struct xfs_buf		*agibp, @@ -2215,187 +1644,6 @@ rele:  	return error;  } -static int -xfs_iunlink_insert_inode( -	struct xfs_trans	*tp, -	struct xfs_perag	*pag, -	struct xfs_buf		*agibp, -	struct xfs_inode	*ip) -{ -	struct xfs_mount	*mp = tp->t_mountp; -	struct xfs_agi		*agi = agibp->b_addr; -	xfs_agino_t		next_agino; -	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino); -	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; -	int			error; - -	/* -	 * Get the index into the agi hash table for the list this inode will -	 * go on.  Make sure the pointer isn't garbage and that this inode -	 * isn't already on the list. -	 */ -	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); -	if (next_agino == agino || -	    !xfs_verify_agino_or_null(pag, next_agino)) { -		xfs_buf_mark_corrupt(agibp); -		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); -		return -EFSCORRUPTED; -	} - -	/* -	 * Update the prev pointer in the next inode to point back to this -	 * inode. -	 */ -	error = xfs_iunlink_update_backref(pag, agino, next_agino); -	if (error == -ENOLINK) -		error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); -	if (error) -		return error; - -	if (next_agino != NULLAGINO) { -		/* -		 * There is already another inode in the bucket, so point this -		 * inode to the current head of the list. -		 */ -		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); -		if (error) -			return error; -		ip->i_next_unlinked = next_agino; -	} - -	/* Point the head of the list to point to this inode. */ -	ip->i_prev_unlinked = NULLAGINO; -	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); -} - -/* - * This is called when the inode's link count has gone to 0 or we are creating - * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0. - * - * We place the on-disk inode on a list in the AGI.  It will be pulled from this - * list when the inode is freed. - */ -int -xfs_iunlink( -	struct xfs_trans	*tp, -	struct xfs_inode	*ip) -{ -	struct xfs_mount	*mp = tp->t_mountp; -	struct xfs_perag	*pag; -	struct xfs_buf		*agibp; -	int			error; - -	ASSERT(VFS_I(ip)->i_nlink == 0); -	ASSERT(VFS_I(ip)->i_mode != 0); -	trace_xfs_iunlink(ip); - -	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - -	/* Get the agi buffer first.  It ensures lock ordering on the list. */ -	error = xfs_read_agi(pag, tp, 0, &agibp); -	if (error) -		goto out; - -	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); -out: -	xfs_perag_put(pag); -	return error; -} - -static int -xfs_iunlink_remove_inode( -	struct xfs_trans	*tp, -	struct xfs_perag	*pag, -	struct xfs_buf		*agibp, -	struct xfs_inode	*ip) -{ -	struct xfs_mount	*mp = tp->t_mountp; -	struct xfs_agi		*agi = agibp->b_addr; -	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino); -	xfs_agino_t		head_agino; -	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; -	int			error; - -	trace_xfs_iunlink_remove(ip); - -	/* -	 * Get the index into the agi hash table for the list this inode will -	 * go on.  Make sure the head pointer isn't garbage. -	 */ -	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); -	if (!xfs_verify_agino(pag, head_agino)) { -		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, -				agi, sizeof(*agi)); -		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); -		return -EFSCORRUPTED; -	} - -	/* -	 * Set our inode's next_unlinked pointer to NULL and then return -	 * the old pointer value so that we can update whatever was previous -	 * to us in the list to point to whatever was next in the list. -	 */ -	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); -	if (error) -		return error; - -	/* -	 * Update the prev pointer in the next inode to point back to previous -	 * inode in the chain. -	 */ -	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, -			ip->i_next_unlinked); -	if (error == -ENOLINK) -		error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, -				ip->i_next_unlinked); -	if (error) -		return error; - -	if (head_agino != agino) { -		struct xfs_inode	*prev_ip; - -		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); -		if (!prev_ip) { -			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); -			return -EFSCORRUPTED; -		} - -		error = xfs_iunlink_log_inode(tp, prev_ip, pag, -				ip->i_next_unlinked); -		prev_ip->i_next_unlinked = ip->i_next_unlinked; -	} else { -		/* Point the head of the list to the next unlinked inode. */ -		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, -				ip->i_next_unlinked); -	} - -	ip->i_next_unlinked = NULLAGINO; -	ip->i_prev_unlinked = 0; -	return error; -} - -/* - * Pull the on-disk inode from the AGI unlinked list. - */ -int -xfs_iunlink_remove( -	struct xfs_trans	*tp, -	struct xfs_perag	*pag, -	struct xfs_inode	*ip) -{ -	struct xfs_buf		*agibp; -	int			error; - -	trace_xfs_iunlink_remove(ip); - -	/* Get the agi buffer first.  It ensures lock ordering on the list. */ -	error = xfs_read_agi(pag, tp, 0, &agibp); -	if (error) -		return error; - -	return xfs_iunlink_remove_inode(tp, pag, agibp, ip); -} -  /*   * Look up the inode number specified and if it is not already marked XFS_ISTALE   * mark it stale. We should only find clean inodes in this lookup that aren't @@ -2614,36 +1862,10 @@ xfs_ifree(  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); -	/* -	 * Free the inode first so that we guarantee that the AGI lock is going -	 * to be taken before we remove the inode from the unlinked list. This -	 * makes the AGI lock -> unlinked list modification order the same as -	 * used in O_TMPFILE creation. -	 */ -	error = xfs_difree(tp, pag, ip->i_ino, &xic); -	if (error) -		goto out; - -	error = xfs_iunlink_remove(tp, pag, ip); +	error = xfs_inode_uninit(tp, pag, ip, &xic);  	if (error)  		goto out; -	/* -	 * Free any local-format data sitting around before we reset the -	 * data fork to extents format.  Note that the attr fork data has -	 * already been freed by xfs_attr_inactive. -	 */ -	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { -		kfree(ip->i_df.if_data); -		ip->i_df.if_data = NULL; -		ip->i_df.if_bytes = 0; -	} - -	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */ -	ip->i_diflags = 0; -	ip->i_diflags2 = mp->m_ino_geo.new_diflags2; -	ip->i_forkoff = 0;		/* mark the attr fork not in use */ -	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;  	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))  		xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); @@ -2652,13 +1874,6 @@ xfs_ifree(  	iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);  	spin_unlock(&iip->ili_lock); -	/* -	 * Bump the generation count so no one will be confused -	 * by reincarnations of this inode. -	 */ -	VFS_I(ip)->i_generation++; -	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -  	if (xic.deleted)  		error = xfs_ifree_cluster(tp, pag, ip, &xic);  out: @@ -2742,13 +1957,17 @@ xfs_remove(  	struct xfs_name		*name,  	struct xfs_inode	*ip)  { +	struct xfs_dir_update	du = { +		.dp		= dp, +		.name		= name, +		.ip		= ip, +	};  	struct xfs_mount	*mp = dp->i_mount;  	struct xfs_trans	*tp = NULL;  	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);  	int			dontcare;  	int                     error = 0;  	uint			resblks; -	struct xfs_parent_args	*ppargs;  	trace_xfs_remove(dp, name); @@ -2765,7 +1984,7 @@ xfs_remove(  	if (error)  		goto std_return; -	error = xfs_parent_start(mp, &ppargs); +	error = xfs_parent_start(mp, &du.ppargs);  	if (error)  		goto std_return; @@ -2788,76 +2007,10 @@ xfs_remove(  		goto out_parent;  	} -	/* -	 * If we're removing a directory perform some additional validation. -	 */ -	if (is_dir) { -		ASSERT(VFS_I(ip)->i_nlink >= 2); -		if (VFS_I(ip)->i_nlink != 2) { -			error = -ENOTEMPTY; -			goto out_trans_cancel; -		} -		if (!xfs_dir_isempty(ip)) { -			error = -ENOTEMPTY; -			goto out_trans_cancel; -		} - -		/* Drop the link from ip's "..".  */ -		error = xfs_droplink(tp, dp); -		if (error) -			goto out_trans_cancel; - -		/* Drop the "." link from ip to self.  */ -		error = xfs_droplink(tp, ip); -		if (error) -			goto out_trans_cancel; - -		/* -		 * Point the unlinked child directory's ".." entry to the root -		 * directory to eliminate back-references to inodes that may -		 * get freed before the child directory is closed.  If the fs -		 * gets shrunk, this can lead to dirent inode validation errors. -		 */ -		if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { -			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, -					tp->t_mountp->m_sb.sb_rootino, 0); -			if (error) -				goto out_trans_cancel; -		} -	} else { -		/* -		 * When removing a non-directory we need to log the parent -		 * inode here.  For a directory this is done implicitly -		 * by the xfs_droplink call for the ".." entry. -		 */ -		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); -	} -	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - -	/* Drop the link from dp to ip. */ -	error = xfs_droplink(tp, ip); +	error = xfs_dir_remove_child(tp, resblks, &du);  	if (error)  		goto out_trans_cancel; -	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); -	if (error) { -		ASSERT(error != -ENOENT); -		goto out_trans_cancel; -	} - -	/* Remove parent pointer. */ -	if (ppargs) { -		error = xfs_parent_removename(tp, ppargs, dp, name, ip); -		if (error) -			goto out_trans_cancel; -	} - -	/* -	 * Drop the link from dp to ip, and if ip was a directory, remove the -	 * '.' and '..' references since we freed the directory. -	 */ -	xfs_dir_update_hook(dp, ip, -1, name); -  	/*  	 * If this is a synchronous mount, make sure that the  	 * remove transaction goes to disk before returning to @@ -2875,7 +2028,7 @@ xfs_remove(  	xfs_iunlock(ip, XFS_ILOCK_EXCL);  	xfs_iunlock(dp, XFS_ILOCK_EXCL); -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);  	return 0;   out_trans_cancel: @@ -2884,7 +2037,7 @@ xfs_remove(  	xfs_iunlock(ip, XFS_ILOCK_EXCL);  	xfs_iunlock(dp, XFS_ILOCK_EXCL);   out_parent: -	xfs_parent_finish(mp, ppargs); +	xfs_parent_finish(mp, du.ppargs);   std_return:  	return error;  } @@ -2964,160 +2117,6 @@ xfs_sort_inodes(  	}  } -static int -xfs_finish_rename( -	struct xfs_trans	*tp) -{ -	/* -	 * If this is a synchronous mount, make sure that the rename transaction -	 * goes to disk before returning to the user. -	 */ -	if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) -		xfs_trans_set_sync(tp); - -	return xfs_trans_commit(tp); -} - -/* - * xfs_cross_rename() - * - * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall - */ -STATIC int -xfs_cross_rename( -	struct xfs_trans	*tp, -	struct xfs_inode	*dp1, -	struct xfs_name		*name1, -	struct xfs_inode	*ip1, -	struct xfs_parent_args	*ip1_ppargs, -	struct xfs_inode	*dp2, -	struct xfs_name		*name2, -	struct xfs_inode	*ip2, -	struct xfs_parent_args	*ip2_ppargs, -	int			spaceres) -{ -	int			error = 0; -	int			ip1_flags = 0; -	int			ip2_flags = 0; -	int			dp2_flags = 0; - -	/* Swap inode number for dirent in first parent */ -	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); -	if (error) -		goto out_trans_abort; - -	/* Swap inode number for dirent in second parent */ -	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); -	if (error) -		goto out_trans_abort; - -	/* -	 * If we're renaming one or more directories across different parents, -	 * update the respective ".." entries (and link counts) to match the new -	 * parents. -	 */ -	if (dp1 != dp2) { -		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - -		if (S_ISDIR(VFS_I(ip2)->i_mode)) { -			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, -						dp1->i_ino, spaceres); -			if (error) -				goto out_trans_abort; - -			/* transfer ip2 ".." reference to dp1 */ -			if (!S_ISDIR(VFS_I(ip1)->i_mode)) { -				error = xfs_droplink(tp, dp2); -				if (error) -					goto out_trans_abort; -				xfs_bumplink(tp, dp1); -			} - -			/* -			 * Although ip1 isn't changed here, userspace needs -			 * to be warned about the change, so that applications -			 * relying on it (like backup ones), will properly -			 * notify the change -			 */ -			ip1_flags |= XFS_ICHGTIME_CHG; -			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; -		} - -		if (S_ISDIR(VFS_I(ip1)->i_mode)) { -			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, -						dp2->i_ino, spaceres); -			if (error) -				goto out_trans_abort; - -			/* transfer ip1 ".." reference to dp2 */ -			if (!S_ISDIR(VFS_I(ip2)->i_mode)) { -				error = xfs_droplink(tp, dp1); -				if (error) -					goto out_trans_abort; -				xfs_bumplink(tp, dp2); -			} - -			/* -			 * Although ip2 isn't changed here, userspace needs -			 * to be warned about the change, so that applications -			 * relying on it (like backup ones), will properly -			 * notify the change -			 */ -			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; -			ip2_flags |= XFS_ICHGTIME_CHG; -		} -	} - -	/* Schedule parent pointer replacements */ -	if (ip1_ppargs) { -		error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2, -				name2, ip1); -		if (error) -			goto out_trans_abort; -	} - -	if (ip2_ppargs) { -		error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1, -				name1, ip2); -		if (error) -			goto out_trans_abort; -	} - -	if (ip1_flags) { -		xfs_trans_ichgtime(tp, ip1, ip1_flags); -		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); -	} -	if (ip2_flags) { -		xfs_trans_ichgtime(tp, ip2, ip2_flags); -		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); -	} -	if (dp2_flags) { -		xfs_trans_ichgtime(tp, dp2, dp2_flags); -		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); -	} -	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); -	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); - -	/* -	 * Inform our hook clients that we've finished an exchange operation as -	 * follows: removed the source and target files from their directories; -	 * added the target to the source directory; and added the source to -	 * the target directory.  All inodes are locked, so it's ok to model a -	 * rename this way so long as we say we deleted entries before we add -	 * new ones. -	 */ -	xfs_dir_update_hook(dp1, ip1, -1, name1); -	xfs_dir_update_hook(dp2, ip2, -1, name2); -	xfs_dir_update_hook(dp1, ip2, 1, name1); -	xfs_dir_update_hook(dp2, ip1, 1, name2); - -	return xfs_finish_rename(tp); - -out_trans_abort: -	xfs_trans_cancel(tp); -	return error; -} -  /*   * xfs_rename_alloc_whiteout()   * @@ -3133,12 +2132,17 @@ xfs_rename_alloc_whiteout(  	struct xfs_inode	*dp,  	struct xfs_inode	**wip)  { +	struct xfs_icreate_args	args = { +		.idmap		= idmap, +		.pip		= dp, +		.mode		= S_IFCHR | WHITEOUT_MODE, +		.flags		= XFS_ICREATE_TMPFILE, +	};  	struct xfs_inode	*tmpfile;  	struct qstr		name;  	int			error; -	error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, -			xfs_has_parent(dp->i_mount), &tmpfile); +	error = xfs_create_tmpfile(&args, &tmpfile);  	if (error)  		return error; @@ -3178,13 +2182,20 @@ xfs_rename(  	struct xfs_inode	*target_ip,  	unsigned int		flags)  { +	struct xfs_dir_update	du_src = { +		.dp		= src_dp, +		.name		= src_name, +		.ip		= src_ip, +	}; +	struct xfs_dir_update	du_tgt = { +		.dp		= target_dp, +		.name		= target_name, +		.ip		= target_ip, +	}; +	struct xfs_dir_update	du_wip = { };  	struct xfs_mount	*mp = src_dp->i_mount;  	struct xfs_trans	*tp; -	struct xfs_inode	*wip = NULL;		/* whiteout inode */  	struct xfs_inode	*inodes[__XFS_SORT_INODES]; -	struct xfs_parent_args	*src_ppargs = NULL; -	struct xfs_parent_args	*tgt_ppargs = NULL; -	struct xfs_parent_args	*wip_ppargs = NULL;  	int			i;  	int			num_inodes = __XFS_SORT_INODES;  	bool			new_parent = (src_dp != target_dp); @@ -3204,8 +2215,8 @@ xfs_rename(  	 * appropriately.  	 */  	if (flags & RENAME_WHITEOUT) { -		error = xfs_rename_alloc_whiteout(idmap, src_name, -						  target_dp, &wip); +		error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp, +				&du_wip.ip);  		if (error)  			return error; @@ -3213,21 +2224,21 @@ xfs_rename(  		src_name->type = XFS_DIR3_FT_CHRDEV;  	} -	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, -				inodes, &num_inodes); +	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, du_wip.ip, +			inodes, &num_inodes); -	error = xfs_parent_start(mp, &src_ppargs); +	error = xfs_parent_start(mp, &du_src.ppargs);  	if (error)  		goto out_release_wip; -	if (wip) { -		error = xfs_parent_start(mp, &wip_ppargs); +	if (du_wip.ip) { +		error = xfs_parent_start(mp, &du_wip.ppargs);  		if (error)  			goto out_src_ppargs;  	}  	if (target_ip) { -		error = xfs_parent_start(mp, &tgt_ppargs); +		error = xfs_parent_start(mp, &du_tgt.ppargs);  		if (error)  			goto out_wip_ppargs;  	} @@ -3235,7 +2246,7 @@ xfs_rename(  retry:  	nospace_error = 0;  	spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL, -			target_name->len, wip != NULL); +			target_name->len, du_wip.ip != NULL);  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);  	if (error == -ENOSPC) {  		nospace_error = error; @@ -3250,7 +2261,7 @@ retry:  	 * We don't allow reservationless renaming when parent pointers are  	 * enabled because we can't back out if the xattrs must grow.  	 */ -	if (src_ppargs && nospace_error) { +	if (du_src.ppargs && nospace_error) {  		error = nospace_error;  		xfs_trans_cancel(tp);  		goto out_tgt_ppargs; @@ -3282,8 +2293,8 @@ retry:  	xfs_trans_ijoin(tp, src_ip, 0);  	if (target_ip)  		xfs_trans_ijoin(tp, target_ip, 0); -	if (wip) -		xfs_trans_ijoin(tp, wip, 0); +	if (du_wip.ip) +		xfs_trans_ijoin(tp, du_wip.ip, 0);  	/*  	 * If we are using project inheritance, we only allow renames @@ -3298,11 +2309,11 @@ retry:  	/* RENAME_EXCHANGE is unique from here on. */  	if (flags & RENAME_EXCHANGE) { -		error = xfs_cross_rename(tp, src_dp, src_name, src_ip, -				src_ppargs, target_dp, target_name, target_ip, -				tgt_ppargs, spaceres); -		nospace_error = 0; -		goto out_unlock; +		error = xfs_dir_exchange_children(tp, &du_src, &du_tgt, +				spaceres); +		if (error) +			goto out_trans_cancel; +		goto out_commit;  	}  	/* @@ -3335,39 +2346,12 @@ retry:  	 * We don't allow quotaless renaming when parent pointers are enabled  	 * because we can't back out if the xattrs must grow.  	 */ -	if (src_ppargs && nospace_error) { +	if (du_src.ppargs && nospace_error) {  		error = nospace_error;  		goto out_trans_cancel;  	}  	/* -	 * Check for expected errors before we dirty the transaction -	 * so we can return an error without a transaction abort. -	 */ -	if (target_ip == NULL) { -		/* -		 * If there's no space reservation, check the entry will -		 * fit before actually inserting it. -		 */ -		if (!spaceres) { -			error = xfs_dir_canenter(tp, target_dp, target_name); -			if (error) -				goto out_trans_cancel; -		} -	} else { -		/* -		 * If target exists and it's a directory, check that whether -		 * it can be destroyed. -		 */ -		if (S_ISDIR(VFS_I(target_ip)->i_mode) && -		    (!xfs_dir_isempty(target_ip) || -		     (VFS_I(target_ip)->i_nlink > 2))) { -			error = -EEXIST; -			goto out_trans_cancel; -		} -	} - -	/*  	 * Lock the AGI buffers we need to handle bumping the nlink of the  	 * whiteout inode off the unlinked list and to handle dropping the  	 * nlink of the target inode.  Per locking order rules, do this in @@ -3378,7 +2362,7 @@ retry:  	 * target_ip is either null or an empty directory.  	 */  	for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { -		if (inodes[i] == wip || +		if (inodes[i] == du_wip.ip ||  		    (inodes[i] == target_ip &&  		     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {  			struct xfs_perag	*pag; @@ -3393,188 +2377,29 @@ retry:  		}  	} -	/* -	 * Directory entry creation below may acquire the AGF. Remove -	 * the whiteout from the unlinked list first to preserve correct -	 * AGI/AGF locking order. This dirties the transaction so failures -	 * after this point will abort and log recovery will clean up the -	 * mess. -	 * -	 * For whiteouts, we need to bump the link count on the whiteout -	 * inode. After this point, we have a real link, clear the tmpfile -	 * state flag from the inode so it doesn't accidentally get misused -	 * in future. -	 */ -	if (wip) { -		struct xfs_perag	*pag; - -		ASSERT(VFS_I(wip)->i_nlink == 0); - -		pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); -		error = xfs_iunlink_remove(tp, pag, wip); -		xfs_perag_put(pag); -		if (error) -			goto out_trans_cancel; - -		xfs_bumplink(tp, wip); -		VFS_I(wip)->i_state &= ~I_LINKABLE; -	} - -	/* -	 * Set up the target. -	 */ -	if (target_ip == NULL) { -		/* -		 * If target does not exist and the rename crosses -		 * directories, adjust the target directory link count -		 * to account for the ".." reference from the new entry. -		 */ -		error = xfs_dir_createname(tp, target_dp, target_name, -					   src_ip->i_ino, spaceres); -		if (error) -			goto out_trans_cancel; - -		xfs_trans_ichgtime(tp, target_dp, -					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - -		if (new_parent && src_is_directory) { -			xfs_bumplink(tp, target_dp); -		} -	} else { /* target_ip != NULL */ -		/* -		 * Link the source inode under the target name. -		 * If the source inode is a directory and we are moving -		 * it across directories, its ".." entry will be -		 * inconsistent until we replace that down below. -		 * -		 * In case there is already an entry with the same -		 * name at the destination directory, remove it first. -		 */ -		error = xfs_dir_replace(tp, target_dp, target_name, -					src_ip->i_ino, spaceres); -		if (error) -			goto out_trans_cancel; - -		xfs_trans_ichgtime(tp, target_dp, -					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - -		/* -		 * Decrement the link count on the target since the target -		 * dir no longer points to it. -		 */ -		error = xfs_droplink(tp, target_ip); -		if (error) -			goto out_trans_cancel; - -		if (src_is_directory) { -			/* -			 * Drop the link from the old "." entry. -			 */ -			error = xfs_droplink(tp, target_ip); -			if (error) -				goto out_trans_cancel; -		} -	} /* target_ip != NULL */ - -	/* -	 * Remove the source. -	 */ -	if (new_parent && src_is_directory) { -		/* -		 * Rewrite the ".." entry to point to the new -		 * directory. -		 */ -		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, -					target_dp->i_ino, spaceres); -		ASSERT(error != -EEXIST); -		if (error) -			goto out_trans_cancel; -	} - -	/* -	 * We always want to hit the ctime on the source inode. -	 * -	 * This isn't strictly required by the standards since the source -	 * inode isn't really being changed, but old unix file systems did -	 * it and some incremental backup programs won't work without it. -	 */ -	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); -	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); - -	/* -	 * Adjust the link count on src_dp.  This is necessary when -	 * renaming a directory, either within one parent when -	 * the target existed, or across two parent directories. -	 */ -	if (src_is_directory && (new_parent || target_ip != NULL)) { - -		/* -		 * Decrement link count on src_directory since the -		 * entry that's moved no longer points to it. -		 */ -		error = xfs_droplink(tp, src_dp); -		if (error) -			goto out_trans_cancel; -	} - -	/* -	 * For whiteouts, we only need to update the source dirent with the -	 * inode number of the whiteout inode rather than removing it -	 * altogether. -	 */ -	if (wip) -		error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, -					spaceres); -	else -		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, -					   spaceres); - +	error = xfs_dir_rename_children(tp, &du_src, &du_tgt, spaceres, +			&du_wip);  	if (error)  		goto out_trans_cancel; -	/* Schedule parent pointer updates. */ -	if (wip_ppargs) { -		error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name, -				wip); -		if (error) -			goto out_trans_cancel; -	} - -	if (src_ppargs) { -		error = xfs_parent_replacename(tp, src_ppargs, src_dp, -				src_name, target_dp, target_name, src_ip); -		if (error) -			goto out_trans_cancel; -	} - -	if (tgt_ppargs) { -		error = xfs_parent_removename(tp, tgt_ppargs, target_dp, -				target_name, target_ip); -		if (error) -			goto out_trans_cancel; +	if (du_wip.ip) { +		/* +		 * Now we have a real link, clear the "I'm a tmpfile" state +		 * flag from the inode so it doesn't accidentally get misused in +		 * future. +		 */ +		VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;  	} -	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); -	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); -	if (new_parent) -		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - +out_commit:  	/* -	 * Inform our hook clients that we've finished a rename operation as -	 * follows: removed the source and target files from their directories; -	 * that we've added the source to the target directory; and finally -	 * that we've added the whiteout, if there was one.  All inodes are -	 * locked, so it's ok to model a rename this way so long as we say we -	 * deleted entries before we add new ones. +	 * If this is a synchronous mount, make sure that the rename +	 * transaction goes to disk before returning to the user.  	 */ -	if (target_ip) -		xfs_dir_update_hook(target_dp, target_ip, -1, target_name); -	xfs_dir_update_hook(src_dp, src_ip, -1, src_name); -	xfs_dir_update_hook(target_dp, src_ip, 1, target_name); -	if (wip) -		xfs_dir_update_hook(src_dp, wip, 1, src_name); +	if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) +		xfs_trans_set_sync(tp); -	error = xfs_finish_rename(tp); +	error = xfs_trans_commit(tp);  	nospace_error = 0;  	goto out_unlock; @@ -3583,14 +2408,14 @@ out_trans_cancel:  out_unlock:  	xfs_iunlock_rename(inodes, num_inodes);  out_tgt_ppargs: -	xfs_parent_finish(mp, tgt_ppargs); +	xfs_parent_finish(mp, du_tgt.ppargs);  out_wip_ppargs: -	xfs_parent_finish(mp, wip_ppargs); +	xfs_parent_finish(mp, du_wip.ppargs);  out_src_ppargs: -	xfs_parent_finish(mp, src_ppargs); +	xfs_parent_finish(mp, du_src.ppargs);  out_release_wip: -	if (wip) -		xfs_irele(wip); +	if (du_wip.ip) +		xfs_irele(du_wip.ip);  	if (error == -ENOSPC && nospace_error)  		error = nospace_error;  	return error; @@ -3730,6 +2555,7 @@ flush_out:  	iip->ili_last_fields = iip->ili_fields;  	iip->ili_fields = 0;  	iip->ili_fsync_fields = 0; +	set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);  	spin_unlock(&iip->ili_lock);  	/* @@ -4293,3 +3119,11 @@ xfs_inode_alloc_unitsize(  	return XFS_FSB_TO_B(ip->i_mount, blocks);  } + +/* Should we always be using copy on write for file writes? */ +bool +xfs_is_always_cow_inode( +	struct xfs_inode	*ip) +{ +	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); +} |