diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 542 | 
1 files changed, 449 insertions, 93 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da5799c56160..4421954720b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -455,7 +455,7 @@ again:  	/*  	 * skip compression for a small file range(<=blocksize) that -	 * isn't an inline extent, since it dosen't save disk space at all. +	 * isn't an inline extent, since it doesn't save disk space at all.  	 */  	if (total_compressed <= blocksize &&  	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -824,6 +824,7 @@ retry:  						async_extent->ram_size - 1, 0);  			goto out_free_reserve;  		} +		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);  		/*  		 * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry:  	}  	return;  out_free_reserve: +	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);  out_free:  	extent_clear_unlock_delalloc(inode, async_extent->start, @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,  				goto out_drop_extent_cache;  		} +		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); +  		if (disk_num_bytes < cur_alloc_size)  			break; @@ -1066,6 +1070,7 @@ out:  out_drop_extent_cache:  	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);  out_reserve: +	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);  out_unlock:  	extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1377,6 +1382,9 @@ next_slot:  			 */  			if (csum_exist_in_range(root, disk_bytenr, num_bytes))  				goto out_check; +			if (!btrfs_inc_nocow_writers(root->fs_info, +						     disk_bytenr)) +				goto out_check;  			nocow = 1;  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  			extent_end = found_key.offset + @@ -1391,6 +1399,9 @@ out_check:  			path->slots[0]++;  			if (!nolock && nocow)  				btrfs_end_write_no_snapshoting(root); +			if (nocow) +				btrfs_dec_nocow_writers(root->fs_info, +							disk_bytenr);  			goto next_slot;  		}  		if (!nocow) { @@ -1411,6 +1422,9 @@ out_check:  			if (ret) {  				if (!nolock && nocow)  					btrfs_end_write_no_snapshoting(root); +				if (nocow) +					btrfs_dec_nocow_writers(root->fs_info, +								disk_bytenr);  				goto error;  			}  			cow_start = (u64)-1; @@ -1453,6 +1467,8 @@ out_check:  		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,  					       num_bytes, num_bytes, type); +		if (nocow) +			btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);  		BUG_ON(ret); /* -ENOMEM */  		if (root->root_key.objectid == @@ -1962,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,  {  	WARN_ON((end & (PAGE_SIZE - 1)) == 0);  	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, -				   cached_state, GFP_NOFS); +				   cached_state);  }  /* see btrfs_writepage_start_hook for details on why this is required */ @@ -3103,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&  	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { -		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, -				  GFP_NOFS); +		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);  		return 0;  	} @@ -3256,7 +3271,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  	/* grab metadata reservation from transaction handle */  	if (reserve) {  		ret = btrfs_orphan_reserve_metadata(trans, inode); -		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ +		ASSERT(!ret); +		if (ret) { +			atomic_dec(&root->orphan_inodes); +			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +				  &BTRFS_I(inode)->runtime_flags); +			if (insert) +				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +					  &BTRFS_I(inode)->runtime_flags); +			return ret; +		}  	}  	/* insert an orphan item to track this unlinked/truncated file */ @@ -3706,7 +3730,7 @@ cache_index:  	 * and doesn't have an inode ref with the name "bar" anymore.  	 *  	 * Setting last_unlink_trans to last_trans is a pessimistic approach, -	 * but it guarantees correctness at the expense of ocassional full +	 * but it guarantees correctness at the expense of occasional full  	 * transaction commits on fsync if our inode is a directory, or if our  	 * inode is not a directory, logging its parent unnecessarily.  	 */ @@ -4534,6 +4558,7 @@ delete:  			BUG_ON(ret);  			if (btrfs_should_throttle_delayed_refs(trans, root))  				btrfs_async_run_delayed_refs(root, +							     trans->transid,  					trans->delayed_ref_updates * 2, 0);  			if (be_nice) {  				if (truncate_space_check(trans, root, @@ -4962,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  		 * be instantly completed which will give us extents that need  		 * to be truncated.  If we fail to get an orphan inode down we  		 * could have left over extents that were never meant to live, -		 * so we need to garuntee from this point on that everything +		 * so we need to guarantee from this point on that everything  		 * will be consistent.  		 */  		ret = btrfs_orphan_add(trans, inode); @@ -5232,7 +5257,7 @@ void btrfs_evict_inode(struct inode *inode)  		}  		/* -		 * We can't just steal from the global reserve, we need tomake +		 * We can't just steal from the global reserve, we need to make  		 * sure there is room to do it, if not we need to commit and try  		 * again.  		 */ @@ -5733,6 +5758,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)  	int name_len;  	int is_curr = 0;	/* ctx->pos points to the current index? */  	bool emitted; +	bool put = false;  	/* FIXME, use a real flag for deciding about the key type */  	if (root->fs_info->tree_root == root) @@ -5750,7 +5776,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)  	if (key_type == BTRFS_DIR_INDEX_KEY) {  		INIT_LIST_HEAD(&ins_list);  		INIT_LIST_HEAD(&del_list); -		btrfs_get_delayed_items(inode, &ins_list, &del_list); +		put = btrfs_readdir_get_delayed_items(inode, &ins_list, +						      &del_list);  	}  	key.type = key_type; @@ -5897,8 +5924,8 @@ next:  nopos:  	ret = 0;  err: -	if (key_type == BTRFS_DIR_INDEX_KEY) -		btrfs_put_delayed_items(&ins_list, &del_list); +	if (put) +		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);  	btrfs_free_path(path);  	return ret;  } @@ -6964,7 +6991,18 @@ insert:  		 * existing will always be non-NULL, since there must be  		 * extent causing the -EEXIST.  		 */ -		if (start >= extent_map_end(existing) || +		if (existing->start == em->start && +		    extent_map_end(existing) == extent_map_end(em) && +		    em->block_start == existing->block_start) { +			/* +			 * these two extents are the same, it happens +			 * with inlines especially +			 */ +			free_extent_map(em); +			em = existing; +			err = 0; + +		} else if (start >= extent_map_end(existing) ||  		    start <= existing->start) {  			/*  			 * The existing extent map is the one nearest to @@ -7129,6 +7167,43 @@ out:  	return em;  } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, +						  const u64 start, +						  const u64 len, +						  const u64 orig_start, +						  const u64 block_start, +						  const u64 block_len, +						  const u64 orig_block_len, +						  const u64 ram_bytes, +						  const int type) +{ +	struct extent_map *em = NULL; +	int ret; + +	down_read(&BTRFS_I(inode)->dio_sem); +	if (type != BTRFS_ORDERED_NOCOW) { +		em = create_pinned_em(inode, start, len, orig_start, +				      block_start, block_len, orig_block_len, +				      ram_bytes, type); +		if (IS_ERR(em)) +			goto out; +	} +	ret = btrfs_add_ordered_extent_dio(inode, start, block_start, +					   len, block_len, type); +	if (ret) { +		if (em) { +			free_extent_map(em); +			btrfs_drop_extent_cache(inode, start, +						start + len - 1, 0); +		} +		em = ERR_PTR(ret); +	} + out: +	up_read(&BTRFS_I(inode)->dio_sem); + +	return em; +} +  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,  						  u64 start, u64 len)  { @@ -7144,41 +7219,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,  	if (ret)  		return ERR_PTR(ret); -	/* -	 * Create the ordered extent before the extent map. This is to avoid -	 * races with the fast fsync path that would lead to it logging file -	 * extent items that point to disk extents that were not yet written to. -	 * The fast fsync path collects ordered extents into a local list and -	 * then collects all the new extent maps, so we must create the ordered -	 * extent first and make sure the fast fsync path collects any new -	 * ordered extents after collecting new extent maps as well. -	 * The fsync path simply can not rely on inode_dio_wait() because it -	 * causes deadlock with AIO. -	 */ -	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, -					   ins.offset, ins.offset, 0); -	if (ret) { +	em = btrfs_create_dio_extent(inode, start, ins.offset, start, +				     ins.objectid, ins.offset, ins.offset, +				     ins.offset, 0); +	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); +	if (IS_ERR(em))  		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); -		return ERR_PTR(ret); -	} - -	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, -			      ins.offset, ins.offset, ins.offset, 0); -	if (IS_ERR(em)) { -		struct btrfs_ordered_extent *oe; -		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); -		oe = btrfs_lookup_ordered_extent(inode, start); -		ASSERT(oe); -		if (WARN_ON(!oe)) -			return em; -		set_bit(BTRFS_ORDERED_IOERR, &oe->flags); -		set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); -		btrfs_remove_ordered_extent(inode, oe); -		/* Once for our lookup and once for the ordered extents tree. */ -		btrfs_put_ordered_extent(oe); -		btrfs_put_ordered_extent(oe); -	}  	return em;  } @@ -7408,7 +7455,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  				 cached_state);  		/*  		 * We're concerned with the entire range that we're going to be -		 * doing DIO to, so we need to make sure theres no ordered +		 * doing DIO to, so we need to make sure there's no ordered  		 * extents in this range.  		 */  		ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7570,7 +7617,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	if (current->journal_info) {  		/*  		 * Need to pull our outstanding extents and set journal_info to NULL so -		 * that anything that needs to check if there's a transction doesn't get +		 * that anything that needs to check if there's a transaction doesn't get  		 * confused.  		 */  		dio_data = current->journal_info; @@ -7603,7 +7650,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	 * decompress it, so there will be buffering required no matter what we  	 * do, so go ahead and fallback to buffered.  	 * -	 * We return -ENOTBLK because thats what makes DIO go ahead and go back +	 * We return -ENOTBLK because that's what makes DIO go ahead and go back  	 * to buffered IO.  Don't blame me, this is the price we pay for using  	 * the generic code.  	 */ @@ -7650,24 +7697,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  		block_start = em->block_start + (start - em->start);  		if (can_nocow_extent(inode, start, &len, &orig_start, -				     &orig_block_len, &ram_bytes) == 1) { +				     &orig_block_len, &ram_bytes) == 1 && +		    btrfs_inc_nocow_writers(root->fs_info, block_start)) { +			struct extent_map *em2; + +			em2 = btrfs_create_dio_extent(inode, start, len, +						      orig_start, block_start, +						      len, orig_block_len, +						      ram_bytes, type); +			btrfs_dec_nocow_writers(root->fs_info, block_start);  			if (type == BTRFS_ORDERED_PREALLOC) {  				free_extent_map(em); -				em = create_pinned_em(inode, start, len, -						       orig_start, -						       block_start, len, -						       orig_block_len, -						       ram_bytes, type); -				if (IS_ERR(em)) { -					ret = PTR_ERR(em); -					goto unlock_err; -				} +				em = em2;  			} - -			ret = btrfs_add_ordered_extent_dio(inode, start, -					   block_start, len, len, type); -			if (ret) { -				free_extent_map(em); +			if (em2 && IS_ERR(em2)) { +				ret = PTR_ERR(em2);  				goto unlock_err;  			}  			goto unlock; @@ -9019,7 +9063,7 @@ static int btrfs_truncate(struct inode *inode)  		return ret;  	/* -	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have +	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have  	 * 3 things going on here  	 *  	 * 1) We need to reserve space for our orphan item and the space to @@ -9033,15 +9077,15 @@ static int btrfs_truncate(struct inode *inode)  	 * space reserved in case it uses space during the truncate (thank you  	 * very much snapshotting).  	 * -	 * And we need these to all be seperate.  The fact is we can use alot of +	 * And we need these to all be separate.  The fact is we can use a lot of  	 * space doing the truncate, and we have no earthly idea how much space -	 * we will use, so we need the truncate reservation to be seperate so it +	 * we will use, so we need the truncate reservation to be separate so it  	 * doesn't end up using space reserved for updating the inode or  	 * removing the orphan item.  We also need to be able to stop the  	 * transaction and start a new one, which means we need to be able to  	 * update the inode several times, and we have no idea of knowing how  	 * many times that will be, so we can't just reserve 1 item for the -	 * entirety of the opration, so that has to be done seperately as well. +	 * entirety of the operation, so that has to be done separately as well.  	 * Then there is the orphan item, which does indeed need to be held on  	 * to for the whole operation, and we need nobody to touch this reserved  	 * space except the orphan code. @@ -9230,6 +9274,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	INIT_LIST_HEAD(&ei->delalloc_inodes);  	INIT_LIST_HEAD(&ei->delayed_iput);  	RB_CLEAR_NODE(&ei->rb_node); +	init_rwsem(&ei->dio_sem);  	return inode;  } @@ -9387,10 +9432,281 @@ static int btrfs_getattr(struct vfsmount *mnt,  	return 0;  } +static int btrfs_rename_exchange(struct inode *old_dir, +			      struct dentry *old_dentry, +			      struct inode *new_dir, +			      struct dentry *new_dentry) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = BTRFS_I(old_dir)->root; +	struct btrfs_root *dest = BTRFS_I(new_dir)->root; +	struct inode *new_inode = new_dentry->d_inode; +	struct inode *old_inode = old_dentry->d_inode; +	struct timespec ctime = CURRENT_TIME; +	struct dentry *parent; +	u64 old_ino = btrfs_ino(old_inode); +	u64 new_ino = btrfs_ino(new_inode); +	u64 old_idx = 0; +	u64 new_idx = 0; +	u64 root_objectid; +	int ret; +	bool root_log_pinned = false; +	bool dest_log_pinned = false; + +	/* we only allow rename subvolume link between subvolumes */ +	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) +		return -EXDEV; + +	/* close the race window with snapshot create/destroy ioctl */ +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) +		down_read(&root->fs_info->subvol_sem); +	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) +		down_read(&dest->fs_info->subvol_sem); + +	/* +	 * We want to reserve the absolute worst case amount of items.  So if +	 * both inodes are subvols and we need to unlink them then that would +	 * require 4 item modifications, but if they are both normal inodes it +	 * would require 5 item modifications, so we'll assume their normal +	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items +	 * should cover the worst case number of items we'll modify. +	 */ +	trans = btrfs_start_transaction(root, 12); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out_notrans; +	} + +	/* +	 * We need to find a free sequence number both in the source and +	 * in the destination directory for the exchange. +	 */ +	ret = btrfs_set_inode_index(new_dir, &old_idx); +	if (ret) +		goto out_fail; +	ret = btrfs_set_inode_index(old_dir, &new_idx); +	if (ret) +		goto out_fail; + +	BTRFS_I(old_inode)->dir_index = 0ULL; +	BTRFS_I(new_inode)->dir_index = 0ULL; + +	/* Reference for the source. */ +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { +		/* force full log commit if subvolume involved. */ +		btrfs_set_log_full_commit(root->fs_info, trans); +	} else { +		btrfs_pin_log_trans(root); +		root_log_pinned = true; +		ret = btrfs_insert_inode_ref(trans, dest, +					     new_dentry->d_name.name, +					     new_dentry->d_name.len, +					     old_ino, +					     btrfs_ino(new_dir), old_idx); +		if (ret) +			goto out_fail; +	} + +	/* And now for the dest. */ +	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { +		/* force full log commit if subvolume involved. */ +		btrfs_set_log_full_commit(dest->fs_info, trans); +	} else { +		btrfs_pin_log_trans(dest); +		dest_log_pinned = true; +		ret = btrfs_insert_inode_ref(trans, root, +					     old_dentry->d_name.name, +					     old_dentry->d_name.len, +					     new_ino, +					     btrfs_ino(old_dir), new_idx); +		if (ret) +			goto out_fail; +	} + +	/* Update inode version and ctime/mtime. */ +	inode_inc_iversion(old_dir); +	inode_inc_iversion(new_dir); +	inode_inc_iversion(old_inode); +	inode_inc_iversion(new_inode); +	old_dir->i_ctime = old_dir->i_mtime = ctime; +	new_dir->i_ctime = new_dir->i_mtime = ctime; +	old_inode->i_ctime = ctime; +	new_inode->i_ctime = ctime; + +	if (old_dentry->d_parent != new_dentry->d_parent) { +		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); +		btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); +	} + +	/* src is a subvolume */ +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { +		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; +		ret = btrfs_unlink_subvol(trans, root, old_dir, +					  root_objectid, +					  old_dentry->d_name.name, +					  old_dentry->d_name.len); +	} else { /* src is an inode */ +		ret = __btrfs_unlink_inode(trans, root, old_dir, +					   old_dentry->d_inode, +					   old_dentry->d_name.name, +					   old_dentry->d_name.len); +		if (!ret) +			ret = btrfs_update_inode(trans, root, old_inode); +	} +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail; +	} + +	/* dest is a subvolume */ +	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { +		root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; +		ret = btrfs_unlink_subvol(trans, dest, new_dir, +					  root_objectid, +					  new_dentry->d_name.name, +					  new_dentry->d_name.len); +	} else { /* dest is an inode */ +		ret = __btrfs_unlink_inode(trans, dest, new_dir, +					   new_dentry->d_inode, +					   new_dentry->d_name.name, +					   new_dentry->d_name.len); +		if (!ret) +			ret = btrfs_update_inode(trans, dest, new_inode); +	} +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail; +	} + +	ret = btrfs_add_link(trans, new_dir, old_inode, +			     new_dentry->d_name.name, +			     new_dentry->d_name.len, 0, old_idx); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail; +	} + +	ret = btrfs_add_link(trans, old_dir, new_inode, +			     old_dentry->d_name.name, +			     old_dentry->d_name.len, 0, new_idx); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail; +	} + +	if (old_inode->i_nlink == 1) +		BTRFS_I(old_inode)->dir_index = old_idx; +	if (new_inode->i_nlink == 1) +		BTRFS_I(new_inode)->dir_index = new_idx; + +	if (root_log_pinned) { +		parent = new_dentry->d_parent; +		btrfs_log_new_name(trans, old_inode, old_dir, parent); +		btrfs_end_log_trans(root); +		root_log_pinned = false; +	} +	if (dest_log_pinned) { +		parent = old_dentry->d_parent; +		btrfs_log_new_name(trans, new_inode, new_dir, parent); +		btrfs_end_log_trans(dest); +		dest_log_pinned = false; +	} +out_fail: +	/* +	 * If we have pinned a log and an error happened, we unpin tasks +	 * trying to sync the log and force them to fallback to a transaction +	 * commit if the log currently contains any of the inodes involved in +	 * this rename operation (to ensure we do not persist a log with an +	 * inconsistent state for any of these inodes or leading to any +	 * inconsistencies when replayed). If the transaction was aborted, the +	 * abortion reason is propagated to userspace when attempting to commit +	 * the transaction. If the log does not contain any of these inodes, we +	 * allow the tasks to sync it. +	 */ +	if (ret && (root_log_pinned || dest_log_pinned)) { +		if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || +		    btrfs_inode_in_log(new_dir, root->fs_info->generation) || +		    btrfs_inode_in_log(old_inode, root->fs_info->generation) || +		    (new_inode && +		     btrfs_inode_in_log(new_inode, root->fs_info->generation))) +		    btrfs_set_log_full_commit(root->fs_info, trans); + +		if (root_log_pinned) { +			btrfs_end_log_trans(root); +			root_log_pinned = false; +		} +		if (dest_log_pinned) { +			btrfs_end_log_trans(dest); +			dest_log_pinned = false; +		} +	} +	ret = btrfs_end_transaction(trans, root); +out_notrans: +	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) +		up_read(&dest->fs_info->subvol_sem); +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) +		up_read(&root->fs_info->subvol_sem); + +	return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, +				     struct btrfs_root *root, +				     struct inode *dir, +				     struct dentry *dentry) +{ +	int ret; +	struct inode *inode; +	u64 objectid; +	u64 index; + +	ret = btrfs_find_free_ino(root, &objectid); +	if (ret) +		return ret; + +	inode = btrfs_new_inode(trans, root, dir, +				dentry->d_name.name, +				dentry->d_name.len, +				btrfs_ino(dir), +				objectid, +				S_IFCHR | WHITEOUT_MODE, +				&index); + +	if (IS_ERR(inode)) { +		ret = PTR_ERR(inode); +		return ret; +	} + +	inode->i_op = &btrfs_special_inode_operations; +	init_special_inode(inode, inode->i_mode, +		WHITEOUT_DEV); + +	ret = btrfs_init_inode_security(trans, inode, dir, +				&dentry->d_name); +	if (ret) +		goto out; + +	ret = btrfs_add_nondir(trans, dir, dentry, +				inode, 0, index); +	if (ret) +		goto out; + +	ret = btrfs_update_inode(trans, root, inode); +out: +	unlock_new_inode(inode); +	if (ret) +		inode_dec_link_count(inode); +	iput(inode); + +	return ret; +} +  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -			   struct inode *new_dir, struct dentry *new_dentry) +			   struct inode *new_dir, struct dentry *new_dentry, +			   unsigned int flags)  {  	struct btrfs_trans_handle *trans; +	unsigned int trans_num_items;  	struct btrfs_root *root = BTRFS_I(old_dir)->root;  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;  	struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9715,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	u64 root_objectid;  	int ret;  	u64 old_ino = btrfs_ino(old_inode); +	bool log_pinned = false;  	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)  		return -EPERM; @@ -9449,15 +9766,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	 * We want to reserve the absolute worst case amount of items.  So if  	 * both inodes are subvols and we need to unlink them then that would  	 * require 4 item modifications, but if they are both normal inodes it -	 * would require 5 item modifications, so we'll assume their normal +	 * would require 5 item modifications, so we'll assume they are normal  	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items  	 * should cover the worst case number of items we'll modify. +	 * If our rename has the whiteout flag, we need more 5 units for the +	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item +	 * when selinux is enabled).  	 */ -	trans = btrfs_start_transaction(root, 11); +	trans_num_items = 11; +	if (flags & RENAME_WHITEOUT) +		trans_num_items += 5; +	trans = btrfs_start_transaction(root, trans_num_items);  	if (IS_ERR(trans)) { -                ret = PTR_ERR(trans); -                goto out_notrans; -        } +		ret = PTR_ERR(trans); +		goto out_notrans; +	}  	if (dest != root)  		btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		/* force full log commit if subvolume involved. */  		btrfs_set_log_full_commit(root->fs_info, trans);  	} else { +		btrfs_pin_log_trans(root); +		log_pinned = true;  		ret = btrfs_insert_inode_ref(trans, dest,  					     new_dentry->d_name.name,  					     new_dentry->d_name.len, @@ -9478,14 +9803,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  					     btrfs_ino(new_dir), index);  		if (ret)  			goto out_fail; -		/* -		 * this is an ugly little race, but the rename is required -		 * to make sure that if we crash, the inode is either at the -		 * old name or the new one.  pinning the log transaction lets -		 * us make sure we don't allow a log commit to come in after -		 * we unlink the name but before we add the new name back in. -		 */ -		btrfs_pin_log_trans(root);  	}  	inode_inc_iversion(old_dir); @@ -9552,12 +9869,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (old_inode->i_nlink == 1)  		BTRFS_I(old_inode)->dir_index = index; -	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { +	if (log_pinned) {  		struct dentry *parent = new_dentry->d_parent; +  		btrfs_log_new_name(trans, old_inode, old_dir, parent);  		btrfs_end_log_trans(root); +		log_pinned = false; +	} + +	if (flags & RENAME_WHITEOUT) { +		ret = btrfs_whiteout_for_rename(trans, root, old_dir, +						old_dentry); + +		if (ret) { +			btrfs_abort_transaction(trans, root, ret); +			goto out_fail; +		}  	}  out_fail: +	/* +	 * If we have pinned the log and an error happened, we unpin tasks +	 * trying to sync the log and force them to fallback to a transaction +	 * commit if the log currently contains any of the inodes involved in +	 * this rename operation (to ensure we do not persist a log with an +	 * inconsistent state for any of these inodes or leading to any +	 * inconsistencies when replayed). If the transaction was aborted, the +	 * abortion reason is propagated to userspace when attempting to commit +	 * the transaction. If the log does not contain any of these inodes, we +	 * allow the tasks to sync it. +	 */ +	if (ret && log_pinned) { +		if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || +		    btrfs_inode_in_log(new_dir, root->fs_info->generation) || +		    btrfs_inode_in_log(old_inode, root->fs_info->generation) || +		    (new_inode && +		     btrfs_inode_in_log(new_inode, root->fs_info->generation))) +		    btrfs_set_log_full_commit(root->fs_info, trans); + +		btrfs_end_log_trans(root); +		log_pinned = false; +	}  	btrfs_end_transaction(trans, root);  out_notrans:  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9921,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,  			 struct inode *new_dir, struct dentry *new_dentry,  			 unsigned int flags)  { -	if (flags & ~RENAME_NOREPLACE) +	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))  		return -EINVAL; -	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +	if (flags & RENAME_EXCHANGE) +		return btrfs_rename_exchange(old_dir, old_dentry, new_dir, +					  new_dentry); + +	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);  }  static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9942,6 +10297,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  				btrfs_end_transaction(trans, root);  			break;  		} +		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);  		last_alloc = ins.offset;  		ret = insert_reserved_file_extent(trans, inode, @@ -10160,10 +10516,10 @@ static const struct inode_operations btrfs_dir_inode_operations = {  	.symlink	= btrfs_symlink,  	.setattr	= btrfs_setattr,  	.mknod		= btrfs_mknod, -	.setxattr	= btrfs_setxattr, +	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr	= btrfs_listxattr, -	.removexattr	= btrfs_removexattr, +	.removexattr	= generic_removexattr,  	.permission	= btrfs_permission,  	.get_acl	= btrfs_get_acl,  	.set_acl	= btrfs_set_acl, @@ -10184,7 +10540,7 @@ static const struct file_operations btrfs_dir_file_operations = {  	.iterate_shared	= btrfs_real_readdir,  	.unlocked_ioctl	= btrfs_ioctl,  #ifdef CONFIG_COMPAT -	.compat_ioctl	= btrfs_ioctl, +	.compat_ioctl	= btrfs_compat_ioctl,  #endif  	.release        = btrfs_release_file,  	.fsync		= btrfs_sync_file, @@ -10237,10 +10593,10 @@ static const struct address_space_operations btrfs_symlink_aops = {  static const struct inode_operations btrfs_file_inode_operations = {  	.getattr	= btrfs_getattr,  	.setattr	= btrfs_setattr, -	.setxattr	= btrfs_setxattr, +	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr      = btrfs_listxattr, -	.removexattr	= btrfs_removexattr, +	.removexattr	= generic_removexattr,  	.permission	= btrfs_permission,  	.fiemap		= btrfs_fiemap,  	.get_acl	= btrfs_get_acl, @@ -10251,10 +10607,10 @@ static const struct inode_operations btrfs_special_inode_operations = {  	.getattr	= btrfs_getattr,  	.setattr	= btrfs_setattr,  	.permission	= btrfs_permission, -	.setxattr	= btrfs_setxattr, +	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr	= btrfs_listxattr, -	.removexattr	= btrfs_removexattr, +	.removexattr	= generic_removexattr,  	.get_acl	= btrfs_get_acl,  	.set_acl	= btrfs_set_acl,  	.update_time	= btrfs_update_time, @@ -10265,10 +10621,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = {  	.getattr	= btrfs_getattr,  	.setattr	= btrfs_setattr,  	.permission	= btrfs_permission, -	.setxattr	= btrfs_setxattr, +	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr	= btrfs_listxattr, -	.removexattr	= btrfs_removexattr, +	.removexattr	= generic_removexattr,  	.update_time	= btrfs_update_time,  };  |