diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 492 | 
1 files changed, 337 insertions, 155 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e02..986120f30066 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@  #include <linux/uio.h>  #include <linux/bio.h>  #include <linux/workqueue.h> +#include <linux/kernel.h>  #include "ext4_jbd2.h"  #include "xattr.h" @@ -170,6 +171,9 @@ void ext4_delete_inode(struct inode *inode)  	handle_t *handle;  	int err; +	if (!is_bad_inode(inode)) +		dquot_initialize(inode); +  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0);  	truncate_inode_pages(&inode->i_data, 0); @@ -194,7 +198,7 @@ void ext4_delete_inode(struct inode *inode)  	inode->i_size = 0;  	err = ext4_mark_inode_dirty(handle, inode);  	if (err) { -		ext4_warning(inode->i_sb, __func__, +		ext4_warning(inode->i_sb,  			     "couldn't mark inode dirty (err %d)", err);  		goto stop_handle;  	} @@ -212,7 +216,7 @@ void ext4_delete_inode(struct inode *inode)  		if (err > 0)  			err = ext4_journal_restart(handle, 3);  		if (err != 0) { -			ext4_warning(inode->i_sb, __func__, +			ext4_warning(inode->i_sb,  				     "couldn't extend journal (err %d)", err);  		stop_handle:  			ext4_journal_stop(handle); @@ -323,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode,  		offsets[n++] = i_block & (ptrs - 1);  		final = ptrs;  	} else { -		ext4_warning(inode->i_sb, "ext4_block_to_path", -			     "block %lu > max in inode %lu", +		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",  			     i_block + direct_blocks +  			     indirect_blocks + double_blocks, inode->i_ino);  	} @@ -344,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,  		if (blk &&  		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),  						    blk, 1))) { -			ext4_error(inode->i_sb, function, +			__ext4_error(inode->i_sb, function,  				   "invalid block reference %u "  				   "in inode #%lu", blk, inode->i_ino);  			return -EIO; @@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,  		if (*err)  			goto failed_out; -		BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); +		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { +			EXT4_ERROR_INODE(inode, +					 "current_block %llu + count %lu > %d!", +					 current_block, count, +					 EXT4_MAX_BLOCK_FILE_PHYS); +			*err = -EIO; +			goto failed_out; +		}  		target -= count;  		/* allocate blocks for indirect blocks */ @@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,  		ar.flags = EXT4_MB_HINT_DATA;  	current_block = ext4_mb_new_blocks(handle, &ar, err); -	BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); +	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { +		EXT4_ERROR_INODE(inode, +				 "current_block %llu + ar.len %d > %d!", +				 current_block, ar.len, +				 EXT4_MAX_BLOCK_FILE_PHYS); +		*err = -EIO; +		goto failed_out; +	}  	if (*err && (target == blks)) {  		/* @@ -1061,6 +1078,7 @@ void ext4_da_update_reserve_space(struct inode *inode,  	int mdb_free = 0, allocated_meta_blocks = 0;  	spin_lock(&ei->i_block_reservation_lock); +	trace_ext4_da_update_reserve_space(inode, used);  	if (unlikely(used > ei->i_reserved_data_blocks)) {  		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "  			 "with only %d reserved data blocks\n", @@ -1093,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode,  	/* Update quota subsystem */  	if (quota_claim) { -		vfs_dq_claim_block(inode, used); +		dquot_claim_block(inode, used);  		if (mdb_free) -			vfs_dq_release_reservation_block(inode, mdb_free); +			dquot_release_reservation_block(inode, mdb_free);  	} else {  		/*  		 * We did fallocate with an offset that is already delayed @@ -1106,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode,  		 * that  		 */  		if (allocated_meta_blocks) -			vfs_dq_claim_block(inode, allocated_meta_blocks); -		vfs_dq_release_reservation_block(inode, mdb_free + used); +			dquot_claim_block(inode, allocated_meta_blocks); +		dquot_release_reservation_block(inode, mdb_free + used);  	}  	/* @@ -1124,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg,  				sector_t logical, sector_t phys, int len)  {  	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { -		ext4_error(inode->i_sb, msg, +		__ext4_error(inode->i_sb, msg,  			   "inode #%lu logical block %llu mapped to %llu "  			   "(size %d)", inode->i_ino,  			   (unsigned long long) logical, @@ -1306,7 +1324,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,  			 * i_data's format changing.  Force the migrate  			 * to fail by clearing migrate flags  			 */ -			EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; +			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);  		}  		/* @@ -1534,6 +1552,8 @@ static void ext4_truncate_failed_write(struct inode *inode)  	ext4_truncate(inode);  } +static int ext4_get_block_write(struct inode *inode, sector_t iblock, +		   struct buffer_head *bh_result, int create);  static int ext4_write_begin(struct file *file, struct address_space *mapping,  			    loff_t pos, unsigned len, unsigned flags,  			    struct page **pagep, void **fsdata) @@ -1575,8 +1595,12 @@ retry:  	}  	*pagep = page; -	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, -				ext4_get_block); +	if (ext4_should_dioread_nolock(inode)) +		ret = block_write_begin(file, mapping, pos, len, flags, pagep, +				fsdata, ext4_get_block_write); +	else +		ret = block_write_begin(file, mapping, pos, len, flags, pagep, +				fsdata, ext4_get_block);  	if (!ret && ext4_should_journal_data(inode)) {  		ret = walk_page_buffers(handle, page_buffers(page), @@ -1793,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file,  	new_i_size = pos + copied;  	if (new_i_size > inode->i_size)  		i_size_write(inode, pos+copied); -	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +	ext4_set_inode_state(inode, EXT4_STATE_JDATA);  	if (new_i_size > EXT4_I(inode)->i_disksize) {  		ext4_update_i_disksize(inode, new_i_size);  		ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1836,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned long md_needed, md_reserved; +	int ret;  	/*  	 * recalculate the amount of metadata blocks to reserve @@ -1846,6 +1871,7 @@ repeat:  	spin_lock(&ei->i_block_reservation_lock);  	md_reserved = ei->i_reserved_meta_blocks;  	md_needed = ext4_calc_metadata_amount(inode, lblock); +	trace_ext4_da_reserve_space(inode, md_needed);  	spin_unlock(&ei->i_block_reservation_lock);  	/* @@ -1853,11 +1879,12 @@ repeat:  	 * later. Real quota accounting is done at pages writeout  	 * time.  	 */ -	if (vfs_dq_reserve_block(inode, md_needed + 1)) -		return -EDQUOT; +	ret = dquot_reserve_block(inode, md_needed + 1); +	if (ret) +		return ret;  	if (ext4_claim_free_blocks(sbi, md_needed + 1)) { -		vfs_dq_release_reservation_block(inode, md_needed + 1); +		dquot_release_reservation_block(inode, md_needed + 1);  		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {  			yield();  			goto repeat; @@ -1914,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)  	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -	vfs_dq_release_reservation_block(inode, to_free); +	dquot_release_reservation_block(inode, to_free);  }  static void ext4_da_page_release_reservation(struct page *page, @@ -2091,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,  				} else if (buffer_mapped(bh))  					BUG_ON(bh->b_blocknr != pblock); +				if (buffer_uninit(exbh)) +					set_buffer_uninit(bh);  				cur_logical++;  				pblock++;  			} while ((bh = bh->b_this_page) != head); @@ -2133,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,  			break;  		for (i = 0; i < nr_pages; i++) {  			struct page *page = pvec.pages[i]; -			index = page->index; -			if (index > end) +			if (page->index > end)  				break; -			index++; -  			BUG_ON(!PageLocked(page));  			BUG_ON(PageWriteback(page));  			block_invalidatepage(page, 0);  			ClearPageUptodate(page);  			unlock_page(page);  		} +		index = pvec.pages[nr_pages - 1]->index + 1; +		pagevec_release(&pvec);  	}  	return;  } @@ -2220,6 +2248,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)  	 */  	new.b_state = 0;  	get_blocks_flags = EXT4_GET_BLOCKS_CREATE; +	if (ext4_should_dioread_nolock(mpd->inode)) +		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;  	if (mpd->b_state & (1 << BH_Delay))  		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2630,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page,  		ret = err;  	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); -	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +	ext4_set_inode_state(inode, EXT4_STATE_JDATA);  out:  	return ret;  } +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +  /*   * Note that we don't need to start a transaction unless we're journaling data   * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2682,7 +2715,7 @@ static int ext4_writepage(struct page *page,  	int ret = 0;  	loff_t size;  	unsigned int len; -	struct buffer_head *page_bufs; +	struct buffer_head *page_bufs = NULL;  	struct inode *inode = page->mapping->host;  	trace_ext4_writepage(inode, page); @@ -2758,7 +2791,11 @@ static int ext4_writepage(struct page *page,  	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))  		ret = nobh_writepage(page, noalloc_get_block_write, wbc); -	else +	else if (page_bufs && buffer_uninit(page_bufs)) { +		ext4_set_bh_endio(page_bufs, inode); +		ret = block_write_full_page_endio(page, noalloc_get_block_write, +					    wbc, ext4_end_io_buffer_write); +	} else  		ret = block_write_full_page(page, noalloc_get_block_write,  					    wbc); @@ -3301,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)  		filemap_write_and_wait(mapping);  	} -	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { +	if (EXT4_JOURNAL(inode) && +	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {  		/*  		 * This is a REALLY heavyweight approach, but the use of  		 * bmap on dirty files is expected to be extremely rare: @@ -3320,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)  		 * everything they get.  		 */ -		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; +		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);  		journal = EXT4_JOURNAL(inode);  		jbd2_journal_lock_updates(journal);  		err = jbd2_journal_flush(journal); @@ -3345,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,  	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);  } +static void ext4_free_io_end(ext4_io_end_t *io) +{ +	BUG_ON(!io); +	if (io->page) +		put_page(io->page); +	iput(io->inode); +	kfree(io); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ +	struct buffer_head *head, *bh; +	unsigned int curr_off = 0; + +	if (!page_has_buffers(page)) +		return; +	head = bh = page_buffers(page); +	do { +		if (offset <= curr_off && test_clear_buffer_uninit(bh) +					&& bh->b_private) { +			ext4_free_io_end(bh->b_private); +			bh->b_private = NULL; +			bh->b_end_io = NULL; +		} +		curr_off = curr_off + bh->b_size; +		bh = bh->b_this_page; +	} while (bh != head); +} +  static void ext4_invalidatepage(struct page *page, unsigned long offset)  {  	journal_t *journal = EXT4_JOURNAL(page->mapping->host);  	/* +	 * free any io_end structure allocated for buffers to be discarded +	 */ +	if (ext4_should_dioread_nolock(page->mapping->host)) +		ext4_invalidatepage_free_endio(page, offset); +	/*  	 * If it's a full truncate we just forget about the pending dirtying  	 */  	if (offset == 0) @@ -3420,7 +3492,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  	}  retry: -	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, +	if (rw == READ && ext4_should_dioread_nolock(inode)) +		ret = blockdev_direct_IO_no_locking(rw, iocb, inode, +				 inode->i_sb->s_bdev, iov, +				 offset, nr_segs, +				 ext4_get_block, NULL); +	else +		ret = blockdev_direct_IO(rw, iocb, inode, +				 inode->i_sb->s_bdev, iov,  				 offset, nr_segs,  				 ext4_get_block, NULL);  	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3436,6 +3515,9 @@ retry:  			 * but cannot extend i_size. Bail out and pretend  			 * the write failed... */  			ret = PTR_ERR(handle); +			if (inode->i_nlink) +				ext4_orphan_del(NULL, inode); +  			goto out;  		}  		if (inode->i_nlink) @@ -3463,75 +3545,63 @@ out:  	return ret;  } -static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock,  		   struct buffer_head *bh_result, int create)  { -	handle_t *handle = NULL; +	handle_t *handle = ext4_journal_current_handle();  	int ret = 0;  	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;  	int dio_credits; +	int started = 0; -	ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", +	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",  		   inode->i_ino, create);  	/* -	 * DIO VFS code passes create = 0 flag for write to -	 * the middle of file. It does this to avoid block -	 * allocation for holes, to prevent expose stale data -	 * out when there is parallel buffered read (which does -	 * not hold the i_mutex lock) while direct IO write has -	 * not completed. DIO request on holes finally falls back -	 * to buffered IO for this reason. -	 * -	 * For ext4 extent based file, since we support fallocate, -	 * new allocated extent as uninitialized, for holes, we -	 * could fallocate blocks for holes, thus parallel -	 * buffered IO read will zero out the page when read on -	 * a hole while parallel DIO write to the hole has not completed. -	 * -	 * when we come here, we know it's a direct IO write to -	 * to the middle of file (<i_size) -	 * so it's safe to override the create flag from VFS. +	 * ext4_get_block in prepare for a DIO write or buffer write. +	 * We allocate an uinitialized extent if blocks haven't been allocated. +	 * The extent will be converted to initialized after IO complete.  	 */ -	create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; +	create = EXT4_GET_BLOCKS_IO_CREATE_EXT; -	if (max_blocks > DIO_MAX_BLOCKS) -		max_blocks = DIO_MAX_BLOCKS; -	dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); -	handle = ext4_journal_start(inode, dio_credits); -	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		goto out; +	if (!handle) { +		if (max_blocks > DIO_MAX_BLOCKS) +			max_blocks = DIO_MAX_BLOCKS; +		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); +		handle = ext4_journal_start(inode, dio_credits); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			goto out; +		} +		started = 1;  	} +  	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,  			      create);  	if (ret > 0) {  		bh_result->b_size = (ret << inode->i_blkbits);  		ret = 0;  	} -	ext4_journal_stop(handle); +	if (started) +		ext4_journal_stop(handle);  out:  	return ret;  } -static void ext4_free_io_end(ext4_io_end_t *io) -{ -	BUG_ON(!io); -	iput(io->inode); -	kfree(io); -} -static void dump_aio_dio_list(struct inode * inode) +static void dump_completed_IO(struct inode * inode)  {  #ifdef	EXT4_DEBUG  	struct list_head *cur, *before, *after;  	ext4_io_end_t *io, *io0, *io1; +	unsigned long flags; -	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ -		ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); +	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ +		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);  		return;  	} -	ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); -	list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ +	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); +	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); +	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){  		cur = &io->list;  		before = cur->prev;  		io0 = container_of(before, ext4_io_end_t, list); @@ -3541,32 +3611,31 @@ static void dump_aio_dio_list(struct inode * inode)  		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",  			    io, inode->i_ino, io0, io1);  	} +	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);  #endif  }  /*   * check a range of space and convert unwritten extents to written.   */ -static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +static int ext4_end_io_nolock(ext4_io_end_t *io)  {  	struct inode *inode = io->inode;  	loff_t offset = io->offset; -	size_t size = io->size; +	ssize_t size = io->size;  	int ret = 0; -	ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," +	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"  		   "list->prev 0x%p\n",  	           io, inode->i_ino, io->list.next, io->list.prev);  	if (list_empty(&io->list))  		return ret; -	if (io->flag != DIO_AIO_UNWRITTEN) +	if (io->flag != EXT4_IO_UNWRITTEN)  		return ret; -	if (offset + size <= i_size_read(inode)) -		ret = ext4_convert_unwritten_extents(inode, offset, size); - +	ret = ext4_convert_unwritten_extents(inode, offset, size);  	if (ret < 0) {  		printk(KERN_EMERG "%s: failed to convert unwritten"  			"extents to written extents, error is %d" @@ -3579,50 +3648,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)  	io->flag = 0;  	return ret;  } +  /*   * work on completed aio dio IO, to convert unwritten extents to extents   */ -static void ext4_end_aio_dio_work(struct work_struct *work) +static void ext4_end_io_work(struct work_struct *work)  { -	ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work); -	struct inode *inode = io->inode; -	int ret = 0; +	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work); +	struct inode		*inode = io->inode; +	struct ext4_inode_info	*ei = EXT4_I(inode); +	unsigned long		flags; +	int			ret;  	mutex_lock(&inode->i_mutex); -	ret = ext4_end_aio_dio_nolock(io); -	if (ret >= 0) { -		if (!list_empty(&io->list)) -			list_del_init(&io->list); -		ext4_free_io_end(io); +	ret = ext4_end_io_nolock(io); +	if (ret < 0) { +		mutex_unlock(&inode->i_mutex); +		return;  	} + +	spin_lock_irqsave(&ei->i_completed_io_lock, flags); +	if (!list_empty(&io->list)) +		list_del_init(&io->list); +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  	mutex_unlock(&inode->i_mutex); +	ext4_free_io_end(io);  } +  /*   * This function is called from ext4_sync_file().   * - * When AIO DIO IO is completed, the work to convert unwritten - * extents to written is queued on workqueue but may not get immediately + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately   * scheduled. When fsync is called, we need to ensure the   * conversion is complete before fsync returns. - * The inode keeps track of a list of completed AIO from DIO path - * that might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents to written. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success.   */ -int flush_aio_dio_completed_IO(struct inode *inode) +int flush_completed_IO(struct inode *inode)  {  	ext4_io_end_t *io; +	struct ext4_inode_info *ei = EXT4_I(inode); +	unsigned long flags;  	int ret = 0;  	int ret2 = 0; -	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) +	if (list_empty(&ei->i_completed_io_list))  		return ret; -	dump_aio_dio_list(inode); -	while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ -		io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, +	dump_completed_IO(inode); +	spin_lock_irqsave(&ei->i_completed_io_lock, flags); +	while (!list_empty(&ei->i_completed_io_list)){ +		io = list_entry(ei->i_completed_io_list.next,  				ext4_io_end_t, list);  		/* -		 * Calling ext4_end_aio_dio_nolock() to convert completed +		 * Calling ext4_end_io_nolock() to convert completed  		 * IO to written.  		 *  		 * When ext4_sync_file() is called, run_queue() may already @@ -3635,20 +3718,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)  		 * avoid double converting from both fsync and background work  		 * queue work.  		 */ -		ret = ext4_end_aio_dio_nolock(io); +		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +		ret = ext4_end_io_nolock(io); +		spin_lock_irqsave(&ei->i_completed_io_lock, flags);  		if (ret < 0)  			ret2 = ret;  		else  			list_del_init(&io->list);  	} +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  	return (ret2 < 0) ? ret2 : 0;  } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)  {  	ext4_io_end_t *io = NULL; -	io = kmalloc(sizeof(*io), GFP_NOFS); +	io = kmalloc(sizeof(*io), flags);  	if (io) {  		igrab(inode); @@ -3656,8 +3742,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)  		io->flag = 0;  		io->offset = 0;  		io->size = 0; -		io->error = 0; -		INIT_WORK(&io->work, ext4_end_aio_dio_work); +		io->page = NULL; +		INIT_WORK(&io->work, ext4_end_io_work);  		INIT_LIST_HEAD(&io->list);  	} @@ -3669,6 +3755,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  {          ext4_io_end_t *io_end = iocb->private;  	struct workqueue_struct *wq; +	unsigned long flags; +	struct ext4_inode_info *ei;  	/* if not async direct IO or dio with 0 bytes write, just return */  	if (!io_end || !size) @@ -3680,7 +3768,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  		  size);  	/* if not aio dio with unwritten extents, just free io and return */ -	if (io_end->flag != DIO_AIO_UNWRITTEN){ +	if (io_end->flag != EXT4_IO_UNWRITTEN){  		ext4_free_io_end(io_end);  		iocb->private = NULL;  		return; @@ -3688,16 +3776,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  	io_end->offset = offset;  	io_end->size = size; +	io_end->flag = EXT4_IO_UNWRITTEN;  	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;  	/* queue the work to convert unwritten extents to written */  	queue_work(wq, &io_end->work);  	/* Add the io_end to per-inode completed aio dio list*/ -	list_add_tail(&io_end->list, -		 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); +	ei = EXT4_I(io_end->inode); +	spin_lock_irqsave(&ei->i_completed_io_lock, flags); +	list_add_tail(&io_end->list, &ei->i_completed_io_list); +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  	iocb->private = NULL;  } + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ +	ext4_io_end_t *io_end = bh->b_private; +	struct workqueue_struct *wq; +	struct inode *inode; +	unsigned long flags; + +	if (!test_clear_buffer_uninit(bh) || !io_end) +		goto out; + +	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { +		printk("sb umounted, discard end_io request for inode %lu\n", +			io_end->inode->i_ino); +		ext4_free_io_end(io_end); +		goto out; +	} + +	io_end->flag = EXT4_IO_UNWRITTEN; +	inode = io_end->inode; + +	/* Add the io_end to per-inode completed io list*/ +	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); +	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); +	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + +	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; +	/* queue the work to convert unwritten extents to written */ +	queue_work(wq, &io_end->work); +out: +	bh->b_private = NULL; +	bh->b_end_io = NULL; +	clear_buffer_uninit(bh); +	end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ +	ext4_io_end_t *io_end; +	struct page *page = bh->b_page; +	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; +	size_t size = bh->b_size; + +retry: +	io_end = ext4_init_io_end(inode, GFP_ATOMIC); +	if (!io_end) { +		if (printk_ratelimit()) +			printk(KERN_WARNING "%s: allocation fail\n", __func__); +		schedule(); +		goto retry; +	} +	io_end->offset = offset; +	io_end->size = size; +	/* +	 * We need to hold a reference to the page to make sure it +	 * doesn't get evicted before ext4_end_io_work() has a chance +	 * to convert the extent from written to unwritten. +	 */ +	io_end->page = page; +	get_page(io_end->page); + +	bh->b_private = io_end; +	bh->b_end_io = ext4_end_io_buffer_write; +	return 0; +} +  /*   * For ext4 extent files, ext4 will do direct-io write to holes,   * preallocated extents, and those write extend the file, no need to @@ -3751,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		iocb->private = NULL;  		EXT4_I(inode)->cur_aio_dio = NULL;  		if (!is_sync_kiocb(iocb)) { -			iocb->private = ext4_init_io_end(inode); +			iocb->private = ext4_init_io_end(inode, GFP_NOFS);  			if (!iocb->private)  				return -ENOMEM;  			/* @@ -3767,7 +3924,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		ret = blockdev_direct_IO(rw, iocb, inode,  					 inode->i_sb->s_bdev, iov,  					 offset, nr_segs, -					 ext4_get_block_dio_write, +					 ext4_get_block_write,  					 ext4_end_io_dio);  		if (iocb->private)  			EXT4_I(inode)->cur_aio_dio = NULL; @@ -3788,8 +3945,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {  			ext4_free_io_end(iocb->private);  			iocb->private = NULL; -		} else if (ret > 0 && (EXT4_I(inode)->i_state & -				       EXT4_STATE_DIO_UNWRITTEN)) { +		} else if (ret > 0 && ext4_test_inode_state(inode, +						EXT4_STATE_DIO_UNWRITTEN)) {  			int err;  			/*  			 * for non AIO case, since the IO is already @@ -3799,7 +3956,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  							     offset, ret);  			if (err < 0)  				ret = err; -			EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; +			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);  		}  		return ret;  	} @@ -4130,18 +4287,27 @@ no_top:   * We release `count' blocks on disk, but (last - first) may be greater   * than `count' because there can be holes in there.   */ -static void ext4_clear_blocks(handle_t *handle, struct inode *inode, -			      struct buffer_head *bh, -			      ext4_fsblk_t block_to_free, -			      unsigned long count, __le32 *first, -			      __le32 *last) +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, +			     struct buffer_head *bh, +			     ext4_fsblk_t block_to_free, +			     unsigned long count, __le32 *first, +			     __le32 *last)  {  	__le32 *p; -	int	flags = EXT4_FREE_BLOCKS_FORGET; +	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;  	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))  		flags |= EXT4_FREE_BLOCKS_METADATA; +	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, +				   count)) { +		ext4_error(inode->i_sb, "inode #%lu: " +			   "attempt to clear blocks %llu len %lu, invalid", +			   inode->i_ino, (unsigned long long) block_to_free, +			   count); +		return 1; +	} +  	if (try_to_extend_transaction(handle, inode)) {  		if (bh) {  			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4160,6 +4326,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,  		*p = 0;  	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); +	return 0;  }  /** @@ -4215,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,  			} else if (nr == block_to_free + count) {  				count++;  			} else { -				ext4_clear_blocks(handle, inode, this_bh, -						  block_to_free, -						  count, block_to_free_p, p); +				if (ext4_clear_blocks(handle, inode, this_bh, +						      block_to_free, count, +						      block_to_free_p, p)) +					break;  				block_to_free = nr;  				block_to_free_p = p;  				count = 1; @@ -4241,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,  		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))  			ext4_handle_dirty_metadata(handle, inode, this_bh);  		else -			ext4_error(inode->i_sb, __func__, +			ext4_error(inode->i_sb,  				   "circular indirect block detected, "  				   "inode=%lu, block=%llu",  				   inode->i_ino, @@ -4281,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,  			if (!nr)  				continue;		/* A hole */ +			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), +						   nr, 1)) { +				ext4_error(inode->i_sb, +					   "indirect mapped block in inode " +					   "#%lu invalid (level %d, blk #%lu)", +					   inode->i_ino, depth, +					   (unsigned long) nr); +				break; +			} +  			/* Go read the buffer for the next level down */  			bh = sb_bread(inode->i_sb, nr); @@ -4289,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,  			 * (should be rare).  			 */  			if (!bh) { -				ext4_error(inode->i_sb, "ext4_free_branches", +				ext4_error(inode->i_sb,  					   "Read failure, inode=%lu, block=%llu",  					   inode->i_ino, nr);  				continue; @@ -4433,8 +4611,10 @@ void ext4_truncate(struct inode *inode)  	if (!ext4_can_truncate(inode))  		return; +	EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; +  	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) -		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; +		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);  	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {  		ext4_ext_truncate(inode); @@ -4604,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode,  	bh = sb_getblk(sb, block);  	if (!bh) { -		ext4_error(sb, "ext4_get_inode_loc", "unable to read " -			   "inode block - inode=%lu, block=%llu", -			   inode->i_ino, block); +		ext4_error(sb, "unable to read inode block - " +			   "inode=%lu, block=%llu", inode->i_ino, block);  		return -EIO;  	}  	if (!buffer_uptodate(bh)) { @@ -4704,9 +4883,8 @@ make_io:  		submit_bh(READ_META, bh);  		wait_on_buffer(bh);  		if (!buffer_uptodate(bh)) { -			ext4_error(sb, __func__, -				   "unable to read inode block - inode=%lu, " -				   "block=%llu", inode->i_ino, block); +			ext4_error(sb, "unable to read inode block - inode=%lu," +				   " block=%llu", inode->i_ino, block);  			brelse(bh);  			return -EIO;  		} @@ -4720,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)  {  	/* We have all inode data except xattrs in memory here. */  	return __ext4_get_inode_loc(inode, iloc, -		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); +		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));  }  void ext4_set_inode_flags(struct inode *inode) @@ -4814,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	}  	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); -	ei->i_state = 0; +	ei->i_state_flags = 0;  	ei->i_dir_start_lookup = 0;  	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);  	/* We now have enough fields to check if the inode was active or not. @@ -4897,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  					EXT4_GOOD_OLD_INODE_SIZE +  					ei->i_extra_isize;  			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) -				ei->i_state |= EXT4_STATE_XATTR; +				ext4_set_inode_state(inode, EXT4_STATE_XATTR);  		}  	} else  		ei->i_extra_isize = 0; @@ -4917,8 +5095,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	ret = 0;  	if (ei->i_file_acl &&  	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { -		ext4_error(sb, __func__, -			   "bad extended attribute block %llu in inode #%lu", +		ext4_error(sb, "bad extended attribute block %llu inode #%lu",  			   ei->i_file_acl, inode->i_ino);  		ret = -EIO;  		goto bad_inode; @@ -4964,8 +5141,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));  	} else {  		ret = -EIO; -		ext4_error(inode->i_sb, __func__, -			   "bogus i_mode (%o) for inode=%lu", +		ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",  			   inode->i_mode, inode->i_ino);  		goto bad_inode;  	} @@ -5037,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle,  	/* For fields not not tracking in the in-memory inode,  	 * initialise them to zero for new inodes. */ -	if (ei->i_state & EXT4_STATE_NEW) +	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))  		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);  	ext4_get_inode_flags(ei); @@ -5101,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle,  					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);  			sb->s_dirt = 1;  			ext4_handle_sync(handle); -			err = ext4_handle_dirty_metadata(handle, inode, +			err = ext4_handle_dirty_metadata(handle, NULL,  					EXT4_SB(sb)->s_sbh);  		}  	} @@ -5130,10 +5306,10 @@ static int ext4_do_update_inode(handle_t *handle,  	}  	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); -	rc = ext4_handle_dirty_metadata(handle, inode, bh); +	rc = ext4_handle_dirty_metadata(handle, NULL, bh);  	if (!err)  		err = rc; -	ei->i_state &= ~EXT4_STATE_NEW; +	ext4_clear_inode_state(inode, EXT4_STATE_NEW);  	ext4_update_inode_fsync_trans(handle, inode, 0);  out_brelse: @@ -5177,7 +5353,7 @@ out_brelse:   * `stuff()' is running, and the new i_size will be lost.  Plus the inode   * will no longer be on the superblock's dirty inode list.   */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  {  	int err; @@ -5191,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait)  			return -EIO;  		} -		if (!wait) +		if (wbc->sync_mode != WB_SYNC_ALL)  			return 0;  		err = ext4_force_commit(inode->i_sb); @@ -5201,13 +5377,11 @@ int ext4_write_inode(struct inode *inode, int wait)  		err = ext4_get_inode_loc(inode, &iloc);  		if (err)  			return err; -		if (wait) +		if (wbc->sync_mode == WB_SYNC_ALL)  			sync_dirty_buffer(iloc.bh);  		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { -			ext4_error(inode->i_sb, __func__, -				   "IO error syncing inode, " -				   "inode=%lu, block=%llu", -				   inode->i_ino, +			ext4_error(inode->i_sb, "IO error syncing inode, " +				   "inode=%lu, block=%llu", inode->i_ino,  				   (unsigned long long)iloc.bh->b_blocknr);  			err = -EIO;  		} @@ -5249,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  	if (error)  		return error; +	if (ia_valid & ATTR_SIZE) +		dquot_initialize(inode);  	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||  		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {  		handle_t *handle; @@ -5261,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  			error = PTR_ERR(handle);  			goto err_out;  		} -		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; +		error = dquot_transfer(inode, attr);  		if (error) {  			ext4_journal_stop(handle);  			return error; @@ -5288,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  	}  	if (S_ISREG(inode->i_mode) && -	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { +	    attr->ia_valid & ATTR_SIZE && +	    (attr->ia_size < inode->i_size || +	     (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {  		handle_t *handle;  		handle = ext4_journal_start(inode, 3); @@ -5319,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  				goto err_out;  			}  		} +		/* ext4_truncate will clear the flag */ +		if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) +			ext4_truncate(inode);  	}  	rc = inode_setattr(inode, attr); @@ -5557,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode,  	entry = IFIRST(header);  	/* No extended attributes present */ -	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || -		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { +	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || +	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {  		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,  			new_extra_isize);  		EXT4_I(inode)->i_extra_isize = new_extra_isize; @@ -5602,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)  	err = ext4_reserve_inode_write(handle, inode, &iloc);  	if (ext4_handle_valid(handle) &&  	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && -	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { +	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {  		/*  		 * We need extra buffer credits since we may write into EA block  		 * with this same handle. If journal_extend fails, then it will @@ -5616,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)  						      sbi->s_want_extra_isize,  						      iloc, handle);  			if (ret) { -				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; +				ext4_set_inode_state(inode, +						     EXT4_STATE_NO_EXPAND);  				if (mnt_count !=  					le16_to_cpu(sbi->s_es->s_mnt_count)) { -					ext4_warning(inode->i_sb, __func__, +					ext4_warning(inode->i_sb,  					"Unable to expand inode %lu. Delete"  					" some EAs or run e2fsck.",  					inode->i_ino); @@ -5641,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)   * i_size has been changed by generic_commit_write() and we thus need   * to include the updated inode in the current transaction.   * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks   * are allocated to the file.   *   * If the inode is marked synchronous, we don't honour that here - doing @@ -5683,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)  			err = jbd2_journal_get_write_access(handle, iloc.bh);  			if (!err)  				err = ext4_handle_dirty_metadata(handle, -								 inode, +								 NULL,  								 iloc.bh);  			brelse(iloc.bh);  		}  |