diff options
Diffstat (limited to 'fs/fs-writeback.c')
| -rw-r--r-- | fs/fs-writeback.c | 174 | 
1 files changed, 127 insertions, 47 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 542b02d170f8..8aaa7eec7b74 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,10 +36,6 @@   */  #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10)) -struct wb_completion { -	atomic_t		cnt; -}; -  /*   * Passed into wb_writeback(), essentially a subset of writeback_control   */ @@ -61,19 +57,6 @@ struct wb_writeback_work {  };  /* - * If one wants to wait for one or more wb_writeback_works, each work's - * ->done should be set to a wb_completion defined using the following - * macro.  Once all work items are issued with wb_queue_work(), the caller - * can wait for the completion of all using wb_wait_for_completion().  Work - * items which are waited upon aren't freed automatically on completion. - */ -#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\ -	struct wb_completion cmpl = {					\ -		.cnt		= ATOMIC_INIT(1),			\ -	} - - -/*   * If an inode is constantly having its pages dirtied, but then the   * updates stop dirtytime_expire_interval seconds in the past, it's   * possible for the worst case time between when an inode has its @@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb,  	if (work->auto_free)  		kfree(work);  	if (done && atomic_dec_and_test(&done->cnt)) -		wake_up_all(&wb->bdi->wb_waitq); +		wake_up_all(done->waitq);  }  static void wb_queue_work(struct bdi_writeback *wb, @@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb,  /**   * wb_wait_for_completion - wait for completion of bdi_writeback_works - * @bdi: bdi work items were issued to   * @done: target wb_completion   *   * Wait for one or more work items issued to @bdi with their ->done field - * set to @done, which should have been defined with - * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such - * work items are completed.  Work items which are waited upon aren't freed + * set to @done, which should have been initialized with + * DEFINE_WB_COMPLETION().  This function returns after all such work items + * are completed.  Work items which are waited upon aren't freed   * automatically on completion.   */ -static void wb_wait_for_completion(struct backing_dev_info *bdi, -				   struct wb_completion *done) +void wb_wait_for_completion(struct wb_completion *done)  {  	atomic_dec(&done->cnt);		/* put down the initial count */ -	wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +	wait_event(*done->waitq, !atomic_read(&done->cnt));  }  #ifdef CONFIG_CGROUP_WRITEBACK -/* parameters for foreign inode detection, see wb_detach_inode() */ +/* + * Parameters for foreign inode detection, see wbc_detach_inode() to see + * how they're used. + * + * These paramters are inherently heuristical as the detection target + * itself is fuzzy.  All we want to do is detaching an inode from the + * current owner if it's being written to by some other cgroups too much. + * + * The current cgroup writeback is built on the assumption that multiple + * cgroups writing to the same inode concurrently is very rare and a mode + * of operation which isn't well supported.  As such, the goal is not + * taking too long when a different cgroup takes over an inode while + * avoiding too aggressive flip-flops from occasional foreign writes. + * + * We record, very roughly, 2s worth of IO time history and if more than + * half of that is foreign, trigger the switch.  The recording is quantized + * to 16 slots.  To avoid tiny writes from swinging the decision too much, + * writes smaller than 1/8 of avg size are ignored. + */  #define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */  #define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */ -#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_CUT_DIV	8	/* ignore rounds < avg / 8 */  #define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */  #define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */ @@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,  					/* if foreign slots >= 8, switch */  #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)  					/* one round can affect upto 5 slots */ +#define WB_FRN_MAX_IN_FLIGHT	1024	/* don't queue too many concurrently */  static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);  static struct workqueue_struct *isw_wq; @@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)  	if (unlikely(inode->i_state & I_FREEING))  		goto skip_switch; +	trace_inode_switch_wbs(inode, old_wb, new_wb); +  	/*  	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points  	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to @@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)  	if (inode->i_state & I_WB_SWITCH)  		return; -	/* -	 * Avoid starting new switches while sync_inodes_sb() is in -	 * progress.  Otherwise, if the down_write protected issue path -	 * blocks heavily, we might end up starting a large number of -	 * switches which will block on the rwsem. -	 */ -	if (!down_read_trylock(&bdi->wb_switch_rwsem)) +	/* avoid queueing a new switch if too many are already in flight */ +	if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)  		return;  	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);  	if (!isw) -		goto out_unlock; +		return;  	/* find and pin the new wb */  	rcu_read_lock(); @@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)  	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);  	atomic_inc(&isw_nr_in_flight); - -	goto out_unlock; +	return;  out_free:  	if (isw->new_wb)  		wb_put(isw->new_wb);  	kfree(isw); -out_unlock: -	up_read(&bdi->wb_switch_rwsem);  }  /** @@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc)  		if (wbc->wb_id != max_id)  			history |= (1U << slots) - 1; +		if (history) +			trace_inode_foreign_history(inode, wbc, history); +  		/*  		 * Switch if the current wb isn't the consistent winner.  		 * If there are multiple closely competing dirtiers, the @@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,  restart:  	rcu_read_lock();  	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { -		DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); +		DEFINE_WB_COMPLETION(fallback_work_done, bdi);  		struct wb_writeback_work fallback_work;  		struct wb_writeback_work *work;  		long nr_pages; @@ -890,7 +887,7 @@ restart:  		last_wb = wb;  		rcu_read_unlock(); -		wb_wait_for_completion(bdi, &fallback_work_done); +		wb_wait_for_completion(&fallback_work_done);  		goto restart;  	}  	rcu_read_unlock(); @@ -900,6 +897,89 @@ restart:  }  /** + * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs + * @bdi_id: target bdi id + * @memcg_id: target memcg css id + * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @reason: reason why some writeback work initiated + * @done: target wb_completion + * + * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id + * with the specified parameters. + */ +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +			   enum wb_reason reason, struct wb_completion *done) +{ +	struct backing_dev_info *bdi; +	struct cgroup_subsys_state *memcg_css; +	struct bdi_writeback *wb; +	struct wb_writeback_work *work; +	int ret; + +	/* lookup bdi and memcg */ +	bdi = bdi_get_by_id(bdi_id); +	if (!bdi) +		return -ENOENT; + +	rcu_read_lock(); +	memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); +	if (memcg_css && !css_tryget(memcg_css)) +		memcg_css = NULL; +	rcu_read_unlock(); +	if (!memcg_css) { +		ret = -ENOENT; +		goto out_bdi_put; +	} + +	/* +	 * And find the associated wb.  If the wb isn't there already +	 * there's nothing to flush, don't create one. +	 */ +	wb = wb_get_lookup(bdi, memcg_css); +	if (!wb) { +		ret = -ENOENT; +		goto out_css_put; +	} + +	/* +	 * If @nr is zero, the caller is attempting to write out most of +	 * the currently dirty pages.  Let's take the current dirty page +	 * count and inflate it by 25% which should be large enough to +	 * flush out most dirty pages while avoiding getting livelocked by +	 * concurrent dirtiers. +	 */ +	if (!nr) { +		unsigned long filepages, headroom, dirty, writeback; + +		mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, +				      &writeback); +		nr = dirty * 10 / 8; +	} + +	/* issue the writeback work */ +	work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); +	if (work) { +		work->nr_pages = nr; +		work->sync_mode = WB_SYNC_NONE; +		work->range_cyclic = 1; +		work->reason = reason; +		work->done = done; +		work->auto_free = 1; +		wb_queue_work(wb, work); +		ret = 0; +	} else { +		ret = -ENOMEM; +	} + +	wb_put(wb); +out_css_put: +	css_put(memcg_css); +out_bdi_put: +	bdi_put(bdi); +	return ret; +} + +/**   * cgroup_writeback_umount - flush inode wb switches for umount   *   * This function is called when a super_block is about to be destroyed and @@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb)  static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,  				     enum wb_reason reason, bool skip_if_busy)  { -	DEFINE_WB_COMPLETION_ONSTACK(done); +	struct backing_dev_info *bdi = sb->s_bdi; +	DEFINE_WB_COMPLETION(done, bdi);  	struct wb_writeback_work work = {  		.sb			= sb,  		.sync_mode		= WB_SYNC_NONE, @@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,  		.nr_pages		= nr,  		.reason			= reason,  	}; -	struct backing_dev_info *bdi = sb->s_bdi;  	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)  		return;  	WARN_ON(!rwsem_is_locked(&sb->s_umount));  	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); -	wb_wait_for_completion(bdi, &done); +	wb_wait_for_completion(&done);  }  /** @@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);   */  void sync_inodes_sb(struct super_block *sb)  { -	DEFINE_WB_COMPLETION_ONSTACK(done); +	struct backing_dev_info *bdi = sb->s_bdi; +	DEFINE_WB_COMPLETION(done, bdi);  	struct wb_writeback_work work = {  		.sb		= sb,  		.sync_mode	= WB_SYNC_ALL, @@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb)  		.reason		= WB_REASON_SYNC,  		.for_sync	= 1,  	}; -	struct backing_dev_info *bdi = sb->s_bdi;  	/*  	 * Can't skip on !bdi_has_dirty() because we should wait for !dirty @@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb)  	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */  	bdi_down_write_wb_switch_rwsem(bdi);  	bdi_split_work_to_wbs(bdi, &work, false); -	wb_wait_for_completion(bdi, &done); +	wb_wait_for_completion(&done);  	bdi_up_write_wb_switch_rwsem(bdi);  	wait_sb_inodes(sb);  |