From 624080eded68738daee041ad64672a9d2614754f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 6 Jun 2008 17:50:40 -0400 Subject: jbd2: If a journal checksum error is detected, propagate the error to ext4 If a journal checksum error is detected, the ext4 filesystem will call ext4_error(), and the mount will either continue, become a read-only mount, or cause a kernel panic based on the superblock flags indicating the user's preference of what to do in case of filesystem corruption being detected. Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 05e2b307161a..d147f0f90360 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -919,6 +919,9 @@ struct journal_s struct proc_dir_entry *j_proc_entry; struct transaction_stats_s j_stats; + /* Failed journal commit ID */ + unsigned int j_failed_commit; + /* * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here -- cgit From 43f83a8f9963a11a9c3f41beecc363da21ae3602 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 27 May 2008 01:37:26 -0400 Subject: Input: wm9713 - support five wire panels Signed-off-by: Mark Brown Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wm9713.c | 22 ++++++++++++++++++++++ include/linux/wm97xx.h | 1 + 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/input/touchscreen/wm9713.c b/drivers/input/touchscreen/wm9713.c index 01278bd7e65c..838458792ea0 100644 --- a/drivers/input/touchscreen/wm9713.c +++ b/drivers/input/touchscreen/wm9713.c @@ -84,6 +84,15 @@ static int delay = 4; module_param(delay, int, 0); MODULE_PARM_DESC(delay, "Set adc sample delay."); +/* + * Set five_wire = 1 to use a 5 wire touchscreen. + * + * NOTE: Five wire mode does not allow for readback of pressure. + */ +static int five_wire; +module_param(five_wire, int, 0); +MODULE_PARM_DESC(five_wire, "Set to '1' to use 5-wire touchscreen."); + /* * Set adc mask function. * @@ -162,6 +171,19 @@ static void wm9713_phy_init(struct wm97xx *wm) 64000 / rpu); } + /* Five wire panel? */ + if (five_wire) { + dig3 |= WM9713_45W; + dev_info(wm->dev, "setting 5-wire touchscreen mode."); + + if (pil) { + dev_warn(wm->dev, + "Pressure measurement not supported in 5 " + "wire mode, disabling\n"); + pil = 0; + } + } + /* touchpanel pressure */ if (pil == 2) { dig3 |= WM9712_PIL; diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h index 4d13732e9cf0..6f69968eab24 100644 --- a/include/linux/wm97xx.h +++ b/include/linux/wm97xx.h @@ -100,6 +100,7 @@ #define WM9713_ADCSEL_Y 0x0004 /* Y measurement */ #define WM9713_ADCSEL_PRES 0x0008 /* Pressure measurement */ #define WM9713_COO 0x0001 /* enable coordinate mode */ +#define WM9713_45W 0x1000 /* set for 5 wire panel */ #define WM9713_PDEN 0x0800 /* measure only when pen down */ #define WM9713_ADCSEL_MASK 0x00fe /* ADC selection mask */ #define WM9713_WAIT 0x0200 /* coordinate wait */ -- cgit From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:54:41 +0200 Subject: Added in MESSAGE notes for blktraces Allows messages to be inserted into blktrace streams. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 14 ++++++++++++++ include/linux/blktrace_api.h | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/block/blktrace.c b/block/blktrace.c index b2cbb4e5d767..20e11f354f11 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + static char bt_msg_buf[BLK_TN_MAX_MSG]; + + va_start(args, fmt); + n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cfc3147e5cf9..b7cd8f1eedbe 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -55,6 +55,7 @@ enum blktrace_act { enum blktrace_notify { __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ __BLK_TN_TIMESTAMP, /* include system clock */ + __BLK_TN_MESSAGE, /* Character string message */ }; @@ -79,6 +80,7 @@ enum blktrace_notify { #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) +#define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 @@ -149,7 +151,28 @@ extern void blk_trace_shutdown(struct request_queue *); extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); +/** + * blk_add_trace_msg - Add a (simple) message to the blktrace stream + * @q: queue the io is for + * @fmt: format to print message in + * args... Variable argument list for format + * + * Description: + * Records a (simple) message onto the blktrace stream. + * + * NOTE: BLK_TN_MAX_MSG characters are output at most. + * NOTE: Can not use 'static inline' due to presence of var args... + * + **/ +#define blk_add_trace_msg(q, fmt, ...) \ + do { \ + struct blk_trace *bt = (q)->blk_trace; \ + if (unlikely(bt)) \ + __trace_note_message(bt, fmt, ##__VA_ARGS__); \ + } while (0) +#define BLK_TN_MAX_MSG 1024 /** * blk_add_trace_rq - Add a trace for a request oriented action @@ -299,6 +322,8 @@ extern int blk_trace_remove(struct request_queue *q); #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) #define blk_trace_remove(q) (-ENOTTY) +#define blk_add_trace_msg(q, fmt, ...) do { } while (0) + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ #endif -- cgit From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:45:33 +0200 Subject: block: make blktrace use per-cpu buffers for message notes Currently it uses a single static char array, but that risks being corrupted when multiple users issue message notes at the same time. Make the buffers dynamically allocated when the trace is setup and make them per-cpu instead. The default max message size of 1k is also very large, the interface is mainly for small text notes. So shrink it to 128 bytes. Signed-off-by: Jens Axboe --- block/blktrace.c | 15 ++++++++++++--- include/linux/blktrace_api.h | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/blktrace.c b/block/blktrace.c index 20e11f354f11..7ae87cc4a163 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; - static char bt_msg_buf[BLK_TN_MAX_MSG]; + char *buf; + preempt_disable(); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); - n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + preempt_enable(); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); + free_percpu(bt->msg_data); kfree(bt); } @@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + ret = -ENOENT; dir = blk_create_tree(buts->name); if (!dir) @@ -406,6 +414,7 @@ err: if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); + free_percpu(bt->msg_data); if (bt->rchan) relay_close(bt->rchan); kfree(bt); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index b7cd8f1eedbe..e3ef903aae88 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -121,6 +121,7 @@ struct blk_trace { int trace_state; struct rchan *rchan; unsigned long *sequence; + unsigned char *msg_data; u16 act_mask; u64 start_lba; u64 end_lba; @@ -172,7 +173,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); if (unlikely(bt)) \ __trace_note_message(bt, fmt, ##__VA_ARGS__); \ } while (0) -#define BLK_TN_MAX_MSG 1024 +#define BLK_TN_MAX_MSG 128 /** * blk_add_trace_rq - Add a trace for a request oriented action -- cgit From 6363ca57c76b7b83639ca8c83fc285fa26a7880e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 May 2008 11:28:57 +0200 Subject: revert ("sched: fair-group: SMP-nice for group scheduling") Yanmin Zhang reported: Comparing with 2.6.25, volanoMark has big regression with kernel 2.6.26-rc1. It's about 50% on my 8-core stoakley, 16-core tigerton, and Itanium Montecito. With bisect, I located the following patch: | 18d95a2832c1392a2d63227a7a6d433cb9f2037e is first bad commit | commit 18d95a2832c1392a2d63227a7a6d433cb9f2037e | Author: Peter Zijlstra | Date: Sat Apr 19 19:45:00 2008 +0200 | | sched: fair-group: SMP-nice for group scheduling Revert it so that we get v2.6.25 behavior. Bisected-by: Yanmin Zhang Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 430 ++++---------------------------------------------- kernel/sched_debug.c | 5 - kernel/sched_fair.c | 124 ++++++--------- kernel/sched_rt.c | 4 - 5 files changed, 75 insertions(+), 489 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a6176f4b..8a888499954e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -766,7 +766,6 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ - int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index 3dc13f05b10e..bfb8ad8ed171 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -398,43 +398,6 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; - /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. - */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; - - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; - - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; - } aggregate; -#endif #endif }; @@ -1508,326 +1471,6 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) -{ - return &tg->cfs_rq[sd->first_cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) -{ - struct task_group *parent, *child; - - rcu_read_lock(); - parent = &root_task_group; -down: - (*down)(parent, sd); - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - (*up)(parent, sd); - - child = parent; - parent = parent->parent; - if (parent) - goto up; - rcu_read_unlock(); -} - -/* - * Calculate the aggregate runqueue weight. - */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - unsigned long task_weight = 0; - int i; - - for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; - task_weight += tg->cfs_rq[i]->task_weight; - } - - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) - shares += tg->cfs_rq[i]->shares; - - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) - shares = tg->shares; - - aggregate(tg, sd)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - int i; - - load = 0; - for_each_cpu_mask(i, sd->span) - load += cpu_rq(i)->load.weight; - - } else { - load = aggregate(tg->parent, sd)->load; - - /* - * shares is our weight in the parent's rq so - * shares/parent->rq_weight gives our fraction of the load - */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; - } - - aggregate(tg, sd)->load = load; -} - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) -{ - int boost = 0; - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[tcpu]) - return; - - rq_weight = tg->cfs_rq[tcpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * - */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; - - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; - - __set_se_shares(tg->se[tcpu], shares); -} - -/* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. - */ -static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, - int scpu, int dcpu) -{ - unsigned long shares; - - shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - if (shares) - tg->cfs_rq[dcpu]->shares += shares; -} - -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, - int scpu, int dcpu) -{ - while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); - tg = tg->parent; - } -} - -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long shares = aggregate(tg, sd)->shares; - int i; - - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); - spin_unlock_irqrestore(&rq->lock, flags); - } - - aggregate_group_shares(tg, sd); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= aggregate(tg, sd)->shares; - if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; - } -} - -/* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. - */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) -{ - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); -} - -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) -{ - aggregate_group_set_shares(tg, sd); -} - -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); - -static void __init init_aggregate(void) -{ - int i; - - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(aggregate_lock, i)); -} - -static int get_aggregate(struct sched_domain *sd) -{ - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) - return 0; - - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); - return 1; -} - -static void put_aggregate(struct sched_domain *sd) -{ - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); -} - -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ - cfs_rq->shares = shares; -} - -#else - -static inline void init_aggregate(void) -{ -} - -static inline int get_aggregate(struct sched_domain *sd) -{ - return 0; -} - -static inline void put_aggregate(struct sched_domain *sd) -{ -} -#endif - #else /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1848,14 +1491,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; + inc_load(rq, p); } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1947,7 +1602,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + inc_nr_running(p, rq); } /* @@ -1959,7 +1614,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(rq); + dec_nr_running(p, rq); } /** @@ -2612,7 +2267,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(rq); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3603,12 +3258,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; - int unlock_aggregate; cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); - /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3724,9 +3376,8 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - - goto out; + return -1; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3741,13 +3392,8 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; -out: - if (unlock_aggregate) - put_aggregate(sd); - return ld_moved; + return -1; + return 0; } /* @@ -4934,8 +4580,10 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_load(rq, p); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4945,6 +4593,7 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7319,7 +6968,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7330,7 +6978,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7342,7 +6989,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7354,7 +7000,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7367,7 +7012,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -8037,7 +7681,6 @@ void __init sched_init(void) } #ifdef CONFIG_SMP - init_aggregate(); init_defrootdomain(); #endif @@ -8602,11 +8245,14 @@ void sched_move_task(struct task_struct *tsk) #endif #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) +static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; int on_rq; + spin_lock_irq(&rq->lock); + on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8616,17 +8262,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); -} -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + spin_unlock_irq(&rq->lock); } static DEFINE_MUTEX(shares_mutex); @@ -8665,13 +8302,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); + for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); - } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5f06118fbc31..8bb713040ac9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); -#endif -#endif } static void print_cpu(struct seq_file *m, int cpu) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0eb0ae879542..f0f25fc12d0a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -540,10 +523,6 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -1327,90 +1306,75 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -static unsigned long -__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, - struct cfs_rq *cfs_rq) +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) { - struct rq_iterator cfs_rq_iterator; + struct sched_entity *curr; + struct task_struct *p; - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - cfs_rq_iterator.arg = cfs_rq; + if (!cfs_rq->nr_running || !first_fair(cfs_rq)) + return MAX_PRIO; + + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &cfs_rq_iterator); + p = task_of(curr); + + return p->prio; } +#endif -#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { + struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; - - rcu_read_lock(); - list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; - - /* - * empty group - */ - if (!aggregate(tg, sd)->task_weight) - continue; - - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; - - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; + struct rq_iterator cfs_rq_iterator; - imbalance = (busiest_weight - this_weight) / 2; + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; - if (imbalance < 0) - imbalance = busiest_weight; + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { +#ifdef CONFIG_FAIR_GROUP_SCHED + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; - max_load = max(rem_load, imbalance); - moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, - tg->cfs_rq[busiest_cpu]); + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - if (!moved_load) + imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + *this_best_prio = cfs_rq_best_prio(this_cfs_rq); +#else +# define maxload rem_load_move +#endif + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, + maxload, sd, idle, all_pinned, + this_best_prio, + &cfs_rq_iterator); - rem_load_move -= moved_load; - if (rem_load_move < 0) + if (rem_load_move <= 0) break; } - rcu_read_unlock(); return max_load_move - rem_load_move; } -#else -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return __load_balance_fair(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); -} -#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 060e87b0cb1c..3432d573205d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) if (rt_rq && rt_rq->rt_nr_running) enqueue_rt_entity(rt_se); } - - dec_cpu_load(rq, p->se.load.weight); } /* -- cgit From ea3f01f8afd3bc5daff915cc4ea5cc5ea9e7d427 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 May 2008 14:32:23 +0200 Subject: sched: re-tune NUMA topologies improve the sysbench ramp-up phase and its peak throughput on a 16way NUMA box, by turning on WAKE_AFFINE: tip/sched tip/sched+wake-affine ------------------------------------------------- 1: 700 830 +15.65% 2: 1465 1391 -5.28% 4: 3017 3105 +2.81% 8: 5100 6021 +15.30% 16: 10725 10745 +0.19% 32: 10135 10150 +0.16% 64: 9338 9240 -1.06% 128: 8599 8252 -4.21% 256: 8475 8144 -4.07% ------------------------------------------------- SUM: 57558 57882 +0.56% this change also improves lat_ctx from 6.69 usecs to 1.11 usec: $ ./lat_ctx -s 0 2 "size=0k ovr=1.19 2 1.11 $ ./lat_ctx -s 0 2 "size=0k ovr=1.22 2 6.69 in sysbench it's an overall win with some weakness at the lots-of-clients side. That happens because we now under-balance this workload a bit. To counter that effect, turn on NEWIDLE: wake-idle wake-idle+newidle ------------------------------------------------- 1: 830 834 +0.43% 2: 1391 1401 +0.65% 4: 3105 3091 -0.43% 8: 6021 6046 +0.42% 16: 10745 10736 -0.08% 32: 10150 10206 +0.55% 64: 9240 9533 +3.08% 128: 8252 8355 +1.24% 256: 8144 8384 +2.87% ------------------------------------------------- SUM: 57882 58591 +1.21% as a bonus this not only improves the many-clients case but also improves the (more important) rampup phase. sysbench is a workload that quickly breaks down if the scheduler over-balances, so since it showed an improvement under NEWIDLE this change is definitely good. --- include/linux/topology.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 4bb7074a2c3a..24f3d2282e11 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -166,7 +166,9 @@ void arch_update_cpu_topology(void); .busy_idx = 3, \ .idle_idx = 3, \ .flags = SD_LOAD_BALANCE \ - | SD_SERIALIZE, \ + | SD_BALANCE_NEWIDLE \ + | SD_WAKE_AFFINE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ } -- cgit From 413c239fad68258157f903b3ffd9bfcc53f5e34b Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 30 May 2008 10:16:40 +1000 Subject: driver-core: prepare for 2.6.27 api change by adding dev_set_name Create the dev_set_name function now so that various subsystems can start changing over to it before other changes in 2.6.27 will make it compulsory. Cc: Kay Sievers Signed-off-by: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 15 +++++++++++++++ include/linux/device.h | 3 +++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 72eccae4904b..422cfcad486d 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -759,6 +759,21 @@ static void device_remove_class_symlinks(struct device *dev) sysfs_remove_link(&dev->kobj, "subsystem"); } +/** + * dev_set_name - set a device name + * @dev: device + */ +int dev_set_name(struct device *dev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + vsnprintf(dev->bus_id, sizeof(dev->bus_id), fmt, vargs); + va_end(vargs); + return 0; +} +EXPORT_SYMBOL_GPL(dev_set_name); + /** * device_add - add device to device hierarchy. * @dev: device. diff --git a/include/linux/device.h b/include/linux/device.h index 14616e80213c..6a2d04c011bc 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -385,6 +385,9 @@ static inline const char *dev_name(struct device *dev) return dev->bus_id; } +extern int dev_set_name(struct device *dev, const char *name, ...) + __attribute__((format(printf, 2, 3))); + #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { -- cgit From 3ef536095446552823fc488fec1c5451aab1260d Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 16 May 2008 11:17:03 +0200 Subject: virtio_blk: allow read-only disks Hello Rusty, sometimes it is useful to share a disk (e.g. usr). To avoid file system corruption, the disk should be mounted read-only in that case. This patch adds a new feature flag, that allows the host to specify, if the disk should be considered read-only. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 6 +++++- include/linux/virtio_blk.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c4804f3465db..dd7ea203f940 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -260,6 +260,10 @@ static int virtblk_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); + /* If disk is read-only in the host, the guest should obey */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) + set_disk_ro(vblk->disk, 1); + /* Host must always specify the capacity. */ vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), &cap, sizeof(cap)); @@ -326,7 +330,7 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, - VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, }; static struct virtio_driver virtio_blk = { diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index d4695a3356d0..b80919fad0ef 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -10,6 +10,7 @@ #define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ #define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ #define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */ +#define VIRTIO_BLK_F_RO 5 /* Disk is read-only */ struct virtio_blk_config { -- cgit From f7f510ec195781c857ab76366a3e1c59e1caae42 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 30 May 2008 15:09:44 -0500 Subject: virtio: An entropy device, as suggested by hpa. Note that by itself, having a "hardware" random generator does very little: you should probably run "rngd" in your guest to feed this into the kernel entropy pool. Included: virtio_rng: dont use vmalloced addresses for virtio If virtio_rng is build as a module, random_data is an address in vmalloc space. As virtio expects guest real addresses, this can cause any kind of funny behaviour, so lets allocate random_data dynamically with kmalloc. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- drivers/char/hw_random/Kconfig | 9 +++ drivers/char/hw_random/Makefile | 1 + drivers/char/hw_random/virtio-rng.c | 155 ++++++++++++++++++++++++++++++++++++ include/linux/virtio_rng.h | 8 ++ 4 files changed, 173 insertions(+) create mode 100644 drivers/char/hw_random/virtio-rng.c create mode 100644 include/linux/virtio_rng.h (limited to 'include/linux') diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 8d6c2089d2a8..efd0b4db7c8e 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -112,3 +112,12 @@ config HW_RANDOM_PASEMI If unsure, say Y. +config HW_RANDOM_VIRTIO + tristate "VirtIO Random Number Generator support" + depends on HW_RANDOM && VIRTIO + ---help--- + This driver provides kernel-side support for the virtual Random Number + Generator hardware. + + To compile this driver as a module, choose M here: the + module will be called virtio-rng. If unsure, say N. diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index c8b7300e2fb1..b4940ddbb35f 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_HW_RANDOM_VIA) += via-rng.o obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o obj-$(CONFIG_HW_RANDOM_PASEMI) += pasemi-rng.o +obj-$(CONFIG_HW_RANDOM_VIRTIO) += virtio-rng.o diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c new file mode 100644 index 000000000000..d0e563e4fc39 --- /dev/null +++ b/drivers/char/hw_random/virtio-rng.c @@ -0,0 +1,155 @@ +/* + * Randomness driver for virtio + * Copyright (C) 2007, 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include + +/* The host will fill any buffer we give it with sweet, sweet randomness. We + * give it 64 bytes at a time, and the hwrng framework takes it 4 bytes at a + * time. */ +#define RANDOM_DATA_SIZE 64 + +static struct virtqueue *vq; +static u32 *random_data; +static unsigned int data_left; +static DECLARE_COMPLETION(have_data); + +static void random_recv_done(struct virtqueue *vq) +{ + int len; + + /* We never get spurious callbacks. */ + if (!vq->vq_ops->get_buf(vq, &len)) + BUG(); + + data_left = len / sizeof(random_data[0]); + complete(&have_data); +} + +static void register_buffer(void) +{ + struct scatterlist sg; + + sg_init_one(&sg, random_data, RANDOM_DATA_SIZE); + /* There should always be room for one buffer. */ + if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) != 0) + BUG(); + vq->vq_ops->kick(vq); +} + +/* At least we don't udelay() in a loop like some other drivers. */ +static int virtio_data_present(struct hwrng *rng, int wait) +{ + if (data_left) + return 1; + + if (!wait) + return 0; + + wait_for_completion(&have_data); + return 1; +} + +/* virtio_data_present() must have succeeded before this is called. */ +static int virtio_data_read(struct hwrng *rng, u32 *data) +{ + BUG_ON(!data_left); + + *data = random_data[--data_left]; + + if (!data_left) { + init_completion(&have_data); + register_buffer(); + } + return sizeof(*data); +} + +static struct hwrng virtio_hwrng = { + .name = "virtio", + .data_present = virtio_data_present, + .data_read = virtio_data_read, +}; + +static int virtrng_probe(struct virtio_device *vdev) +{ + int err; + + /* We expect a single virtqueue. */ + vq = vdev->config->find_vq(vdev, 0, random_recv_done); + if (IS_ERR(vq)) + return PTR_ERR(vq); + + err = hwrng_register(&virtio_hwrng); + if (err) { + vdev->config->del_vq(vq); + return err; + } + + register_buffer(); + return 0; +} + +static void virtrng_remove(struct virtio_device *vdev) +{ + vdev->config->reset(vdev); + hwrng_unregister(&virtio_hwrng); + vdev->config->del_vq(vq); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_RNG, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_rng = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtrng_probe, + .remove = __devexit_p(virtrng_remove), +}; + +static int __init init(void) +{ + int err; + + random_data = kmalloc(RANDOM_DATA_SIZE, GFP_KERNEL); + if (!random_data) + return -ENOMEM; + + err = register_virtio_driver(&virtio_rng); + if (err) + kfree(random_data); + return err; +} + +static void __exit fini(void) +{ + kfree(random_data); + unregister_virtio_driver(&virtio_rng); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio random number driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio_rng.h b/include/linux/virtio_rng.h new file mode 100644 index 000000000000..331afb6c9f62 --- /dev/null +++ b/include/linux/virtio_rng.h @@ -0,0 +1,8 @@ +#ifndef _LINUX_VIRTIO_RNG_H +#define _LINUX_VIRTIO_RNG_H +#include + +/* The ID for virtio_rng */ +#define VIRTIO_ID_RNG 4 + +#endif /* _LINUX_VIRTIO_RNG_H */ -- cgit From 7f31fe05000af54e1af81f65a96cab90db8d7ed8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 29 May 2008 11:08:01 +0200 Subject: virtio_config: fix len calculation of config elements Rusty, This patch is a prereq for the virtio_blk blocksize patch, please apply it first. Adding an u32 value to the virtio_blk_config unconvered a small bug the config space defintions: v is a pointer, to we have to use sizeof(*v) instead of sizeof(v). Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 50db245c81ad..71d6c102497e 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -99,7 +99,7 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, * The return value is -ENOENT if the feature doesn't exist. Otherwise * the config value is copied into whatever is pointed to by v. */ #define virtio_config_val(vdev, fbit, offset, v) \ - virtio_config_buf((vdev), (fbit), (offset), (v), sizeof(v)) + virtio_config_buf((vdev), (fbit), (offset), (v), sizeof(*v)) static inline int virtio_config_buf(struct virtio_device *vdev, unsigned int fbit, -- cgit From 7757f09c70af87887dfc195e6d6ddd54f5cc7c39 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 29 May 2008 11:10:01 +0200 Subject: virtio_blk: fix endianess annotations Since commit 72e61eb40b55dd57031ec5971e810649f82b0259 (virtio: change config to guest endian) config space is no longer fixed endian. Lets change the virtio_blk_config variables. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- include/linux/virtio_blk.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index b80919fad0ef..5f79a5f9de79 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -15,14 +15,14 @@ struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ - __le64 capacity; + __u64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ - __le32 size_max; + __u32 size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ - __le32 seg_max; + __u32 seg_max; /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { - __le16 cylinders; + __u16 cylinders; __u8 heads; __u8 sectors; } geometry; -- cgit From b4f68be6c5d507afdcd74f5be3df0b1209cda503 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 30 May 2008 15:09:45 -0500 Subject: virtio: force callback on empty. virtio allows drivers to suppress callbacks (ie. interrupts) for efficiency (no locking, it's just an optimization). There's a similar mechanism for the host to suppress notifications coming from the guest: in that case, we ignore the suppression if the ring is completely full. It turns out that life is simpler if the host similarly ignores callback suppression when the ring is completely empty: the network driver wants to free up old packets in a timely manner, and otherwise has to use a timer to poll. We have to remove the code which ignores interrupts when the driver has disabled them (again, it had no locking and hence was unreliable anyway). Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 7 ------- include/linux/virtio_config.h | 4 ++++ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 96d2567a7df8..72bf8bc09014 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -253,13 +253,6 @@ irqreturn_t vring_interrupt(int irq, void *_vq) if (unlikely(vq->broken)) return IRQ_HANDLED; - /* Other side may have missed us turning off the interrupt, - * but we should preserve disable semantic for virtio users. */ - if (unlikely(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { - pr_debug("virtqueue interrupt after disable for %p\n", vq); - return IRQ_HANDLED; - } - pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 71d6c102497e..f364bbf63c34 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -15,6 +15,10 @@ /* We've given up on this device. */ #define VIRTIO_CONFIG_S_FAILED 0x80 +/* Do we get callbacks when the ring is completely used, even if we've + * suppressed them? */ +#define VIRTIO_F_NOTIFY_ON_EMPTY 24 + #ifdef __KERNEL__ #include -- cgit From 5adad0133907790c50283bf03271d920d6897043 Mon Sep 17 00:00:00 2001 From: Henrique de Moraes Holschuh Date: Fri, 30 May 2008 10:40:46 -0400 Subject: Input: rename SW_RADIO to SW_RFKILL_ALL The SW_RADIO code for EV_SW events has a name that is not descriptive enough of its intended function, and could induce someone to think KEY_RADIO is its EV_KEY counterpart, which is false. Rename it to SW_RFKILL_ALL, and document what this event is for. Keep the old name around, to avoid userspace ABI breaks. The SW_RFKILL_ALL event is meant to be used by rfkill master switches. It is not bound to a particular radio switch type, and usually applies to all types. It is semantically tied to master rfkill switches that enable or disable every radio in a system. Signed-off-by: Henrique de Moraes Holschuh Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 28a094fcfe20..e075c4b762fb 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -637,7 +637,9 @@ struct input_absinfo { #define SW_LID 0x00 /* set = lid shut */ #define SW_TABLET_MODE 0x01 /* set = tablet mode */ #define SW_HEADPHONE_INSERT 0x02 /* set = inserted */ -#define SW_RADIO 0x03 /* set = radio enabled */ +#define SW_RFKILL_ALL 0x03 /* rfkill master switch, type "any" + set = radio enabled */ +#define SW_RADIO SW_RFKILL_ALL /* deprecated */ #define SW_MAX 0x0f #define SW_CNT (SW_MAX+1) -- cgit From ca05a99a54db1db5bca72eccb5866d2a86f8517f Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Tue, 27 May 2008 22:05:17 -0700 Subject: capabilities: remain source compatible with 32-bit raw legacy capability support. Source code out there hard-codes a notion of what the _LINUX_CAPABILITY_VERSION #define means in terms of the semantics of the raw capability system calls capget() and capset(). Its unfortunate, but true. Since the confusing header file has been in a released kernel, there is software that is erroneously using 64-bit capabilities with the semantics of 32-bit compatibilities. These recently compiled programs may suffer corruption of their memory when sys_getcap() overwrites more memory than they are coded to expect, and the raising of added capabilities when using sys_capset(). As such, this patch does a number of things to clean up the situation for all. It 1. forces the _LINUX_CAPABILITY_VERSION define to always retain its legacy value. 2. adopts a new #define strategy for the kernel's internal implementation of the preferred magic. 3. deprecates v2 capability magic in favor of a new (v3) magic number. The functionality of v3 is entirely equivalent to v2, the only difference being that the v2 magic causes the kernel to log a "deprecated" warning so the admin can find applications that may be using v2 inappropriately. [User space code continues to be encouraged to use the libcap API which protects the application from details like this. libcap-2.10 is the first to support v3 capabilities.] Fixes issue reported in https://bugzilla.redhat.com/show_bug.cgi?id=447518. Thanks to Bojan Smojver for the report. [akpm@linux-foundation.org: s/depreciate/deprecate/g] [akpm@linux-foundation.org: be robust about put_user size] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew G. Morgan Cc: Serge E. Hallyn Cc: Bojan Smojver Cc: stable@kernel.org Signed-off-by: Andrew Morton Signed-off-by: Chris Wright --- fs/proc/array.c | 2 +- include/linux/capability.h | 29 ++++++++---- kernel/capability.c | 111 +++++++++++++++++++++++++++++---------------- 3 files changed, 95 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9e3b8c33c24b..797d775e0354 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_printf(m, "%s", header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", - a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]); + a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); } seq_printf(m, "\n"); } diff --git a/include/linux/capability.h b/include/linux/capability.h index f4ea0dd9a618..fa830f8de032 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -31,11 +31,11 @@ struct task_struct; #define _LINUX_CAPABILITY_VERSION_1 0x19980330 #define _LINUX_CAPABILITY_U32S_1 1 -#define _LINUX_CAPABILITY_VERSION_2 0x20071026 +#define _LINUX_CAPABILITY_VERSION_2 0x20071026 /* deprecated - use v3 */ #define _LINUX_CAPABILITY_U32S_2 2 -#define _LINUX_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_2 -#define _LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_2 +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 typedef struct __user_cap_header_struct { __u32 version; @@ -77,10 +77,23 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; -#ifdef __KERNEL__ +#ifndef __KERNEL__ + +/* + * Backwardly compatible definition for source code - trapped in a + * 32-bit world. If you find you need this, please consider using + * libcap to untrap yourself... + */ +#define _LINUX_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_1 +#define _LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_1 + +#else + +#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 +#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 typedef struct kernel_cap_struct { - __u32 cap[_LINUX_CAPABILITY_U32S]; + __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) @@ -351,7 +364,7 @@ typedef struct kernel_cap_struct { */ #define CAP_FOR_EACH_U32(__capi) \ - for (__capi = 0; __capi < _LINUX_CAPABILITY_U32S; ++__capi) + for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi) # define CAP_FS_MASK_B0 (CAP_TO_MASK(CAP_CHOWN) \ | CAP_TO_MASK(CAP_DAC_OVERRIDE) \ @@ -361,7 +374,7 @@ typedef struct kernel_cap_struct { # define CAP_FS_MASK_B1 (CAP_TO_MASK(CAP_MAC_OVERRIDE)) -#if _LINUX_CAPABILITY_U32S != 2 +#if _KERNEL_CAPABILITY_U32S != 2 # error Fix up hand-coded capability macro initializers #else /* HAND-CODED capability initializers */ @@ -372,7 +385,7 @@ typedef struct kernel_cap_struct { # define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \ CAP_FS_MASK_B1 } }) -#endif /* _LINUX_CAPABILITY_U32S != 2 */ +#endif /* _KERNEL_CAPABILITY_U32S != 2 */ #define CAP_INIT_INH_SET CAP_EMPTY_SET diff --git a/kernel/capability.c b/kernel/capability.c index 39e8193b41ea..cfbe44299488 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -52,6 +52,69 @@ static void warn_legacy_capability_use(void) } } +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + /* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is @@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - __u32 version; struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -118,7 +167,7 @@ out: spin_unlock(&task_capability_lock); if (!ret) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { @@ -128,7 +177,7 @@ out: } /* - * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability @@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - __u32 version; struct task_struct *target; int ret; pid_t pid; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } - while (i < _LINUX_CAPABILITY_U32S) { + while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; -- cgit From 63e14626eddb534ab429e9c2b95d3f7038b596b6 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sun, 1 Jun 2008 11:49:32 +0200 Subject: mmc_spi: mmc_spi.h should include linux/interrupts.h Since mmc_spi.h uses irqreturn_t type, it should include appropriate header, otherwise build will break if users didn't include it (some of them do not use interrupts). Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman Signed-off-by: Linus Torvalds --- include/linux/spi/mmc_spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/mmc_spi.h b/include/linux/spi/mmc_spi.h index e9bbe3ebd721..d5ca78b93a3b 100644 --- a/include/linux/spi/mmc_spi.h +++ b/include/linux/spi/mmc_spi.h @@ -1,6 +1,8 @@ #ifndef __LINUX_SPI_MMC_SPI_H #define __LINUX_SPI_MMC_SPI_H +#include + struct device; struct mmc_host; -- cgit From 64e9159f5d2c4edf5fa6425031e556f8fddaf7e6 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 3 Jun 2008 15:18:54 +0100 Subject: serial_core: uart_set_ldisc infrastructure The tty layer provides a callback that is used when the line discipline is changed. Some hardware uses this to configure hardware specific features such as IrDA mode on serial ports. Unfortunately the serial layer does not provide this feature or pass it down to drivers. Blackfin used to hack around this by rewriting the tty ops, but those are now properly shared and const so the hack fails. Instead provide the proper operations. This change plus a follow up from the Blackfin guys is needed to avoid blackfin losing features in this release. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/serial/serial_core.c | 10 ++++++++++ include/linux/serial_core.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c index 53b03c629aff..951a75ea6e3e 100644 --- a/drivers/serial/serial_core.c +++ b/drivers/serial/serial_core.c @@ -1165,6 +1165,15 @@ out: return ret; } +static void uart_set_ldisc(struct tty_struct *tty, int ldisc) +{ + struct uart_state *state = tty->driver_data; + struct uart_port *port = state->port; + + if (port->ops->set_ldisc) + port->ops->set_ldisc(port); +} + static void uart_set_termios(struct tty_struct *tty, struct ktermios *old_termios) { @@ -2288,6 +2297,7 @@ static const struct tty_operations uart_ops = { .unthrottle = uart_unthrottle, .send_xchar = uart_send_xchar, .set_termios = uart_set_termios, + .set_ldisc = uart_set_ldisc, .stop = uart_stop, .start = uart_start, .hangup = uart_hangup, diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d32123ae08ad..d8f31de632c5 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -192,6 +192,7 @@ struct uart_ops { void (*shutdown)(struct uart_port *); void (*set_termios)(struct uart_port *, struct ktermios *new, struct ktermios *old); + void (*set_ldisc)(struct uart_port *); void (*pm)(struct uart_port *, unsigned int state, unsigned int oldstate); int (*set_wake)(struct uart_port *, unsigned int state); -- cgit From 51b77cae0d5aa8e1546fca855dcfe48ddfadfa9c Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:01 -0700 Subject: route: Mark unused route cache flags as such. Also removes an obsolete check for the unused flag RTCF_MASQ. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/in_route.h | 12 ++++++------ net/ipv4/route.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/in_route.h b/include/linux/in_route.h index 61f25c30a2a0..b261b8c915f0 100644 --- a/include/linux/in_route.h +++ b/include/linux/in_route.h @@ -10,19 +10,19 @@ #define RTCF_NOPMTUDISC RTM_F_NOPMTUDISC #define RTCF_NOTIFY 0x00010000 -#define RTCF_DIRECTDST 0x00020000 +#define RTCF_DIRECTDST 0x00020000 /* unused */ #define RTCF_REDIRECTED 0x00040000 -#define RTCF_TPROXY 0x00080000 +#define RTCF_TPROXY 0x00080000 /* unused */ -#define RTCF_FAST 0x00200000 -#define RTCF_MASQ 0x00400000 -#define RTCF_SNAT 0x00800000 +#define RTCF_FAST 0x00200000 /* unused */ +#define RTCF_MASQ 0x00400000 /* unused */ +#define RTCF_SNAT 0x00800000 /* unused */ #define RTCF_DOREDIRECT 0x01000000 #define RTCF_DIRECTSRC 0x04000000 #define RTCF_DNAT 0x08000000 #define RTCF_BROADCAST 0x10000000 #define RTCF_MULTICAST 0x20000000 -#define RTCF_REJECT 0x40000000 +#define RTCF_REJECT 0x40000000 /* unused */ #define RTCF_LOCAL 0x80000000 #define RTCF_NAT (RTCF_DNAT|RTCF_SNAT) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df41026b60db..96be336064fb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && + if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; -- cgit From 1f9d11c7c99da706e33646c3a9080dd5a8ef9a0b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:27 -0700 Subject: route: Mark unused routing attributes as such Also removes an unused policy entry for an attribute which is only used in kernel->user direction. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 4 ++-- net/ipv4/fib_frontend.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 44c81c744538..a2aec2c0cfb5 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -267,10 +267,10 @@ enum rtattr_type_t RTA_PREFSRC, RTA_METRICS, RTA_MULTIPATH, - RTA_PROTOINFO, + RTA_PROTOINFO, /* no longer used */ RTA_FLOW, RTA_CACHEINFO, - RTA_SESSION, + RTA_SESSION, /* no longer used */ RTA_MP_ALGO, /* no longer used */ RTA_TABLE, __RTA_MAX diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0f1557a4ac7a..0b2ac6a3d903 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, - [RTA_PROTOINFO] = { .type = NLA_U32 }, [RTA_FLOW] = { .type = NLA_U32 }, }; -- cgit From ab32cd793dca21eec846a8204390d9594ed994d5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:37:33 -0700 Subject: route: Remove unused ifa_anycast field The field was supposed to allow the creation of an anycast route by assigning an anycast address to an address prefix. It was never implemented so this field is unused and serves no purpose. Remove it. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 - net/ipv4/devinet.c | 9 --------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 7009b0cdd06f..c6f51ad52d5b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -117,7 +117,6 @@ struct in_ifaddr __be32 ifa_address; __be32 ifa_mask; __be32 ifa_broadcast; - __be32 ifa_anycast; unsigned char ifa_scope; unsigned char ifa_flags; unsigned char ifa_prefixlen; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6848e4760f34..79a7ef6209ff 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, - [IFA_ANYCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; @@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); - if (tb[IFA_ANYCAST]) - ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]); - if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else @@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; - ifa->ifa_anycast = 0; ifa->ifa_scope = 0; } @@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(4) /* IFA_ANYCAST */ + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ } @@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, if (ifa->ifa_broadcast) NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - if (ifa->ifa_anycast) - NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast); - if (ifa->ifa_label[0]) NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); -- cgit From 4f0ebe3cc57f18ba26317b56b80b108c2848b1de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 May 2008 02:17:50 +0900 Subject: libata: kill unused constants Kill a few unused constants. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 4a92fbafce9d..93e2b89d0c57 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -111,13 +111,10 @@ enum { /* various global constants */ LIBATA_MAX_PRD = ATA_MAX_PRD / 2, LIBATA_DUMB_MAX_PRD = ATA_MAX_PRD / 4, /* Worst case */ - ATA_MAX_PORTS = 8, ATA_DEF_QUEUE = 1, /* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */ ATA_MAX_QUEUE = 32, ATA_TAG_INTERNAL = ATA_MAX_QUEUE - 1, - ATA_MAX_BUS = 2, - ATA_DEF_BUSY_WAIT = 10000, ATA_SHORT_PAUSE = (HZ >> 6) + 1, ATAPI_MAX_DRAIN = 16 << 10, -- cgit From a57c1bade5a0ee5cd8b74502db9cbebb7f5780b2 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 29 May 2008 22:10:58 +0100 Subject: libata-sff: Fix oops reported in kerneloops.org for pnp devices with no ctl - Make ata_sff_altstatus private so nobody uses it by mistake - Drop the 400nS delay from it Add ata_sff_irq_status - encapsulates the IRQ check logic This function keeps the existing behaviour for altstatus using devices. I actually suspect the logic was wrong before the changes but -rc isn't the time to play with that ata_sff_sync - ensure writes hit the device Really we want an io* operation for 'is posted' eg ioisposted(ioaddr) so that we can fix the nasty delay this causes on most systems. - ata_sff_pause - 400nS delay Ensure the command hit the device and delay 400nS - ata_sff_dma_pause Ensure the I/O hit the device and enforce an HDMA1:0 transition delay. Requires altstatus register exists, BUG if not so we don't risk corruption in MWDMA modes. (UDMA the checksum will save your backside in theory) The only other complication then is devices with their own handlers. rb532 can use dma_pause but scc needs to access its own altstatus register for internal errata workarounds so directly call the drivers own altstatus function. Signed-off-by: Alan Cox Signed-off-by: Jeff Garzik --- drivers/ata/libata-sff.c | 115 ++++++++++++++++++++++++++++++++++++++------ drivers/ata/pata_icside.c | 2 +- drivers/ata/pata_rb532_cf.c | 4 +- drivers/ata/pata_scc.c | 5 +- include/linux/libata.h | 16 +----- 5 files changed, 109 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 3c2d2289f85e..90d20c615ef5 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -247,7 +247,7 @@ u8 ata_sff_check_status(struct ata_port *ap) * LOCKING: * Inherited from caller. */ -u8 ata_sff_altstatus(struct ata_port *ap) +static u8 ata_sff_altstatus(struct ata_port *ap) { if (ap->ops->sff_check_altstatus) return ap->ops->sff_check_altstatus(ap); @@ -255,6 +255,93 @@ u8 ata_sff_altstatus(struct ata_port *ap) return ioread8(ap->ioaddr.altstatus_addr); } +/** + * ata_sff_irq_status - Check if the device is busy + * @ap: port where the device is + * + * Determine if the port is currently busy. Uses altstatus + * if available in order to avoid clearing shared IRQ status + * when finding an IRQ source. Non ctl capable devices don't + * share interrupt lines fortunately for us. + * + * LOCKING: + * Inherited from caller. + */ +static u8 ata_sff_irq_status(struct ata_port *ap) +{ + u8 status; + + if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) { + status = ata_sff_altstatus(ap); + /* Not us: We are busy */ + if (status & ATA_BUSY) + return status; + } + /* Clear INTRQ latch */ + status = ata_sff_check_status(ap); + return status; +} + +/** + * ata_sff_sync - Flush writes + * @ap: Port to wait for. + * + * CAUTION: + * If we have an mmio device with no ctl and no altstatus + * method this will fail. No such devices are known to exist. + * + * LOCKING: + * Inherited from caller. + */ + +static void ata_sff_sync(struct ata_port *ap) +{ + if (ap->ops->sff_check_altstatus) + ap->ops->sff_check_altstatus(ap); + else if (ap->ioaddr.altstatus_addr) + ioread8(ap->ioaddr.altstatus_addr); +} + +/** + * ata_sff_pause - Flush writes and wait 400nS + * @ap: Port to pause for. + * + * CAUTION: + * If we have an mmio device with no ctl and no altstatus + * method this will fail. No such devices are known to exist. + * + * LOCKING: + * Inherited from caller. + */ + +void ata_sff_pause(struct ata_port *ap) +{ + ata_sff_sync(ap); + ndelay(400); +} + +/** + * ata_sff_dma_pause - Pause before commencing DMA + * @ap: Port to pause for. + * + * Perform I/O fencing and ensure sufficient cycle delays occur + * for the HDMA1:0 transition + */ + +void ata_sff_dma_pause(struct ata_port *ap) +{ + if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) { + /* An altstatus read will cause the needed delay without + messing up the IRQ status */ + ata_sff_altstatus(ap); + return; + } + /* There are no DMA controllers without ctl. BUG here to ensure + we never violate the HDMA1:0 transition timing and risk + corruption. */ + BUG(); +} + /** * ata_sff_busy_sleep - sleep until BSY clears, or timeout * @ap: port containing status register to be polled @@ -742,7 +829,7 @@ static void ata_pio_sectors(struct ata_queued_cmd *qc) } else ata_pio_sector(qc); - ata_sff_altstatus(qc->ap); /* flush */ + ata_sff_sync(qc->ap); /* flush */ } /** @@ -763,8 +850,9 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc) WARN_ON(qc->dev->cdb_len < 12); ap->ops->sff_data_xfer(qc->dev, qc->cdb, qc->dev->cdb_len, 1); - ata_sff_altstatus(ap); /* flush */ - + ata_sff_sync(ap); + /* FIXME: If the CDB is for DMA do we need to do the transition delay + or is bmdma_start guaranteed to do it ? */ switch (qc->tf.protocol) { case ATAPI_PROT_PIO: ap->hsm_task_state = HSM_ST; @@ -905,7 +993,7 @@ static void atapi_pio_bytes(struct ata_queued_cmd *qc) if (unlikely(__atapi_pio_bytes(qc, bytes))) goto err_out; - ata_sff_altstatus(ap); /* flush */ + ata_sff_sync(ap); /* flush */ return; @@ -1489,14 +1577,10 @@ inline unsigned int ata_sff_host_intr(struct ata_port *ap, goto idle_irq; } - /* check altstatus */ - status = ata_sff_altstatus(ap); - if (status & ATA_BUSY) - goto idle_irq; - /* check main status, clearing INTRQ */ - status = ap->ops->sff_check_status(ap); - if (unlikely(status & ATA_BUSY)) + /* check main status, clearing INTRQ if needed */ + status = ata_sff_irq_status(ap); + if (status & ATA_BUSY) goto idle_irq; /* ack bmdma irq events */ @@ -2030,7 +2114,7 @@ void ata_sff_error_handler(struct ata_port *ap) ap->ops->bmdma_stop(qc); } - ata_sff_altstatus(ap); + ata_sff_sync(ap); /* FIXME: We don't need this */ ap->ops->sff_check_status(ap); ap->ops->sff_irq_clear(ap); @@ -2203,7 +2287,7 @@ void ata_bmdma_stop(struct ata_queued_cmd *qc) mmio + ATA_DMA_CMD); /* one-PIO-cycle guaranteed wait, per spec, for HDMA1:0 transition */ - ata_sff_altstatus(ap); /* dummy read */ + ata_sff_dma_pause(ap); } /** @@ -2722,7 +2806,8 @@ EXPORT_SYMBOL_GPL(ata_sff_qc_prep); EXPORT_SYMBOL_GPL(ata_sff_dumb_qc_prep); EXPORT_SYMBOL_GPL(ata_sff_dev_select); EXPORT_SYMBOL_GPL(ata_sff_check_status); -EXPORT_SYMBOL_GPL(ata_sff_altstatus); +EXPORT_SYMBOL_GPL(ata_sff_dma_pause); +EXPORT_SYMBOL_GPL(ata_sff_pause); EXPORT_SYMBOL_GPL(ata_sff_busy_sleep); EXPORT_SYMBOL_GPL(ata_sff_wait_ready); EXPORT_SYMBOL_GPL(ata_sff_tf_load); diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c index 17138436423d..cf9e9848f8b5 100644 --- a/drivers/ata/pata_icside.c +++ b/drivers/ata/pata_icside.c @@ -270,7 +270,7 @@ static void pata_icside_bmdma_stop(struct ata_queued_cmd *qc) disable_dma(state->dma); /* see ata_bmdma_stop */ - ata_sff_altstatus(ap); + ata_sff_dma_pause(ap); } static u8 pata_icside_bmdma_status(struct ata_port *ap) diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c index a108d259f19d..f8b3ffc8ae9e 100644 --- a/drivers/ata/pata_rb532_cf.c +++ b/drivers/ata/pata_rb532_cf.c @@ -57,7 +57,9 @@ static inline void rb532_pata_finish_io(struct ata_port *ap) struct ata_host *ah = ap->host; struct rb532_cf_info *info = ah->private_data; - ata_sff_altstatus(ap); + /* FIXME: Keep previous delay. If this is merely a fence then + ata_sff_sync might be sufficient. */ + ata_sff_dma_pause(ap); ndelay(RB500_CF_IO_DELAY); set_irq_type(info->irq, IRQ_TYPE_LEVEL_HIGH); diff --git a/drivers/ata/pata_scc.c b/drivers/ata/pata_scc.c index e965b251ca24..bbf5aa345e68 100644 --- a/drivers/ata/pata_scc.c +++ b/drivers/ata/pata_scc.c @@ -726,7 +726,7 @@ static void scc_bmdma_stop (struct ata_queued_cmd *qc) in_be32(bmid_base + SCC_DMA_CMD) & ~ATA_DMA_START); /* one-PIO-cycle guaranteed wait, per spec, for HDMA1:0 transition */ - ata_sff_altstatus(ap); /* dummy read */ + ata_sff_dma_pause(ap); /* dummy read */ } /** @@ -747,7 +747,8 @@ static u8 scc_bmdma_status (struct ata_port *ap) return host_stat; /* errata A252,A308 workaround: Step4 */ - if ((ata_sff_altstatus(ap) & ATA_ERR) && (int_status & INTSTS_INTRQ)) + if ((scc_check_altstatus(ap) & ATA_ERR) + && (int_status & INTSTS_INTRQ)) return (host_stat | ATA_DMA_INTR); /* errata A308 workaround Step5 */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 93e2b89d0c57..e57e5d08312d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1432,7 +1432,8 @@ extern void ata_sff_qc_prep(struct ata_queued_cmd *qc); extern void ata_sff_dumb_qc_prep(struct ata_queued_cmd *qc); extern void ata_sff_dev_select(struct ata_port *ap, unsigned int device); extern u8 ata_sff_check_status(struct ata_port *ap); -extern u8 ata_sff_altstatus(struct ata_port *ap); +extern void ata_sff_pause(struct ata_port *ap); +extern void ata_sff_dma_pause(struct ata_port *ap); extern int ata_sff_busy_sleep(struct ata_port *ap, unsigned long timeout_pat, unsigned long timeout); extern int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline); @@ -1492,19 +1493,6 @@ extern int ata_pci_sff_init_one(struct pci_dev *pdev, struct scsi_host_template *sht, void *host_priv); #endif /* CONFIG_PCI */ -/** - * ata_sff_pause - Flush writes and pause 400 nanoseconds. - * @ap: Port to wait for. - * - * LOCKING: - * Inherited from caller. - */ -static inline void ata_sff_pause(struct ata_port *ap) -{ - ata_sff_altstatus(ap); - ndelay(400); -} - /** * ata_sff_busy_wait - Wait for a port status register * @ap: Port to wait for. -- cgit From 39028ec69b13712ec1dcd9aa14844bf60f19cb20 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 2 Jun 2008 15:46:51 -0300 Subject: V4L/DVB (7166): [v4l] Add new user class controls and deprecate others These were removed in commit 26d507fcfef7f7d0cd2eec874a87169cc121c835: > -#define V4L2_CID_HCENTER (V4L2_CID_BASE+22) > -#define V4L2_CID_VCENTER (V4L2_CID_BASE+23) > -#define V4L2_CID_LASTP1 (V4L2_CID_BASE+24) /* > last CID + 1 */ > + > +/* Deprecated, use V4L2_CID_PAN_RESET and V4L2_CID_TILT_RESET */ > +#define V4L2_CID_HCENTER_DEPRECATED (V4L2_CID_BASE+22) > +#define V4L2_CID_VCENTER_DEPRECATED (V4L2_CID_BASE+23) But there was no warning in Documentation/feature-removal-schedule.txt and I'm receiving reports that it's breaking userspace apps (the gstreamer-v4l2 plugin breaks in Fedora rawhide). You can't just pull things from the published userspace API like that. Please can we revert the addition of _DEPRECATED to these ioctl definitions. Perhaps we can add a runtime warning if they actually get used? Or a compile-time warning if we can manage that? Signed-off-by: David Woodhouse Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index c1411189ba6c..4a535ea1e123 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -865,9 +865,9 @@ struct v4l2_querymenu #define V4L2_CID_HFLIP (V4L2_CID_BASE+20) #define V4L2_CID_VFLIP (V4L2_CID_BASE+21) -/* Deprecated, use V4L2_CID_PAN_RESET and V4L2_CID_TILT_RESET */ -#define V4L2_CID_HCENTER_DEPRECATED (V4L2_CID_BASE+22) -#define V4L2_CID_VCENTER_DEPRECATED (V4L2_CID_BASE+23) +/* Deprecated; use V4L2_CID_PAN_RESET and V4L2_CID_TILT_RESET */ +#define V4L2_CID_HCENTER (V4L2_CID_BASE+22) +#define V4L2_CID_VCENTER (V4L2_CID_BASE+23) #define V4L2_CID_POWER_LINE_FREQUENCY (V4L2_CID_BASE+24) enum v4l2_power_line_frequency { -- cgit From 2f5997140f22f68f6390c49941150d3fa8a95cb7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 May 2008 12:10:20 -0300 Subject: KVM: migrate PIT timer Migrate the PIT timer to the physical CPU which vcpu0 is scheduled on, similarly to what is done for the LAPIC timers, otherwise PIT interrupts will be delayed until an unrelated event causes an exit. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 14 +++++++++++++- arch/x86/kvm/irq.c | 6 ++++++ arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 2 +- 7 files changed, 25 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 7c077a9d9777..f2f5d260874e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,7 +200,6 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - /* FIXME: handle case where the guest is in guest mode */ if (vcpu0 && waitqueue_active(&vcpu0->wq)) { vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(&vcpu0->wq); @@ -237,6 +236,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct hrtimer *timer; + + if (vcpu->vcpu_id != 0 || !pit) + return; + + timer = &pit->pit_state.pit_timer.timer; + if (hrtimer_cancel(timer)) + hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} + static void destroy_pit_timer(struct kvm_kpit_timer *pt) { pr_debug("pit: execute del timer!\n"); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ce1f583459b1..76d736b5f664 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); + +void __kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_pit_timer(vcpu); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 1802134b836f..2a15be2275c0 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab22615eee89..6b0d5fa5bab3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bfe4db11989c..96445f341519 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -608,7 +608,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); vpid_sync_vcpu_all(vmx); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21338bdb28ff..00acf1301a15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2758,7 +2758,7 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) - __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_timers(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 398978972b7a..092b1b25291d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -297,7 +297,7 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGE_SHIFT; } -static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) { set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); } -- cgit From 44d1b980c72db0faf35adb082fb2208351803028 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Jun 2008 22:46:18 -0700 Subject: Fix various old email addresses for dwmw2 Although if people have questions about ARCnet, perhaps it's _better_ for them to be mailing dwmw2@cam.ac.uk about it... Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/networking/arcnet.txt | 2 +- arch/frv/kernel/cmode.S | 2 +- arch/frv/kernel/sleep.S | 2 +- arch/frv/mb93090-mb00/pci-dma-nommu.c | 2 +- drivers/mtd/redboot.c | 2 +- fs/afs/callback.c | 2 +- fs/afs/inode.c | 2 +- fs/afs/super.c | 2 +- include/linux/mtd/nand.h | 2 +- include/linux/tty.h | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.txt index 770fc41a78e8..796012540386 100644 --- a/Documentation/networking/arcnet.txt +++ b/Documentation/networking/arcnet.txt @@ -46,7 +46,7 @@ These are the ARCnet drivers for Linux. This new release (2.91) has been put together by David Woodhouse -, in an attempt to tidy up the driver after adding support +, in an attempt to tidy up the driver after adding support for yet another chipset. Now the generic support has been separated from the individual chipset drivers, and the source files aren't quite so packed with #ifdefs! I've changed this file a bit, but kept it in the first person from diff --git a/arch/frv/kernel/cmode.S b/arch/frv/kernel/cmode.S index 81ba28ad2207..53deeb5d7e87 100644 --- a/arch/frv/kernel/cmode.S +++ b/arch/frv/kernel/cmode.S @@ -1,7 +1,7 @@ /* cmode.S: clock mode management * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Woodhouse (dwmw2@redhat.com) + * Written by David Woodhouse (dwmw2@infradead.org) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/arch/frv/kernel/sleep.S b/arch/frv/kernel/sleep.S index c9b2d51ab9ad..f67bf73cd2cc 100644 --- a/arch/frv/kernel/sleep.S +++ b/arch/frv/kernel/sleep.S @@ -1,7 +1,7 @@ /* sleep.S: power saving mode entry * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Woodhouse (dwmw2@redhat.com) + * Written by David Woodhouse (dwmw2@infradead.org) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index 4985466b1a7c..64ee58d748be 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c @@ -1,7 +1,7 @@ /* pci-dma-nommu.c: Dynamic DMA mapping support for the FRV * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Woodhouse (dwmw2@redhat.com) + * Written by David Woodhouse (dwmw2@infradead.org) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/drivers/mtd/redboot.c b/drivers/mtd/redboot.c index 47474903263c..c5030f94f04e 100644 --- a/drivers/mtd/redboot.c +++ b/drivers/mtd/redboot.c @@ -295,5 +295,5 @@ module_init(redboot_parser_init); module_exit(redboot_parser_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Red Hat, Inc. - David Woodhouse "); +MODULE_AUTHOR("David Woodhouse "); MODULE_DESCRIPTION("Parsing code for RedBoot Flash Image System (FIS) tables"); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index a78d5b236bb1..587ef5123cd8 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -8,7 +8,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Authors: David Woodhouse + * Authors: David Woodhouse * David Howells * */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 08db82e1343a..bb47217f6a18 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -8,7 +8,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Authors: David Woodhouse + * Authors: David Woodhouse * David Howells * */ diff --git a/fs/afs/super.c b/fs/afs/super.c index 4b572b801d8d..7e3faeef6818 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -10,7 +10,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Authors: David Howells - * David Woodhouse + * David Woodhouse * */ diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index c42bc7f533a5..53ea3dc8b0e8 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1,7 +1,7 @@ /* * linux/include/linux/mtd/nand.h * - * Copyright (c) 2000 David Woodhouse + * Copyright (c) 2000 David Woodhouse * Steven J. Hill * Thomas Gleixner * diff --git a/include/linux/tty.h b/include/linux/tty.h index 7f7121f9c968..324a3b231d40 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -36,7 +36,7 @@ #define N_6PACK 7 #define N_MASC 8 /* Reserved for Mobitex module */ #define N_R3964 9 /* Reserved for Simatic R3964 module */ -#define N_PROFIBUS_FDL 10 /* Reserved for Profibus */ +#define N_PROFIBUS_FDL 10 /* Reserved for Profibus */ #define N_IRDA 11 /* Linux IrDa - http://irda.sourceforge.net/ */ #define N_SMSBLOCK 12 /* SMS block mode - for talking to GSM data */ /* cards about SMS messages */ -- cgit From 3527fb326f07bc8e85cf66d4f987ebeea24e8e4a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Thu, 5 Jun 2008 22:46:19 -0700 Subject: lib: export bitrev16 Bluetooth will be able to use this. Signed-off-by: Harvey Harrison Cc: Marcel Holtmann Cc: Dave Young Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitrev.h | 1 + lib/bitrev.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h index 05e540d6963a..7ffe03f4693d 100644 --- a/include/linux/bitrev.h +++ b/include/linux/bitrev.h @@ -10,6 +10,7 @@ static inline u8 bitrev8(u8 byte) return byte_rev_table[byte]; } +extern u16 bitrev16(u16 in); extern u32 bitrev32(u32 in); #endif /* _LINUX_BITREV_H */ diff --git a/lib/bitrev.c b/lib/bitrev.c index 989aff73f881..3956203456d4 100644 --- a/lib/bitrev.c +++ b/lib/bitrev.c @@ -42,10 +42,11 @@ const u8 byte_rev_table[256] = { }; EXPORT_SYMBOL_GPL(byte_rev_table); -static __always_inline u16 bitrev16(u16 x) +u16 bitrev16(u16 x) { return (bitrev8(x & 0xff) << 8) | bitrev8(x >> 8); } +EXPORT_SYMBOL(bitrev16); /** * bitrev32 - reverse the order of bits in a u32 value -- cgit From 93b071139a956e51c98cdefd50a47981a4eb852e Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 5 Jun 2008 22:46:21 -0700 Subject: introduce memory_read_from_buffer() This patch introduces memory_read_from_buffer(). The only difference between memory_read_from_buffer() and simple_read_from_buffer() is which address space the function copies to. simple_read_from_buffer copies to user space memory. memory_read_from_buffer copies to normal memory. Signed-off-by: Akinobu Mita Cc: Al Viro Cc: Doug Warzecha Cc: Zhang Rui Cc: Matt Domsch Cc: Abhay Salunke Cc: Greg Kroah-Hartman Cc: Markus Rechberger Cc: Kay Sievers Cc: Bob Moore Cc: Thomas Renninger Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: "Antonino A. Daplas" Cc: Krzysztof Helt Cc: Geert Uytterhoeven Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Peter Oberparleiter Cc: Michael Holzheu Cc: Brian King Cc: James E.J. Bottomley Cc: Andrew Vasquez Cc: Seokmann Ju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 18 ++++++++++++++++++ include/linux/fs.h | 5 ++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index b004dfadd891..892d41cb3382 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -528,6 +528,23 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, return count; } +ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + + if (pos < 0) + return -EINVAL; + if (pos >= available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy(to, from + pos, count); + *ppos = pos + count; + + return count; +} + /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then @@ -800,6 +817,7 @@ EXPORT_SYMBOL(simple_statfs); EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); +EXPORT_SYMBOL(memory_read_from_buffer); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index f413085f748e..d490779f18d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2000,7 +2000,10 @@ extern int simple_fill_super(struct super_block *, int, struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); -extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t); +extern ssize_t simple_read_from_buffer(void __user *to, size_t count, + loff_t *ppos, const void *from, size_t available); +extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, -- cgit From 68aa0a206a7a2dd8655a50b36e8274eb87b84544 Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Thu, 5 Jun 2008 22:46:36 -0700 Subject: ipc: restore MSGPOOL original value When posting: [PATCH 1/8] Scaling msgmni to the amount of lowmem (see http://article.gmane.org/gmane.linux.kernel/637849/) I changed the MSGPOOL value to make it fit what is said in the man pages (i.e. a size in bytes). But Michael Kerrisk rightly complained that this change could affect the ABI. So I'm posting this patch to make MSGPOOL expressed back in Kbytes. Michael, on his side, has fixed the man page. Signed-off-by: Nadia Derbey Cc: Pierre Peiffer Cc: Manfred Spraul Acked-by: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msg.h b/include/linux/msg.h index 6f3b8e79a991..56abf1558fdd 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -64,11 +64,11 @@ struct msginfo { #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ /* unused */ -#define MSGPOOL (MSGMNI * MSGMNB) /* size in bytes of message pool */ +#define MSGPOOL (MSGMNI * MSGMNB / 1024) /* size in kbytes of message pool */ #define MSGTQL MSGMNB /* number of system message headers */ #define MSGMAP MSGMNB /* number of entries in message map */ #define MSGSSZ 16 /* message segment size */ -#define __MSGSEG (MSGPOOL / MSGSSZ) /* max no. of segments */ +#define __MSGSEG ((MSGPOOL * 1024) / MSGSSZ) /* max no. of segments */ #define MSGSEG (__MSGSEG <= 0xffff ? __MSGSEG : 0xffff) #ifdef __KERNEL__ -- cgit From 979b0fea2d9ae5d57237a368d571cbc84655fba6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Jun 2008 22:47:00 -0700 Subject: vm: add kzalloc_node() inline To get zeroed out memory from a particular NUMA node. To be used by sunrpc. Signed-off-by: Jeff Layton Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 805ed4b92f9a..c2ad35016599 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -276,6 +276,17 @@ static inline void *kzalloc(size_t size, gfp_t flags) return kmalloc(size, flags | __GFP_ZERO); } +/** + * kzalloc_node - allocate zeroed memory from a particular memory node. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + * @node: memory node from which to allocate + */ +static inline void *kzalloc_node(size_t size, gfp_t flags, int node) +{ + return kmalloc_node(size, flags | __GFP_ZERO, node); +} + #ifdef CONFIG_SLABINFO extern const struct seq_operations slabinfo_op; ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *); -- cgit From f751aa125d1843ea4a9a264b451fd5b1639fab20 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 8 Jun 2008 21:43:10 +0300 Subject: fat_valid_media() isn't for userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 73f20e58b1d586e9f6d3ddc3aad872829aca7743 ("FAT_VALID_MEDIA(): remove pointless test") wrongly added the new fat_valid_media() function to the userspace-visible part of include/linux/msdos_fs.h Move it to the part of include/linux/msdos_fs.h that is not exported to userspace. Reported-by: Onur Küçük Reported-by: S.Çağlar Onur Signed-off-by: Adrian Bunk Acked-by: OGAWA Hirofumi Signed-off-by: Linus Torvalds --- include/linux/msdos_fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index b03b27457413..81cd36b735b0 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -57,12 +57,6 @@ #define MSDOS_DOT ". " /* ".", padded to MSDOS_NAME chars */ #define MSDOS_DOTDOT ".. " /* "..", padded to MSDOS_NAME chars */ -/* media of boot sector */ -static inline int fat_valid_media(u8 media) -{ - return 0xf8 <= media || media == 0xf0; -} - #define FAT_FIRST_ENT(s, x) ((MSDOS_SB(s)->fat_bits == 32 ? 0x0FFFFF00 : \ MSDOS_SB(s)->fat_bits == 16 ? 0xFF00 : 0xF00) | (x)) @@ -334,6 +328,12 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) #endif } +/* media of boot sector */ +static inline int fat_valid_media(u8 media) +{ + return 0xf8 <= media || media == 0xf0; +} + /* fat/cache.c */ extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, -- cgit From 0d5799449f0f373ca12681d86c941ae464146a37 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 4 Jun 2008 08:30:54 +1000 Subject: [POWERPC] Make walk_memory_resource available with MEMORY_HOTPLUG=n The ehea driver was recently changed[1] to use walk_memory_resource() to detect the system's memory layout. However, walk_memory_resource() is available only when memory hotplug is enabled. So CONFIG_EHEA was made to depend on MEMORY_HOTPLUG [2], but it is inappropriate for a network driver to have such a dependency. Make the declaration of walk_memory_resource() and its powerpc implementation (ehea is powerpc-specific) unconditionally available. [1] 48cfb14f8b89d4d5b3df6c16f08b258686fb12ad "ehea: Add DLPAR memory remove support" [2] fb7b6ca2b6b7c23b52be143bdd5f55a23b9780c8 "ehea: Add dependency to Kconfig" Signed-off-by: Nathan Lynch Acked-by: Badari Pulavarty Signed-off-by: Paul Mackerras --- arch/powerpc/mm/mem.c | 3 +-- include/linux/memory_hotplug.h | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index f67e118116fa..51f82d83bf14 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,6 +151,7 @@ out: return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ /* * walk_memory_resource() needs to make sure there is no holes in a given @@ -184,8 +185,6 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, } EXPORT_SYMBOL_GPL(walk_memory_resource); -#endif /* CONFIG_MEMORY_HOTPLUG */ - void show_mem(void) { unsigned long total = 0, reserved = 0; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 73e358612eaf..ea9f5ad9ec8e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -77,14 +77,6 @@ extern int __add_pages(struct zone *zone, unsigned long start_pfn, extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -/* - * Walk through all memory which is registered as resource. - * arg is (start_pfn, nr_pages, private_arg_pointer) - */ -extern int walk_memory_resource(unsigned long start_pfn, - unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)); - #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); #else @@ -199,6 +191,14 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) #endif /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Walk through all memory which is registered as resource. + * arg is (start_pfn, nr_pages, private_arg_pointer) + */ +extern int walk_memory_resource(unsigned long start_pfn, + unsigned long nr_pages, void *arg, + int (*func)(unsigned long, unsigned long, void *)); + extern int add_memory(int nid, u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size); extern int remove_memory(u64 start, u64 size); -- cgit From dfa7e20cc0d1a7a620def4dce97de1ae5375f99b Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Mon, 9 Jun 2008 11:18:45 -0500 Subject: mm: Minor clean-up of page flags in mm/page_alloc.c Minor source code cleanup of page flags in mm/page_alloc.c. Move the definition of the groups of bits to page-flags.h. The purpose of this clean up is that the next patch will conditionally add a page flag to the groups. Doing that in a header file is cleaner than adding #ifdefs to the C code. Signed-off-by: Russ Anderson Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 24 ++++++++++++++++++++++++ mm/page_alloc.c | 34 +++------------------------------- 2 files changed, 27 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 590cff32415d..f31debfac926 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -306,5 +306,29 @@ static inline void __ClearPageTail(struct page *page) } #endif /* !PAGEFLAGS_EXTENDED */ + +#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ + 1 << PG_buddy | 1 << PG_writeback | \ + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active) + +/* + * Flags checked in bad_page(). Pages on the free list should not have + * these flags set. It they are, there is a problem. + */ +#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | 1 << PG_reclaim | 1 << PG_dirty) + +/* + * Flags checked when a page is freed. Pages being freed should not have + * these flags set. It they are, there is a problem. + */ +#define PAGE_FLAGS_CHECK_AT_FREE (PAGE_FLAGS | 1 << PG_reserved) + +/* + * Flags checked when a page is prepped for return by the page allocator. + * Pages being prepped should not have these flags set. It they are, there + * is a problem. + */ +#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | 1 << PG_reserved | 1 << PG_dirty) + #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e83f02cd2d3..2f552955a02f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -237,16 +237,7 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~(1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_buddy ); + page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); /* -- cgit From 585c5434f0e02ff0ffc567ec223af61e2d8e2e88 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 5 Jun 2008 21:29:49 +0300 Subject: include/linux/ssb/ssb_driver_gige.h typo fix This patch fixes a typo in the name of a config variable. Reported-by: Robert P. J. Day Signed-off-by: Adrian Bunk Reviewed-by: Michael Buesch Signed-off-by: John W. Linville --- include/linux/ssb/ssb_driver_gige.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index 01fbdf5fef22..942e38736901 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -100,7 +100,7 @@ extern char * nvram_get(const char *name); /* Get the device MAC address */ static inline void ssb_gige_get_macaddr(struct pci_dev *pdev, u8 *macaddr) { -#ifdef CONFIG_BCM947XX +#ifdef CONFIG_BCM47XX char *res = nvram_get("et0macaddr"); if (res) memcpy(macaddr, res, 6); -- cgit From 16882c1e962b4be5122fc05aaf2afc10fd9e2d15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 8 Jun 2008 21:20:41 +0400 Subject: sched: fix TASK_WAKEKILL vs SIGKILL race schedule() has the special "TASK_INTERRUPTIBLE && signal_pending()" case, this allows us to do current->state = TASK_INTERRUPTIBLE; schedule(); without fear to sleep with pending signal. However, the code like current->state = TASK_KILLABLE; schedule(); is not right, schedule() doesn't take TASK_WAKEKILL into account. This means that mutex_lock_killable(), wait_for_completion_killable(), down_killable(), schedule_timeout_killable() can miss SIGKILL (and btw the second SIGKILL has no effect). Introduce the new helper, signal_pending_state(), and change schedule() to use it. Hopefully it will have more users, that is why the task's state is passed separately. Note this "__TASK_STOPPED | __TASK_TRACED" check in signal_pending_state(). This is needed to preserve the current behaviour (ptrace_notify). I hope this check will be removed soon, but this (afaics good) change needs the separate discussion. The fast path is "(state & (INTERRUPTIBLE | WAKEKILL)) + signal_pending(p)", basically the same that schedule() does now. However, this patch of course bloats schedule(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++++++++++ kernel/sched.c | 6 ++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ae0be3c62375..c5d3f847ca8d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2026,6 +2026,19 @@ static inline int fatal_signal_pending(struct task_struct *p) return signal_pending(p) && __fatal_signal_pending(p); } +static inline int signal_pending_state(long state, struct task_struct *p) +{ + if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) + return 0; + if (!signal_pending(p)) + return 0; + + if (state & (__TASK_STOPPED | __TASK_TRACED)) + return 0; + + return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); +} + static inline int need_resched(void) { return unlikely(test_thread_flag(TIF_NEED_RESCHED)); diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed171..2c65bf29d133 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4159,12 +4159,10 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } -- cgit From b76916462d990751882eaeadc75ac8c487d6de1d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 10 Jun 2008 20:56:36 +0200 Subject: ide: remove the ide_etrax100 chipset type I forgot to remove the ide_etrax100 chipset type when removing the ETRAX_IDE driver. Reported-by: Bartlomiej Zolnierkiewicz Signed-off-by: Adrian Bunk Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-probe.c | 5 ++--- drivers/ide/ide-proc.c | 1 - include/linux/ide.h | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 655ec7ef568a..0bccb63d10a1 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1333,8 +1333,7 @@ static void ide_port_init_devices(ide_hwif_t *hwif) static void ide_init_port(ide_hwif_t *hwif, unsigned int port, const struct ide_port_info *d) { - if (d->chipset != ide_etrax100) - hwif->channel = port; + hwif->channel = port; if (d->chipset) hwif->chipset = d->chipset; @@ -1519,7 +1518,7 @@ int ide_device_add_all(u8 *idx, const struct ide_port_info *d) continue; } - if (d->chipset != ide_etrax100 && (i & 1) && mate) { + if ((i & 1) && mate) { hwif->mate = mate; mate->mate = hwif; } diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 8d6ad812a014..55ec7f798772 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -63,7 +63,6 @@ static int proc_ide_read_imodel case ide_pmac: name = "mac-io"; break; case ide_au1xxx: name = "au1xxx"; break; case ide_palm3710: name = "palm3710"; break; - case ide_etrax100: name = "etrax100"; break; case ide_acorn: name = "acorn"; break; default: name = "(unknown)"; break; } diff --git a/include/linux/ide.h b/include/linux/ide.h index f8f195c20da2..9918772bf274 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -153,7 +153,7 @@ enum { ide_unknown, ide_generic, ide_pci, ide_qd65xx, ide_umc8672, ide_ht6560b, ide_rz1000, ide_trm290, ide_cmd646, ide_cy82c693, ide_4drives, - ide_pmac, ide_etrax100, ide_acorn, + ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit From ce4a7d0d48bbaed78ccbb0bafb9229651a40303a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Jun 2008 12:39:35 -0700 Subject: inet{6}_request_sock: Init ->opt and ->pktopts in the constructor Wei Yongjun noticed that we may call reqsk_free on request sock objects where the opt fields may not be initialized, fix it by introducing inet_reqsk_alloc where we initialize ->opt to NULL and set ->pktopts to NULL in inet6_reqsk_alloc. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 4 +++- include/net/inet_sock.h | 10 ++++++++++ net/dccp/ipv4.c | 3 +-- net/dccp/ipv6.c | 1 - net/ipv4/syncookies.c | 3 +-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/syncookies.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 8 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 10b666b61add..cde056e08181 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -396,8 +396,10 @@ static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *op { struct request_sock *req = reqsk_alloc(ops); - if (req != NULL) + if (req != NULL) { inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req); + inet6_rsk(req)->pktopts = NULL; + } return req; } diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a42cd63d241a..9fabe5b38912 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -197,4 +197,14 @@ static inline int inet_iif(const struct sk_buff *skb) return skb->rtable->rt_iif; } +static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) +{ + struct request_sock *req = reqsk_alloc(ops); + + if (req != NULL) + inet_rsk(req)->opt = NULL; + + return req; +} + #endif /* _INET_SOCK_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index c22a3780c14e..37d27bcb361f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -589,7 +589,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&dccp_request_sock_ops); + req = inet_reqsk_alloc(&dccp_request_sock_ops); if (req == NULL) goto drop; @@ -605,7 +605,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; - ireq->opt = NULL; /* * Step 3: Process LISTEN state diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9b1129bb7ece..f7fe2a572d7b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -421,7 +421,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq6 = inet6_rsk(req); ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); - ireq6->pktopts = NULL; if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 73ba98921d64..d182a2a26291 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -285,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, cookie_check_timestamp(&tcp_opt); ret = NULL; - req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) goto out; @@ -301,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; - ireq->opt = NULL; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd601a866c2f..4f8485c67d1a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1285,7 +1285,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp_request_sock_ops); + req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 938ce4ecde55..3ecc1157994e 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -198,7 +198,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq6 = inet6_rsk(req); treq = tcp_rsk(req); - ireq6->pktopts = NULL; if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 715965f0fac0..cb46749d4c32 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1299,7 +1299,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq = inet6_rsk(req); ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); - treq->pktopts = NULL; if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); -- cgit From 2506ece0c0bbd2fc19a4827b96dc52ea47e2ce4a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sun, 8 Jun 2008 20:49:59 +1000 Subject: virtio: Fix typo in virtio_net_hdr comments Signed-off-by: Mark McLoughlin Signed-off-by: Rusty Russell Signed-off-by: Jeff Garzik --- include/linux/virtio_net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9405aa6cdf26..38c0571820fb 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -38,7 +38,7 @@ struct virtio_net_hdr #define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set __u8 gso_type; __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ - __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */ + __u16 gso_size; /* Bytes to append to hdr_len per frame */ __u16 csum_start; /* Position to start checksumming from */ __u16 csum_offset; /* Offset after that to place checksum */ }; -- cgit From 709772e6e06564ed94ba740de70185ac3d792773 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Tue, 10 Jun 2008 15:44:49 -0700 Subject: net: Fix routing tables with id > 255 for legacy software Most legacy software do not like tables > 255 as rtm_table is u8 so tb_id is sent &0xff and it is possible to mismatch for example table 510 with table 254 (main). This patch introduces RT_TABLE_COMPAT=252 so the code uses it if tb_id > 255. It makes such old applications happy, new ones are still able to use RTA_TABLE to get a proper table id. Signed-off-by: Krzysztof Piotr Oledzki Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + net/ipv4/fib_semantics.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index a2aec2c0cfb5..b358c704d102 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -246,6 +246,7 @@ enum rt_class_t { RT_TABLE_UNSPEC=0, /* User defined values */ + RT_TABLE_COMPAT=252, RT_TABLE_DEFAULT=253, RT_TABLE_MAIN=254, RT_TABLE_LOCAL=255, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3b83c34019fc..0d4d72827e4b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -960,7 +960,10 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, rtm->rtm_dst_len = dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = tos; - rtm->rtm_table = tb_id; + if (tb_id < 256) + rtm->rtm_table = tb_id; + else + rtm->rtm_table = RT_TABLE_COMPAT; NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; -- cgit From dcb84f335bee9c9a7781cfc5d74492dccaf066d2 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 19 May 2008 19:09:27 -0400 Subject: cpuidle acpi driver: fix oops on AC<->DC cpuidle and acpi driver interaction bug with the way cpuidle_register_driver() is called. Due to this bug, there will be oops on AC<->DC on some systems, where they support C-states in one DC and not in AC. The current code does ON BOOT: Look at CST and other C-state info to see whether more than C1 is supported. If it is, then acpi processor_idle does a cpuidle_register_driver() call, which internally enables the device. ON CST change notification (AC<->DC) and on suspend-resume: acpi driver temporarily disables device, updates the device with any new C-states, and reenables the device. The problem is is on boot, there are no C2, C3 states supported and we skip the register. Later on AC<->DC, we may get a CST notification and we try to reevaluate CST and enabled the device, without actually registering it. This causes breakage as we try to create /sys fs sub directory, without the parent directory which is created at register time. Thanks to Sanjeev for reporting the problem here. http://bugzilla.kernel.org/show_bug.cgi?id=10394 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- drivers/acpi/processor_idle.c | 13 +++++++------ drivers/cpuidle/cpuidle.c | 40 +++++++++++++++++++++++++++++++++++----- include/linux/cpuidle.h | 1 + 3 files changed, 43 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 2dd2c1f3a01c..556ee1585192 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1669,6 +1669,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr) return -EINVAL; } + dev->cpu = pr->id; for (i = 0; i < CPUIDLE_STATE_MAX; i++) { dev->states[i].name[0] = '\0'; dev->states[i].desc[0] = '\0'; @@ -1738,7 +1739,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr) int acpi_processor_cst_has_changed(struct acpi_processor *pr) { - int ret; + int ret = 0; if (boot_option_idle_override) return 0; @@ -1756,8 +1757,10 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr) cpuidle_pause_and_lock(); cpuidle_disable_device(&pr->power.dev); acpi_processor_get_power_info(pr); - acpi_processor_setup_cpuidle(pr); - ret = cpuidle_enable_device(&pr->power.dev); + if (pr->flags.power) { + acpi_processor_setup_cpuidle(pr); + ret = cpuidle_enable_device(&pr->power.dev); + } cpuidle_resume_and_unlock(); return ret; @@ -1813,7 +1816,6 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, if (pr->flags.power) { #ifdef CONFIG_CPU_IDLE acpi_processor_setup_cpuidle(pr); - pr->power.dev.cpu = pr->id; if (cpuidle_register_device(&pr->power.dev)) return -EIO; #endif @@ -1850,8 +1852,7 @@ int acpi_processor_power_exit(struct acpi_processor *pr, return 0; #ifdef CONFIG_CPU_IDLE - if (pr->flags.power) - cpuidle_unregister_device(&pr->power.dev); + cpuidle_unregister_device(&pr->power.dev); #endif pr->flags.power_setup_done = 0; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index fc555a90bb21..23554b676d6e 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -38,6 +38,8 @@ static void cpuidle_kick_cpus(void) static void cpuidle_kick_cpus(void) {} #endif +static int __cpuidle_register_device(struct cpuidle_device *dev); + /** * cpuidle_idle_call - the main idle loop * @@ -138,6 +140,12 @@ int cpuidle_enable_device(struct cpuidle_device *dev) if (!dev->state_count) return -EINVAL; + if (dev->registered == 0) { + ret = __cpuidle_register_device(dev); + if (ret) + return ret; + } + if ((ret = cpuidle_add_state_sysfs(dev))) return ret; @@ -232,10 +240,13 @@ static void poll_idle_init(struct cpuidle_device *dev) {} #endif /* CONFIG_ARCH_HAS_CPU_RELAX */ /** - * cpuidle_register_device - registers a CPU's idle PM feature + * __cpuidle_register_device - internal register function called before register + * and enable routines * @dev: the cpu + * + * cpuidle_lock mutex must be held before this is called */ -int cpuidle_register_device(struct cpuidle_device *dev) +static int __cpuidle_register_device(struct cpuidle_device *dev) { int ret; struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu); @@ -247,18 +258,34 @@ int cpuidle_register_device(struct cpuidle_device *dev) init_completion(&dev->kobj_unregister); - mutex_lock(&cpuidle_lock); - poll_idle_init(dev); per_cpu(cpuidle_devices, dev->cpu) = dev; list_add(&dev->device_list, &cpuidle_detected_devices); if ((ret = cpuidle_add_sysfs(sys_dev))) { - mutex_unlock(&cpuidle_lock); module_put(cpuidle_curr_driver->owner); return ret; } + dev->registered = 1; + return 0; +} + +/** + * cpuidle_register_device - registers a CPU's idle PM feature + * @dev: the cpu + */ +int cpuidle_register_device(struct cpuidle_device *dev) +{ + int ret; + + mutex_lock(&cpuidle_lock); + + if ((ret = __cpuidle_register_device(dev))) { + mutex_unlock(&cpuidle_lock); + return ret; + } + cpuidle_enable_device(dev); cpuidle_install_idle_handler(); @@ -278,6 +305,9 @@ void cpuidle_unregister_device(struct cpuidle_device *dev) { struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu); + if (dev->registered == 0) + return; + cpuidle_pause_and_lock(); cpuidle_disable_device(dev); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 51e6b1e520e6..dcf77fa826b5 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -82,6 +82,7 @@ struct cpuidle_state_kobj { }; struct cpuidle_device { + unsigned int registered:1; unsigned int enabled:1; unsigned int cpu; -- cgit From e9fe9e188118a0a34c6200d9b10ea6247f53592d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 9 Jun 2008 16:52:04 -0700 Subject: pnpacpi: fix IRQ flag decoding When decoding IRQ trigger mode and polarity, it is not enough to mask by IORESOURCE_BITS because there are now additional bits defined. For example, if IORESOURCE_IRQ_SHAREABLE was set, we failed to set *triggering and *polarity at all. I can't point to a failure that this patch fixes, but bugs in this area have caused problems when resuming after suspend, for example: http://bugzilla.kernel.org/show_bug.cgi?id=6316 http://bugzilla.kernel.org/show_bug.cgi?id=9487 https://bugs.launchpad.net/ubuntu/+source/linux-source-2.6.22/+bug/152187 This is based on a patch by Tom Jaeger: http://bugzilla.kernel.org/show_bug.cgi?id=9487#c32 [rene.herman@keyaccess.nl: fix comment] Signed-off-by: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- drivers/pnp/pnpacpi/rsparser.c | 16 ++++++++++++---- include/linux/ioport.h | 6 +++--- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c index 0201c8adfda7..ab09fe6fe1e4 100644 --- a/drivers/pnp/pnpacpi/rsparser.c +++ b/drivers/pnp/pnpacpi/rsparser.c @@ -56,9 +56,11 @@ static int irq_flags(int triggering, int polarity, int shareable) return flags; } -static void decode_irq_flags(int flag, int *triggering, int *polarity) +static void decode_irq_flags(struct pnp_dev *dev, int flags, int *triggering, + int *polarity) { - switch (flag) { + switch (flags & (IORESOURCE_IRQ_LOWLEVEL | IORESOURCE_IRQ_HIGHLEVEL | + IORESOURCE_IRQ_LOWEDGE | IORESOURCE_IRQ_HIGHEDGE)) { case IORESOURCE_IRQ_LOWLEVEL: *triggering = ACPI_LEVEL_SENSITIVE; *polarity = ACPI_ACTIVE_LOW; @@ -75,6 +77,12 @@ static void decode_irq_flags(int flag, int *triggering, int *polarity) *triggering = ACPI_EDGE_SENSITIVE; *polarity = ACPI_ACTIVE_HIGH; break; + default: + dev_err(&dev->dev, "can't encode invalid IRQ mode %#x\n", + flags); + *triggering = ACPI_EDGE_SENSITIVE; + *polarity = ACPI_ACTIVE_HIGH; + break; } } @@ -790,7 +798,7 @@ static void pnpacpi_encode_irq(struct pnp_dev *dev, struct acpi_resource_irq *irq = &resource->data.irq; int triggering, polarity; - decode_irq_flags(p->flags & IORESOURCE_BITS, &triggering, &polarity); + decode_irq_flags(dev, p->flags, &triggering, &polarity); irq->triggering = triggering; irq->polarity = polarity; if (triggering == ACPI_EDGE_SENSITIVE) @@ -813,7 +821,7 @@ static void pnpacpi_encode_ext_irq(struct pnp_dev *dev, struct acpi_resource_extended_irq *extended_irq = &resource->data.extended_irq; int triggering, polarity; - decode_irq_flags(p->flags & IORESOURCE_BITS, &triggering, &polarity); + decode_irq_flags(dev, p->flags, &triggering, &polarity); extended_irq->producer_consumer = ACPI_CONSUMER; extended_irq->triggering = triggering; extended_irq->polarity = polarity; diff --git a/include/linux/ioport.h b/include/linux/ioport.h index d5d40a9f7929..c6801bffe76d 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -53,14 +53,14 @@ struct resource_list { #define IORESOURCE_AUTO 0x40000000 #define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */ -/* ISA PnP IRQ specific bits (IORESOURCE_BITS) */ +/* PnP IRQ specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IRQ_HIGHEDGE (1<<0) #define IORESOURCE_IRQ_LOWEDGE (1<<1) #define IORESOURCE_IRQ_HIGHLEVEL (1<<2) #define IORESOURCE_IRQ_LOWLEVEL (1<<3) #define IORESOURCE_IRQ_SHAREABLE (1<<4) -/* ISA PnP DMA specific bits (IORESOURCE_BITS) */ +/* PnP DMA specific bits (IORESOURCE_BITS) */ #define IORESOURCE_DMA_TYPE_MASK (3<<0) #define IORESOURCE_DMA_8BIT (0<<0) #define IORESOURCE_DMA_8AND16BIT (1<<0) @@ -76,7 +76,7 @@ struct resource_list { #define IORESOURCE_DMA_TYPEB (2<<6) #define IORESOURCE_DMA_TYPEF (3<<6) -/* ISA PnP memory I/O specific bits (IORESOURCE_BITS) */ +/* PnP memory I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_MEM_WRITEABLE (1<<0) /* dup: IORESOURCE_READONLY */ #define IORESOURCE_MEM_CACHEABLE (1<<1) /* dup: IORESOURCE_CACHEABLE */ #define IORESOURCE_MEM_RANGELENGTH (1<<2) /* dup: IORESOURCE_RANGELENGTH */ -- cgit From 45aec1ae72fc592f231e9e73ed9ed4d10cfbc0b5 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Tue, 18 Mar 2008 17:00:22 -0700 Subject: x86: PAT export resource_wc in pci sysfs For the ranges with IORESOURCE_PREFETCH, export a new resource_wc interface in pci /sysfs along with resource (which is uncached). Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Acked-by: Jesse Barnes Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/filesystems/sysfs-pci.txt | 1 + drivers/pci/pci-sysfs.c | 84 ++++++++++++++++++++++++--------- include/linux/pci.h | 1 + 3 files changed, 64 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 5daa2aaec2c5..68ef48839c04 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -36,6 +36,7 @@ files, each with their own function. local_cpus nearby CPU mask (cpumask, ro) resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap) + resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) subsystem_vendor PCI subsystem vendor (ascii, ro) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 271d41cc05ab..9ec7d3977a82 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -489,13 +489,14 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr, * @kobj: kobject for mapping * @attr: struct bin_attribute for the file being mapped * @vma: struct vm_area_struct passed into the mmap + * @write_combine: 1 for write_combine mapping * * Use the regular PCI mapping routines to map a PCI resource into userspace. * FIXME: write combining? maybe automatic for prefetchable regions? */ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, int write_combine) { struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj)); @@ -518,7 +519,21 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, vma->vm_pgoff += start >> PAGE_SHIFT; mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; - return pci_mmap_page_range(pdev, vma, mmap_type, 0); + return pci_mmap_page_range(pdev, vma, mmap_type, write_combine); +} + +static int +pci_mmap_resource_uc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 0); +} + +static int +pci_mmap_resource_wc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 1); } /** @@ -541,9 +556,46 @@ pci_remove_resource_files(struct pci_dev *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); kfree(res_attr); } + + res_attr = pdev->res_attr_wc[i]; + if (res_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); + kfree(res_attr); + } } } +static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) +{ + /* allocate attribute structure, piggyback attribute name */ + int name_len = write_combine ? 13 : 10; + struct bin_attribute *res_attr; + int retval; + + res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC); + if (res_attr) { + char *res_attr_name = (char *)(res_attr + 1); + + if (write_combine) { + pdev->res_attr_wc[num] = res_attr; + sprintf(res_attr_name, "resource%d_wc", num); + res_attr->mmap = pci_mmap_resource_wc; + } else { + pdev->res_attr[num] = res_attr; + sprintf(res_attr_name, "resource%d", num); + res_attr->mmap = pci_mmap_resource_uc; + } + res_attr->attr.name = res_attr_name; + res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->size = pci_resource_len(pdev, num); + res_attr->private = &pdev->resource[num]; + retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); + } else + retval = -ENOMEM; + + return retval; +} + /** * pci_create_resource_files - create resource files in sysfs for @dev * @dev: dev in question @@ -557,31 +609,19 @@ static int pci_create_resource_files(struct pci_dev *pdev) /* Expose the PCI resources from this device as files */ for (i = 0; i < PCI_ROM_RESOURCE; i++) { - struct bin_attribute *res_attr; /* skip empty resources */ if (!pci_resource_len(pdev, i)) continue; - /* allocate attribute structure, piggyback attribute name */ - res_attr = kzalloc(sizeof(*res_attr) + 10, GFP_ATOMIC); - if (res_attr) { - char *res_attr_name = (char *)(res_attr + 1); - - pdev->res_attr[i] = res_attr; - sprintf(res_attr_name, "resource%d", i); - res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; - res_attr->size = pci_resource_len(pdev, i); - res_attr->mmap = pci_mmap_resource; - res_attr->private = &pdev->resource[i]; - retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); - if (retval) { - pci_remove_resource_files(pdev); - return retval; - } - } else { - return -ENOMEM; + retval = pci_create_attr(pdev, i, 0); + /* for prefetchable resources, create a WC mappable file */ + if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH) + retval = pci_create_attr(pdev, i, 1); + + if (retval) { + pci_remove_resource_files(pdev); + return retval; } } return 0; diff --git a/include/linux/pci.h b/include/linux/pci.h index 509159bcd4e7..d18b1dd49fab 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -206,6 +206,7 @@ struct pci_dev { struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ + struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ #ifdef CONFIG_PCI_MSI struct list_head msi_list; #endif -- cgit From f595ec964daf7f99668039d7303ddedd09a75142 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Jun 2008 10:47:56 +0200 Subject: common implementation of iterative div/mod We have a few instances of the open-coded iterative div/mod loop, used when we don't expcet the dividend to be much bigger than the divisor. Unfortunately modern gcc's have the tendency to strength "reduce" this into a full mod operation, which isn't necessarily any faster, and even if it were, doesn't exist if gcc implements it in libgcc. The workaround is to put a dummy asm statement in the loop to prevent gcc from performing the transformation. This patch creates a single implementation of this loop, and uses it to replace the open-coded versions I know about. Signed-off-by: Jeremy Fitzhardinge Cc: Andrew Morton Cc: john stultz Cc: Segher Boessenkool Cc: Christian Kujau Cc: Robert Hancock Signed-off-by: Ingo Molnar --- arch/x86/xen/time.c | 13 +++---------- include/linux/math64.h | 2 ++ include/linux/time.h | 11 ++--------- lib/div64.c | 23 +++++++++++++++++++++++ 4 files changed, 30 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5aa241..52b2e3856980 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -150,11 +151,7 @@ static void do_stolen_accounting(void) if (stolen < 0) stolen = 0; - ticks = 0; - while (stolen >= NS_PER_TICK) { - ticks++; - stolen -= NS_PER_TICK; - } + ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; account_steal_time(NULL, ticks); @@ -166,11 +163,7 @@ static void do_stolen_accounting(void) if (blocked < 0) blocked = 0; - ticks = 0; - while (blocked >= NS_PER_TICK) { - ticks++; - blocked -= NS_PER_TICK; - } + ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; account_steal_time(idle_task(smp_processor_id()), ticks); } diff --git a/include/linux/math64.h b/include/linux/math64.h index c1a5f81501ff..177785e1e4a3 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -81,4 +81,6 @@ static inline s64 div_s64(s64 dividend, s32 divisor) } #endif +u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); + #endif /* _LINUX_MATH64_H */ diff --git a/include/linux/time.h b/include/linux/time.h index d32ef0ad4c0a..05f9517a8ed1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -6,6 +6,7 @@ #ifdef __KERNEL__ # include # include +# include #endif #ifndef _STRUCT_TIMESPEC @@ -172,15 +173,7 @@ extern struct timeval ns_to_timeval(const s64 nsec); */ static inline void timespec_add_ns(struct timespec *a, u64 ns) { - ns += a->tv_nsec; - while(unlikely(ns >= NSEC_PER_SEC)) { - /* The following asm() prevents the compiler from - * optimising this loop into a modulo operation. */ - asm("" : "+r"(ns)); - - ns -= NSEC_PER_SEC; - a->tv_sec++; - } + a->tv_sec += iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } #endif /* __KERNEL__ */ diff --git a/lib/div64.c b/lib/div64.c index bb5bd0c0f030..76c01542d3e1 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -98,3 +98,26 @@ EXPORT_SYMBOL(div64_u64); #endif #endif /* BITS_PER_LONG == 32 */ + +/* + * Iterative div/mod for use when dividend is not expected to be much + * bigger than divisor. + */ +u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) +{ + u32 ret = 0; + + while (dividend >= divisor) { + /* The following asm() prevents the compiler from + optimising this loop into a modulo operation. */ + asm("" : "+rm"(dividend)); + + dividend -= divisor; + ret++; + } + + *remainder = dividend; + + return ret; +} +EXPORT_SYMBOL(iter_div_u64_rem); -- cgit From d5e181f78ac753893eb930868a52a4488cd3de0a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Jun 2008 10:47:58 +0200 Subject: add an inlined version of iter_div_u64_rem iter_div_u64_rem is used in the x86-64 vdso, which cannot call other kernel code. For this case, provide the always_inlined version, __iter_div_u64_rem. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/math64.h | 19 +++++++++++++++++++ lib/div64.c | 15 +-------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index 177785e1e4a3..c87f1528703a 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -83,4 +83,23 @@ static inline s64 div_s64(s64 dividend, s32 divisor) u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); +static __always_inline u32 +__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) +{ + u32 ret = 0; + + while (dividend >= divisor) { + /* The following asm() prevents the compiler from + optimising this loop into a modulo operation. */ + asm("" : "+rm"(dividend)); + + dividend -= divisor; + ret++; + } + + *remainder = dividend; + + return ret; +} + #endif /* _LINUX_MATH64_H */ diff --git a/lib/div64.c b/lib/div64.c index 76c01542d3e1..a111eb8de9cf 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -105,19 +105,6 @@ EXPORT_SYMBOL(div64_u64); */ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { - u32 ret = 0; - - while (dividend >= divisor) { - /* The following asm() prevents the compiler from - optimising this loop into a modulo operation. */ - asm("" : "+rm"(dividend)); - - dividend -= divisor; - ret++; - } - - *remainder = dividend; - - return ret; + return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); -- cgit From 9412e28649d0272df5e4af57bb378926fd4df580 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Jun 2008 10:48:00 +0200 Subject: always_inline timespec_add_ns timespec_add_ns is used from the x86-64 vdso, which cannot call out to other kernel code. Make sure that timespec_add_ns is always inlined (and only uses always_inlined functions) to make sure there are no unexpected calls. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/time.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time.h b/include/linux/time.h index 05f9517a8ed1..e15206a7e82e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -170,10 +170,13 @@ extern struct timeval ns_to_timeval(const s64 nsec); * timespec_add_ns - Adds nanoseconds to a timespec * @a: pointer to timespec to be incremented * @ns: unsigned nanoseconds value to be added + * + * This must always be inlined because its used from the x86-64 vdso, + * which cannot call other kernel functions. */ -static inline void timespec_add_ns(struct timespec *a, u64 ns) +static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) { - a->tv_sec += iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } #endif /* __KERNEL__ */ -- cgit From ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jun 2008 16:31:35 -0700 Subject: tcp: Revert 'process defer accept as established' changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 ("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38 ("tcp: Fix slab corruption with ipv6 and tcp6fuzz"). This change causes several problems, first reported by Ingo Molnar as a distcc-over-loopback regression where connections were getting stuck. Ilpo Järvinen first spotted the locking problems. The new function added by this code, tcp_defer_accept_check(), only has the child socket locked, yet it is modifying state of the parent listening socket. Fixing that is non-trivial at best, because we can't simply just grab the parent listening socket lock at this point, because it would create an ABBA deadlock. The normal ordering is parent listening socket --> child socket, but this code path would require the reverse lock ordering. Next is a problem noticed by Vitaliy Gusev, he noted: ---------------------------------------- >--- a/net/ipv4/tcp_timer.c >+++ b/net/ipv4/tcp_timer.c >@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) > goto death; > } > >+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto death; Here socket sk is not attached to listening socket's request queue. tcp_done() will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should release this sk) as socket is not DEAD. Therefore socket sk will be lost for freeing. ---------------------------------------- Finally, Alexey Kuznetsov argues that there might not even be any real value or advantage to these new semantics even if we fix all of the bugs: ---------------------------------------- Hiding from accept() sockets with only out-of-order data only is the only thing which is impossible with old approach. Is this really so valuable? My opinion: no, this is nothing but a new loophole to consume memory without control. ---------------------------------------- So revert this thing for now. Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 ------- include/net/request_sock.h | 4 ++-- include/net/tcp.h | 1 - net/ipv4/inet_connection_sock.c | 11 +++++++--- net/ipv4/tcp.c | 18 ++++++++++------- net/ipv4/tcp_input.c | 45 ----------------------------------------- net/ipv4/tcp_ipv4.c | 8 -------- net/ipv4/tcp_minisocks.c | 32 +++++++++++------------------ net/ipv4/tcp_timer.c | 5 ----- 9 files changed, 33 insertions(+), 98 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 18e62e3d406f..b31b6b74aa28 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,11 +239,6 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } -struct tcp_deferred_accept_info { - struct sock *listen_sk; - struct request_sock *request; -}; - struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -379,8 +374,6 @@ struct tcp_sock { unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; - struct tcp_deferred_accept_info defer_tcp_accept; - unsigned long last_synq_overflow; u32 tso_deferred; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b220b5f624de..0c96e7bed5db 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -115,8 +115,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; - u16 rskq_defer_accept; - /* 2 bytes hole, try to pack */ + u8 rskq_defer_accept; + /* 3 bytes hole, try to pack */ struct listen_sock *listen_opt; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index d448310c82c1..cf54034019d9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -139,7 +139,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define MAX_TCP_ACCEPT_DEFERRED 65535 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 828ea211ff21..045e799d3e1d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if (req->retrans < thresh && - !req->rsk_ops->rtx_syn_ack(parent, req)) { + if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) && + (inet_rsk(req)->acked || + !req->rsk_ops->rtx_syn_ack(parent, req))) { unsigned long timeo; if (req->retrans++ == 0) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab66683b8043..fc54a48fde1e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - if (val < 0) { - err = -EINVAL; - } else { - if (val > MAX_TCP_ACCEPT_DEFERRED) - val = MAX_TCP_ACCEPT_DEFERRED; - icsk->icsk_accept_queue.rskq_defer_accept = val; + icsk->icsk_accept_queue.rskq_defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of + * retransmits */ + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && + val > ((TCP_TIMEOUT_INIT / HZ) << + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = icsk->icsk_accept_queue.rskq_defer_accept; + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b560..cad73b7dfef0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } -static int tcp_defer_accept_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->defer_tcp_accept.request) { - int queued_data = tp->rcv_nxt - tp->copied_seq; - int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? - tcp_hdr((struct sk_buff *) - sk->sk_receive_queue.prev)->fin : 0; - - if (queued_data && hasfin) - queued_data--; - - if (queued_data && - tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { - if (sock_flag(sk, SOCK_KEEPOPEN)) { - inet_csk_reset_keepalive_timer(sk, - keepalive_time_when(tp)); - } else { - inet_csk_delete_keepalive_timer(sk); - } - - inet_csk_reqsk_queue_add( - tp->defer_tcp_accept.listen_sk, - tp->defer_tcp_accept.request, - sk); - - tp->defer_tcp_accept.listen_sk->sk_data_ready( - tp->defer_tcp_accept.listen_sk, 0); - - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } else if (hasfin || - tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { - tcp_reset(sk); - return -1; - } - } - return 0; -} - static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4944,8 +4901,6 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - - tcp_defer_accept_check(sk); return 0; csum_error: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4f8485c67d1a..97a230026e13 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } - if (tp->defer_tcp_accept.request) { - reqsk_free(tp->defer_tcp_accept.request); - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } - atomic_dec(&tcp_sockets_allocated); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 019c8c16e5cc..8245247a6ceb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - Both ends (listening sockets) accept the new incoming - connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + inet_rsk(req)->acked = 1; + return NULL; + } + /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - - /* the accept queue handling is done is est recv slow - * path so lets make sure to start there - */ - tcp_sk(child)->pred_flags = 0; - sock_hold(sk); - sock_hold(child); - tcp_sk(child)->defer_tcp_accept.listen_sk = sk; - tcp_sk(child)->defer_tcp_accept.request = req; - - inet_csk_reset_keepalive_timer(child, - inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); - } else { - inet_csk_reqsk_queue_add(sk, req, child); - } - + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4de68cf5f2aa..63ed9d6830e7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { - tcp_send_active_reset(sk, GFP_ATOMIC); - goto death; - } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; -- cgit From 57d3c64fd8130ebdacd85a36c9656ba5e221f3a3 Mon Sep 17 00:00:00 2001 From: Ben Nizette Date: Thu, 12 Jun 2008 15:21:31 -0700 Subject: proc_fs.h: move struct mm_struct forward-declaration Move the forward-declaration of struct mm_struct a little way up proc_fs.h. This fixes a bunch of "'struct mm_struct' declared inside parameter list" warnings with CONFIG_PROC_FS=n Signed-off-by: Ben Nizette Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 9883bc942262..fff1d27ddb4c 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -9,6 +9,8 @@ struct net; struct completion; +struct mm_struct; + /* * The proc filesystem constants/structures */ @@ -101,8 +103,6 @@ extern spinlock_t proc_subdir_lock; extern void proc_root_init(void); extern void proc_misc_init(void); -struct mm_struct; - void proc_flush_task(struct task_struct *task); struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -- cgit From 24aac480e76c6f5d1391ac05c5e9c0eb9b0cd302 Mon Sep 17 00:00:00 2001 From: Mike Miller Date: Thu, 12 Jun 2008 15:21:34 -0700 Subject: cciss: add new hardware support Add support for the next generation of HP Smart Array SAS/SATA controllers. Shipping date is late Fall 2008. Bump the driver version to 3.6.20 to reflect the new hardware support from patch 1 of this set. Signed-off-by: Mike Miller Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cciss.txt | 5 +++++ drivers/block/cciss.c | 21 ++++++++++++++++----- include/linux/pci_ids.h | 1 + 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cciss.txt b/Documentation/cciss.txt index e65736c6b8bc..63e59b8847c5 100644 --- a/Documentation/cciss.txt +++ b/Documentation/cciss.txt @@ -21,6 +21,11 @@ This driver is known to work with the following cards: * SA E200 * SA E200i * SA E500 + * SA P212 + * SA P410 + * SA P410i + * SA P411 + * SA P812 Detecting drive failures: ------------------------- diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e336b05fe4a7..5f1e1cc6165a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -53,15 +53,16 @@ #include #define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin)) -#define DRIVER_NAME "HP CISS Driver (v 3.6.14)" -#define DRIVER_VERSION CCISS_DRIVER_VERSION(3,6,14) +#define DRIVER_NAME "HP CISS Driver (v 3.6.20)" +#define DRIVER_VERSION CCISS_DRIVER_VERSION(3, 6, 20) /* Embedded module documentation macros - see modules.h */ MODULE_AUTHOR("Hewlett-Packard Company"); -MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 3.6.14"); +MODULE_DESCRIPTION("Driver for HP Smart Array Controllers"); MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400" - " SA6i P600 P800 P400 P400i E200 E200i E500"); -MODULE_VERSION("3.6.14"); + " SA6i P600 P800 P400 P400i E200 E200i E500 P700m" + " Smart Array G2 Series SAS/SATA Controllers"); +MODULE_VERSION("3.6.20"); MODULE_LICENSE("GPL"); #include "cciss_cmd.h" @@ -90,6 +91,11 @@ static const struct pci_device_id cciss_pci_device_id[] = { {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249}, {PCI_VENDOR_ID_HP, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_RAID << 8, 0xffff << 8, 0}, {0,} @@ -123,6 +129,11 @@ static struct board_type products[] = { {0x3215103C, "Smart Array E200i", &SA5_access, 120}, {0x3237103C, "Smart Array E500", &SA5_access, 512}, {0x323D103C, "Smart Array P700m", &SA5_access, 512}, + {0x3241103C, "Smart Array P212", &SA5_access, 384}, + {0x3243103C, "Smart Array P410", &SA5_access, 384}, + {0x3245103C, "Smart Array P410i", &SA5_access, 384}, + {0x3247103C, "Smart Array P411", &SA5_access, 384}, + {0x3249103C, "Smart Array P812", &SA5_access, 384}, {0xFFFF103C, "Unknown Smart Array", &SA5_access, 120}, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 9b940e644179..eafc9d6d2b35 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -716,6 +716,7 @@ #define PCI_DEVICE_ID_HP_CISSA 0x3220 #define PCI_DEVICE_ID_HP_CISSC 0x3230 #define PCI_DEVICE_ID_HP_CISSD 0x3238 +#define PCI_DEVICE_ID_HP_CISSE 0x323a #define PCI_DEVICE_ID_HP_ZX2_IOC 0x4031 #define PCI_VENDOR_ID_PCTECH 0x1042 -- cgit From 2165009bdf63f79716a36ad545df14c3cdf958b7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 12 Jun 2008 15:21:47 -0700 Subject: pagemap: pass mm into pagewalkers We need this at least for huge page detection for now, because powerpc needs the vm_area_struct to be able to determine whether a virtual address is referring to a huge page (its pmd_huge() doesn't work). It might also come in handy for some of the other users. Signed-off-by: Dave Hansen Acked-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 44 +++++++++++++++++++++++++------------------- include/linux/mm.h | 17 +++++++++-------- mm/pagewalk.c | 42 ++++++++++++++++++++++-------------------- 3 files changed, 56 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 17403629e330..f0df3109343d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -315,9 +315,9 @@ struct mem_size_stats { }; static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct mem_size_stats *mss = private; + struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; @@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; - static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; int ret; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; memset(&mss, 0, sizeof mss); mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, - &smaps_walk, &mss); + walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); ret = show_map(m, v); if (ret) @@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = { }; static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, void *private) + unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = private; + struct vm_area_struct *vma = walk->private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + static struct mm_walk clear_refs_walk; + memset(&clear_refs_walk, 0, sizeof(clear_refs_walk)); + clear_refs_walk.pmd_entry = clear_refs_pte_range; + clear_refs_walk.mm = mm; down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for (vma = mm->mmap; vma; vma = vma->vm_next) { + clear_refs_walk.private = vma; if (!is_vm_hugetlb_page(vma)) - walk_page_range(mm, vma->vm_start, vma->vm_end, - &clear_refs_walk, vma); + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); + } flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -528,9 +534,9 @@ static int add_to_pagemap(unsigned long addr, u64 pfn, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; for (addr = start; addr < end; addr += PAGE_SIZE) { @@ -548,9 +554,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) } static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; @@ -675,8 +681,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, * user buffer is tracked in "pm", and the walk * will stop when we hit the end of the buffer. */ - ret = walk_page_range(mm, start_vaddr, end_vaddr, - &pagemap_walk, &pm); + ret = walk_page_range(start_vaddr, end_vaddr, + &pagemap_walk); if (ret == PM_END_OF_BUFFER) ret = 0; /* don't need mmap_sem for these, but this looks cleaner */ diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..586a943cab01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -760,16 +760,17 @@ unsigned long unmap_vmas(struct mmu_gather **tlb, * (see walk_page_range for more details) */ struct mm_walk { - int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *); - int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *); - int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *); - int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *); - int (*pte_hole)(unsigned long, unsigned long, void *); + int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); + int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); + struct mm_struct *mm; + void *private; }; -int walk_page_range(const struct mm_struct *, unsigned long addr, - unsigned long end, const struct mm_walk *walk, - void *private); +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 0afd2387e507..d5878bed7841 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,14 +3,14 @@ #include static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pte_t *pte; int err = 0; pte = pte_offset_map(pmd, addr); for (;;) { - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; addr += PAGE_SIZE; @@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pmd_t *pmd; unsigned long next; @@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, private); + err = walk->pmd_entry(pmd, addr, next, walk); if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk, private); + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); @@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, } static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pud_t *pud; unsigned long next; @@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, private); + err = walk->pud_entry(pud, addr, next, walk); if (!err && (walk->pmd_entry || walk->pte_entry)) - err = walk_pmd_range(pud, addr, next, walk, private); + err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); @@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree - * @private: private data passed to the callback function * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, * etc.). If lower-level callbacks are omitted, walking depth is reduced. * - * Each callback receives an entry pointer, the start and end of the - * associated range, and a caller-supplied private data pointer. + * Each callback receives an entry pointer and the start and end of the + * associated range, and a copy of the original mm_walk for access to + * the ->private or ->mm fields. * * No locks are taken, but the bottom level iterator will map PTE * directories from highmem if necessary. @@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. */ -int walk_page_range(const struct mm_struct *mm, - unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) { pgd_t *pgd; unsigned long next; @@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm, if (addr >= end) return err; - pgd = pgd_offset(mm, addr); + if (!walk->mm) + return -EINVAL; + + pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, private); + err = walk->pgd_entry(pgd, addr, next, walk); if (!err && (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) - err = walk_pud_range(pgd, addr, next, walk, private); + err = walk_pud_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); -- cgit