From 100a8fdbf525bb11796692a713c267be6523a890 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:48 +0100 Subject: thermal: trace: Trace temperature changes Create a new event to trace the temperature of a thermal zone. Using this event trace the temperature changes of the thermal zone every-time it is updated. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- include/trace/events/thermal.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 include/trace/events/thermal.h (limited to 'include') diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h new file mode 100644 index 000000000000..8c5ca96eccd6 --- /dev/null +++ b/include/trace/events/thermal.h @@ -0,0 +1,38 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM thermal + +#if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_THERMAL_H + +#include +#include + +TRACE_EVENT(thermal_temperature, + + TP_PROTO(struct thermal_zone_device *tz), + + TP_ARGS(tz), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, temp_prev) + __field(int, temp) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->temp_prev = tz->last_temperature; + __entry->temp = tz->temperature; + ), + + TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d", + __get_str(thermal_zone), __entry->id, __entry->temp_prev, + __entry->temp) +); + +#endif /* _TRACE_THERMAL_H */ + +/* This part must be outside protection */ +#include -- cgit From 39811569e43a81417bc0ddca3d0c7658c3dcd4b0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:49 +0100 Subject: thermal: trace: Trace when a cooling device's state is updated Introduce and use an event to trace when a cooling device's state is updated. This is useful to follow the effect of governor decisions on cooling devices. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 1 + include/trace/events/thermal.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 6b32391260a0..c74c78d28699 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1291,6 +1291,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev) mutex_unlock(&cdev->lock); cdev->ops->set_cur_state(cdev, target); cdev->updated = true; + trace_cdev_update(cdev, target); dev_dbg(&cdev->device, "set to state %lu\n", target); } EXPORT_SYMBOL(thermal_cdev_update); diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 8c5ca96eccd6..894a79ea0686 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -32,6 +32,25 @@ TRACE_EVENT(thermal_temperature, __entry->temp) ); +TRACE_EVENT(cdev_update, + + TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target), + + TP_ARGS(cdev, target), + + TP_STRUCT__entry( + __string(type, cdev->type) + __field(unsigned long, target) + ), + + TP_fast_assign( + __assign_str(type, cdev->type); + __entry->target = target; + ), + + TP_printk("type=%s target=%lu", __get_str(type), __entry->target) +); + #endif /* _TRACE_THERMAL_H */ /* This part must be outside protection */ -- cgit From 208cd822a19e683bc890f6708786f2420e172d76 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:50 +0100 Subject: thermal: trace: Trace when temperature is above a trip point Create a new event to trace when the temperature is above a trip point. Use the trace-point when handling non-critical and critical trip pionts. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- drivers/thermal/fair_share.c | 12 ++++++++++++ drivers/thermal/step_wise.c | 5 ++++- drivers/thermal/thermal_core.c | 2 ++ include/trace/events/thermal.h | 26 ++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c index 944ba2f340c8..6e0a3fbfae86 100644 --- a/drivers/thermal/fair_share.c +++ b/drivers/thermal/fair_share.c @@ -23,6 +23,7 @@ */ #include +#include #include "thermal_core.h" @@ -34,6 +35,7 @@ static int get_trip_level(struct thermal_zone_device *tz) { int count = 0; unsigned long trip_temp; + enum thermal_trip_type trip_type; if (tz->trips == 0 || !tz->ops->get_trip_temp) return 0; @@ -43,6 +45,16 @@ static int get_trip_level(struct thermal_zone_device *tz) if (tz->temperature < trip_temp) break; } + + /* + * count > 0 only if temperature is greater than first trip + * point, in which case, trip_point = count - 1 + */ + if (count > 0) { + tz->ops->get_trip_type(tz, count - 1, &trip_type); + trace_thermal_zone_trip(tz, count - 1, trip_type); + } + return count; } diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c index f251521baaa2..3b54c2c226d8 100644 --- a/drivers/thermal/step_wise.c +++ b/drivers/thermal/step_wise.c @@ -23,6 +23,7 @@ */ #include +#include #include "thermal_core.h" @@ -129,8 +130,10 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) trend = get_tz_trend(tz, trip); - if (tz->temperature >= trip_temp) + if (tz->temperature >= trip_temp) { throttle = true; + trace_thermal_zone_trip(tz, trip, trip_type); + } dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n", trip, trip_type, trip_temp, trend, throttle); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index c74c78d28699..454884aa15f7 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -371,6 +371,8 @@ static void handle_critical_trips(struct thermal_zone_device *tz, if (tz->temperature < trip_temp) return; + trace_thermal_zone_trip(tz, trip, trip_type); + if (tz->ops->notify) tz->ops->notify(tz, trip, trip_type); diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 894a79ea0686..0f4f95d63c03 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -51,6 +51,32 @@ TRACE_EVENT(cdev_update, TP_printk("type=%s target=%lu", __get_str(type), __entry->target) ); +TRACE_EVENT(thermal_zone_trip, + + TP_PROTO(struct thermal_zone_device *tz, int trip, + enum thermal_trip_type trip_type), + + TP_ARGS(tz, trip, trip_type), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, trip) + __field(enum thermal_trip_type, trip_type) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->trip = trip; + __entry->trip_type = trip_type; + ), + + TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%d", + __get_str(thermal_zone), __entry->id, __entry->trip, + __entry->trip_type) +); + #endif /* _TRACE_THERMAL_H */ /* This part must be outside protection */ -- cgit From e963bb1de415ab06693357336c1bec664753e1e2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 1 Sep 2014 22:22:13 -0400 Subject: ext4: improve extents status tree trace point This commit improves the trace point of extents status tree. We rename trace_ext4_es_shrink_enter in ext4_es_count() because it is also used in ext4_es_scan() and we can not identify them from the result. Further this commit fixes a variable name in trace point in order to keep consistency with others. Cc: Andreas Dilger Cc: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Zheng Liu Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 6 +++--- include/trace/events/ext4.h | 28 ++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index bdd400c81533..95da65c60d83 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1019,7 +1019,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1032,14 +1032,14 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int ret, nr_shrunk; ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d4f70a7fe876..849aaba75dc8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2369,7 +2369,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, show_extent_status(__entry->found ? __entry->status : 0)) ); -TRACE_EVENT(ext4_es_shrink_enter, +DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), @@ -2391,26 +2391,38 @@ TRACE_EVENT(ext4_es_shrink_enter, __entry->nr_to_scan, __entry->cache_cnt) ); -TRACE_EVENT(ext4_es_shrink_exit, - TP_PROTO(struct super_block *sb, int shrunk_nr, int cache_cnt), +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), + + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), + + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +TRACE_EVENT(ext4_es_shrink_scan_exit, + TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), - TP_ARGS(sb, shrunk_nr, cache_cnt), + TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) - __field( int, shrunk_nr ) + __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->shrunk_nr = shrunk_nr; + __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), - TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d", + TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->shrunk_nr, __entry->cache_cnt) + __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, -- cgit From eb68d0e2fc5a4e5c06324ea5f485fccbae626d05 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 1 Sep 2014 22:26:49 -0400 Subject: ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger Cc: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Zheng Liu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +- fs/ext4/extents_status.c | 186 +++++++++++++++++++++++++++++++++++++++++--- fs/ext4/extents_status.h | 13 +++- fs/ext4/super.c | 11 +-- include/trace/events/ext4.h | 31 ++++++++ 5 files changed, 224 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c07f43f8eb93..00fd822ac6e4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -891,6 +891,7 @@ struct ext4_inode_info { struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_lru; + unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_lru_nr; /* protected by i_es_lock */ unsigned long i_touch_when; /* jiffies of last accessing */ @@ -1331,8 +1332,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; - unsigned long s_es_last_sorted; - struct percpu_counter s_extent_cache_cnt; + struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95da65c60d83..09fd57667262 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -11,6 +11,8 @@ */ #include #include +#include +#include #include "ext4.h" #include "extents_status.h" @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, */ if (!ext4_es_is_delayed(es)) { EXT4_I(inode)->i_es_lru_nr++; - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + return es; } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + /* Decrease the lru counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); EXT4_I(inode)->i_es_lru_nr--; - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -729,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es) { struct ext4_es_tree *tree; + struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; @@ -765,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, } out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -931,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; struct list_head *cur, *tmp; LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); spin_lock(&sbi->s_es_lru_lock); retry: @@ -946,7 +966,8 @@ retry: * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. */ - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + if (percpu_counter_read_positive( + &es_stats->es_stats_lru_cnt) == 0) break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); @@ -956,7 +977,7 @@ retry: * time. Normally we try hard to avoid shrinking * precached inodes, but we will as a last resort. */ - if ((sbi->s_es_last_sorted < ei->i_touch_when) || + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || (skip_precached && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED))) { nr_skipped++; @@ -990,7 +1011,7 @@ retry: if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; + es_stats->es_stats_last_sorted = jiffies; ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); /* @@ -1008,6 +1029,22 @@ retry: if (locked_ei && nr_shrunk == 0) nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + nr_skipped, retried); return nr_shrunk; } @@ -1018,7 +1055,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1031,7 +1068,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) @@ -1043,19 +1080,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, return nr_shrunk; } -void ext4_es_register_shrinker(struct ext4_sb_info *sbi) +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void * +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = seq->private; + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lru_lock); + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lru_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); + if (es_stats->es_stats_last_sorted != 0) + seq_printf(seq, " %u ms last sorted interval\n", + jiffies_to_msecs(jiffies - + es_stats->es_stats_last_sorted)); + if (inode_cnt) + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_es_seq_shrinker_info_ops = { + .start = ext4_es_seq_shrinker_info_start, + .next = ext4_es_seq_shrinker_info_next, + .stop = ext4_es_seq_shrinker_info_stop, + .show = ext4_es_seq_shrinker_info_show, +}; + +static int +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE_DATA(inode); + } + + return ret; +} + +static int +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static const struct file_operations ext4_es_seq_shrinker_info_fops = { + .owner = THIS_MODULE, + .open = ext4_es_seq_shrinker_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext4_es_seq_shrinker_info_release, +}; + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { + int err; + INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_last_sorted = 0; + sbi->s_es_stats.es_stats_last_sorted = 0; + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0); + if (err) + goto err1; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker); + if (err) + goto err2; + + if (sbi->s_proc) + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, + &ext4_es_seq_shrinker_info_fops, sbi); + + return 0; + +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); unregister_shrinker(&sbi->s_es_shrinker); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f1b62a419920..efd5f970b501 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -64,6 +64,17 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +struct ext4_es_stats { + unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_lru_cnt; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, (pb & ~ES_MASK)); } -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7194a51eb217..487c65b8cff0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_all_nr = 0; ei->i_es_lru_nr = 0; ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; @@ -3890,12 +3890,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sbi); - - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + if (ext4_es_register_shrinker(sbi)) goto failed_mount3; - } sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; @@ -4225,10 +4221,9 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } -failed_mount3: ext4_es_unregister_shrinker(sbi); +failed_mount3: del_timer_sync(&sbi->s_err_report); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 849aaba75dc8..ff4bd1b35246 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2450,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range, __entry->offset, __entry->len) ); +TRACE_EVENT(ext4_es_shrink, + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, + int skip_precached, int nr_skipped, int retried), + + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_shrunk ) + __field( unsigned long long, scan_time ) + __field( int, skip_precached ) + __field( int, nr_skipped ) + __field( int, retried ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nr_shrunk = nr_shrunk; + __entry->scan_time = div_u64(scan_time, 1000); + __entry->skip_precached = skip_precached; + __entry->nr_skipped = nr_skipped; + __entry->retried = retried; + ), + + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " + "nr_skipped %d retried %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, + __entry->scan_time, __entry->skip_precached, + __entry->nr_skipped, __entry->retried) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:04:42 -0400 Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers A buffer cache is allocated from movable area because it is referred for a while and released soon. But some filesystems are taking buffer cache for a long time and it can disturb page migration. New APIs are introduced to allocate buffer cache with user specific flag. *_gfp APIs are for user want to set page allocation flag for page cache allocation. And *_unmovable APIs are for the user wants to allocate page cache from non-movable area. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/buffer.c | 45 +++++++++++++++++++++++++------------------ include/linux/buffer_head.h | 47 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..9a6029e0dd71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1058,7 +1058,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1404,24 +1407,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 324329ceea1e..73b45225a7ca 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -175,12 +175,13 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); -struct buffer_head *__getblk(struct block_device *bdev, sector_t block, - unsigned size); +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); +struct buffer_head *__bread_gfp(struct block_device *, + sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -295,7 +296,13 @@ static inline void bforget(struct buffer_head *bh) static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { - return __bread(sb->s_bdev, block, sb->s_blocksize); + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); +} + +static inline struct buffer_head * +sb_bread_unmovable(struct super_block *sb, sector_t block) +{ + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void @@ -307,7 +314,7 @@ sb_breadahead(struct super_block *sb, sector_t block) static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { - return __getblk(sb->s_bdev, block, sb->s_blocksize); + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * @@ -344,6 +351,36 @@ static inline void lock_buffer(struct buffer_head *bh) __lock_buffer(bh); } +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, 0); +} + +static inline struct buffer_head *__getblk(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); +} + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * The page cache is allocated from movable area so that it can be migrated. + * It returns NULL if the block was unreadable. + */ +static inline struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + return __bread_gfp(bdev, block, size, __GFP_MOVABLE); +} + extern int __set_page_dirty_buffers(struct page *page); #else /* CONFIG_BLOCK */ -- cgit From 047133066e6c2549403fe5a2d619f47ba4212ef5 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:22 -0700 Subject: leds: Reorder include directives Reorder include directives so that they are arranged in alphabetical order. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 13 +++++++------ drivers/leds/led-core.c | 3 ++- include/linux/leds.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index aa29198fca3e..4bd4572efe6e 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -9,16 +9,17 @@ * published by the Free Software Foundation. */ -#include -#include +#include +#include +#include #include +#include +#include #include +#include +#include #include -#include #include -#include -#include -#include #include "leds.h" static struct class *leds_class; diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 71b40d3bf776..50b579ad948e 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -12,10 +12,11 @@ */ #include +#include #include #include +#include #include -#include #include "leds.h" DECLARE_RWSEM(leds_list_lock); diff --git a/include/linux/leds.h b/include/linux/leds.h index e43686472197..4be2d7623d9e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -13,8 +13,8 @@ #define __LINUX_LEDS_H_INCLUDED #include -#include #include +#include #include #include -- cgit From d8082827d8a214343b761f2c4554d2a7d1573d63 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:23 -0700 Subject: leds: make brightness type consistent across whole subsystem Documentations states that brightness units type is enum led_brightness and this is the type used by the led API functions. Adjust the type of brightness variables in the struct led_classdev accordingly. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- include/linux/leds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/leds.h b/include/linux/leds.h index 4be2d7623d9e..f2e1cbc25705 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -31,8 +31,8 @@ enum led_brightness { struct led_classdev { const char *name; - int brightness; - int max_brightness; + enum led_brightness brightness; + enum led_brightness max_brightness; int flags; /* Lower 16 bits reflect status */ -- cgit From 3ef7de5304edf60d0b8674dd7cdacc104e15a93c Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Wed, 20 Aug 2014 06:41:55 -0700 Subject: leds: Improve and export led_update_brightness led_update_brightness helper function used to be exploited only locally in the led-class.c module, where its result was being passed to the brightness_show sysfs callback. With the introduction of v4l2-flash subdevice the same functionality becomes required for reading current brightness from a LED device. This patch adds checking of return value of the brightness_get callback and moves the led_update_brightness() function to the LED subsystem public API. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 6 ------ drivers/leds/led-core.c | 16 ++++++++++++++++ include/linux/leds.h | 10 ++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 71666a40b79a..7440c58b8e6f 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -24,12 +24,6 @@ static struct class *leds_class; -static void led_update_brightness(struct led_classdev *led_cdev) -{ - if (led_cdev->brightness_get) - led_cdev->brightness = led_cdev->brightness_get(led_cdev); -} - static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 50b579ad948e..aaa8eba9099f 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -127,3 +127,19 @@ void led_set_brightness(struct led_classdev *led_cdev, __led_set_brightness(led_cdev, brightness); } EXPORT_SYMBOL(led_set_brightness); + +int led_update_brightness(struct led_classdev *led_cdev) +{ + int ret = 0; + + if (led_cdev->brightness_get) { + ret = led_cdev->brightness_get(led_cdev); + if (ret >= 0) { + led_cdev->brightness = ret; + return 0; + } + } + + return ret; +} +EXPORT_SYMBOL(led_update_brightness); diff --git a/include/linux/leds.h b/include/linux/leds.h index f2e1cbc25705..a57611d0c94e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -140,6 +140,16 @@ extern void led_blink_set_oneshot(struct led_classdev *led_cdev, */ extern void led_set_brightness(struct led_classdev *led_cdev, enum led_brightness brightness); +/** + * led_update_brightness - update LED brightness + * @led_cdev: the LED to query + * + * Get an LED's current brightness and update led_cdev->brightness + * member with the obtained value. + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_update_brightness(struct led_classdev *led_cdev); /* * LED Triggers -- cgit From 50849db32a9f529235a84bcc84a6b8e631b1d0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Sep 2014 00:58:12 -0400 Subject: jbd2: simplify calling convention around __jbd2_journal_clean_checkpoint_list __jbd2_journal_clean_checkpoint_list() returns number of buffers it freed but noone was using the value so just stop doing that. This also allows for simplifying the calling convention for journal_clean_once_cp_list(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 56 ++++++++++++++++++++++------------------------------ include/linux/jbd2.h | 2 +- 2 files changed, 25 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 3ab4c5ee12ce..988b32ed4c87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -421,16 +421,15 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns 1 if we freed the transaction, 0 otherwise. */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +static int journal_clean_one_cp_list(struct journal_head *jh) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret, freed = 0; + int ret; + int freed = 0; - *released = 0; if (!jh) return 0; @@ -441,11 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) ret = __try_to_free_cp_buf(jh); if (!ret) return freed; - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + if (ret == 2) + return 1; + freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -465,53 +462,48 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * Find all the written-back checkpoint buffers in the journal and release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) */ - -int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; - int freed = 0; - int released; transaction = journal->j_checkpoint_transactions; if (!transaction) - goto out; + return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ - if (need_resched()) { - freed += ret; - goto out; - } - if (released) { - freed += ret; + if (need_resched()) + return; + if (ret) continue; - } /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as * we can possibly see not yet submitted buffers on io_list */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - freed += ret; - if (need_resched() || !ret) - goto out; + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); + if (need_resched()) + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; } while (transaction != last_transaction); -out: - return freed; } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0dae71e9971c..704b9a599b26 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1042,7 +1042,7 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -int __jbd2_journal_clean_checkpoint_list(journal_t *journal); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal); int __jbd2_journal_remove_checkpoint(struct journal_head *); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); -- cgit From 7990da71ebfa887ae6fe4464ab0d99ddeb8efacc Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 3 Sep 2014 17:49:32 +0200 Subject: PM / QoS: Add PM_QOS_MEMORY_BANDWIDTH class Also adds a class type PM_QOS_SUM that aggregates the values by summing them. It can be used by memory controllers to calculate the optimum clock frequency based on the bandwidth needs of the different memory clients. Signed-off-by: Tomeu Vizoso Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.txt | 4 +++- include/linux/pm_qos.h | 5 ++++- kernel/power/qos.c | 27 ++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); -- cgit From 33940d09937276cd3c81f2874faf43e37c2db0e2 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:12 -0400 Subject: target: encapsulate smp_mb__after_atomic() The target code has a rather generous helping of smp_mb__after_atomic() throughout the code base. Most atomic operations were followed by one and none were preceded by smp_mb__before_atomic(), nor accompanied by a comment explaining the need for a barrier. Instead of trying to prove for every case whether or not it is needed, this patch introduces atomic_inc_mb() and atomic_dec_mb(), which explicitly include the memory barriers before and after the atomic operation. For now they are defined in a target header, although they could be of general use. Most of the existing atomic/mb combinations were replaced by the new helpers. In a few cases the atomic was sandwiched in spin_lock/spin_unlock and I simply removed the barrier. I suspect that in most cases the correct conversion would have been to drop the barrier. I also suspect that a few cases exist where a) the barrier was necessary and b) a second barrier before the atomic would have been necessary and got added by this patch. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/loopback/tcm_loop.c | 6 +-- drivers/target/target_core_alua.c | 33 ++++--------- drivers/target/target_core_device.c | 9 ++-- drivers/target/target_core_pr.c | 90 ++++++++++++---------------------- drivers/target/target_core_transport.c | 18 +++---- drivers/target/target_core_ua.c | 15 ++---- include/target/target_core_base.h | 14 ++++++ 7 files changed, 70 insertions(+), 115 deletions(-) (limited to 'include') diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index 340de9d92b15..a7f6dc646045 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -960,8 +960,7 @@ static int tcm_loop_port_link( struct tcm_loop_tpg, tl_se_tpg); struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; - atomic_inc(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tl_tpg->tl_tpg_port_count); /* * Add Linux/SCSI struct scsi_device by HCTL */ @@ -995,8 +994,7 @@ static void tcm_loop_port_unlink( scsi_remove_device(sd); scsi_device_put(sd); - atomic_dec(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tl_tpg->tl_tpg_port_count); pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n"); } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index fbc5ebb5f761..fb87780929d2 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -392,8 +392,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) if (tg_pt_id != tg_pt_gp->tg_pt_gp_id) continue; - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -403,8 +402,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) found = true; spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -998,8 +996,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) * every I_T nexus other than the I_T nexus on which the SET * TARGET PORT GROUPS command */ - atomic_inc(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&mem->tg_pt_gp_mem_ref_cnt); spin_unlock(&tg_pt_gp->tg_pt_gp_lock); spin_lock_bh(&port->sep_alua_lock); @@ -1028,8 +1025,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) spin_unlock_bh(&port->sep_alua_lock); spin_lock(&tg_pt_gp->tg_pt_gp_lock); - atomic_dec(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&mem->tg_pt_gp_mem_ref_cnt); } spin_unlock(&tg_pt_gp->tg_pt_gp_lock); /* @@ -1063,7 +1059,6 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state)); spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (tg_pt_gp->tg_pt_gp_transition_complete) @@ -1125,7 +1120,6 @@ static int core_alua_do_transition_tg_pt( */ spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { @@ -1168,7 +1162,6 @@ int core_alua_do_port_transition( spin_lock(&local_lu_gp_mem->lu_gp_mem_lock); lu_gp = local_lu_gp_mem->lu_gp; atomic_inc(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock); /* * For storage objects that are members of the 'default_lu_gp', @@ -1184,8 +1177,7 @@ int core_alua_do_port_transition( l_tg_pt_gp->tg_pt_gp_alua_nacl = l_nacl; rc = core_alua_do_transition_tg_pt(l_tg_pt_gp, new_state, explicit); - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } /* @@ -1198,8 +1190,7 @@ int core_alua_do_port_transition( lu_gp_mem_list) { dev = lu_gp_mem->lu_gp_mem_dev; - atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); spin_unlock(&lu_gp->lu_gp_lock); spin_lock(&dev->t10_alua.tg_pt_gps_lock); @@ -1227,8 +1218,7 @@ int core_alua_do_port_transition( tg_pt_gp->tg_pt_gp_alua_port = NULL; tg_pt_gp->tg_pt_gp_alua_nacl = NULL; } - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); /* * core_alua_do_transition_tg_pt() will always return @@ -1238,16 +1228,14 @@ int core_alua_do_port_transition( new_state, explicit); spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); if (rc) break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); spin_lock(&lu_gp->lu_gp_lock); - atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); } spin_unlock(&lu_gp->lu_gp_lock); @@ -1260,8 +1248,7 @@ int core_alua_do_port_transition( core_alua_dump_state(new_state)); } - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index e784284cbc2e..f5057a2f4ed1 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -224,8 +224,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi( if (port->sep_rtpi != rtpi) continue; - atomic_inc(&deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->pr_ref_count); spin_unlock_irq(&nacl->device_list_lock); return deve; @@ -1388,8 +1387,7 @@ int core_dev_add_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_add_tail(&lacl->lacl_list, &lun->lun_acl_list); - atomic_inc(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_inc_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for " @@ -1422,8 +1420,7 @@ int core_dev_del_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_del(&lacl->lacl_list); - atomic_dec(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_dec_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun, diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 281d52e3fe99..48a801045176 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -674,8 +674,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( */ spin_lock(&dev->se_port_lock); list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) { - atomic_inc(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&port->sep_tg_pt_ref_cnt); spin_unlock(&dev->se_port_lock); spin_lock_bh(&port->sep_alua_lock); @@ -709,8 +708,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (strcmp(nacl->initiatorname, nacl_tmp->initiatorname)) continue; - atomic_inc(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve_tmp->pr_ref_count); spin_unlock_bh(&port->sep_alua_lock); /* * Grab a configfs group dependency that is released @@ -722,10 +720,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (ret < 0) { pr_err("core_scsi3_lunacl_depend" "_item() failed\n"); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); goto out; } /* @@ -739,10 +735,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( nacl_tmp, deve_tmp, NULL, sa_res_key, all_tg_pt, aptpl); if (!pr_reg_atp) { - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); core_scsi3_lunacl_undepend_item(deve_tmp); goto out; } @@ -754,8 +748,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_unlock_bh(&port->sep_alua_lock); spin_lock(&dev->se_port_lock); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); } spin_unlock(&dev->se_port_lock); @@ -1109,8 +1102,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (dev->dev_attrib.enforce_pr_isids) continue; } - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1124,8 +1116,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (strcmp(isid, pr_reg->pr_reg_isid)) continue; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1154,8 +1145,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg( static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg) { - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); } static int core_scsi3_check_implicit_release( @@ -1348,8 +1338,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &tpg->tpg_group.cg_item); - atomic_dec(&tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tpg->tpg_pr_ref_count); } static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl) @@ -1368,16 +1357,14 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) struct se_portal_group *tpg = nacl->se_tpg; if (nacl->dynamic_node_acl) { - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); return; } configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &nacl->acl_group.cg_item); - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); } static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) @@ -1407,8 +1394,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) * For nacl->dynamic_node_acl=1 */ if (!lun_acl) { - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); return; } nacl = lun_acl->se_lun_nacl; @@ -1417,8 +1403,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &lun_acl->se_lun_group.cg_item); - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); } static sense_reason_t @@ -1551,15 +1536,13 @@ core_scsi3_decode_spec_i_port( if (!i_str) continue; - atomic_inc(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tmp_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(tmp_tpg)) { pr_err(" core_scsi3_tpg_depend_item()" " for tmp_tpg\n"); - atomic_dec(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tmp_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; } @@ -1571,10 +1554,8 @@ core_scsi3_decode_spec_i_port( spin_lock_irq(&tmp_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl( tmp_tpg, i_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&tmp_tpg->acl_node_lock); if (!dest_node_acl) { @@ -1586,8 +1567,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("configfs_depend_item() failed" " for dest_node_acl->acl_group\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); core_scsi3_tpg_undepend_item(tmp_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; @@ -1646,8 +1626,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item()" " failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); core_scsi3_nodeacl_undepend_item(dest_node_acl); core_scsi3_tpg_undepend_item(dest_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -3167,15 +3146,13 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, if (!dest_tf_ops) continue; - atomic_inc(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dest_se_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(dest_se_tpg)) { pr_err("core_scsi3_tpg_depend_item() failed" " for dest_se_tpg\n"); - atomic_dec(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_put_pr_reg; } @@ -3271,10 +3248,8 @@ after_iport_check: spin_lock_irq(&dest_se_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl(dest_se_tpg, initiator_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&dest_se_tpg->acl_node_lock); if (!dest_node_acl) { @@ -3288,8 +3263,7 @@ after_iport_check: if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("core_scsi3_nodeacl_depend_item() for" " dest_node_acl\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); dest_node_acl = NULL; ret = TCM_INVALID_PARAMETER_LIST; goto out; @@ -3313,8 +3287,7 @@ after_iport_check: if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item() failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); dest_se_deve = NULL; ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out; @@ -3879,8 +3852,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_tpg = pr_reg->pr_reg_nacl->se_tpg; add_desc_len = 0; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); /* * Determine expected length of $FABRIC_MOD specific @@ -3893,8 +3865,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) pr_warn("SPC-3 PRIN READ_FULL_STATUS ran" " out of buffer: %d\n", cmd->data_length); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); break; } /* @@ -3955,8 +3926,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_nacl, pr_reg, &format_code, &buf[off+4]); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); /* * Set the ADDITIONAL DESCRIPTOR LENGTH */ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 0b43761ed85f..115632ee3ec8 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -752,8 +752,7 @@ void target_qf_do_work(struct work_struct *work) list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) { list_del(&cmd->se_qf_node); - atomic_dec(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_qf_count); pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue" " context: %s\n", cmd->se_tfo->get_fabric_name(), cmd, @@ -1721,8 +1720,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) cmd->t_task_cdb[0], cmd->se_ordered_id); return false; case MSG_ORDERED_TAG: - atomic_inc(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_ordered_sync); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, " " se_ordered_id: %u\n", @@ -1739,8 +1737,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->simple_cmds); break; } @@ -1844,8 +1841,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) return; if (cmd->sam_task_attr == MSG_SIMPLE_TAG) { - atomic_dec(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->simple_cmds); dev->dev_cur_ordered_id++; pr_debug("Incremented dev->dev_cur_ordered_id: %u for" " SIMPLE: %u\n", dev->dev_cur_ordered_id, @@ -1856,8 +1852,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) " HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id, cmd->se_ordered_id); } else if (cmd->sam_task_attr == MSG_ORDERED_TAG) { - atomic_dec(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_ordered_sync); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:" @@ -1915,8 +1910,7 @@ static void transport_handle_queue_full( { spin_lock_irq(&dev->qf_cmd_lock); list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list); - atomic_inc(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_qf_count); spin_unlock_irq(&cmd->se_dev->qf_cmd_lock); schedule_work(&cmd->se_dev->qf_work_queue); diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c index 101858e245b3..1738b1646988 100644 --- a/drivers/target/target_core_ua.c +++ b/drivers/target/target_core_ua.c @@ -161,8 +161,7 @@ int core_scsi3_ua_allocate( spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } list_add_tail(&ua->ua_nacl_list, &deve->ua_list); @@ -174,8 +173,7 @@ int core_scsi3_ua_allocate( nacl->se_tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, asc, ascq); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } @@ -189,8 +187,7 @@ void core_scsi3_ua_release_all( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); } @@ -250,8 +247,7 @@ void core_scsi3_ua_for_check_condition( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); @@ -309,8 +305,7 @@ int core_scsi3_ua_clear_for_request_sense( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 9ec9864ecf38..b106240d8385 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -903,4 +903,18 @@ struct se_wwn { struct config_group fabric_stat_group; }; +static inline void atomic_inc_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_inc(v); + smp_mb__after_atomic(); +} + +static inline void atomic_dec_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_dec(v); + smp_mb__after_atomic(); +} + #endif /* TARGET_CORE_BASE_H */ -- cgit From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:18 -0400 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data ->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/buffer.c | 3 +++ include/linux/mm.h | 1 + mm/truncate.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc882ed2..5005464fe012 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include +#include #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,11 +720,67 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit From f14bb039a4e8206439d3e9abd92bc76bd142f243 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:03 -0700 Subject: uio: Export definition of struct uio_device In order to prevent a O(n) search of the filesystem to link up its uio node with its target configuration, TCMU needs to know the minor number that UIO assigned. Expose the definition of this struct so TCMU can access this field. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/uio/uio.c | 12 ------------ include/linux/uio_driver.h | 12 +++++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a673e5b6a2e0..60fa6278fbce 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -28,18 +28,6 @@ #define UIO_MAX_DEVICES (1U << MINORBITS) -struct uio_device { - struct module *owner; - struct device *dev; - int minor; - atomic_t event; - struct fasync_struct *async_queue; - wait_queue_head_t wait; - struct uio_info *info; - struct kobject *map_dir; - struct kobject *portio_dir; -}; - static int uio_major; static struct cdev *uio_cdev; static DEFINE_IDR(uio_idr); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 1ad4724458de..baa81718d985 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -63,7 +63,17 @@ struct uio_port { #define MAX_UIO_PORT_REGIONS 5 -struct uio_device; +struct uio_device { + struct module *owner; + struct device *dev; + int minor; + atomic_t event; + struct fasync_struct *async_queue; + wait_queue_head_t wait; + struct uio_info *info; + struct kobject *map_dir; + struct kobject *portio_dir; +}; /** * struct uio_info - UIO device capabilities -- cgit From 5a17dae422d7de4b776a9753cd4673a343a25b4b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 5 Aug 2014 11:52:11 +0100 Subject: efi: Add efi= parameter parsing to the EFI boot stub We need a way to customize the behaviour of the EFI boot stub, in particular, we need a way to disable the "chunking" workaround, used when reading files from the EFI System Partition. One of my machines doesn't cope well when reading files in 1MB chunks to a buffer above the 4GB mark - it appears that the "chunking" bug workaround triggers another firmware bug. This was only discovered with commit 4bf7111f5016 ("x86/efi: Support initrd loaded above 4G"), and that commit is perfectly valid. The symptom I observed was a corrupt initrd rather than any kind of crash. efi= is now used to specify EFI parameters in two very different execution environments, the EFI boot stub and during kernel boot. There is also a slight performance optimization by enabling efi=nochunk, but that's offset by the fact that you're more likely to run into firmware issues, at least on x86. This is the rationale behind leaving the workaround enabled by default. Also provide some documentation for EFI_READ_CHUNK_SIZE and why we're using the current value of 1MB. Tested-by: Ard Biesheuvel Cc: Roy Franz Cc: Maarten Lankhorst Cc: Leif Lindholm Cc: Borislav Petkov Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 5 ++- arch/x86/boot/compressed/eboot.c | 4 ++ arch/x86/platform/efi/efi.c | 19 +++++++- drivers/firmware/efi/libstub/arm-stub.c | 4 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 62 +++++++++++++++++++++++++- include/linux/efi.h | 2 + 6 files changed, 91 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ae8608ca9f5..67d79597d9d6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -992,10 +992,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map" } + Format: { "old_map", "nochunk" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. + nochunk: disable reading files in "chunks" in the EFI + boot stub, as chunking can cause problems with some + firmware implementations. efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index f277184e2ac1..f4bdab1dbf66 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1100,6 +1100,10 @@ struct boot_params *make_boot_params(struct efi_config *c) else initrd_addr_max = hdr->initrd_addr_max; + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + goto fail2; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", initrd_addr_max, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 850da94fef30..a1f745b0bf1d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -943,8 +943,23 @@ static int __init parse_efi_cmdline(char *str) if (*str == '=') str++; - if (!strncmp(str, "old_map", 7)) - set_bit(EFI_OLD_MEMMAP, &efi.flags); + while (*str) { + if (!strncmp(str, "old_map", 7)) { + set_bit(EFI_OLD_MEMMAP, &efi.flags); + str += strlen("old_map"); + } + + /* + * Skip any options we don't understand. Presumably + * they apply to the EFI boot stub. + */ + while (*str && *str != ',') + str++; + + /* If we hit a delimiter, skip it */ + if (*str == ',') + str++; + } return 0; } diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 480339b6b110..75ee05964cbc 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -226,6 +226,10 @@ unsigned long __init efi_entry(void *handle, efi_system_table_t *sys_table, goto fail_free_image; } + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 32d5cca30f49..a920fec8fe88 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -15,8 +15,23 @@ #include "efistub.h" +/* + * Some firmware implementations have problems reading files in one go. + * A read chunk size of 1MB seems to work for most platforms. + * + * Unfortunately, reading files in chunks triggers *other* bugs on some + * platforms, so we provide a way to disable this workaround, which can + * be done by passing "efi=nochunk" on the EFI boot stub command line. + * + * If you experience issues with initrd images being corrupt it's worth + * trying efi=nochunk, but chunking is enabled by default because there + * are far more machines that require the workaround than those that + * break with it enabled. + */ #define EFI_READ_CHUNK_SIZE (1024 * 1024) +static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; + struct file_info { efi_file_handle_t *handle; u64 size; @@ -281,6 +296,49 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size, efi_call_early(free_pages, addr, nr_pages); } +/* + * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi= + * option, e.g. efi=nochunk. + * + * It should be noted that efi= is parsed in two very different + * environments, first in the early boot environment of the EFI boot + * stub, and subsequently during the kernel boot. + */ +efi_status_t efi_parse_options(char *cmdline) +{ + char *str; + + /* + * If no EFI parameters were specified on the cmdline we've got + * nothing to do. + */ + str = strstr(cmdline, "efi="); + if (!str) + return EFI_SUCCESS; + + /* Skip ahead to first argument */ + str += strlen("efi="); + + /* + * Remember, because efi= is also used by the kernel we need to + * skip over arguments we don't understand. + */ + while (*str) { + if (!strncmp(str, "nochunk", 7)) { + str += strlen("nochunk"); + __chunk_size = -1UL; + } + + /* Group words together, delimited by "," */ + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return EFI_SUCCESS; +} /* * Check the cmdline for a LILO-style file= arguments. @@ -423,8 +481,8 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, size = files[j].size; while (size) { unsigned long chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; + if (size > __chunk_size) + chunksize = __chunk_size; else chunksize = size; diff --git a/include/linux/efi.h b/include/linux/efi.h index 45cb4ffdea62..518779fb5e90 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1227,4 +1227,6 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, unsigned long *load_addr, unsigned long *load_size); +efi_status_t efi_parse_options(char *cmdline); + #endif /* _LINUX_EFI_H */ -- cgit From b2e0a54a1296a91b800f316df7bef7d1905e4fd0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:26 +0800 Subject: efi: Move noefi early param code out of x86 arch code noefi param can be used for arches other than X86 later, thus move it out of x86 platform code. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 2 +- arch/x86/platform/efi/efi.c | 10 +--------- drivers/firmware/efi/efi.c | 13 +++++++++++++ include/linux/efi.h | 1 + 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 67d79597d9d6..08df275eee2b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2169,7 +2169,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nodsp [SH] Disable hardware DSP at boot time. - noefi [X86] Disable EFI runtime services support. + noefi Disable EFI runtime services support. noexec [IA-64] diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a1f745b0bf1d..c90d3cd2728c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,14 +70,6 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -static bool disable_runtime __initdata = false; -static int __init setup_noefi(char *arg) -{ - disable_runtime = true; - return 0; -} -early_param("noefi", setup_noefi); - int add_efi_memmap; EXPORT_SYMBOL(add_efi_memmap); @@ -492,7 +484,7 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (disable_runtime || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) return; } if (efi_memmap_init()) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 64ecbb501c50..c8f01a73edb5 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -41,6 +41,19 @@ struct efi __read_mostly efi = { }; EXPORT_SYMBOL(efi); +static bool disable_runtime; +static int __init setup_noefi(char *arg) +{ + disable_runtime = true; + return 0; +} +early_param("noefi", setup_noefi); + +bool efi_runtime_disabled(void) +{ + return disable_runtime; +} + static struct kobject *efi_kobj; static struct kobject *efivars_kobj; diff --git a/include/linux/efi.h b/include/linux/efi.h index 518779fb5e90..4812ed0b0374 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1229,4 +1229,5 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ -- cgit From 6ccc72b87b83ece31c2a75bbe07f440b0378f7a9 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:27 +0800 Subject: lib: Add a generic cmdline parse function parse_option_str There should be a generic function to parse params like a=b,c Adding parse_option_str in lib/cmdline.c which will return true if there's specified option set in the params. Also updated efi=old_map parsing code to use the new function Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 22 ++-------------------- include/linux/kernel.h | 1 + lib/cmdline.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c90d3cd2728c..c73a7df5d37f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -932,26 +932,8 @@ u64 efi_mem_attributes(unsigned long phys_addr) static int __init parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - while (*str) { - if (!strncmp(str, "old_map", 7)) { - set_bit(EFI_OLD_MEMMAP, &efi.flags); - str += strlen("old_map"); - } - - /* - * Skip any options we don't understand. Presumably - * they apply to the EFI boot stub. - */ - while (*str && *str != ',') - str++; - - /* If we hit a delimiter, skip it */ - if (*str == ',') - str++; - } + if (parse_option_str(str, "old_map")) + set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 95624bed87ef..f66427ef0628 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -407,6 +407,7 @@ int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 76a712e6e20e..8f13cf73c2ec 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -160,3 +160,32 @@ unsigned long long memparse(const char *ptr, char **retptr) return ret; } EXPORT_SYMBOL(memparse); + +/** + * parse_option_str - Parse a string and check an option is set or not + * @str: String to be parsed + * @option: option name + * + * This function parses a string containing a comma-separated list of + * strings like a=b,c. + * + * Return true if there's such option in the string, or return false. + */ +bool parse_option_str(const char *str, const char *option) +{ + while (*str) { + if (!strncmp(str, option, strlen(option))) { + str += strlen(option); + if (!*str || *str == ',') + return true; + } + + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return false; +} -- cgit From 9c97e0bdd4b4ae44577a1b1ec949e782084e9a78 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:19 +0200 Subject: efi: Add macro for EFI_MEMORY_UCE memory attribute Add the following macro from the UEFI spec, for completeness: EFI_MEMORY_UCE Memory cacheability attribute: The memory region supports being configured as not cacheable, exported, and supports the "fetch and add" semaphore mechanism. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 4812ed0b0374..7464032ae00a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -92,6 +92,7 @@ typedef struct { #define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ #define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ #define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ +#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -- cgit From 98d2a6ca14520904a47c46258d3bad02ffcd3f96 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:20 +0200 Subject: efi: Introduce efi_md_typeattr_format() At the moment, there are three architectures debug-printing the EFI memory map at initialization: x86, ia64, and arm64. They all use different format strings, plus the EFI memory type and the EFI memory attributes are similarly hard to decode for a human reader. Introduce a helper __init function that formats the memory type and the memory attributes in a unified way, to a user-provided character buffer. The array "memory_type_name" is copied from the arm64 code, temporarily duplicating it. The (otherwise optional) braces around each string literal in the initializer list are dropped in order to match the kernel coding style more closely. The element size is tightened from 32 to 20 bytes (maximum actual string length + 1) so that we can derive the field width from the element size. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel [ Dropped useless 'register' keyword, which compiler will ignore ] Signed-off-by: Matt Fleming --- drivers/firmware/efi/efi.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 7 ++++++ 2 files changed, 64 insertions(+) (limited to 'include') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index cebfa36a27ae..8590099ac148 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -445,3 +445,60 @@ int __init efi_get_fdt_params(struct efi_fdt_params *params, int verbose) return ret; } #endif /* CONFIG_EFI_PARAMS_FROM_FDT */ + +static __initdata char memory_type_name[][20] = { + "Reserved", + "Loader Code", + "Loader Data", + "Boot Code", + "Boot Data", + "Runtime Code", + "Runtime Data", + "Conventional Memory", + "Unusable Memory", + "ACPI Reclaim Memory", + "ACPI Memory NVS", + "Memory Mapped I/O", + "MMIO Port Space", + "PAL Code" +}; + +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md) +{ + char *pos; + int type_len; + u64 attr; + + pos = buf; + if (md->type >= ARRAY_SIZE(memory_type_name)) + type_len = snprintf(pos, size, "[type=%u", md->type); + else + type_len = snprintf(pos, size, "[%-*s", + (int)(sizeof(memory_type_name[0]) - 1), + memory_type_name[md->type]); + if (type_len >= size) + return buf; + + pos += type_len; + size -= type_len; + + attr = md->attribute; + if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT | + EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP | + EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME)) + snprintf(pos, size, "|attr=0x%016llx]", + (unsigned long long)attr); + else + snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_XP ? "XP" : "", + attr & EFI_MEMORY_RP ? "RP" : "", + attr & EFI_MEMORY_WP ? "WP" : "", + attr & EFI_MEMORY_UCE ? "UCE" : "", + attr & EFI_MEMORY_WB ? "WB" : "", + attr & EFI_MEMORY_WT ? "WT" : "", + attr & EFI_MEMORY_WC ? "WC" : "", + attr & EFI_MEMORY_UC ? "UC" : ""); + return buf; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 7464032ae00a..78b29b133e14 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -887,6 +887,13 @@ extern bool efi_poweroff_required(void); (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/* + * Format an EFI memory descriptor's type and attributes to a user-provided + * character buffer, as per snprintf(), and return the buffer. + */ +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md); + /** * efi_range_is_wc - check the WC bit on an address range * @start: starting kvirt address -- cgit From 6d80dba1c9fe4316ef626980102b92fa30c7845a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 30 Sep 2014 21:58:52 +0100 Subject: efi: Provide a non-blocking SetVariable() operation There are some circumstances that call for trying to write an EFI variable in a non-blocking way. One such scenario is when writing pstore data in efi_pstore_write() via the pstore_dump() kdump callback. Now that we have an EFI runtime spinlock we need a way of aborting if there is contention instead of spinning, since when writing pstore data from the kdump callback, the runtime lock may already be held by the CPU that's running the callback if we crashed in the middle of an EFI variable operation. The situation is sufficiently special that a new EFI variable operation is warranted. Introduce ->set_variable_nonblocking() for this use case. It is an optional EFI backend operation, and need only be implemented by those backends that usually acquire locks to serialize access to EFI variables, as is the case for virt_efi_set_variable() where we now grab the EFI runtime spinlock. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Matthew Garrett Signed-off-by: Matt Fleming --- drivers/firmware/efi/runtime-wrappers.c | 19 +++++++++++++ drivers/firmware/efi/vars.c | 47 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 6 +++++ 3 files changed, 72 insertions(+) (limited to 'include') diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 9694cba665c4..4349206198b2 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -200,6 +200,24 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, return status; } +static efi_status_t +virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; +} + + static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *storage_space, u64 *remaining_space, @@ -287,6 +305,7 @@ void efi_native_runtime_setup(void) efi.get_variable = virt_efi_get_variable; efi.get_next_variable = virt_efi_get_next_variable; efi.set_variable = virt_efi_set_variable; + efi.set_variable_nonblocking = virt_efi_set_variable_nonblocking; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.query_variable_info = virt_efi_query_variable_info; diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 1fa724f31b0e..fa3c66bdc1e5 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -595,6 +595,39 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, } EXPORT_SYMBOL_GPL(efivar_entry_set); +/* + * efivar_entry_set_nonblocking - call set_variable_nonblocking() + * + * This function is guaranteed to not block and is suitable for calling + * from crash/panic handlers. + * + * Crucially, this function will not block if it cannot acquire + * __efivars->lock. Instead, it returns -EBUSY. + */ +static int +efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, + u32 attributes, unsigned long size, void *data) +{ + const struct efivar_operations *ops = __efivars->ops; + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&__efivars->lock, flags)) + return -EBUSY; + + status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + spin_unlock_irqrestore(&__efivars->lock, flags); + return -ENOSPC; + } + + status = ops->set_variable_nonblocking(name, &vendor, attributes, + size, data); + + spin_unlock_irqrestore(&__efivars->lock, flags); + return efi_status_to_err(status); +} + /** * efivar_entry_set_safe - call set_variable() if enough space in firmware * @name: buffer containing the variable name @@ -622,6 +655,20 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, if (!ops->query_variable_store) return -ENOSYS; + /* + * If the EFI variable backend provides a non-blocking + * ->set_variable() operation and we're in a context where we + * cannot block, then we need to use it to avoid live-locks, + * since the implication is that the regular ->set_variable() + * will block. + * + * If no ->set_variable_nonblocking() is provided then + * ->set_variable() is assumed to be non-blocking. + */ + if (!block && ops->set_variable_nonblocking) + return efivar_entry_set_nonblocking(name, vendor, attributes, + size, data); + if (!block) { if (!spin_trylock_irqsave(&__efivars->lock, flags)) return -EBUSY; diff --git a/include/linux/efi.h b/include/linux/efi.h index 78b29b133e14..0949f9c7e872 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,10 @@ typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); +typedef efi_status_t +efi_set_variable_nonblocking_t(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, void *data); + typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); typedef void efi_reset_system_t (int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data); @@ -822,6 +826,7 @@ extern struct efi { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_info_t *query_variable_info; efi_update_capsule_t *update_capsule; efi_query_capsule_caps_t *query_capsule_caps; @@ -1042,6 +1047,7 @@ struct efivar_operations { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_store_t *query_variable_store; }; -- cgit From 7c9e7a6fe11c8dc5b3b9d0e889dde73347247584 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:05 -0700 Subject: target: Add a user-passthrough backstore Add a LIO storage engine that presents commands to userspace for execution. This would allow more complex backstores to be implemented out-of-kernel, and also make experimentation a-la FUSE (but at the SCSI level -- "SUSE"?) possible. It uses a mmap()able UIO device per LUN to share a command ring and data area. The commands are raw SCSI CDBs and iovs for in/out data. The command ring is also reused for returning scsi command status and optional sense data. This implementation is based on Shaohua Li's earlier version but heavily modified. Differences include: * Shared memory allocated by kernel, not locked-down user pages * Single ring for command request and response * Offsets instead of embedded pointers * Generic SCSI CDB passthrough instead of per-cmd specialization in ring format. * Uses UIO device instead of anon_file passed in mailbox. * Optional in-kernel handling of some commands. The main reason for these differences is to permit greater resiliency if the user process dies or hangs. Things not yet implemented (on purpose): * Zero copy. The data area is flexible enough to allow page flipping or backend-allocated pages to be used by fabrics, but it's not clear these are performance wins. Can come later. * Out-of-order command completion by userspace. Possible to add by just allowing userspace to change cmd_id in rsp cmd entries, but currently not supported. * No locks between kernel cmd submission and completion routines. Sounds like it's possible, but this can come later. * Sparse allocation of mmaped area. Current code vmallocs the whole thing. If the mapped area was larger and not fully mapped then the driver would have more freedom to change cmd and data area sizes based on demand. Current code open issues: * The use of idrs may be overkill -- we maybe can replace them with a simple counter to generate cmd_ids, and a hash table to get a cmd_id's associated pointer. * Use of a free-running counter for cmd ring instead of explicit modulo math. This would require power-of-2 cmd ring size. (Add kconfig depends NET - Randy) Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/Kconfig | 7 + drivers/target/Makefile | 1 + drivers/target/target_core_transport.c | 4 + drivers/target/target_core_user.c | 1163 ++++++++++++++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/target_core_user.h | 142 ++++ 6 files changed, 1318 insertions(+) create mode 100644 drivers/target/target_core_user.c create mode 100644 include/uapi/linux/target_core_user.h (limited to 'include') diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index dc2d84ac5a0e..81d44c477a5b 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -31,6 +31,13 @@ config TCM_PSCSI Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device +config TCM_USER + tristate "TCM/USER Subsystem Plugin for Linux" + depends on UIO && NET + help + Say Y here to enable the TCM/USER subsystem plugin for a userspace + process to handle requests + source "drivers/target/loopback/Kconfig" source "drivers/target/tcm_fc/Kconfig" source "drivers/target/iscsi/Kconfig" diff --git a/drivers/target/Makefile b/drivers/target/Makefile index 85b012d2f89b..bbb4a7d638ef 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TARGET_CORE) += target_core_mod.o obj-$(CONFIG_TCM_IBLOCK) += target_core_iblock.o obj-$(CONFIG_TCM_FILEIO) += target_core_file.o obj-$(CONFIG_TCM_PSCSI) += target_core_pscsi.o +obj-$(CONFIG_TCM_USER) += target_core_user.o # Fabric modules obj-$(CONFIG_LOOPBACK_TARGET) += loopback/ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 9700ea125268..9ea0d5f03f7a 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -232,6 +232,10 @@ void transport_subsystem_check_init(void) if (ret != 0) pr_err("Unable to load target_core_pscsi\n"); + ret = request_module("target_core_user"); + if (ret != 0) + pr_err("Unable to load target_core_user\n"); + sub_api_initialized = 1; } diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c new file mode 100644 index 000000000000..6608ecf94570 --- /dev/null +++ b/drivers/target/target_core_user.c @@ -0,0 +1,1163 @@ +/* + * Copyright (C) 2013 Shaohua Li + * Copyright (C) 2014 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Define a shared-memory interface for LIO to pass SCSI commands and + * data to userspace for processing. This is to allow backends that + * are too complex for in-kernel support to be possible. + * + * It uses the UIO framework to do a lot of the device-creation and + * introspection work for us. + * + * See the .h file for how the ring is laid out. Note that while the + * command ring is defined, the particulars of the data area are + * not. Offset values in the command entry point to other locations + * internal to the mmap()ed area. There is separate space outside the + * command ring for data buffers. This leaves maximum flexibility for + * moving buffer allocations, or even page flipping or other + * allocation techniques, without altering the command ring layout. + * + * SECURITY: + * The user process must be assumed to be malicious. There's no way to + * prevent it breaking the command ring protocol if it wants, but in + * order to prevent other issues we must only ever read *data* from + * the shared memory area, not offsets or sizes. This applies to + * command ring entries as well as the mailbox. Extra code needed for + * this may have a 'UAM' comment. + */ + + +#define TCMU_TIME_OUT (30 * MSEC_PER_SEC) + +#define CMDR_SIZE (16 * 4096) +#define DATA_SIZE (257 * 4096) + +#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) + +static struct device *tcmu_root_device; + +struct tcmu_hba { + u32 host_id; +}; + +/* User wants all cmds or just some */ +enum passthru_level { + TCMU_PASS_ALL = 0, + TCMU_PASS_IO, + TCMU_PASS_INVALID, +}; + +#define TCMU_CONFIG_LEN 256 + +struct tcmu_dev { + struct se_device se_dev; + + char *name; + struct se_hba *hba; + +#define TCMU_DEV_BIT_OPEN 0 +#define TCMU_DEV_BIT_BROKEN 1 + unsigned long flags; + enum passthru_level pass_level; + + struct uio_info uio_info; + + struct tcmu_mailbox *mb_addr; + size_t dev_size; + u32 cmdr_size; + u32 cmdr_last_cleaned; + /* Offset of data ring from start of mb */ + size_t data_off; + size_t data_size; + /* Ring head + tail values. */ + /* Must add data_off and mb_addr to get the address */ + size_t data_head; + size_t data_tail; + + wait_queue_head_t wait_cmdr; + /* TODO should this be a mutex? */ + spinlock_t cmdr_lock; + + struct idr commands; + spinlock_t commands_lock; + + struct timer_list timeout; + + char dev_config[TCMU_CONFIG_LEN]; +}; + +#define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev) + +#define CMDR_OFF sizeof(struct tcmu_mailbox) + +struct tcmu_cmd { + struct se_cmd *se_cmd; + struct tcmu_dev *tcmu_dev; + + uint16_t cmd_id; + + /* Can't use se_cmd->data_length when cleaning up expired cmds, because if + cmd has been completed then accessing se_cmd is off limits */ + size_t data_length; + + unsigned long deadline; + +#define TCMU_CMD_BIT_EXPIRED 0 + unsigned long flags; +}; + +static struct kmem_cache *tcmu_cmd_cache; + +/* multicast group */ +enum tcmu_multicast_groups { + TCMU_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group tcmu_mcgrps[] = { + [TCMU_MCGRP_CONFIG] = { .name = "config", }, +}; + +/* Our generic netlink family */ +static struct genl_family tcmu_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "TCM-USER", + .version = 1, + .maxattr = TCMU_ATTR_MAX, + .mcgrps = tcmu_mcgrps, + .n_mcgrps = ARRAY_SIZE(tcmu_mcgrps), +}; + +static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int cmd_id; + + tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL); + if (!tcmu_cmd) + return NULL; + + tcmu_cmd->se_cmd = se_cmd; + tcmu_cmd->tcmu_dev = udev; + tcmu_cmd->data_length = se_cmd->data_length; + + tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT); + + idr_preload(GFP_KERNEL); + spin_lock_irq(&udev->commands_lock); + cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0, + USHRT_MAX, GFP_NOWAIT); + spin_unlock_irq(&udev->commands_lock); + idr_preload_end(); + + if (cmd_id < 0) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + tcmu_cmd->cmd_id = cmd_id; + + return tcmu_cmd; +} + +static inline void tcmu_flush_dcache_range(void *vaddr, size_t size) +{ + unsigned long offset = (unsigned long) vaddr & ~PAGE_MASK; + + size = round_up(size+offset, PAGE_SIZE); + vaddr -= offset; + + while (size) { + flush_dcache_page(virt_to_page(vaddr)); + size -= PAGE_SIZE; + } +} + +/* + * Some ring helper functions. We don't assume size is a power of 2 so + * we can't use circ_buf.h. + */ +static inline size_t spc_used(size_t head, size_t tail, size_t size) +{ + int diff = head - tail; + + if (diff >= 0) + return diff; + else + return size + diff; +} + +static inline size_t spc_free(size_t head, size_t tail, size_t size) +{ + /* Keep 1 byte unused or we can't tell full from empty */ + return (size - spc_used(head, tail, size) - 1); +} + +static inline size_t head_to_end(size_t head, size_t size) +{ + return size - head; +} + +#define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size) + +/* + * We can't queue a command until we have space available on the cmd ring *and* space + * space avail on the data ring. + * + * Called with ring lock held. + */ +static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_needed, size_t data_needed) +{ + struct tcmu_mailbox *mb = udev->mb_addr; + size_t space; + u32 cmd_head; + + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + + space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size); + if (space < cmd_needed) { + pr_debug("no cmd space: %u %u %u\n", cmd_head, + udev->cmdr_last_cleaned, udev->cmdr_size); + return false; + } + + space = spc_free(udev->data_head, udev->data_tail, udev->data_size); + if (space < data_needed) { + pr_debug("no data space: %zu %zu %zu\n", udev->data_head, + udev->data_tail, udev->data_size); + return false; + } + + return true; +} + +static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t base_command_size, command_size; + size_t cmdr_space_needed; + struct tcmu_mailbox *mb; + size_t pad_size; + struct tcmu_cmd_entry *entry; + int i; + struct scatterlist *sg; + struct iovec *iov; + int iov_cnt = 0; + uint32_t cmd_head; + uint64_t cdb_off; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + return -EINVAL; + + /* + * Must be a certain minimum size for response sense info, but + * also may be larger if the iov array is large. + * + * iovs = sgl_nents+1, for end-of-ring case, plus another 1 + * b/c size == offsetof one-past-element. + */ + base_command_size = max(offsetof(struct tcmu_cmd_entry, + req.iov[se_cmd->t_data_nents + 2]), + sizeof(struct tcmu_cmd_entry)); + command_size = base_command_size + + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); + + WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + + spin_lock_irq(&udev->cmdr_lock); + + mb = udev->mb_addr; + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + if ((command_size > (udev->cmdr_size / 2)) + || tcmu_cmd->data_length > (udev->data_size - 1)) + pr_warn("TCMU: Request of size %zu/%zu may be too big for %u/%zu " + "cmd/data ring buffers\n", command_size, tcmu_cmd->data_length, + udev->cmdr_size, udev->data_size); + + /* + * Cmd end-of-ring space is too small so we need space for a NOP plus + * original cmd - cmds are internally contiguous. + */ + if (head_to_end(cmd_head, udev->cmdr_size) >= command_size) + pad_size = 0; + else + pad_size = head_to_end(cmd_head, udev->cmdr_size); + cmdr_space_needed = command_size + pad_size; + + while (!is_ring_space_avail(udev, cmdr_space_needed, tcmu_cmd->data_length)) { + int ret; + DEFINE_WAIT(__wait); + + prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); + + pr_debug("sleeping for ring space\n"); + spin_unlock_irq(&udev->cmdr_lock); + ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); + finish_wait(&udev->wait_cmdr, &__wait); + if (!ret) { + pr_warn("tcmu: command timed out\n"); + return -ETIMEDOUT; + } + + spin_lock_irq(&udev->cmdr_lock); + + /* We dropped cmdr_lock, cmd_head is stale */ + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + } + + if (pad_size) { + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); + tcmu_hdr_set_len(&entry->hdr, pad_size); + + UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + WARN_ON(cmd_head != 0); + } + + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD); + tcmu_hdr_set_len(&entry->hdr, command_size); + entry->cmd_id = tcmu_cmd->cmd_id; + + /* + * Fix up iovecs, and handle if allocation in data ring wrapped. + */ + iov = &entry->req.iov[0]; + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_head, udev->data_size)); + void *from = kmap_atomic(sg_page(sg)) + sg->offset; + void *to = (void *) mb + udev->data_off + udev->data_head; + + if (tcmu_cmd->se_cmd->data_direction == DMA_TO_DEVICE) { + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + /* Even iov_base is relative to mb_addr */ + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + + /* Uh oh, we wrapped the buffer. Must split sg across 2 iovs. */ + if (sg->length != copy_bytes) { + from += copy_bytes; + copy_bytes = sg->length - copy_bytes; + + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + + if (se_cmd->data_direction == DMA_TO_DEVICE) { + to = (void *) mb + udev->data_off + udev->data_head; + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + } + + kunmap_atomic(from); + } + entry->req.iov_cnt = iov_cnt; + + /* All offsets relative to mb_addr, not start of entry! */ + cdb_off = CMDR_OFF + cmd_head + base_command_size; + memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb)); + entry->req.cdb_off = cdb_off; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + spin_unlock_irq(&udev->cmdr_lock); + + /* TODO: only if FLUSH and FUA? */ + uio_event_notify(&udev->uio_info); + + mod_timer(&udev->timeout, + round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT))); + + return 0; +} + +static int tcmu_queue_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int ret; + + tcmu_cmd = tcmu_alloc_cmd(se_cmd); + if (!tcmu_cmd) + return -ENOMEM; + + ret = tcmu_queue_cmd_ring(tcmu_cmd); + if (ret < 0) { + pr_err("TCMU: Could not queue command\n"); + spin_lock_irq(&udev->commands_lock); + idr_remove(&udev->commands, tcmu_cmd->cmd_id); + spin_unlock_irq(&udev->commands_lock); + + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + } + + return ret; +} + +static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) +{ + struct se_cmd *se_cmd = cmd->se_cmd; + struct tcmu_dev *udev = cmd->tcmu_dev; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + /* cmd has been completed already from timeout, just reclaim data + ring space */ + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + return; + } + + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { + memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, + se_cmd->scsi_sense_length); + + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } + else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + struct scatterlist *sg; + int i; + + /* It'd be easier to look at entry's iovec again, but UAM */ + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes; + void *to; + void *from; + + copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_tail, udev->data_size)); + + to = kmap_atomic(sg_page(sg)) + sg->offset; + WARN_ON(sg->length + sg->offset > PAGE_SIZE); + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + + /* Uh oh, wrapped the data buffer for this sg's data */ + if (sg->length != copy_bytes) { + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + WARN_ON(udev->data_tail); + to += copy_bytes; + copy_bytes = sg->length - copy_bytes; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + } + + kunmap_atomic(to); + } + + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } else { + pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction); + } + + target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); +} + +static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) +{ + struct tcmu_mailbox *mb; + LIST_HEAD(cpl_cmds); + unsigned long flags; + int handled = 0; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { + pr_err("ring broken, not handling completions\n"); + return 0; + } + + spin_lock_irqsave(&udev->cmdr_lock, flags); + + mb = udev->mb_addr; + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) { + + struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned; + struct tcmu_cmd *cmd; + + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) { + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + continue; + } + WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD); + + spin_lock(&udev->commands_lock); + cmd = idr_find(&udev->commands, entry->cmd_id); + if (cmd) + idr_remove(&udev->commands, cmd->cmd_id); + spin_unlock(&udev->commands_lock); + + if (!cmd) { + pr_err("cmd_id not found, ring is broken\n"); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + break; + } + + tcmu_handle_completion(cmd, entry); + + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + + handled++; + } + + if (mb->cmd_tail == mb->cmd_head) + del_timer(&udev->timeout); /* no more pending cmds */ + + spin_unlock_irqrestore(&udev->cmdr_lock, flags); + + wake_up(&udev->wait_cmdr); + + return handled; +} + +static int tcmu_check_expired_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + + if (!time_after(cmd->deadline, jiffies)) + return 0; + + set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); + target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); + + return 0; +} + +static void tcmu_device_timedout(unsigned long data) +{ + struct tcmu_dev *udev = (struct tcmu_dev *)data; + unsigned long flags; + int handled; + + handled = tcmu_handle_completions(udev); + + pr_warn("%d completions handled from timeout\n", handled); + + spin_lock_irqsave(&udev->commands_lock, flags); + idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); + spin_unlock_irqrestore(&udev->commands_lock, flags); + + /* + * We don't need to wakeup threads on wait_cmdr since they have their + * own timeout. + */ +} + +static int tcmu_attach_hba(struct se_hba *hba, u32 host_id) +{ + struct tcmu_hba *tcmu_hba; + + tcmu_hba = kzalloc(sizeof(struct tcmu_hba), GFP_KERNEL); + if (!tcmu_hba) + return -ENOMEM; + + tcmu_hba->host_id = host_id; + hba->hba_ptr = tcmu_hba; + + return 0; +} + +static void tcmu_detach_hba(struct se_hba *hba) +{ + kfree(hba->hba_ptr); + hba->hba_ptr = NULL; +} + +static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) +{ + struct tcmu_dev *udev; + + udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL); + if (!udev) + return NULL; + + udev->name = kstrdup(name, GFP_KERNEL); + if (!udev->name) { + kfree(udev); + return NULL; + } + + udev->hba = hba; + + init_waitqueue_head(&udev->wait_cmdr); + spin_lock_init(&udev->cmdr_lock); + + idr_init(&udev->commands); + spin_lock_init(&udev->commands_lock); + + setup_timer(&udev->timeout, tcmu_device_timedout, + (unsigned long)udev); + + udev->pass_level = TCMU_PASS_ALL; + + return &udev->se_dev; +} + +static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) +{ + struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); + + tcmu_handle_completions(tcmu_dev); + + return 0; +} + +/* + * mmap code from uio.c. Copied here because we want to hook mmap() + * and this stuff must come along. + */ +static int tcmu_find_mem_index(struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + + if (vma->vm_pgoff < MAX_UIO_MAPS) { + if (info->mem[vma->vm_pgoff].size == 0) + return -1; + return (int)vma->vm_pgoff; + } + return -1; +} + +static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + struct page *page; + unsigned long offset; + void *addr; + + int mi = tcmu_find_mem_index(vma); + if (mi < 0) + return VM_FAULT_SIGBUS; + + /* + * We need to subtract mi because userspace uses offset = N*PAGE_SIZE + * to use mem[N]. + */ + offset = (vmf->pgoff - mi) << PAGE_SHIFT; + + addr = (void *)(unsigned long)info->mem[mi].addr + offset; + if (info->mem[mi].memtype == UIO_MEM_LOGICAL) + page = virt_to_page(addr); + else + page = vmalloc_to_page(addr); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct tcmu_vm_ops = { + .fault = tcmu_vma_fault, +}; + +static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &tcmu_vm_ops; + + vma->vm_private_data = udev; + + /* Ensure the mmap is exactly the right size */ + if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT)) + return -EINVAL; + + return 0; +} + +static int tcmu_open(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + /* O_EXCL not supported for char devs, so fake it? */ + if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) + return -EBUSY; + + pr_debug("open\n"); + + return 0; +} + +static int tcmu_release(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags); + + pr_debug("close\n"); + + return 0; +} + +static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor) +{ + struct sk_buff *skb; + void *msg_header; + int ret; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + + ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + + ret = genlmsg_end(skb, msg_header); + if (ret < 0) { + nlmsg_free(skb); + return ret; + } + + ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, + TCMU_MCGRP_CONFIG, GFP_KERNEL); + + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + + return ret; +} + +static int tcmu_configure_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + struct tcmu_hba *hba = udev->hba->hba_ptr; + struct uio_info *info; + struct tcmu_mailbox *mb; + size_t size; + size_t used; + int ret = 0; + char *str; + + info = &udev->uio_info; + + size = snprintf(NULL, 0, "tcm-user/%u/%s/%s", hba->host_id, udev->name, + udev->dev_config); + size += 1; /* for \0 */ + str = kmalloc(size, GFP_KERNEL); + if (!str) + return -ENOMEM; + + used = snprintf(str, size, "tcm-user/%u/%s", hba->host_id, udev->name); + + if (udev->dev_config[0]) + snprintf(str + used, size - used, "/%s", udev->dev_config); + + info->name = str; + + udev->mb_addr = vzalloc(TCMU_RING_SIZE); + if (!udev->mb_addr) { + ret = -ENOMEM; + goto err_vzalloc; + } + + /* mailbox fits in first part of CMDR space */ + udev->cmdr_size = CMDR_SIZE - CMDR_OFF; + udev->data_off = CMDR_SIZE; + udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; + + mb = udev->mb_addr; + mb->version = 1; + mb->cmdr_off = CMDR_OFF; + mb->cmdr_size = udev->cmdr_size; + + WARN_ON(!PAGE_ALIGNED(udev->data_off)); + WARN_ON(udev->data_size % PAGE_SIZE); + + info->version = "1"; + + info->mem[0].name = "tcm-user command & data buffer"; + info->mem[0].addr = (phys_addr_t) udev->mb_addr; + info->mem[0].size = TCMU_RING_SIZE; + info->mem[0].memtype = UIO_MEM_VIRTUAL; + + info->irqcontrol = tcmu_irqcontrol; + info->irq = UIO_IRQ_CUSTOM; + + info->mmap = tcmu_mmap; + info->open = tcmu_open; + info->release = tcmu_release; + + ret = uio_register_device(tcmu_root_device, info); + if (ret) + goto err_register; + + /* Other attributes can be configured in userspace */ + dev->dev_attrib.hw_block_size = 512; + dev->dev_attrib.hw_max_sectors = 128; + dev->dev_attrib.hw_queue_depth = 128; + + ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + if (ret) + goto err_netlink; + + return 0; + +err_netlink: + uio_unregister_device(&udev->uio_info); +err_register: + vfree(udev->mb_addr); +err_vzalloc: + kfree(info->name); + + return ret; +} + +static int tcmu_check_pending_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + return -EINVAL; +} + +static void tcmu_free_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + int i; + + del_timer_sync(&udev->timeout); + + vfree(udev->mb_addr); + + /* Upper layer should drain all requests before calling this */ + spin_lock_irq(&udev->commands_lock); + i = idr_for_each(&udev->commands, tcmu_check_pending_cmd, NULL); + idr_destroy(&udev->commands); + spin_unlock_irq(&udev->commands_lock); + WARN_ON(i); + + /* Device was configured */ + if (udev->uio_info.uio_dev) { + tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + + uio_unregister_device(&udev->uio_info); + kfree(udev->uio_info.name); + kfree(udev->name); + } + + kfree(udev); +} + +enum { + Opt_dev_config, Opt_dev_size, Opt_err, Opt_pass_level, +}; + +static match_table_t tokens = { + {Opt_dev_config, "dev_config=%s"}, + {Opt_dev_size, "dev_size=%u"}, + {Opt_pass_level, "pass_level=%u"}, + {Opt_err, NULL} +}; + +static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, + const char *page, ssize_t count) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + char *orig, *ptr, *opts, *arg_p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0, token; + int arg; + + opts = kstrdup(page, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + orig = opts; + + while ((ptr = strsep(&opts, ",\n")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, tokens, args); + switch (token) { + case Opt_dev_config: + if (match_strlcpy(udev->dev_config, &args[0], + TCMU_CONFIG_LEN) == 0) { + ret = -EINVAL; + break; + } + pr_debug("TCMU: Referencing Path: %s\n", udev->dev_config); + break; + case Opt_dev_size: + arg_p = match_strdup(&args[0]); + if (!arg_p) { + ret = -ENOMEM; + break; + } + ret = kstrtoul(arg_p, 0, (unsigned long *) &udev->dev_size); + kfree(arg_p); + if (ret < 0) + pr_err("kstrtoul() failed for dev_size=\n"); + break; + case Opt_pass_level: + match_int(args, &arg); + if (arg >= TCMU_PASS_INVALID) { + pr_warn("TCMU: Invalid pass_level: %d\n", arg); + break; + } + + pr_debug("TCMU: Setting pass_level to %d\n", arg); + udev->pass_level = arg; + break; + default: + break; + } + } + + kfree(orig); + return (!ret) ? count : ret; +} + +static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + ssize_t bl = 0; + + bl = sprintf(b + bl, "Config: %s ", + udev->dev_config[0] ? udev->dev_config : "NULL"); + bl += sprintf(b + bl, "Size: %zu PassLevel: %u\n", + udev->dev_size, udev->pass_level); + + return bl; +} + +static sector_t tcmu_get_blocks(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + + return div_u64(udev->dev_size - dev->dev_attrib.block_size, + dev->dev_attrib.block_size); +} + +static sense_reason_t +tcmu_execute_rw(struct se_cmd *se_cmd, struct scatterlist *sgl, u32 sgl_nents, + enum dma_data_direction data_direction) +{ + int ret; + + ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static sense_reason_t +tcmu_pass_op(struct se_cmd *se_cmd) +{ + int ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static struct sbc_ops tcmu_sbc_ops = { + .execute_rw = tcmu_execute_rw, + .execute_sync_cache = tcmu_pass_op, + .execute_write_same = tcmu_pass_op, + .execute_write_same_unmap = tcmu_pass_op, + .execute_unmap = tcmu_pass_op, +}; + +static sense_reason_t +tcmu_parse_cdb(struct se_cmd *cmd) +{ + unsigned char *cdb = cmd->t_task_cdb; + struct tcmu_dev *udev = TCMU_DEV(cmd->se_dev); + sense_reason_t ret; + + switch (udev->pass_level) { + case TCMU_PASS_ALL: + /* We're just like pscsi, then */ + /* + * For REPORT LUNS we always need to emulate the response, for everything + * else, pass it up. + */ + switch (cdb[0]) { + case REPORT_LUNS: + cmd->execute_cmd = spc_emulate_report_luns; + break; + case READ_6: + case READ_10: + case READ_12: + case READ_16: + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + case WRITE_VERIFY: + cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB; + /* FALLTHROUGH */ + default: + cmd->execute_cmd = tcmu_pass_op; + } + ret = TCM_NO_SENSE; + break; + case TCMU_PASS_IO: + ret = sbc_parse_cdb(cmd, &tcmu_sbc_ops); + break; + default: + pr_err("Unknown tcm-user pass level %d\n", udev->pass_level); + ret = TCM_CHECK_CONDITION_ABORT_CMD; + } + + return ret; +} + +static struct se_subsystem_api tcmu_template = { + .name = "user", + .inquiry_prod = "USER", + .inquiry_rev = TCMU_VERSION, + .owner = THIS_MODULE, + .transport_type = TRANSPORT_PLUGIN_VHBA_PDEV, + .attach_hba = tcmu_attach_hba, + .detach_hba = tcmu_detach_hba, + .alloc_device = tcmu_alloc_device, + .configure_device = tcmu_configure_device, + .free_device = tcmu_free_device, + .parse_cdb = tcmu_parse_cdb, + .set_configfs_dev_params = tcmu_set_configfs_dev_params, + .show_configfs_dev_params = tcmu_show_configfs_dev_params, + .get_device_type = sbc_get_device_type, + .get_blocks = tcmu_get_blocks, +}; + +static int __init tcmu_module_init(void) +{ + int ret; + + BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0); + + tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache", + sizeof(struct tcmu_cmd), + __alignof__(struct tcmu_cmd), + 0, NULL); + if (!tcmu_cmd_cache) + return -ENOMEM; + + tcmu_root_device = root_device_register("tcm_user"); + if (IS_ERR(tcmu_root_device)) { + ret = PTR_ERR(tcmu_root_device); + goto out_free_cache; + } + + ret = genl_register_family(&tcmu_genl_family); + if (ret < 0) { + goto out_unreg_device; + } + + ret = transport_subsystem_register(&tcmu_template); + if (ret) + goto out_unreg_genl; + + return 0; + +out_unreg_genl: + genl_unregister_family(&tcmu_genl_family); +out_unreg_device: + root_device_unregister(tcmu_root_device); +out_free_cache: + kmem_cache_destroy(tcmu_cmd_cache); + + return ret; +} + +static void __exit tcmu_module_exit(void) +{ + transport_subsystem_release(&tcmu_template); + genl_unregister_family(&tcmu_genl_family); + root_device_unregister(tcmu_root_device); + kmem_cache_destroy(tcmu_cmd_cache); +} + +MODULE_DESCRIPTION("TCM USER subsystem plugin"); +MODULE_AUTHOR("Shaohua Li "); +MODULE_AUTHOR("Andy Grover "); +MODULE_LICENSE("GPL"); + +module_init(tcmu_module_init); +module_exit(tcmu_module_exit); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index be88166349a1..6ebd0d1faf2e 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -371,6 +371,7 @@ header-y += swab.h header-y += synclink.h header-y += sysctl.h header-y += sysinfo.h +header-y += target_core_user.h header-y += taskstats.h header-y += tcp.h header-y += tcp_metrics.h diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h new file mode 100644 index 000000000000..7dcfbe6771b1 --- /dev/null +++ b/include/uapi/linux/target_core_user.h @@ -0,0 +1,142 @@ +#ifndef __TARGET_CORE_USER_H +#define __TARGET_CORE_USER_H + +/* This header will be used by application too */ + +#include +#include + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +#define TCMU_VERSION "1.0" + +/* + * Ring Design + * ----------- + * + * The mmaped area is divided into three parts: + * 1) The mailbox (struct tcmu_mailbox, below) + * 2) The command ring + * 3) Everything beyond the command ring (data) + * + * The mailbox tells userspace the offset of the command ring from the + * start of the shared memory region, and how big the command ring is. + * + * The kernel passes SCSI commands to userspace by putting a struct + * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking + * userspace via uio's interrupt mechanism. + * + * tcmu_cmd_entry contains a header. If the header type is PAD, + * userspace should skip hdr->length bytes (mod cmdr_size) to find the + * next cmd_entry. + * + * Otherwise, the entry will contain offsets into the mmaped area that + * contain the cdb and data buffers -- the latter accessible via the + * iov array. iov addresses are also offsets into the shared area. + * + * When userspace is completed handling the command, set + * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate, + * and also set mailbox->cmd_tail equal to the old cmd_tail plus + * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it + * should process the next packet the same way, and so on. + */ + +#define TCMU_MAILBOX_VERSION 1 +#define ALIGN_SIZE 64 /* Should be enough for most CPUs */ + +struct tcmu_mailbox { + __u16 version; + __u16 flags; + __u32 cmdr_off; + __u32 cmdr_size; + + __u32 cmd_head; + + /* Updated by user. On its own cacheline */ + __u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE))); + +} __packed; + +enum tcmu_opcode { + TCMU_OP_PAD = 0, + TCMU_OP_CMD, +}; + +/* + * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. + */ +struct tcmu_cmd_entry_hdr { + __u32 len_op; +} __packed; + +#define TCMU_OP_MASK 0x7 + +static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) +{ + hdr->len_op &= ~TCMU_OP_MASK; + hdr->len_op |= (op & TCMU_OP_MASK); +} + +static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & ~TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) +{ + hdr->len_op &= TCMU_OP_MASK; + hdr->len_op |= len; +} + +/* Currently the same as SCSI_SENSE_BUFFERSIZE */ +#define TCMU_SENSE_BUFFERSIZE 96 + +struct tcmu_cmd_entry { + struct tcmu_cmd_entry_hdr hdr; + + uint16_t cmd_id; + uint16_t __pad1; + + union { + struct { + uint64_t cdb_off; + uint64_t iov_cnt; + struct iovec iov[0]; + } req; + struct { + uint8_t scsi_status; + uint8_t __pad1; + uint16_t __pad2; + uint32_t __pad3; + char sense_buffer[TCMU_SENSE_BUFFERSIZE]; + } rsp; + }; + +} __packed; + +#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t) + +enum tcmu_genl_cmd { + TCMU_CMD_UNSPEC, + TCMU_CMD_ADDED_DEVICE, + TCMU_CMD_REMOVED_DEVICE, + __TCMU_CMD_MAX, +}; +#define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) + +enum tcmu_genl_attr { + TCMU_ATTR_UNSPEC, + TCMU_ATTR_DEVICE, + TCMU_ATTR_MINOR, + __TCMU_ATTR_MAX, +}; +#define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) + +#endif -- cgit From 92404e609a2dffc55a9a22540ed48b6f0edc9c59 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 4 Oct 2014 01:06:08 +0000 Subject: target: Add force_pr_aptpl device attribute This patch adds a force_pr_aptpl device attribute used to force SPC-3 PR Activate Persistence across Target Power Loss (APTPL) operation. This makes PR metadata write-out occur during state change regardless if new PERSISTENT_RESERVE_OUT CDBs have their APTPL feature bit set. This is useful during H/A failover in active/passive setups where all PR state is being re-created on a different node, driven by configfs backend device + export layout and pre-loaded $DEV/pr/res_aptpl_metadata. Cc: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_configfs.c | 4 ++++ drivers/target/target_core_device.c | 18 ++++++++++++++++++ drivers/target/target_core_internal.h | 1 + drivers/target/target_core_pr.c | 9 +++++++++ include/target/target_core_base.h | 3 +++ 5 files changed, 35 insertions(+) (limited to 'include') diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 291dc711fbc3..b30fc8f53bd8 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -665,6 +665,9 @@ SE_DEV_ATTR(is_nonrot, S_IRUGO | S_IWUSR); DEF_DEV_ATTRIB(emulate_rest_reord); SE_DEV_ATTR(emulate_rest_reord, S_IRUGO | S_IWUSR); +DEF_DEV_ATTRIB(force_pr_aptpl); +SE_DEV_ATTR(force_pr_aptpl, S_IRUGO | S_IWUSR); + DEF_DEV_ATTRIB_RO(hw_block_size); SE_DEV_ATTR_RO(hw_block_size); @@ -719,6 +722,7 @@ static struct configfs_attribute *target_core_dev_attrib_attrs[] = { &target_core_dev_attrib_hw_pi_prot_type.attr, &target_core_dev_attrib_pi_prot_format.attr, &target_core_dev_attrib_enforce_pr_isids.attr, + &target_core_dev_attrib_force_pr_aptpl.attr, &target_core_dev_attrib_is_nonrot.attr, &target_core_dev_attrib_emulate_rest_reord.attr, &target_core_dev_attrib_hw_block_size.attr, diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index d18dd8b532a8..c45f9e907e44 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -1018,6 +1018,23 @@ int se_dev_set_enforce_pr_isids(struct se_device *dev, int flag) return 0; } +int se_dev_set_force_pr_aptpl(struct se_device *dev, int flag) +{ + if ((flag != 0) && (flag != 1)) { + printk(KERN_ERR "Illegal value %d\n", flag); + return -EINVAL; + } + if (dev->export_count) { + pr_err("dev[%p]: Unable to set force_pr_aptpl while" + " export_count is %d\n", dev, dev->export_count); + return -EINVAL; + } + + dev->dev_attrib.force_pr_aptpl = flag; + pr_debug("dev[%p]: SE Device force_pr_aptpl: %d\n", dev, flag); + return 0; +} + int se_dev_set_is_nonrot(struct se_device *dev, int flag) { if ((flag != 0) && (flag != 1)) { @@ -1544,6 +1561,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) dev->dev_attrib.emulate_3pc = DA_EMULATE_3PC; dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE0_PROT; dev->dev_attrib.enforce_pr_isids = DA_ENFORCE_PR_ISIDS; + dev->dev_attrib.force_pr_aptpl = DA_FORCE_PR_APTPL; dev->dev_attrib.is_nonrot = DA_IS_NONROT; dev->dev_attrib.emulate_rest_reord = DA_EMULATE_REST_REORD; dev->dev_attrib.max_unmap_lba_count = DA_MAX_UNMAP_LBA_COUNT; diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 42ef4ab70585..e31f42f369ff 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -38,6 +38,7 @@ int se_dev_set_emulate_3pc(struct se_device *, int); int se_dev_set_pi_prot_type(struct se_device *, int); int se_dev_set_pi_prot_format(struct se_device *, int); int se_dev_set_enforce_pr_isids(struct se_device *, int); +int se_dev_set_force_pr_aptpl(struct se_device *, int); int se_dev_set_is_nonrot(struct se_device *, int); int se_dev_set_emulate_rest_reord(struct se_device *dev, int); int se_dev_set_queue_depth(struct se_device *, u32); diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index a06edb59b67f..8c60a1a1ae8d 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -895,6 +895,7 @@ static int __core_scsi3_check_aptpl_registration( spin_lock(&pr_tmpl->aptpl_reg_lock); list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->aptpl_reg_list, pr_reg_aptpl_list) { + if (!strcmp(pr_reg->pr_iport, i_port) && (pr_reg->pr_res_mapped_lun == deve->mapped_lun) && !(strcmp(pr_reg->pr_tport, t_port)) && @@ -3470,6 +3471,7 @@ static unsigned long long core_scsi3_extract_reservation_key(unsigned char *cdb) sense_reason_t target_scsi3_emulate_pr_out(struct se_cmd *cmd) { + struct se_device *dev = cmd->se_dev; unsigned char *cdb = &cmd->t_task_cdb[0]; unsigned char *buf; u64 res_key, sa_res_key; @@ -3534,6 +3536,13 @@ target_scsi3_emulate_pr_out(struct se_cmd *cmd) aptpl = (buf[17] & 0x01); unreg = (buf[17] & 0x02); } + /* + * If the backend device has been configured to force APTPL metadata + * write-out, go ahead and propigate aptpl=1 down now. + */ + if (dev->dev_attrib.force_pr_aptpl) + aptpl = 1; + transport_kunmap_data_sg(cmd); buf = NULL; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index b106240d8385..23c518a0340c 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -108,6 +108,8 @@ #define DA_EMULATE_ALUA 0 /* Enforce SCSI Initiator Port TransportID with 'ISID' for PR */ #define DA_ENFORCE_PR_ISIDS 1 +/* Force SPC-3 PR Activate Persistence across Target Power Loss */ +#define DA_FORCE_PR_APTPL 0 #define DA_STATUS_MAX_SECTORS_MIN 16 #define DA_STATUS_MAX_SECTORS_MAX 8192 /* By default don't report non-rotating (solid state) medium */ @@ -680,6 +682,7 @@ struct se_dev_attrib { enum target_prot_type pi_prot_type; enum target_prot_type hw_pi_prot_type; int enforce_pr_isids; + int force_pr_aptpl; int is_nonrot; int emulate_rest_reord; u32 hw_block_size; -- cgit From f2fc42b6ac31f4d808da7a9da460dd433a71e976 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 12 Jun 2014 22:30:34 +0530 Subject: mailbox: rename pl320-ipc specific mailbox.h The patch 30058677 "ARM / highbank: add support for pl320 IPC" added a pl320 IPC specific header file as a generic mailbox.h. This file has been renamed appropriately to allow the introduction of the generic mailbox API framework. Acked-by: Mark Langsdorf Cc: Rafael J. Wysocki Signed-off-by: Suman Anna Reviewed-by: Mark Brown Acked-by: Arnd Bergmann --- arch/arm/mach-highbank/highbank.c | 2 +- drivers/cpufreq/highbank-cpufreq.c | 2 +- drivers/mailbox/pl320-ipc.c | 2 +- include/linux/mailbox.h | 17 ----------------- include/linux/pl320-ipc.h | 17 +++++++++++++++++ 5 files changed, 20 insertions(+), 20 deletions(-) delete mode 100644 include/linux/mailbox.h create mode 100644 include/linux/pl320-ipc.h (limited to 'include') diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 8c35ae4ff176..07a09570175d 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c index bf8902a0866d..b464f29d8d54 100644 --- a/drivers/cpufreq/highbank-cpufreq.c +++ b/drivers/cpufreq/highbank-cpufreq.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define HB_CPUFREQ_CHANGE_NOTE 0x80000001 diff --git a/drivers/mailbox/pl320-ipc.c b/drivers/mailbox/pl320-ipc.c index d873cbae2fbb..f3755e0aa935 100644 --- a/drivers/mailbox/pl320-ipc.c +++ b/drivers/mailbox/pl320-ipc.c @@ -26,7 +26,7 @@ #include #include -#include +#include #define IPCMxSOURCE(m) ((m) * 0x40) #define IPCMxDSET(m) (((m) * 0x40) + 0x004) diff --git a/include/linux/mailbox.h b/include/linux/mailbox.h deleted file mode 100644 index 5161f63ec1c8..000000000000 --- a/include/linux/mailbox.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -int pl320_ipc_transmit(u32 *data); -int pl320_ipc_register_notifier(struct notifier_block *nb); -int pl320_ipc_unregister_notifier(struct notifier_block *nb); diff --git a/include/linux/pl320-ipc.h b/include/linux/pl320-ipc.h new file mode 100644 index 000000000000..5161f63ec1c8 --- /dev/null +++ b/include/linux/pl320-ipc.h @@ -0,0 +1,17 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +int pl320_ipc_transmit(u32 *data); +int pl320_ipc_register_notifier(struct notifier_block *nb); +int pl320_ipc_unregister_notifier(struct notifier_block *nb); -- cgit From 2b6d83e2b8b7de82331a6a1dcd64b51020a6031c Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Thu, 12 Jun 2014 22:31:19 +0530 Subject: mailbox: Introduce framework for mailbox Introduce common framework for client/protocol drivers and controller drivers of Inter-Processor-Communication (IPC). Client driver developers should have a look at include/linux/mailbox_client.h to understand the part of the API exposed to client drivers. Similarly controller driver developers should have a look at include/linux/mailbox_controller.h Reviewed-by: Mark Brown Signed-off-by: Jassi Brar --- MAINTAINERS | 8 + drivers/mailbox/Makefile | 4 + drivers/mailbox/mailbox.c | 465 +++++++++++++++++++++++++++++++++++++ include/linux/mailbox_client.h | 46 ++++ include/linux/mailbox_controller.h | 133 +++++++++++ 5 files changed, 656 insertions(+) create mode 100644 drivers/mailbox/mailbox.c create mode 100644 include/linux/mailbox_client.h create mode 100644 include/linux/mailbox_controller.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index a12edf2624e5..c5f7c854b6d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5740,6 +5740,14 @@ S: Maintained F: drivers/net/macvlan.c F: include/linux/if_macvlan.h +MAILBOX API +M: Jassi Brar +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/mailbox/ +F: include/linux/mailbox_client.h +F: include/linux/mailbox_controller.h + MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7 M: Michael Kerrisk W: http://www.kernel.org/doc/man-pages diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 6d184dbcaca8..94ed7cefb14d 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -1,3 +1,7 @@ +# Generic MAILBOX API + +obj-$(CONFIG_MAILBOX) += mailbox.o + obj-$(CONFIG_PL320_MBOX) += pl320-ipc.o obj-$(CONFIG_OMAP2PLUS_MBOX) += omap-mailbox.o diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c new file mode 100644 index 000000000000..afcb430508ec --- /dev/null +++ b/drivers/mailbox/mailbox.c @@ -0,0 +1,465 @@ +/* + * Mailbox: Common code for Mailbox controllers and users + * + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK recevied by Client ticks the TX */ + +static LIST_HEAD(mbox_cons); +static DEFINE_MUTEX(con_mutex); + +static int add_to_rbuf(struct mbox_chan *chan, void *mssg) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&chan->lock, flags); + + /* See if there is any space left */ + if (chan->msg_count == MBOX_TX_QUEUE_LEN) { + spin_unlock_irqrestore(&chan->lock, flags); + return -ENOBUFS; + } + + idx = chan->msg_free; + chan->msg_data[idx] = mssg; + chan->msg_count++; + + if (idx == MBOX_TX_QUEUE_LEN - 1) + chan->msg_free = 0; + else + chan->msg_free++; + + spin_unlock_irqrestore(&chan->lock, flags); + + return idx; +} + +static void msg_submit(struct mbox_chan *chan) +{ + unsigned count, idx; + unsigned long flags; + void *data; + int err; + + spin_lock_irqsave(&chan->lock, flags); + + if (!chan->msg_count || chan->active_req) + goto exit; + + count = chan->msg_count; + idx = chan->msg_free; + if (idx >= count) + idx -= count; + else + idx += MBOX_TX_QUEUE_LEN - count; + + data = chan->msg_data[idx]; + + /* Try to submit a message to the MBOX controller */ + err = chan->mbox->ops->send_data(chan, data); + if (!err) { + chan->active_req = data; + chan->msg_count--; + } +exit: + spin_unlock_irqrestore(&chan->lock, flags); +} + +static void tx_tick(struct mbox_chan *chan, int r) +{ + unsigned long flags; + void *mssg; + + spin_lock_irqsave(&chan->lock, flags); + mssg = chan->active_req; + chan->active_req = NULL; + spin_unlock_irqrestore(&chan->lock, flags); + + /* Submit next message */ + msg_submit(chan); + + /* Notify the client */ + if (mssg && chan->cl->tx_done) + chan->cl->tx_done(chan->cl, mssg, r); + + if (chan->cl->tx_block) + complete(&chan->tx_complete); +} + +static void poll_txdone(unsigned long data) +{ + struct mbox_controller *mbox = (struct mbox_controller *)data; + bool txdone, resched = false; + int i; + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + if (chan->active_req && chan->cl) { + resched = true; + txdone = chan->mbox->ops->last_tx_done(chan); + if (txdone) + tx_tick(chan, 0); + } + } + + if (resched) + mod_timer(&mbox->poll, jiffies + + msecs_to_jiffies(mbox->txpoll_period)); +} + +/** + * mbox_chan_received_data - A way for controller driver to push data + * received from remote to the upper layer. + * @chan: Pointer to the mailbox channel on which RX happened. + * @mssg: Client specific message typecasted as void * + * + * After startup and before shutdown any data received on the chan + * is passed on to the API via atomic mbox_chan_received_data(). + * The controller should ACK the RX only after this call returns. + */ +void mbox_chan_received_data(struct mbox_chan *chan, void *mssg) +{ + /* No buffering the received data */ + if (chan->cl->rx_callback) + chan->cl->rx_callback(chan->cl, mssg); +} +EXPORT_SYMBOL_GPL(mbox_chan_received_data); + +/** + * mbox_chan_txdone - A way for controller driver to notify the + * framework that the last TX has completed. + * @chan: Pointer to the mailbox chan on which TX happened. + * @r: Status of last TX - OK or ERROR + * + * The controller that has IRQ for TX ACK calls this atomic API + * to tick the TX state machine. It works only if txdone_irq + * is set by the controller. + */ +void mbox_chan_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_IRQ))) { + dev_err(chan->mbox->dev, + "Controller can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_chan_txdone); + +/** + * mbox_client_txdone - The way for a client to run the TX state machine. + * @chan: Mailbox channel assigned to this client. + * @r: Success status of last transmission. + * + * The client/protocol had received some 'ACK' packet and it notifies + * the API that the last packet was sent successfully. This only works + * if the controller can't sense TX-Done. + */ +void mbox_client_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_ACK))) { + dev_err(chan->mbox->dev, "Client can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_client_txdone); + +/** + * mbox_client_peek_data - A way for client driver to pull data + * received from remote by the controller. + * @chan: Mailbox channel assigned to this client. + * + * A poke to controller driver for any received data. + * The data is actually passed onto client via the + * mbox_chan_received_data() + * The call can be made from atomic context, so the controller's + * implementation of peek_data() must not sleep. + * + * Return: True, if controller has, and is going to push after this, + * some data. + * False, if controller doesn't have any data to be read. + */ +bool mbox_client_peek_data(struct mbox_chan *chan) +{ + if (chan->mbox->ops->peek_data) + return chan->mbox->ops->peek_data(chan); + + return false; +} +EXPORT_SYMBOL_GPL(mbox_client_peek_data); + +/** + * mbox_send_message - For client to submit a message to be + * sent to the remote. + * @chan: Mailbox channel assigned to this client. + * @mssg: Client specific message typecasted. + * + * For client to submit data to the controller destined for a remote + * processor. If the client had set 'tx_block', the call will return + * either when the remote receives the data or when 'tx_tout' millisecs + * run out. + * In non-blocking mode, the requests are buffered by the API and a + * non-negative token is returned for each queued request. If the request + * is not queued, a negative token is returned. Upon failure or successful + * TX, the API calls 'tx_done' from atomic context, from which the client + * could submit yet another request. + * The pointer to message should be preserved until it is sent + * over the chan, i.e, tx_done() is made. + * This function could be called from atomic context as it simply + * queues the data and returns a token against the request. + * + * Return: Non-negative integer for successful submission (non-blocking mode) + * or transmission over chan (blocking mode). + * Negative value denotes failure. + */ +int mbox_send_message(struct mbox_chan *chan, void *mssg) +{ + int t; + + if (!chan || !chan->cl) + return -EINVAL; + + t = add_to_rbuf(chan, mssg); + if (t < 0) { + dev_err(chan->mbox->dev, "Try increasing MBOX_TX_QUEUE_LEN\n"); + return t; + } + + msg_submit(chan); + + if (chan->txdone_method == TXDONE_BY_POLL) + poll_txdone((unsigned long)chan->mbox); + + if (chan->cl->tx_block && chan->active_req) { + unsigned long wait; + int ret; + + if (!chan->cl->tx_tout) /* wait forever */ + wait = msecs_to_jiffies(3600000); + else + wait = msecs_to_jiffies(chan->cl->tx_tout); + + ret = wait_for_completion_timeout(&chan->tx_complete, wait); + if (ret == 0) { + t = -EIO; + tx_tick(chan, -EIO); + } + } + + return t; +} +EXPORT_SYMBOL_GPL(mbox_send_message); + +/** + * mbox_request_channel - Request a mailbox channel. + * @cl: Identity of the client requesting the channel. + * @index: Index of mailbox specifier in 'mboxes' property. + * + * The Client specifies its requirements and capabilities while asking for + * a mailbox channel. It can't be called from atomic context. + * The channel is exclusively allocated and can't be used by another + * client before the owner calls mbox_free_channel. + * After assignment, any packet received on this channel will be + * handed over to the client via the 'rx_callback'. + * The framework holds reference to the client, so the mbox_client + * structure shouldn't be modified until the mbox_free_channel returns. + * + * Return: Pointer to the channel assigned to the client if successful. + * ERR_PTR for request failure. + */ +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) +{ + struct device *dev = cl->dev; + struct mbox_controller *mbox; + struct of_phandle_args spec; + struct mbox_chan *chan; + unsigned long flags; + int ret; + + if (!dev || !dev->of_node) { + pr_debug("%s: No owner device node\n", __func__); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&con_mutex); + + if (of_parse_phandle_with_args(dev->of_node, "mboxes", + "#mbox-cells", index, &spec)) { + dev_dbg(dev, "%s: can't parse \"mboxes\" property\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-ENODEV); + } + + chan = NULL; + list_for_each_entry(mbox, &mbox_cons, node) + if (mbox->dev->of_node == spec.np) { + chan = mbox->of_xlate(mbox, &spec); + break; + } + + of_node_put(spec.np); + + if (!chan || chan->cl || !try_module_get(mbox->dev->driver->owner)) { + dev_dbg(dev, "%s: mailbox not free\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-EBUSY); + } + + spin_lock_irqsave(&chan->lock, flags); + chan->msg_free = 0; + chan->msg_count = 0; + chan->active_req = NULL; + chan->cl = cl; + init_completion(&chan->tx_complete); + + if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) + chan->txdone_method |= TXDONE_BY_ACK; + + spin_unlock_irqrestore(&chan->lock, flags); + + ret = chan->mbox->ops->startup(chan); + if (ret) { + dev_err(dev, "Unable to startup the chan (%d)\n", ret); + mbox_free_channel(chan); + chan = ERR_PTR(ret); + } + + mutex_unlock(&con_mutex); + return chan; +} +EXPORT_SYMBOL_GPL(mbox_request_channel); + +/** + * mbox_free_channel - The client relinquishes control of a mailbox + * channel by this call. + * @chan: The mailbox channel to be freed. + */ +void mbox_free_channel(struct mbox_chan *chan) +{ + unsigned long flags; + + if (!chan || !chan->cl) + return; + + chan->mbox->ops->shutdown(chan); + + /* The queued TX requests are simply aborted, no callbacks are made */ + spin_lock_irqsave(&chan->lock, flags); + chan->cl = NULL; + chan->active_req = NULL; + if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + chan->txdone_method = TXDONE_BY_POLL; + + module_put(chan->mbox->dev->driver->owner); + spin_unlock_irqrestore(&chan->lock, flags); +} +EXPORT_SYMBOL_GPL(mbox_free_channel); + +static struct mbox_chan * +of_mbox_index_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *sp) +{ + int ind = sp->args[0]; + + if (ind >= mbox->num_chans) + return NULL; + + return &mbox->chans[ind]; +} + +/** + * mbox_controller_register - Register the mailbox controller + * @mbox: Pointer to the mailbox controller. + * + * The controller driver registers its communication channels + */ +int mbox_controller_register(struct mbox_controller *mbox) +{ + int i, txdone; + + /* Sanity check */ + if (!mbox || !mbox->dev || !mbox->ops || !mbox->num_chans) + return -EINVAL; + + if (mbox->txdone_irq) + txdone = TXDONE_BY_IRQ; + else if (mbox->txdone_poll) + txdone = TXDONE_BY_POLL; + else /* It has to be ACK then */ + txdone = TXDONE_BY_ACK; + + if (txdone == TXDONE_BY_POLL) { + mbox->poll.function = &poll_txdone; + mbox->poll.data = (unsigned long)mbox; + init_timer(&mbox->poll); + } + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + chan->cl = NULL; + chan->mbox = mbox; + chan->txdone_method = txdone; + spin_lock_init(&chan->lock); + } + + if (!mbox->of_xlate) + mbox->of_xlate = of_mbox_index_xlate; + + mutex_lock(&con_mutex); + list_add_tail(&mbox->node, &mbox_cons); + mutex_unlock(&con_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(mbox_controller_register); + +/** + * mbox_controller_unregister - Unregister the mailbox controller + * @mbox: Pointer to the mailbox controller. + */ +void mbox_controller_unregister(struct mbox_controller *mbox) +{ + int i; + + if (!mbox) + return; + + mutex_lock(&con_mutex); + + list_del(&mbox->node); + + for (i = 0; i < mbox->num_chans; i++) + mbox_free_channel(&mbox->chans[i]); + + if (mbox->txdone_poll) + del_timer_sync(&mbox->poll); + + mutex_unlock(&con_mutex); +} +EXPORT_SYMBOL_GPL(mbox_controller_unregister); diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h new file mode 100644 index 000000000000..307d9cab2026 --- /dev/null +++ b/include/linux/mailbox_client.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CLIENT_H +#define __MAILBOX_CLIENT_H + +#include +#include + +struct mbox_chan; + +/** + * struct mbox_client - User of a mailbox + * @dev: The client device + * @tx_block: If the mbox_send_message should block until data is + * transmitted. + * @tx_tout: Max block period in ms before TX is assumed failure + * @knows_txdone: If the client could run the TX state machine. Usually + * if the client receives some ACK packet for transmission. + * Unused if the controller already has TX_Done/RTR IRQ. + * @rx_callback: Atomic callback to provide client the data received + * @tx_done: Atomic callback to tell client of data transmission + */ +struct mbox_client { + struct device *dev; + bool tx_block; + unsigned long tx_tout; + bool knows_txdone; + + void (*rx_callback)(struct mbox_client *cl, void *mssg); + void (*tx_done)(struct mbox_client *cl, void *mssg, int r); +}; + +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); +int mbox_send_message(struct mbox_chan *chan, void *mssg); +void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ +bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ + +#endif /* __MAILBOX_CLIENT_H */ diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h new file mode 100644 index 000000000000..d4cf96f07cfc --- /dev/null +++ b/include/linux/mailbox_controller.h @@ -0,0 +1,133 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CONTROLLER_H +#define __MAILBOX_CONTROLLER_H + +#include +#include +#include +#include +#include + +struct mbox_chan; + +/** + * struct mbox_chan_ops - methods to control mailbox channels + * @send_data: The API asks the MBOX controller driver, in atomic + * context try to transmit a message on the bus. Returns 0 if + * data is accepted for transmission, -EBUSY while rejecting + * if the remote hasn't yet read the last data sent. Actual + * transmission of data is reported by the controller via + * mbox_chan_txdone (if it has some TX ACK irq). It must not + * sleep. + * @startup: Called when a client requests the chan. The controller + * could ask clients for additional parameters of communication + * to be provided via client's chan_data. This call may + * block. After this call the Controller must forward any + * data received on the chan by calling mbox_chan_received_data. + * The controller may do stuff that need to sleep. + * @shutdown: Called when a client relinquishes control of a chan. + * This call may block too. The controller must not forward + * any received data anymore. + * The controller may do stuff that need to sleep. + * @last_tx_done: If the controller sets 'txdone_poll', the API calls + * this to poll status of last TX. The controller must + * give priority to IRQ method over polling and never + * set both txdone_poll and txdone_irq. Only in polling + * mode 'send_data' is expected to return -EBUSY. + * The controller may do stuff that need to sleep/block. + * Used only if txdone_poll:=true && txdone_irq:=false + * @peek_data: Atomic check for any received data. Return true if controller + * has some data to push to the client. False otherwise. + */ +struct mbox_chan_ops { + int (*send_data)(struct mbox_chan *chan, void *data); + int (*startup)(struct mbox_chan *chan); + void (*shutdown)(struct mbox_chan *chan); + bool (*last_tx_done)(struct mbox_chan *chan); + bool (*peek_data)(struct mbox_chan *chan); +}; + +/** + * struct mbox_controller - Controller of a class of communication channels + * @dev: Device backing this controller + * @ops: Operators that work on each communication chan + * @chans: Array of channels + * @num_chans: Number of channels in the 'chans' array. + * @txdone_irq: Indicates if the controller can report to API when + * the last transmitted data was read by the remote. + * Eg, if it has some TX ACK irq. + * @txdone_poll: If the controller can read but not report the TX + * done. Ex, some register shows the TX status but + * no interrupt rises. Ignored if 'txdone_irq' is set. + * @txpoll_period: If 'txdone_poll' is in effect, the API polls for + * last TX's status after these many millisecs + * @of_xlate: Controller driver specific mapping of channel via DT + * @poll: API private. Used to poll for TXDONE on all channels. + * @node: API private. To hook into list of controllers. + */ +struct mbox_controller { + struct device *dev; + struct mbox_chan_ops *ops; + struct mbox_chan *chans; + int num_chans; + bool txdone_irq; + bool txdone_poll; + unsigned txpoll_period; + struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, + const struct of_phandle_args *sp); + /* Internal to API */ + struct timer_list poll; + struct list_head node; +}; + +/* + * The length of circular buffer for queuing messages from a client. + * 'msg_count' tracks the number of buffered messages while 'msg_free' + * is the index where the next message would be buffered. + * We shouldn't need it too big because every transfer is interrupt + * triggered and if we have lots of data to transfer, the interrupt + * latencies are going to be the bottleneck, not the buffer length. + * Besides, mbox_send_message could be called from atomic context and + * the client could also queue another message from the notifier 'tx_done' + * of the last transfer done. + * REVISIT: If too many platforms see the "Try increasing MBOX_TX_QUEUE_LEN" + * print, it needs to be taken from config option or somesuch. + */ +#define MBOX_TX_QUEUE_LEN 20 + +/** + * struct mbox_chan - s/w representation of a communication chan + * @mbox: Pointer to the parent/provider of this channel + * @txdone_method: Way to detect TXDone chosen by the API + * @cl: Pointer to the current owner of this channel + * @tx_complete: Transmission completion + * @active_req: Currently active request hook + * @msg_count: No. of mssg currently queued + * @msg_free: Index of next available mssg slot + * @msg_data: Hook for data packet + * @lock: Serialise access to the channel + * @con_priv: Hook for controller driver to attach private data + */ +struct mbox_chan { + struct mbox_controller *mbox; + unsigned txdone_method; + struct mbox_client *cl; + struct completion tx_complete; + void *active_req; + unsigned msg_count, msg_free; + void *msg_data[MBOX_TX_QUEUE_LEN]; + spinlock_t lock; /* Serialise access to the channel */ + void *con_priv; +}; + +int mbox_controller_register(struct mbox_controller *mbox); /* can sleep */ +void mbox_controller_unregister(struct mbox_controller *mbox); /* can sleep */ +void mbox_chan_received_data(struct mbox_chan *chan, void *data); /* atomic */ +void mbox_chan_txdone(struct mbox_chan *chan, int r); /* atomic */ + +#endif /* __MAILBOX_CONTROLLER_H */ -- cgit From 083bf668cb70e47b84db64856606e94beac87f01 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Mar 2014 14:06:25 +0800 Subject: ACPI: make acpi_create_platform_device() an external API Signed-off-by: Zhang Rui --- drivers/acpi/acpi_platform.c | 1 + drivers/acpi/internal.h | 7 ------- include/linux/acpi.h | 1 + 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 2bf9082f7523..a3c89a1bcf54 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -113,3 +113,4 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) kfree(resources); return pdev; } +EXPORT_SYMBOL_GPL(acpi_create_platform_device); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index de47f9f746c9..f221d1eb594a 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -168,13 +168,6 @@ static inline int suspend_nvs_save(void) { return 0; } static inline void suspend_nvs_restore(void) {} #endif -/*-------------------------------------------------------------------------- - Platform bus support - -------------------------------------------------------------------------- */ -struct platform_device; - -struct platform_device *acpi_create_platform_device(struct acpi_device *adev); - /*-------------------------------------------------------------------------- Video -------------------------------------------------------------------------- */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 807cbc46d73e..2c24c2c1be45 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -432,6 +432,7 @@ static inline bool acpi_driver_match_device(struct device *dev, int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); +struct platform_device *acpi_create_platform_device(struct acpi_device *); #define ACPI_PTR(_ptr) (_ptr) #else /* !CONFIG_ACPI */ -- cgit From 2bb3a2bf9939f3361e25045f4ef7b136b864c3b8 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 19 Nov 2013 15:43:52 +0800 Subject: ACPI / fan: use acpi_device_xxx_power instead of acpi_bus equivelant When we have the acpi_device pointer, there is no need to pass the device's handle to the acpi_bus_xxx_power functions to get/set/update the device's power state, instead, use the acpi_device_xxx_power functions directly. To make this happen for fan module, export acpi_device_update_power. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/device_pm.c | 1 + drivers/acpi/fan.c | 10 +++++----- drivers/acpi/internal.h | 2 -- include/acpi/acpi_bus.h | 1 + 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 67075f800e34..91775475e367 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -343,6 +343,7 @@ int acpi_device_update_power(struct acpi_device *device, int *state_p) return 0; } +EXPORT_SYMBOL_GPL(acpi_device_update_power); int acpi_bus_update_power(acpi_handle handle, int *state_p) { diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index df861bbc73cc..fff9696bea25 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -92,7 +92,7 @@ static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long if (!device) return -EINVAL; - result = acpi_bus_update_power(device->handle, &acpi_state); + result = acpi_device_update_power(device, &acpi_state); if (result) return result; @@ -110,7 +110,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) if (!device || (state != 0 && state != 1)) return -EINVAL; - result = acpi_bus_set_power(device->handle, + result = acpi_device_set_power(device, state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); return result; @@ -134,7 +134,7 @@ static int acpi_fan_add(struct acpi_device *device) strcpy(acpi_device_name(device), "Fan"); strcpy(acpi_device_class(device), ACPI_FAN_CLASS); - result = acpi_bus_update_power(device->handle, NULL); + result = acpi_device_update_power(device, NULL); if (result) { printk(KERN_ERR PREFIX "Setting initial power state\n"); goto end; @@ -186,7 +186,7 @@ static int acpi_fan_remove(struct acpi_device *device) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { - acpi_bus_set_power(to_acpi_device(dev)->handle, ACPI_STATE_D0); + acpi_device_set_power(to_acpi_device(dev), ACPI_STATE_D0); return AE_OK; } @@ -195,7 +195,7 @@ static int acpi_fan_resume(struct device *dev) { int result; - result = acpi_bus_update_power(to_acpi_device(dev)->handle, NULL); + result = acpi_device_update_power(to_acpi_device(dev), NULL); if (result) printk(KERN_ERR PREFIX "Error updating fan power state\n"); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index f221d1eb594a..447f6d679b29 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -104,8 +104,6 @@ int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); -int acpi_device_update_power(struct acpi_device *device, int *state_p); - int acpi_wakeup_device_init(void); #ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index bcfd808b1098..6ca32812f3da 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -435,6 +435,7 @@ int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); int acpi_bus_update_power(acpi_handle handle, int *state_p); +int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); #ifdef CONFIG_PM -- cgit From 7b83fd9d91a411158f72d36958103c708c3b5a86 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 25 Mar 2014 10:40:09 +0800 Subject: Thermal: move the KELVIN_TO_MILLICELSIUS macro to thermal.h This macro can be used by other component so move it to a common header, but in a slightly different way: define two macros, one macro with an offset and the other doesn't. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/thermal.c | 18 +++++++++--------- include/linux/thermal.h | 2 ++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 112817e963e0..d24fa1964eb8 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -528,7 +528,6 @@ static void acpi_thermal_check(void *data) } /* sys I/F for generic thermal sysfs support */ -#define KELVIN_TO_MILLICELSIUS(t, off) (((t) - (off)) * 100) static int thermal_get_temp(struct thermal_zone_device *thermal, unsigned long *temp) @@ -543,7 +542,8 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, if (result) return result; - *temp = KELVIN_TO_MILLICELSIUS(tz->temperature, tz->kelvin_offset); + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(tz->temperature, + tz->kelvin_offset); return 0; } @@ -647,7 +647,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.critical.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -657,7 +657,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.hot.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.hot.temperature, tz->kelvin_offset); return 0; @@ -667,7 +667,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.passive.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.passive.temperature, tz->kelvin_offset); return 0; @@ -678,7 +678,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.active[i].temperature, tz->kelvin_offset); return 0; @@ -694,7 +694,7 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal, struct acpi_thermal *tz = thermal->devdata; if (tz->trips.critical.flags.valid) { - *temperature = KELVIN_TO_MILLICELSIUS( + *temperature = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -714,8 +714,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal, if (type == THERMAL_TRIP_ACTIVE) { unsigned long trip_temp; - unsigned long temp = KELVIN_TO_MILLICELSIUS(tz->temperature, - tz->kelvin_offset); + unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( + tz->temperature, tz->kelvin_offset); if (thermal_get_trip_temp(thermal, trip, &trip_temp)) return -EINVAL; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0305cde21a74..79ce6b94884a 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,6 +44,8 @@ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) +#define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) +#define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit From 77e337c6e23e3b9d22e09ffec202a80f755a54c2 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 3 Sep 2014 15:13:02 +0800 Subject: Thermal: introduce INT3402 thermal driver ACPI INT3402 device object could report temperature for the memory module. To expose such information to user space, a thermal zone device is registered for it so that the thermal sysfs interface can expose such information for userspace to use. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/int3402_thermal.c | 242 ++++++++++++++++++++++ include/linux/thermal.h | 2 + 3 files changed, 245 insertions(+) create mode 100644 drivers/thermal/int340x_thermal/int3402_thermal.c (limited to 'include') diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile index e10a53bcefe7..67c98fd24200 100644 --- a/drivers/thermal/int340x_thermal/Makefile +++ b/drivers/thermal/int340x_thermal/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c new file mode 100644 index 000000000000..a5d08c14ba24 --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3402_thermal.c @@ -0,0 +1,242 @@ +/* + * INT3402 thermal driver for memory temperature reporting + * + * Copyright (C) 2014, Intel Corporation + * Authors: Aaron Lu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include + +#define ACPI_ACTIVE_COOLING_MAX_NR 10 + +struct active_trip { + unsigned long temp; + int id; + bool valid; +}; + +struct int3402_thermal_data { + unsigned long *aux_trips; + int aux_trip_nr; + unsigned long psv_temp; + int psv_trip_id; + unsigned long crt_temp; + int crt_trip_id; + unsigned long hot_temp; + int hot_trip_id; + struct active_trip act_trips[ACPI_ACTIVE_COOLING_MAX_NR]; + acpi_handle *handle; +}; + +static int int3402_thermal_get_zone_temp(struct thermal_zone_device *zone, + unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(d->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -ENODEV; + + /* _TMP returns the temperature in tenths of degrees Kelvin */ + *temp = DECI_KELVIN_TO_MILLICELSIUS(tmp); + + return 0; +} + +static int int3402_thermal_get_trip_temp(struct thermal_zone_device *zone, + int trip, unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *temp = d->aux_trips[trip]; + else if (trip == d->crt_trip_id) + *temp = d->crt_temp; + else if (trip == d->psv_trip_id) + *temp = d->psv_temp; + else if (trip == d->hot_trip_id) + *temp = d->hot_temp; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *temp = d->act_trips[i].temp; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_get_trip_type(struct thermal_zone_device *zone, + int trip, enum thermal_trip_type *type) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *type = THERMAL_TRIP_PASSIVE; + else if (trip == d->crt_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else if (trip == d->hot_trip_id) + *type = THERMAL_TRIP_HOT; + else if (trip == d->psv_trip_id) + *type = THERMAL_TRIP_PASSIVE; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *type = THERMAL_TRIP_ACTIVE; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_set_trip_temp(struct thermal_zone_device *zone, int trip, + unsigned long temp) +{ + struct int3402_thermal_data *d = zone->devdata; + acpi_status status; + char name[10]; + + snprintf(name, sizeof(name), "PAT%d", trip); + status = acpi_execute_simple_method(d->handle, name, + MILLICELSIUS_TO_DECI_KELVIN(temp)); + if (ACPI_FAILURE(status)) + return -EIO; + + d->aux_trips[trip] = temp; + return 0; +} + +static struct thermal_zone_device_ops int3402_thermal_zone_ops = { + .get_temp = int3402_thermal_get_zone_temp, + .get_trip_temp = int3402_thermal_get_trip_temp, + .get_trip_type = int3402_thermal_get_trip_type, + .set_trip_temp = int3402_thermal_set_trip_temp, +}; + +static struct thermal_zone_params int3402_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static int int3402_thermal_get_temp(acpi_handle handle, char *name, + unsigned long *temp) +{ + unsigned long long r; + acpi_status status; + + status = acpi_evaluate_integer(handle, name, NULL, &r); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLICELSIUS(r); + return 0; +} + +static int int3402_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3402_thermal_data *d; + struct thermal_zone_device *zone; + acpi_status status; + unsigned long long trip_cnt; + int trip_mask = 0, i; + + if (!acpi_has_method(adev->handle, "_TMP")) + return -ENODEV; + + d = devm_kzalloc(&pdev->dev, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + status = acpi_evaluate_integer(adev->handle, "PATC", NULL, &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + else { + d->aux_trips = devm_kzalloc(&pdev->dev, + sizeof(*d->aux_trips) * trip_cnt, GFP_KERNEL); + if (!d->aux_trips) + return -ENOMEM; + trip_mask = trip_cnt - 1; + d->handle = adev->handle; + d->aux_trip_nr = trip_cnt; + } + + d->crt_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_CRT", &d->crt_temp)) + d->crt_trip_id = trip_cnt++; + d->hot_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_HOT", &d->hot_temp)) + d->hot_trip_id = trip_cnt++; + d->psv_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_PSV", &d->psv_temp)) + d->psv_trip_id = trip_cnt++; + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + char name[5] = { '_', 'A', 'C', '0' + i, '\0' }; + if (int3402_thermal_get_temp(adev->handle, name, + &d->act_trips[i].temp)) + break; + d->act_trips[i].id = trip_cnt++; + d->act_trips[i].valid = true; + } + + zone = thermal_zone_device_register(acpi_device_bid(adev), trip_cnt, + trip_mask, d, + &int3402_thermal_zone_ops, + &int3402_thermal_params, + 0, 0); + if (IS_ERR(zone)) + return PTR_ERR(zone); + platform_set_drvdata(pdev, zone); + + return 0; +} + +static int int3402_thermal_remove(struct platform_device *pdev) +{ + struct thermal_zone_device *zone = platform_get_drvdata(pdev); + + thermal_zone_device_unregister(zone); + return 0; +} + +static const struct acpi_device_id int3402_thermal_match[] = { + {"INT3402", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3402_thermal_match); + +static struct platform_driver int3402_thermal_driver = { + .probe = int3402_thermal_probe, + .remove = int3402_thermal_remove, + .driver = { + .name = "int3402 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3402_thermal_match, + }, +}; + +module_platform_driver(int3402_thermal_driver); + +MODULE_DESCRIPTION("INT3402 Thermal driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 79ce6b94884a..ef90838b36a0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -46,6 +46,8 @@ #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) #define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) #define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) +#define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) +#define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit From 174e964ec224c3c591b83a6b5f0984d905d3678f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 9 Oct 2014 12:43:27 -0700 Subject: regulator: Include err.h from consumer.h to fix build failure sh:sh2007_defconfig fails to build with the following error: In file included from include/linux/regulator/machine.h:18:0, from arch/sh/boards/board-sh2007.c:10: include/linux/regulator/consumer.h: In function 'regulator_get_optional': include/linux/regulator/consumer.h:271:2: error: implicit declaration of function 'ERR_PTR' include/linux/err.h: At top level: include/linux/err.h:23:35: error: conflicting types for 'ERR_PTR' include/linux/regulator/consumer.h:271:9: note: previous implicit declaration of 'ERR_PTR' was here Since consumer.h uses ERR_PTR, it should include err.h. Signed-off-by: Guenter Roeck Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index d347c805f923..f540b1496e2f 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -35,6 +35,8 @@ #ifndef __LINUX_REGULATOR_CONSUMER_H_ #define __LINUX_REGULATOR_CONSUMER_H_ +#include + struct device; struct notifier_block; struct regmap; -- cgit From 7210e4e38f945dfa173c4a4e59ad827c9ecad541 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Oct 2014 19:50:22 +0200 Subject: netfilter: nf_tables: restrict nat/masq expressions to nat chain type This adds the missing validation code to avoid the use of nat/masq from non-nat chains. The validation assumes two possible configuration scenarios: 1) Use of nat from base chain that is not of nat type. Reject this configuration from the nft_*_init() path of the expression. 2) Use of nat from non-base chain. In this case, we have to wait until the non-base chain is referenced by at least one base chain via jump/goto. This is resolved from the nft_*_validate() path which is called from nf_tables_check_loops(). The user gets an -EOPNOTSUPP in both cases. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/net/netfilter/nft_masq.h | 3 +++ net/ipv4/netfilter/nft_masq_ipv4.c | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 1 + net/netfilter/nf_tables_api.c | 14 ++++++++++++++ net/netfilter/nft_masq.c | 12 ++++++++++++ net/netfilter/nft_nat.c | 12 ++++++++++++ 7 files changed, 46 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3d7292392fac..845c596bf594 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -530,6 +530,9 @@ enum nft_chain_type { NFT_CHAIN_T_MAX }; +int nft_chain_validate_dependency(const struct nft_chain *chain, + enum nft_chain_type type); + struct nft_stats { u64 bytes; u64 pkts; diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h index c72729f954f4..e2a518b60e19 100644 --- a/include/net/netfilter/nft_masq.h +++ b/include/net/netfilter/nft_masq.h @@ -13,4 +13,7 @@ int nft_masq_init(const struct nft_ctx *ctx, int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + #endif /* _NFT_MASQ_H_ */ diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 1c636d6b5b50..c1023c445920 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .eval = nft_masq_ipv4_eval, .init = nft_masq_init, .dump = nft_masq_dump, + .validate = nft_masq_validate, }; static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 556262f40761..8a7ac685076d 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = { .eval = nft_masq_ipv6_eval, .init = nft_masq_init, .dump = nft_masq_dump, + .validate = nft_masq_validate, }; static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 556a0dfa4abc..65eb2a1160d5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3744,6 +3744,20 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .abort = nf_tables_abort, }; +int nft_chain_validate_dependency(const struct nft_chain *chain, + enum nft_chain_type type) +{ + const struct nft_base_chain *basechain; + + if (chain->flags & NFT_BASE_CHAIN) { + basechain = nft_base_chain(chain); + if (basechain->type->type != type) + return -EOPNOTSUPP; + } + return 0; +} +EXPORT_SYMBOL_GPL(nft_chain_validate_dependency); + /* * Loop detection - walk through the ruleset beginning at the destination chain * of a new jump until either the source chain is reached (loop) or all diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 6637bab00567..d1ffd5eb3a9b 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -26,6 +26,11 @@ int nft_masq_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_masq *priv = nft_expr_priv(expr); + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; if (tb[NFTA_MASQ_FLAGS] == NULL) return 0; @@ -55,5 +60,12 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_masq_dump); +int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} +EXPORT_SYMBOL_GPL(nft_masq_validate); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 799550b476fb..0f0af6e86fb8 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -95,6 +95,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, u32 family; int err; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + if (tb[NFTA_NAT_TYPE] == NULL) return -EINVAL; @@ -205,6 +209,13 @@ nla_put_failure: return -1; } +static int nft_nat_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} + static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, @@ -212,6 +223,7 @@ static const struct nft_expr_ops nft_nat_ops = { .eval = nft_nat_eval, .init = nft_nat_init, .dump = nft_nat_dump, + .validate = nft_nat_validate, }; static struct nft_expr_type nft_nat_type __read_mostly = { -- cgit From d4c5efdb97773f59a2b711754ca0953f24516739 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 26 Aug 2014 23:16:35 -0400 Subject: random: add and use memzero_explicit() for clearing data zatimend has reported that in his environment (3.16/gcc4.8.3/corei7) memset() calls which clear out sensitive data in extract_{buf,entropy, entropy_user}() in random driver are being optimized away by gcc. Add a helper memzero_explicit() (similarly as explicit_bzero() variants) that can be used in such cases where a variable with sensitive data is being cleared out in the end. Other use cases might also be in crypto code. [ I have put this into lib/string.c though, as it's always built-in and doesn't need any dependencies then. ] Fixes kernel bugzilla: 82041 Reported-by: zatimend@hotmail.co.uk Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Cc: Alexey Dobriyan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- drivers/char/random.c | 8 ++++---- include/linux/string.h | 5 +++-- lib/string.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/char/random.c b/drivers/char/random.c index c18d41db83d8..8c86a95203a0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1106,7 +1106,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __mix_pool_bytes(r, hash.w, sizeof(hash.w)); spin_unlock_irqrestore(&r->lock, flags); - memset(workspace, 0, sizeof(workspace)); + memzero_explicit(workspace, sizeof(workspace)); /* * In case the hash function has some recognizable output @@ -1118,7 +1118,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) hash.w[2] ^= rol32(hash.w[2], 16); memcpy(out, &hash, EXTRACT_SIZE); - memset(&hash, 0, sizeof(hash)); + memzero_explicit(&hash, sizeof(hash)); } /* @@ -1175,7 +1175,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } @@ -1218,7 +1218,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..3b42b3732da6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -132,7 +132,7 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); + const void *from, size_t available); /** * strstarts - does @str start with @prefix? @@ -144,7 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } -extern size_t memweight(const void *ptr, size_t bytes); +size_t memweight(const void *ptr, size_t bytes); +void memzero_explicit(void *s, size_t count); /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index 992bf30af759..3a3120452a1d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -604,6 +604,22 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + OPTIMIZER_HIDE_VAR(s); +} +EXPORT_SYMBOL(memzero_explicit); + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- cgit From 70f3ce0510afdad7cbaf27ab7ab961377205c782 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 29 Sep 2014 11:47:54 +0200 Subject: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers currently call spi_nor_match_id() and then spi_nor_scan(). This adds a dependency on struct spi_device_id which we want to avoid. Make spi_nor_scan() do it for them. Signed-off-by: Ben Hutchings Signed-off-by: Rafał Miłecki Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 4 +--- drivers/mtd/spi-nor/fsl-quadspi.c | 7 +------ drivers/mtd/spi-nor/spi-nor.c | 13 +++++++++---- include/linux/mtd/spi-nor.h | 20 +++----------------- 4 files changed, 14 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 822209d10689..bd5e4c6edfd4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -193,7 +193,6 @@ static int m25p_probe(struct spi_device *spi) { struct mtd_part_parser_data ppdata; struct flash_platform_data *data; - const struct spi_device_id *id = NULL; struct m25p *flash; struct spi_nor *nor; enum read_mode mode = SPI_NOR_NORMAL; @@ -241,8 +240,7 @@ static int m25p_probe(struct spi_device *spi) else flash_name = spi->modalias; - id = spi_nor_match_id(flash_name); - ret = spi_nor_scan(nor, id, mode); + ret = spi_nor_scan(nor, flash_name, mode); if (ret) return ret; diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index 8d659a2888d5..d5269a26c839 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -881,7 +881,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* iterate the subnodes. */ for_each_available_child_of_node(dev->of_node, np) { - const struct spi_device_id *id; char modalias[40]; /* skip the holes */ @@ -909,10 +908,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) if (of_modalias_node(np, modalias, sizeof(modalias)) < 0) goto map_failed; - id = spi_nor_match_id(modalias); - if (!id) - goto map_failed; - ret = of_property_read_u32(np, "spi-max-frequency", &q->clk_rate); if (ret < 0) @@ -921,7 +916,7 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* set the chip address for READID */ fsl_qspi_set_base_addr(q, nor); - ret = spi_nor_scan(nor, id, SPI_NOR_QUAD); + ret = spi_nor_scan(nor, modalias, SPI_NOR_QUAD); if (ret) goto map_failed; diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index ae16aa2f6885..5c8e39977bc5 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -28,6 +28,8 @@ #define JEDEC_MFR(_jedec_id) ((_jedec_id) >> 16) +static const struct spi_device_id *spi_nor_match_id(const char *name); + /* * Read the status register, returning its value in the location * Return the status register value. @@ -911,9 +913,9 @@ static int spi_nor_check(struct spi_nor *nor) return 0; } -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode) +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode) { + const struct spi_device_id *id = NULL; struct flash_info *info; struct device *dev = nor->dev; struct mtd_info *mtd = nor->mtd; @@ -925,6 +927,10 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, if (ret) return ret; + id = spi_nor_match_id(name); + if (!id) + return -ENOENT; + info = (void *)id->driver_data; if (info->jedec_id) { @@ -1113,7 +1119,7 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, } EXPORT_SYMBOL_GPL(spi_nor_scan); -const struct spi_device_id *spi_nor_match_id(char *name) +static const struct spi_device_id *spi_nor_match_id(const char *name) { const struct spi_device_id *id = spi_nor_ids; @@ -1124,7 +1130,6 @@ const struct spi_device_id *spi_nor_match_id(char *name) } return NULL; } -EXPORT_SYMBOL_GPL(spi_nor_match_id); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huang Shijie "); diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 9e6294f32ba8..a5a7a086748d 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -187,32 +187,18 @@ struct spi_nor { /** * spi_nor_scan() - scan the SPI NOR * @nor: the spi_nor structure - * @id: the spi_device_id provided by the driver + * @name: the chip type name * @mode: the read mode supported by the driver * * The drivers can use this fuction to scan the SPI NOR. * In the scanning, it will try to get all the necessary information to * fill the mtd_info{} and the spi_nor{}. * - * The board may assigns a spi_device_id with @id which be used to compared with - * the spi_device_id detected by the scanning. + * The chip type name can be provided through the @name parameter. * * Return: 0 for success, others for failure. */ -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode); +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); extern const struct spi_device_id spi_nor_ids[]; -/** - * spi_nor_match_id() - find the spi_device_id by the name - * @name: the name of the spi_device_id - * - * The drivers use this function to find the spi_device_id - * specified by the @name. - * - * Return: returns the right spi_device_id pointer on success, - * and returns NULL on failure. - */ -const struct spi_device_id *spi_nor_match_id(char *name); - #endif -- cgit From aa281ac631008b9c18c405c8880007789f659c7d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 19 Oct 2014 19:38:58 +0300 Subject: Boaz Harrosh - Fix broken email address I no longer have access to the Panasas email. So change to an email that can always reach me. Signed-off-by: Boaz Harrosh --- drivers/scsi/osd/Kbuild | 2 +- drivers/scsi/osd/Kconfig | 2 +- drivers/scsi/osd/osd_debug.h | 2 +- drivers/scsi/osd/osd_initiator.c | 4 ++-- drivers/scsi/osd/osd_uld.c | 4 ++-- fs/exofs/Kbuild | 2 +- fs/exofs/common.h | 2 +- fs/exofs/dir.c | 2 +- fs/exofs/exofs.h | 2 +- fs/exofs/file.c | 2 +- fs/exofs/inode.c | 2 +- fs/exofs/namei.c | 2 +- fs/exofs/ore.c | 4 ++-- fs/exofs/ore_raid.c | 2 +- fs/exofs/ore_raid.h | 2 +- fs/exofs/super.c | 2 +- fs/exofs/symlink.c | 2 +- fs/exofs/sys.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/objlayout/objlayout.c | 2 +- fs/nfs/objlayout/objlayout.h | 2 +- fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 2 +- include/linux/pnfs_osd_xdr.h | 2 +- include/scsi/osd_initiator.h | 2 +- include/scsi/osd_ore.h | 2 +- include/scsi/osd_protocol.h | 4 ++-- include/scsi/osd_sec.h | 2 +- include/scsi/osd_sense.h | 2 +- include/scsi/osd_types.h | 2 +- 29 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/scsi/osd/Kbuild b/drivers/scsi/osd/Kbuild index 5fd73d77c3af..58cecd45b0f5 100644 --- a/drivers/scsi/osd/Kbuild +++ b/drivers/scsi/osd/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index a0703514eb0f..347cc5e33749 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_debug.h b/drivers/scsi/osd/osd_debug.h index 579e491f11df..26341261bb5c 100644 --- a/drivers/scsi/osd/osd_debug.h +++ b/drivers/scsi/osd/osd_debug.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 5f4cbf0c4759..52e243b77000 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -7,7 +7,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -57,7 +57,7 @@ enum { OSD_REQ_RETRIES = 1 }; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd initiator library libosd.ko"); MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c index e1d9a4c4c4b3..92cdd4b06526 100644 --- a/drivers/scsi/osd/osd_uld.c +++ b/drivers/scsi/osd/osd_uld.c @@ -10,7 +10,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -74,7 +74,7 @@ static const char osd_name[] = "osd"; static const char *osd_version_string = "open-osd 0.2.1"; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd Upper-Layer-Driver osd.ko"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(SCSI_OSD_MAJOR); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 389ba8312d5d..b47c7b8dc275 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 3bbd46956d77..7d88ef566213 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -4,7 +4,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 49f51ab4caac..d7defd557601 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index fffe86fd7a42..ad9cac670a47 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 71bf8e4fb5d4..1a376b42d305 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3f9cafd73931..f1d3d4eb8c4f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4731fd991efe..28907460e8fa 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index cfc0205d62c4..7bd8ac8dfb28 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * @@ -29,7 +29,7 @@ #include "ore_raid.h" -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 84529b8a331b..27cbdb697649 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index cf6375d82129..a6e746775570 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -1,6 +1,6 @@ /* * Copyright (C) from 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index ed73ed8ebbee..95965503afcb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 4dd687c3e747..832e2624b80b 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 1b4f2f95fc37..5e6a2c0a1f0b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2012 * Sachin Bhamare - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ae05278b3761..5cbd8cf9a5a5 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 697a16d11fac..ad988428162e 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fd13f1d2f136..3472e8390ba5 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -6,7 +6,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index b3918f7ac34d..f093c7ec983b 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h index fe25876c1a5d..17d7d0d20eca 100644 --- a/include/linux/pnfs_osd_xdr.h +++ b/include/linux/pnfs_osd_xdr.h @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index b2e85fdd2ae0..a09cca829082 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 6ca3265a4dca..7a8d2cd30328 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * Public Declarations of the ORE API * diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h index a2594afe05c7..e0ca835e7bf7 100644 --- a/include/scsi/osd_protocol.h +++ b/include/scsi/osd_protocol.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -496,7 +496,7 @@ struct osd_timestamp { */ struct osd_key_identifier { - u8 id[7]; /* if you know why 7 please email bharrosh@panasas.com */ + u8 id[7]; /* if you know why 7 please email ooo@electrozaur.com */ } __packed; /* for osd_capability.format */ diff --git a/include/scsi/osd_sec.h b/include/scsi/osd_sec.h index f96151c9c9e8..7abeb0f0db30 100644 --- a/include/scsi/osd_sec.h +++ b/include/scsi/osd_sec.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_sense.h b/include/scsi/osd_sense.h index 91db543a5502..d52aa93a0b2d 100644 --- a/include/scsi/osd_sense.h +++ b/include/scsi/osd_sense.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_types.h b/include/scsi/osd_types.h index bd0be7ed4bcf..48e8a165e136 100644 --- a/include/scsi/osd_types.h +++ b/include/scsi/osd_types.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify -- cgit From 4846e3784585173f48e267b76f968bcb4a12d3b2 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 9 Sep 2014 22:18:31 +0200 Subject: watchdog: simplify definitions of WATCHDOG_NOWAYOUT(_INIT_STATUS)? Signed-off-by: Uwe Kleine-K=C3=B6nig Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 2a3038ee17a3..395b70e0eccf 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -97,13 +97,8 @@ struct watchdog_device { #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ }; -#ifdef CONFIG_WATCHDOG_NOWAYOUT -#define WATCHDOG_NOWAYOUT 1 -#define WATCHDOG_NOWAYOUT_INIT_STATUS (1 << WDOG_NO_WAY_OUT) -#else -#define WATCHDOG_NOWAYOUT 0 -#define WATCHDOG_NOWAYOUT_INIT_STATUS 0 -#endif +#define WATCHDOG_NOWAYOUT IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT) +#define WATCHDOG_NOWAYOUT_INIT_STATUS (WATCHDOG_NOWAYOUT << WDOG_NO_WAY_OUT) /* Use the following function to check whether or not the watchdog is active */ static inline bool watchdog_active(struct watchdog_device *wdd) -- cgit From f974008f07a62171a9dede08250c9a35c2b2b986 Mon Sep 17 00:00:00 2001 From: Olivier Gay Date: Sat, 18 Oct 2014 01:53:39 +0200 Subject: HID: add keyboard input assist hid usages Add keyboard input assist controls usages from approved hid usage table request HUTTR42: http://www.usb.org/developers/hidpage/HUTRR42c.pdf Signed-off-by: Olivier Gay Acked-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/hid-debug.c | 6 ++++++ drivers/hid/hid-input.c | 7 +++++++ include/uapi/linux/input.h | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 84c3cb15ccdd..8bf61d295ffd 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -946,6 +946,12 @@ static const char *keys[KEY_MAX + 1] = { [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", + [KEY_KBDINPUTASSIST_PREV] = "KbdInputAssistPrev", + [KEY_KBDINPUTASSIST_NEXT] = "KbdInputAssistNext", + [KEY_KBDINPUTASSIST_PREVGROUP] = "KbdInputAssistPrevGroup", + [KEY_KBDINPUTASSIST_NEXTGROUP] = "KbdInputAssistNextGroup", + [KEY_KBDINPUTASSIST_ACCEPT] = "KbdInputAssistAccept", + [KEY_KBDINPUTASSIST_CANCEL] = "KbdInputAssistCancel", }; static const char *relatives[REL_MAX + 1] = { diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 2df7fddbd119..56c6c30f2c7d 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -862,6 +862,13 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x28b: map_key_clear(KEY_FORWARDMAIL); break; case 0x28c: map_key_clear(KEY_SEND); break; + case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV); break; + case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT); break; + case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP); break; + case 0x2ca: map_key_clear(KEY_KBDINPUTASSIST_NEXTGROUP); break; + case 0x2cb: map_key_clear(KEY_KBDINPUTASSIST_ACCEPT); break; + case 0x2cc: map_key_clear(KEY_KBDINPUTASSIST_CANCEL); break; + default: goto ignore; } break; diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 1874ebe9ac1e..a1d7e931ab72 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -739,6 +739,13 @@ struct input_keymap_entry { #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ +#define KEY_KBDINPUTASSIST_PREV 0x260 +#define KEY_KBDINPUTASSIST_NEXT 0x261 +#define KEY_KBDINPUTASSIST_PREVGROUP 0x262 +#define KEY_KBDINPUTASSIST_NEXTGROUP 0x263 +#define KEY_KBDINPUTASSIST_ACCEPT 0x264 +#define KEY_KBDINPUTASSIST_CANCEL 0x265 + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 -- cgit From 5f8b35b6330db14d15fb385cc7b2ccca53dc323e Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:39:24 +0800 Subject: ACPICA: Add string for _DDN method name. The _DDN method will be used internally. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- include/acpi/acnames.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h index f97804bdf1ff..7461327e14e4 100644 --- a/include/acpi/acnames.h +++ b/include/acpi/acnames.h @@ -52,6 +52,7 @@ #define METHOD_NAME__CBA "_CBA" #define METHOD_NAME__CID "_CID" #define METHOD_NAME__CRS "_CRS" +#define METHOD_NAME__DDN "_DDN" #define METHOD_NAME__HID "_HID" #define METHOD_NAME__INI "_INI" #define METHOD_NAME__PLD "_PLD" -- cgit From a08f813e58169a8edd01e13d73f60d8561f3ecea Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:57 +0800 Subject: ACPICA: Events: Reduce source code difference for the ACPI_EVENT_FLAG_HANDLE support. This patch is a partial linuxized result of the following ACPICA commit: ACPICA commit: a73b66c6aa1846d055bb6390d9c9b9902f7d804d Subject: Add "has handler" flag to event/gpe status interfaces. This change adds a new flag, ACPI_EVENT_FLAGS_HAS_HANDLER to the acpi_get_event_status and acpi_get_gpe_status external interfaces. It is set if the event/gpe currently has a handler associated with it. This commit back ports ACPI_EVENT_FLAG_HANDLE from Linux upstream to ACPICA, the flag along with its support code currently can only be found in the Linux upstream and is used by the ACPI sysfs GPE interfaces and the ACPI bus scanning support. Link: https://github.com/acpica/acpica/commit/a73b66c6 Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfevnt.c | 40 ++++++++++++++++++++++++++-------------- drivers/acpi/acpica/evxfgpe.c | 3 --- drivers/acpi/acpica/hwgpe.c | 7 +++++++ include/acpi/actypes.h | 4 ++-- 4 files changed, 35 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index e286640ad4ff..a8a077c97187 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -324,8 +324,9 @@ ACPI_EXPORT_SYMBOL(acpi_clear_event) ******************************************************************************/ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) { - acpi_status status = AE_OK; - u32 value; + acpi_status status; + acpi_event_status local_event_status = 0; + u32 in_byte; ACPI_FUNCTION_TRACE(acpi_get_event_status); @@ -339,29 +340,40 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) return_ACPI_STATUS(AE_BAD_PARAMETER); } - /* Get the status of the requested fixed event */ + /* Fixed event currently can be dispatched? */ + + if (acpi_gbl_fixed_event_handlers[event].handler) { + local_event_status |= ACPI_EVENT_FLAG_HANDLE; + } + + /* Fixed event currently enabled? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, &value); - if (ACPI_FAILURE(status)) + enable_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - *event_status = value; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_ENABLED; + } + + /* Fixed event currently active? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - status_register_id, &value); - if (ACPI_FAILURE(status)) + status_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - if (value) - *event_status |= ACPI_EVENT_FLAG_SET; - - if (acpi_gbl_fixed_event_handlers[event].handler) - *event_status |= ACPI_EVENT_FLAG_HANDLE; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_SET; + } - return_ACPI_STATUS(status); + (*event_status) = local_event_status; + return_ACPI_STATUS(AE_OK); } ACPI_EXPORT_SYMBOL(acpi_get_event_status) diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 2529d3c1c6b7..e889a5304abd 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -523,9 +523,6 @@ acpi_get_gpe_status(acpi_handle gpe_device, status = acpi_hw_get_gpe_status(gpe_event_info, event_status); - if (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) - *event_status |= ACPI_EVENT_FLAG_HANDLE; - unlock_and_exit: acpi_os_release_lock(acpi_gbl_gpe_lock, flags); return_ACPI_STATUS(status); diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index aa884d42361c..0302bc36287b 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -216,6 +216,13 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, return (AE_BAD_PARAMETER); } + /* GPE currently handled? */ + + if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) != + ACPI_GPE_DISPATCH_NONE) { + local_event_status |= ACPI_EVENT_FLAG_HANDLE; + } + /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index ac03ec81d342..857830d8989b 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -721,7 +721,7 @@ typedef u32 acpi_event_type; * | | | +--- Enabled for wake? * | | +----- Set? * | +------- Has a handler? - * +----------- + * +------------- */ typedef u32 acpi_event_status; @@ -729,7 +729,7 @@ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 -#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 +#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 /* Actions for acpi_set_gpe, acpi_gpe_wakeup, acpi_hw_low_set_gpe */ -- cgit From 2f8572344e65296d13c1a771cacfea60916d91dc Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:40:05 +0800 Subject: ACPICA: Events: Reduce source code difference for the ACPI_EVENT_FLAG_HANDLE renaming. This patch is partial linuxized result of the following ACPICA commit: ACPICA commit: a73b66c6aa1846d055bb6390d9c9b9902f7d804d Subject: Add "has handler" flag to event/gpe status interfaces. This change adds a new flag, ACPI_EVENT_FLAGS_HAS_HANDLER to the acpi_get_event_status and acpi_get_gpe_status external interfaces. It is set if the event/gpe currently has a handler associated with it. This patch contains the code to rename ACPI_EVENT_FLAG_HANDLE to ACPI_EVENT_FLAG_HAS_HANDLER, and the corresponding updates of its usages. Link: https://github.com/acpica/acpica/commit/a73b66c6 Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfevnt.c | 2 +- drivers/acpi/acpica/hwgpe.c | 2 +- drivers/acpi/scan.c | 2 +- drivers/acpi/sysfs.c | 4 ++-- include/acpi/actypes.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index a8a077c97187..bb8cbf5961bf 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -343,7 +343,7 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) /* Fixed event currently can be dispatched? */ if (acpi_gbl_fixed_event_handlers[event].handler) { - local_event_status |= ACPI_EVENT_FLAG_HANDLE; + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; } /* Fixed event currently enabled? */ diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 0302bc36287b..48ac7b7b59cd 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -220,7 +220,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) != ACPI_GPE_DISPATCH_NONE) { - local_event_status |= ACPI_EVENT_FLAG_HANDLE; + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; } /* Get the info block for the entire GPE register */ diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index ae44d8654c82..f1d96e7519cb 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1470,7 +1470,7 @@ static void acpi_wakeup_gpe_init(struct acpi_device *device) if (ACPI_FAILURE(status)) return; - wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HANDLE); + wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HAS_HANDLER); } static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 38cb9782d4b8..13e577c80201 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -537,7 +537,7 @@ static ssize_t counter_show(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) size += sprintf(buf + size, " invalid"); else if (status & ACPI_EVENT_FLAG_ENABLED) size += sprintf(buf + size, " enabled"); @@ -581,7 +581,7 @@ static ssize_t counter_set(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) { + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) { printk(KERN_WARNING PREFIX "Can not change Invalid GPE/Fixed Event status\n"); return -EINVAL; diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 857830d8989b..7000e66f768e 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -729,7 +729,7 @@ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 -#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 +#define ACPI_EVENT_FLAG_HAS_HANDLER (acpi_event_status) 0x08 /* Actions for acpi_set_gpe, acpi_gpe_wakeup, acpi_hw_low_set_gpe */ -- cgit From 9fc3d1d09cf7f81d5775712dc64c3db4862ee59d Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:40:28 +0800 Subject: ACPICA: Update version to 20140926. Version 20140926. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9fc1d71c82bc..ab2acf629a64 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -46,7 +46,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20140828 +#define ACPI_CA_VERSION 0x20140926 #include #include -- cgit From 51315cdfa0521fff3059cec5fb8ffecc7f37cba7 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:27 +0200 Subject: cpufreq: allow driver-specific data This commit extends the cpufreq_driver structure with an additional 'void *driver_data' field that can be filled by the ->probe() function of a cpufreq driver to pass additional custom information to the driver itself. A new function called cpufreq_get_driver_data() is added to allow a cpufreq driver to retrieve those driver data, since they are typically needed from a cpufreq_policy->init() callback, which does not have access to the cpufreq_driver structure. This function call is similar to the existing cpufreq_get_current_driver() function call. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 15 +++++++++++++++ include/linux/cpufreq.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 24bf76fba141..058d6e084a6d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1731,6 +1731,21 @@ const char *cpufreq_get_current_driver(void) } EXPORT_SYMBOL_GPL(cpufreq_get_current_driver); +/** + * cpufreq_get_driver_data - return current driver data + * + * Return the private data of the currently loaded cpufreq + * driver, or NULL if no cpufreq driver is loaded. + */ +void *cpufreq_get_driver_data(void) +{ + if (cpufreq_driver) + return cpufreq_driver->driver_data; + + return NULL; +} +EXPORT_SYMBOL_GPL(cpufreq_get_driver_data); + /********************************************************************* * NOTIFIER LISTS INTERFACE * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 138336b6bb04..503b085b7832 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -219,6 +219,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; u8 flags; + void *driver_data; /* needed by all drivers */ int (*init) (struct cpufreq_policy *policy); @@ -312,6 +313,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); const char *cpufreq_get_current_driver(void); +void *cpufreq_get_driver_data(void); static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) -- cgit From 34e5a5273d6aa0ee8836bd5d6111b135ffae6931 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:28 +0200 Subject: cpufreq: cpufreq-dt: extend with platform_data This commit extends the cpufreq-dt driver to take a platform_data structure. This structure is for now used to tell the cpufreq-dt driver the layout of the clocks on the platform, i.e whether all CPUs share the same clock or whether each CPU has a separate clock. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 17 +++++++++++++++-- include/linux/cpufreq-dt.h | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 include/linux/cpufreq-dt.h (limited to 'include') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 6bbb8b913446..52facdaf531e 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,7 @@ try_again: static int cpufreq_init(struct cpufreq_policy *policy) { + struct cpufreq_dt_platform_data *pd; struct cpufreq_frequency_table *freq_table; struct thermal_cooling_device *cdev; struct device_node *np; @@ -265,9 +267,18 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->driver_data = priv; policy->clk = cpu_clk; - ret = cpufreq_generic_init(policy, freq_table, transition_latency); - if (ret) + ret = cpufreq_table_validate_and_show(policy, freq_table); + if (ret) { + dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__, + ret); goto out_cooling_unregister; + } + + policy->cpuinfo.transition_latency = transition_latency; + + pd = cpufreq_get_driver_data(); + if (pd && !pd->independent_clocks) + cpumask_setall(policy->cpus); of_node_put(np); @@ -335,6 +346,8 @@ static int dt_cpufreq_probe(struct platform_device *pdev) if (!IS_ERR(cpu_reg)) regulator_put(cpu_reg); + dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev); + ret = cpufreq_register_driver(&dt_cpufreq_driver); if (ret) dev_err(cpu_dev, "failed register driver: %d\n", ret); diff --git a/include/linux/cpufreq-dt.h b/include/linux/cpufreq-dt.h new file mode 100644 index 000000000000..0414009e2c30 --- /dev/null +++ b/include/linux/cpufreq-dt.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Marvell + * Thomas Petazzoni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __CPUFREQ_DT_H__ +#define __CPUFREQ_DT_H__ + +struct cpufreq_dt_platform_data { + /* + * True when each CPU has its own clock to control its + * frequency, false when all CPUs are controlled by a single + * clock. + */ + bool independent_clocks; +}; + +#endif /* __CPUFREQ_DT_H__ */ -- cgit From a5b7616c55e188fe3d6ef686bef402d4703ecb62 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 30 Sep 2014 03:14:55 +0100 Subject: mtd: m25p80,spi-nor: Fix module aliases for m25p80 m25p80's device ID table is now spi_nor_ids, defined in spi-nor. The MODULE_DEVICE_TABLE() macro doesn't work with extern definitions, but its use was also removed at the same time. Now if m25p80 is built as a module it doesn't get the necessary aliases to be loaded automatically. A clean solution to this will involve defining the list of device IDs in spi-nor.h and removing struct spi_device_id from the spi-nor API, but this is quite a large change. As a quick fix suitable for stable, copy the device IDs back into m25p80. Fixes: 03e296f613af ("mtd: m25p80: use the SPI nor framework") Cc: # 3.16.x: 32f1b7c8352f: mtd: move support for struct flash_platform_data into m25p80 Cc: # 3.16.x: 90e55b3812a1: mtd: m25p80: get rid of spi_get_device_id Cc: # 3.16.x: 70f3ce0510af: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id Cc: # 3.16.x Signed-off-by: Ben Hutchings Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 52 ++++++++++++++++++++++++++++++++++++++++++- drivers/mtd/spi-nor/spi-nor.c | 3 +-- include/linux/mtd/spi-nor.h | 1 - 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index bd5e4c6edfd4..ed827cf894e4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -261,12 +261,62 @@ static int m25p_remove(struct spi_device *spi) } +/* + * XXX This needs to be kept in sync with spi_nor_ids. We can't share + * it with spi-nor, because if this is built as a module then modpost + * won't be able to read it and add appropriate aliases. + */ +static const struct spi_device_id m25p_ids[] = { + {"at25fs010"}, {"at25fs040"}, {"at25df041a"}, {"at25df321a"}, + {"at25df641"}, {"at26f004"}, {"at26df081a"}, {"at26df161a"}, + {"at26df321"}, {"at45db081d"}, + {"en25f32"}, {"en25p32"}, {"en25q32b"}, {"en25p64"}, + {"en25q64"}, {"en25qh128"}, {"en25qh256"}, + {"f25l32pa"}, + {"mr25h256"}, {"mr25h10"}, + {"gd25q32"}, {"gd25q64"}, + {"160s33b"}, {"320s33b"}, {"640s33b"}, + {"mx25l2005a"}, {"mx25l4005a"}, {"mx25l8005"}, {"mx25l1606e"}, + {"mx25l3205d"}, {"mx25l3255e"}, {"mx25l6405d"}, {"mx25l12805d"}, + {"mx25l12855e"},{"mx25l25635e"},{"mx25l25655e"},{"mx66l51235l"}, + {"mx66l1g55g"}, + {"n25q064"}, {"n25q128a11"}, {"n25q128a13"}, {"n25q256a"}, + {"n25q512a"}, {"n25q512ax3"}, {"n25q00"}, + {"pm25lv512"}, {"pm25lv010"}, {"pm25lq032"}, + {"s25sl032p"}, {"s25sl064p"}, {"s25fl256s0"}, {"s25fl256s1"}, + {"s25fl512s"}, {"s70fl01gs"}, {"s25sl12800"}, {"s25sl12801"}, + {"s25fl129p0"}, {"s25fl129p1"}, {"s25sl004a"}, {"s25sl008a"}, + {"s25sl016a"}, {"s25sl032a"}, {"s25sl064a"}, {"s25fl008k"}, + {"s25fl016k"}, {"s25fl064k"}, + {"sst25vf040b"},{"sst25vf080b"},{"sst25vf016b"},{"sst25vf032b"}, + {"sst25vf064c"},{"sst25wf512"}, {"sst25wf010"}, {"sst25wf020"}, + {"sst25wf040"}, + {"m25p05"}, {"m25p10"}, {"m25p20"}, {"m25p40"}, + {"m25p80"}, {"m25p16"}, {"m25p32"}, {"m25p64"}, + {"m25p128"}, {"n25q032"}, + {"m25p05-nonjedec"}, {"m25p10-nonjedec"}, {"m25p20-nonjedec"}, + {"m25p40-nonjedec"}, {"m25p80-nonjedec"}, {"m25p16-nonjedec"}, + {"m25p32-nonjedec"}, {"m25p64-nonjedec"}, {"m25p128-nonjedec"}, + {"m45pe10"}, {"m45pe80"}, {"m45pe16"}, + {"m25pe20"}, {"m25pe80"}, {"m25pe16"}, + {"m25px16"}, {"m25px32"}, {"m25px32-s0"}, {"m25px32-s1"}, + {"m25px64"}, + {"w25x10"}, {"w25x20"}, {"w25x40"}, {"w25x80"}, + {"w25x16"}, {"w25x32"}, {"w25q32"}, {"w25q32dw"}, + {"w25x64"}, {"w25q64"}, {"w25q128"}, {"w25q80"}, + {"w25q80bl"}, {"w25q128"}, {"w25q256"}, {"cat25c11"}, + {"cat25c03"}, {"cat25c09"}, {"cat25c17"}, {"cat25128"}, + { }, +}; +MODULE_DEVICE_TABLE(spi, m25p_ids); + + static struct spi_driver m25p80_driver = { .driver = { .name = "m25p80", .owner = THIS_MODULE, }, - .id_table = spi_nor_ids, + .id_table = m25p_ids, .probe = m25p_probe, .remove = m25p_remove, diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 5c8e39977bc5..c51ee52386a7 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -475,7 +475,7 @@ struct flash_info { * more nor chips. This current list focusses on newer chips, which * have been converging on command sets which including JEDEC ID. */ -const struct spi_device_id spi_nor_ids[] = { +static const struct spi_device_id spi_nor_ids[] = { /* Atmel -- some are (confusingly) marketed as "DataFlash" */ { "at25fs010", INFO(0x1f6601, 0, 32 * 1024, 4, SECT_4K) }, { "at25fs040", INFO(0x1f6604, 0, 64 * 1024, 8, SECT_4K) }, @@ -639,7 +639,6 @@ const struct spi_device_id spi_nor_ids[] = { { "cat25128", CAT25_INFO(2048, 8, 64, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) }, { }, }; -EXPORT_SYMBOL_GPL(spi_nor_ids); static const struct spi_device_id *spi_nor_read_id(struct spi_nor *nor) { diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index a5a7a086748d..046a0a2e4c4e 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -199,6 +199,5 @@ struct spi_nor { * Return: 0 for success, others for failure. */ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); -extern const struct spi_device_id spi_nor_ids[]; #endif -- cgit From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit From 9e8beeb79ded25c5c1986f80fb8a7f6815345d5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:58:48 -0600 Subject: audit: Remove "weak" from audit_classify_compat_syscall() declaration There's only one audit_classify_compat_syscall() definition, so it doesn't need to be weak. Remove the "weak" attribute from the audit_classify_compat_syscall() declaration. Signed-off-by: Bjorn Helgaas Acked-by: Richard Guy Briggs CC: AKASHI Takahiro --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 36dffeccebdb..e58fe7df8b9c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -90,7 +90,7 @@ extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; -extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall); +extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ -- cgit From 96a2adbc6f501996418da9f7afe39bf0e4d006a9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:09 -0600 Subject: clocksource: Remove "weak" from clocksource_default_clock() declaration kernel/time/jiffies.c provides a default clocksource_default_clock() definition explicitly marked "weak". arch/s390 provides its own definition intended to override the default, but the "weak" attribute on the declaration applied to the s390 definition as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the clocksource_default_clock() declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: f1b82746c1e9 ("clocksource: Cleanup clocksource selection") Signed-off-by: Bjorn Helgaas Acked-by: John Stultz Acked-by: Ingo Molnar CC: Daniel Lezcano CC: Martin Schwidefsky --- include/linux/clocksource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 653f0e2b6ca9..abcafaa20b86 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -287,7 +287,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -extern struct clocksource * __init __weak clocksource_default_clock(void); +extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 -- cgit From 5ab03ac5aaa1f032e071f1b3dc433b7839359c03 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:41 -0600 Subject: vmcore: Remove "weak" from function declarations For the following functions: elfcorehdr_alloc() elfcorehdr_free() elfcorehdr_read() elfcorehdr_read_notes() remap_oldmem_pfn_range() fs/proc/vmcore.c provides default definitions explicitly marked "weak". arch/s390 provides its own definitions intended to override the default ones, but the "weak" attribute on the declarations applied to the s390 definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: be8a8d069e50 ("vmcore: introduce ELF header in new memory feature") Fixes: 9cb218131de1 ("vmcore: introduce remap_oldmem_pfn_range()") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton Acked-by: Vivek Goyal CC: Michael Holzheu --- include/linux/crash_dump.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 72ab536ad3de..3849fce7ecfe 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,14 +14,13 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; -extern int __weak elfcorehdr_alloc(unsigned long long *addr, - unsigned long long *size); -extern void __weak elfcorehdr_free(unsigned long long addr); -extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); -extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); -extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot); +extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); +extern void elfcorehdr_free(unsigned long long addr); +extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- cgit From 107bcc6d566cb40184068d888637f9aefe6252dd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:25 -0600 Subject: kgdb: Remove "weak" from kgdb_arch_pc() declaration kernel/debug/debug_core.c provides a default kgdb_arch_pc() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 688b744d8bc8 ("kgdb: fix signedness mixmatches, add statics, add declaration to header") Tested-by: Vineet Gupta # for ARC build Signed-off-by: Bjorn Helgaas Reviewed-by: Harvey Harrison --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 6b06d378f3df..e465bb15912d 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -283,7 +283,7 @@ struct kgdb_io { extern struct kgdb_arch arch_kgdb_ops; -extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs); +extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); -- cgit From e0a8400c6923a163265d52798cdd4c33f3f8ab5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:47 -0600 Subject: memory-hotplug: Remove "weak" from memory_block_size_bytes() declaration drivers/base/memory.c provides a default memory_block_size_bytes() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 41f107266b19 ("drivers: base: Add prototype declaration to the header file") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton CC: Rashika Kheria CC: Nathan Fontenot CC: Anton Blanchard CC: Heiko Carstens CC: Yinghai Lu --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index bb7384e3c3d8..8b8d8d12348e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -35,7 +35,7 @@ struct memory_block { }; int arch_get_memory_phys_device(unsigned long start_pfn); -unsigned long __weak memory_block_size_bytes(void); +unsigned long memory_block_size_bytes(void); /* These states are exposed to userspace as text strings in sysfs */ #define MEM_ONLINE (1<<0) /* exposed to userspace */ -- cgit From 271a9c35158910496f6fc3a635c2ed85df6be3d9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:01:03 -0600 Subject: uprobes: Remove "weak" from function declarations For the following interfaces: set_swbp() set_orig_insn() is_swbp_insn() is_trap_insn() uprobe_get_swbp_addr() arch_uprobe_ignore() arch_uprobe_copy_ixol() kernel/events/uprobes.c provides default definitions explicitly marked "weak". Some architectures provide their own definitions intended to override the defaults, but the "weak" attribute on the declarations applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Signed-off-by: Bjorn Helgaas Acked-by: Ingo Molnar Acked-by: Srikar Dronamraju CC: Victor Kamensky CC: Oleg Nesterov CC: David A. Long CC: Ananth N Mavinakayanahalli --- include/linux/uprobes.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f844c6b03ee..60beb5dc7977 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -98,11 +98,11 @@ struct uprobes_state { struct xol_area *xol_area; }; -extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); -extern bool __weak is_trap_insn(uprobe_opcode_t *insn); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern bool is_swbp_insn(uprobe_opcode_t *insn); +extern bool is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); @@ -128,8 +128,8 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); -extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, +extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { -- cgit From 4aa7c6346be395bdf776f82bbb2e3e2bc60bdd2b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: add i_op->dentry_open() Add a new inode operation i_op->dentry_open(). This is for stacked filesystems that want to return a struct file from a different filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/vfs.txt | 7 +++++++ fs/namei.c | 9 ++++++--- fs/open.c | 23 +++++++++++++++++++++-- include/linux/fs.h | 4 ++++ 5 files changed, 40 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 94d93b1f8b53..b30753cbf431 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fceff7c00a3c..20bf204426ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -696,6 +697,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/fs/namei.c b/fs/namei.c index 43927d14db67..75306b3c9526 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3064,9 +3064,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..de92c13b58be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index a957d4366c24..5cf7f6759679 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1528,6 +1528,9 @@ struct inode_operations { umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); + + /* WARNING: probably going away soon, do not use! */ + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2040,6 +2043,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit From 1c118596a7682912106c80007102ce0184c77780 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export do_splice_direct() to modules Export do_splice_direct() to modules. Needed by overlay filesystem. Signed-off-by: Miklos Szeredi --- fs/internal.h | 6 ------ fs/splice.c | 1 + include/linux/fs.h | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/internal.h b/fs/internal.h index 9477f8f6aefc..0f0626a6997c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -138,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, */ extern int rw_verify_area(int, struct file *, const loff_t *, size_t); -/* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - /* * pipe.c */ diff --git a/fs/splice.c b/fs/splice.c index f5cb9ba84510..75c6058eabf2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5cf7f6759679..10ed65b2c31d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2456,6 +2456,9 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); -- cgit From bd5d08569cc379f8366663a61558a9ce17c2e460 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export __inode_permission() to modules We need to be able to check inode permissions (but not filesystem implied permissions) for stackable filesystems. Expose this interface for overlayfs. Signed-off-by: Miklos Szeredi --- fs/internal.h | 1 - fs/namei.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/internal.h b/fs/internal.h index 0f0626a6997c..757ba2abf21e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -47,7 +47,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/fs/namei.c b/fs/namei.c index 75306b3c9526..d944f6db9b07 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions diff --git a/include/linux/fs.h b/include/linux/fs.h index 10ed65b2c31d..5419df70a835 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) -- cgit From c771d683a62e5d36bc46036f5c07f4f5bb7dda61 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi --- fs/namespace.c | 27 +++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index fbba8b17330d..5b66b2b3624d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 9262e4bf0cc3..c2c561dc0114 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, -- cgit From cbdf35bcb833bfd00f0925d7a9a33a21f41ea582 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: export check_sticky() It's already duplicated in btrfs and about to be used in overlayfs too. Move the sticky bit check to an inline helper and call the out-of-line helper only in the unlikly case of the sticky bit being set. Signed-off-by: Miklos Szeredi --- fs/btrfs/ioctl.c | 20 +------------------- fs/namei.c | 9 ++------- include/linux/fs.h | 9 +++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d2b76e29d3b..4399f0c3a4ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,23 +765,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { diff --git a/fs/namei.c b/fs/namei.c index d944f6db9b07..77fd536106cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2384,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check diff --git a/include/linux/fs.h b/include/linux/fs.h index 5419df70a835..55cc0a319baa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2259,6 +2259,7 @@ extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { @@ -2745,6 +2746,14 @@ static inline int is_sxid(umode_t mode) return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + + return __check_sticky(dir, inode); +} + static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) -- cgit From 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: add whiteout support Whiteout isn't actually a new file type, but is represented as a char device (Linus's idea) with 0/0 device number. This has several advantages compared to introducing a new whiteout file type: - no userspace API changes (e.g. trivial to make backups of upper layer filesystem, without losing whiteouts) - no fs image format changes (you can boot an old kernel/fsck without whiteout support and things won't break) - implementation is trivial Signed-off-by: Miklos Szeredi --- fs/namei.c | 14 ++++++++++++++ include/linux/fs.h | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 77fd536106cb..d20191c0ebf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,6 +4346,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/include/linux/fs.h b/include/linux/fs.h index 55cc0a319baa..69118b3cb917 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,6 +222,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +/* + * Whiteout is represented by a char device. The following constants define the + * mode and device number to use. + */ +#define WHITEOUT_MODE 0 +#define WHITEOUT_DEV 0 + /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. @@ -1398,6 +1405,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1628,6 +1636,9 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ + (inode)->i_rdev == WHITEOUT_DEV) + /* * Inode state bits. Protected by inode->i_lock * -- cgit From 0d7a855526dd672e114aff2ac22b60fc6f155b08 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: vfs: add RENAME_WHITEOUT This adds a new RENAME_WHITEOUT flag. This flag makes rename() create a whiteout of source. The whiteout creation is atomic relative to the rename. Signed-off-by: Miklos Szeredi --- fs/namei.c | 8 ++++++-- include/uapi/linux/fs.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index d20191c0ebf5..42df664e95e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4209,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ca1a11bb4443..3735fa0a6784 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -37,6 +37,7 @@ #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ struct fstrim_range { __u64 start; -- cgit From 69c433ed2ecd2d3264efd7afec4439524b319121 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:39 +0200 Subject: fs: limit filesystem stacking depth Add a simple read-only counter to super_block that indicates how deep this is in the stack of filesystems. Previously ecryptfs was the only stackable filesystem and it explicitly disallowed multiple layers of itself. Overlayfs, however, can be stacked recursively and also may be stacked on top of ecryptfs or vice versa. To limit the kernel stack usage we must limit the depth of the filesystem stack. Initially the limit is set to 2. Signed-off-by: Miklos Szeredi --- fs/ecryptfs/main.c | 7 +++++++ fs/overlayfs/super.c | 9 +++++++++ include/linux/fs.h | 11 +++++++++++ 3 files changed, 27 insertions(+) (limited to 'include') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1b119d3bf924..c4cd1fd86cc2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7dcc24e84417..08b704cebfc4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -677,6 +677,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } ufs->lower_namelen = statfs.f_namelen; + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 69118b3cb917..4e41a4a331bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -261,6 +261,12 @@ struct iattr { */ #include +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1273,6 +1279,11 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; extern struct timespec current_fs_time(struct super_block *sb); -- cgit From 607ec6a5abb637642e5b8199828f39ceae83cfb7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 24 Oct 2014 08:57:01 -0200 Subject: Revert "[media] v4l2-dv-timings: fix a sparse warning" Sparse got a fix for that. Also, it is suspected that reverting this patch might cause compilation breakages on userspace. So, revert it. This reverts commit 5c2cacc1028917168b0f7650008dceaa6f7e3fe2. Requested-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-dv-timings.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 6a0764c89fcb..6c8f159e416e 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -21,8 +21,17 @@ #ifndef _V4L2_DV_TIMINGS_H #define _V4L2_DV_TIMINGS_H +#if __GNUC__ < 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ < 6)) +/* Sadly gcc versions older than 4.6 have a bug in how they initialize + anonymous unions where they require additional curly brackets. + This violates the C1x standard. This workaround adds the curly brackets + if needed. */ #define V4L2_INIT_BT_TIMINGS(_width, args...) \ { .bt = { _width , ## args } } +#else +#define V4L2_INIT_BT_TIMINGS(_width, args...) \ + .bt = { _width , ## args } +#endif /* CEA-861-E timings (i.e. standard HDTV timings) */ -- cgit From 571ee1b6859869a09ed718d390aac2b9414646a2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 9 Oct 2014 18:30:08 +0800 Subject: kvm: vfio: fix unregister kvm_device_ops of vfio After commit 80ce163 (KVM: VFIO: register kvm_device_ops dynamically), kvm_device_ops of vfio can be registered dynamically. Commit 3c3c29fd (kvm-vfio: do not use module_init) move the dynamic register invoked by kvm_init in order to fix broke unloading of the kvm module. However, kvm_device_ops of vfio is unregistered after rmmod kvm-intel module which lead to device type collision detection warning after kvm-intel module reinsmod. WARNING: CPU: 1 PID: 10358 at /root/cathy/kvm/arch/x86/kvm/../../../virt/kvm/kvm_main.c:3289 kvm_init+0x234/0x282 [kvm]() Modules linked in: kvm_intel(O+) kvm(O) nfsv3 nfs_acl auth_rpcgss oid_registry nfsv4 dns_resolver nfs fscache lockd sunrpc pci_stub bridge stp llc autofs4 8021q cpufreq_ondemand ipv6 joydev microcode pcspkr igb i2c_algo_bit ehci_pci ehci_hcd e1000e i2c_i801 ixgbe ptp pps_core hwmon mdio tpm_tis tpm ipmi_si ipmi_msghandler acpi_cpufreq isci libsas scsi_transport_sas button dm_mirror dm_region_hash dm_log dm_mod [last unloaded: kvm_intel] CPU: 1 PID: 10358 Comm: insmod Tainted: G W O 3.17.0-rc1 #2 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 0000000000000cd9 ffff880ff08cfd18 ffffffff814a61d9 0000000000000cd9 0000000000000000 ffff880ff08cfd58 ffffffff810417b7 ffff880ff08cfd48 ffffffffa045bcac ffffffffa049c420 0000000000000040 00000000000000ff Call Trace: [] dump_stack+0x49/0x60 [] warn_slowpath_common+0x7c/0x96 [] ? kvm_init+0x234/0x282 [kvm] [] warn_slowpath_null+0x15/0x17 [] kvm_init+0x234/0x282 [kvm] [] vmx_init+0x1bf/0x42a [kvm_intel] [] ? vmx_check_processor_compat+0x64/0x64 [kvm_intel] [] do_one_initcall+0xe3/0x170 [] ? __vunmap+0xad/0xb8 [] do_init_module+0x2b/0x174 [] load_module+0x43e/0x569 [] ? do_init_module+0x174/0x174 [] ? copy_module_from_user+0x39/0x82 [] ? module_sect_show+0x20/0x20 [] SyS_init_module+0x54/0x81 [] system_call_fastpath+0x16/0x1b ---[ end trace 0626f4a3ddea56f3 ]--- The bug can be reproduced by: rmmod kvm_intel.ko insmod kvm_intel.ko without rmmod/insmod kvm.ko This patch fixes the bug by unregistering kvm_device_ops of vfio when the kvm-intel module is removed. Reported-by: Liu Rongrong Fixes: 3c3c29fd0d7cddc32862c350d0700ce69953e3bd Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 7 +++++++ virt/kvm/vfio.c | 5 +++++ virt/kvm/vfio.h | 4 ++++ 4 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 28be31f49250..ea53b04993f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1080,6 +1080,7 @@ void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); +void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 384eaa7b02fa..25ffac9e947d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2354,6 +2354,12 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) return 0; } +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -3328,5 +3334,6 @@ void kvm_exit(void) kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 281e7cf2b8e5..620e37f741b8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -283,3 +283,8 @@ int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } + +void kvm_vfio_ops_exit(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); +} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h index 92eac75d6b62..ab88c7dc0514 100644 --- a/virt/kvm/vfio.h +++ b/virt/kvm/vfio.h @@ -3,11 +3,15 @@ #ifdef CONFIG_KVM_VFIO int kvm_vfio_ops_init(void); +void kvm_vfio_ops_exit(void); #else static inline int kvm_vfio_ops_init(void) { return 0; } +static inline void kvm_vfio_ops_exit(void) +{ +} #endif #endif -- cgit From a1fc198046181304d28a018dba048b718f2d7ce4 Mon Sep 17 00:00:00 2001 From: Steve Longerbeam Date: Tue, 14 Oct 2014 20:41:49 +0300 Subject: ARM: i.MX6: Fix "emi" clock name typo Fix a typo error, the "emi" names refer to the eim clocks. The change fixes typo in EIM and EIM_SLOW pre-output dividers and selectors clock names. Notably EIM_SLOW clock itself is named correctly. Signed-off-by: Steve Longerbeam [vladimir_zapolskiy@mentor.com: ported to v3.17] Signed-off-by: Vladimir Zapolskiy Cc: Sascha Hauer Signed-off-by: Shawn Guo --- arch/arm/mach-imx/clk-imx6q.c | 14 +++++++------- include/dt-bindings/clock/imx6qdl-clock.h | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-imx/clk-imx6q.c b/arch/arm/mach-imx/clk-imx6q.c index 1412daf4a714..4e79da7c5e30 100644 --- a/arch/arm/mach-imx/clk-imx6q.c +++ b/arch/arm/mach-imx/clk-imx6q.c @@ -50,8 +50,8 @@ static const char *pcie_axi_sels[] = { "axi", "ahb", }; static const char *ssi_sels[] = { "pll3_pfd2_508m", "pll3_pfd3_454m", "pll4_audio_div", }; static const char *usdhc_sels[] = { "pll2_pfd2_396m", "pll2_pfd0_352m", }; static const char *enfc_sels[] = { "pll2_pfd0_352m", "pll2_bus", "pll3_usb_otg", "pll2_pfd2_396m", }; -static const char *emi_sels[] = { "pll2_pfd2_396m", "pll3_usb_otg", "axi", "pll2_pfd0_352m", }; -static const char *emi_slow_sels[] = { "axi", "pll3_usb_otg", "pll2_pfd2_396m", "pll2_pfd0_352m", }; +static const char *eim_sels[] = { "pll2_pfd2_396m", "pll3_usb_otg", "axi", "pll2_pfd0_352m", }; +static const char *eim_slow_sels[] = { "axi", "pll3_usb_otg", "pll2_pfd2_396m", "pll2_pfd0_352m", }; static const char *vdo_axi_sels[] = { "axi", "ahb", }; static const char *vpu_axi_sels[] = { "axi", "pll2_pfd2_396m", "pll2_pfd0_352m", }; static const char *cko1_sels[] = { "pll3_usb_otg", "pll2_bus", "pll1_sys", "pll5_video_div", @@ -302,8 +302,8 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node) clk[IMX6QDL_CLK_USDHC3_SEL] = imx_clk_fixup_mux("usdhc3_sel", base + 0x1c, 18, 1, usdhc_sels, ARRAY_SIZE(usdhc_sels), imx_cscmr1_fixup); clk[IMX6QDL_CLK_USDHC4_SEL] = imx_clk_fixup_mux("usdhc4_sel", base + 0x1c, 19, 1, usdhc_sels, ARRAY_SIZE(usdhc_sels), imx_cscmr1_fixup); clk[IMX6QDL_CLK_ENFC_SEL] = imx_clk_mux("enfc_sel", base + 0x2c, 16, 2, enfc_sels, ARRAY_SIZE(enfc_sels)); - clk[IMX6QDL_CLK_EMI_SEL] = imx_clk_fixup_mux("emi_sel", base + 0x1c, 27, 2, emi_sels, ARRAY_SIZE(emi_sels), imx_cscmr1_fixup); - clk[IMX6QDL_CLK_EMI_SLOW_SEL] = imx_clk_fixup_mux("emi_slow_sel", base + 0x1c, 29, 2, emi_slow_sels, ARRAY_SIZE(emi_slow_sels), imx_cscmr1_fixup); + clk[IMX6QDL_CLK_EIM_SEL] = imx_clk_fixup_mux("eim_sel", base + 0x1c, 27, 2, eim_sels, ARRAY_SIZE(eim_sels), imx_cscmr1_fixup); + clk[IMX6QDL_CLK_EIM_SLOW_SEL] = imx_clk_fixup_mux("eim_slow_sel", base + 0x1c, 29, 2, eim_slow_sels, ARRAY_SIZE(eim_slow_sels), imx_cscmr1_fixup); clk[IMX6QDL_CLK_VDO_AXI_SEL] = imx_clk_mux("vdo_axi_sel", base + 0x18, 11, 1, vdo_axi_sels, ARRAY_SIZE(vdo_axi_sels)); clk[IMX6QDL_CLK_VPU_AXI_SEL] = imx_clk_mux("vpu_axi_sel", base + 0x18, 14, 2, vpu_axi_sels, ARRAY_SIZE(vpu_axi_sels)); clk[IMX6QDL_CLK_CKO1_SEL] = imx_clk_mux("cko1_sel", base + 0x60, 0, 4, cko1_sels, ARRAY_SIZE(cko1_sels)); @@ -354,8 +354,8 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node) clk[IMX6QDL_CLK_USDHC4_PODF] = imx_clk_divider("usdhc4_podf", "usdhc4_sel", base + 0x24, 22, 3); clk[IMX6QDL_CLK_ENFC_PRED] = imx_clk_divider("enfc_pred", "enfc_sel", base + 0x2c, 18, 3); clk[IMX6QDL_CLK_ENFC_PODF] = imx_clk_divider("enfc_podf", "enfc_pred", base + 0x2c, 21, 6); - clk[IMX6QDL_CLK_EMI_PODF] = imx_clk_fixup_divider("emi_podf", "emi_sel", base + 0x1c, 20, 3, imx_cscmr1_fixup); - clk[IMX6QDL_CLK_EMI_SLOW_PODF] = imx_clk_fixup_divider("emi_slow_podf", "emi_slow_sel", base + 0x1c, 23, 3, imx_cscmr1_fixup); + clk[IMX6QDL_CLK_EIM_PODF] = imx_clk_fixup_divider("eim_podf", "eim_sel", base + 0x1c, 20, 3, imx_cscmr1_fixup); + clk[IMX6QDL_CLK_EIM_SLOW_PODF] = imx_clk_fixup_divider("eim_slow_podf", "eim_slow_sel", base + 0x1c, 23, 3, imx_cscmr1_fixup); clk[IMX6QDL_CLK_VPU_AXI_PODF] = imx_clk_divider("vpu_axi_podf", "vpu_axi_sel", base + 0x24, 25, 3); clk[IMX6QDL_CLK_CKO1_PODF] = imx_clk_divider("cko1_podf", "cko1_sel", base + 0x60, 4, 3); clk[IMX6QDL_CLK_CKO2_PODF] = imx_clk_divider("cko2_podf", "cko2_sel", base + 0x60, 21, 3); @@ -456,7 +456,7 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node) clk[IMX6QDL_CLK_USDHC2] = imx_clk_gate2("usdhc2", "usdhc2_podf", base + 0x80, 4); clk[IMX6QDL_CLK_USDHC3] = imx_clk_gate2("usdhc3", "usdhc3_podf", base + 0x80, 6); clk[IMX6QDL_CLK_USDHC4] = imx_clk_gate2("usdhc4", "usdhc4_podf", base + 0x80, 8); - clk[IMX6QDL_CLK_EIM_SLOW] = imx_clk_gate2("eim_slow", "emi_slow_podf", base + 0x80, 10); + clk[IMX6QDL_CLK_EIM_SLOW] = imx_clk_gate2("eim_slow", "eim_slow_podf", base + 0x80, 10); clk[IMX6QDL_CLK_VDO_AXI] = imx_clk_gate2("vdo_axi", "vdo_axi_sel", base + 0x80, 12); clk[IMX6QDL_CLK_VPU_AXI] = imx_clk_gate2("vpu_axi", "vpu_axi_podf", base + 0x80, 14); clk[IMX6QDL_CLK_CKO1] = imx_clk_gate("cko1", "cko1_podf", base + 0x60, 7); diff --git a/include/dt-bindings/clock/imx6qdl-clock.h b/include/dt-bindings/clock/imx6qdl-clock.h index ddaef8620b2c..b690cdba163b 100644 --- a/include/dt-bindings/clock/imx6qdl-clock.h +++ b/include/dt-bindings/clock/imx6qdl-clock.h @@ -62,8 +62,8 @@ #define IMX6QDL_CLK_USDHC3_SEL 50 #define IMX6QDL_CLK_USDHC4_SEL 51 #define IMX6QDL_CLK_ENFC_SEL 52 -#define IMX6QDL_CLK_EMI_SEL 53 -#define IMX6QDL_CLK_EMI_SLOW_SEL 54 +#define IMX6QDL_CLK_EIM_SEL 53 +#define IMX6QDL_CLK_EIM_SLOW_SEL 54 #define IMX6QDL_CLK_VDO_AXI_SEL 55 #define IMX6QDL_CLK_VPU_AXI_SEL 56 #define IMX6QDL_CLK_CKO1_SEL 57 @@ -106,8 +106,8 @@ #define IMX6QDL_CLK_USDHC4_PODF 94 #define IMX6QDL_CLK_ENFC_PRED 95 #define IMX6QDL_CLK_ENFC_PODF 96 -#define IMX6QDL_CLK_EMI_PODF 97 -#define IMX6QDL_CLK_EMI_SLOW_PODF 98 +#define IMX6QDL_CLK_EIM_PODF 97 +#define IMX6QDL_CLK_EIM_SLOW_PODF 98 #define IMX6QDL_CLK_VPU_AXI_PODF 99 #define IMX6QDL_CLK_CKO1_PODF 100 #define IMX6QDL_CLK_AXI 101 -- cgit From dda02fd6278d9e995850b3c1dba484f17cbe4de4 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Fri, 24 Oct 2014 17:47:57 +0800 Subject: mm, cma: make parameters order consistent in func declaration and definition In the current code, the base and size parameters order is not consistent in functions declaration and definition. If someone calls these functions according to the declaration parameters order in cma.h, he will run into some bug and it's hard to find the reason. This patch makes the parameters order consistent in functions declaration and definition. Signed-off-by: Weijie Yang Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- include/linux/cma.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 0430ed05d3b9..a93438beb33c 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -18,12 +18,12 @@ struct cma; extern phys_addr_t cma_get_base(struct cma *cma); extern unsigned long cma_get_size(struct cma *cma); -extern int __init cma_declare_contiguous(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, +extern int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); -extern int cma_init_reserved_mem(phys_addr_t size, - phys_addr_t base, int order_per_bit, +extern int cma_init_reserved_mem(phys_addr_t base, + phys_addr_t size, int order_per_bit, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); extern bool cma_release(struct cma *cma, struct page *pages, int count); -- cgit From e999dbc254044e8d2a5818d92d205f65bae28f37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Oct 2014 17:13:57 +0200 Subject: Revert "block: all blk-mq requests are tagged" This reverts commit fb3ccb5da71273e7f0d50b50bc879e50cedd60e7. SCSI-2/SPI actually needs the tagged/untagged flag in the request to work properly. Revert this patch and add a follow on to set it in the right place. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Jens Axboe Reported-by: Meelis Roos Tested-by: Meelis Roos Cc: stable@vger.kernel.org --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..51d0dc2259cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1136,8 +1136,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) /* * tag stuff */ -#define blk_rq_tagged(rq) \ - ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED)) +#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); -- cgit From b1dd2aac4cc0892b82ec60232ed37e3b0af776cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Oct 2014 17:13:58 +0200 Subject: scsi: set REQ_QUEUE for the blk-mq case To generate the right SPI tag messages we need to properly set QUEUE_FLAG_QUEUED in the request_queue and mirror it to the request. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Jens Axboe Reported-by: Meelis Roos Tested-by: Meelis Roos Cc: stable@vger.kernel.org --- drivers/scsi/scsi_lib.c | 5 +++++ include/scsi/scsi_tcq.h | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9eff8a375132..50a6e1ac8d9c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1893,6 +1893,11 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, blk_mq_start_request(req); } + if (blk_queue_tagged(q)) + req->cmd_flags |= REQ_QUEUED; + else + req->cmd_flags &= ~REQ_QUEUED; + scsi_init_cmd_errh(cmd); cmd->scsi_done = scsi_mq_done; diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index e64583560701..56ed843969ca 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -67,8 +67,9 @@ static inline void scsi_activate_tcq(struct scsi_device *sdev, int depth) if (!sdev->tagged_supported) return; - if (!shost_use_blk_mq(sdev->host) && - !blk_queue_tagged(sdev->request_queue)) + if (shost_use_blk_mq(sdev->host)) + queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, sdev->request_queue); + else if (!blk_queue_tagged(sdev->request_queue)) blk_queue_init_tags(sdev->request_queue, depth, sdev->host->bqt); @@ -81,8 +82,7 @@ static inline void scsi_activate_tcq(struct scsi_device *sdev, int depth) **/ static inline void scsi_deactivate_tcq(struct scsi_device *sdev, int depth) { - if (!shost_use_blk_mq(sdev->host) && - blk_queue_tagged(sdev->request_queue)) + if (blk_queue_tagged(sdev->request_queue)) blk_queue_free_tags(sdev->request_queue); scsi_adjust_queue_depth(sdev, 0, depth); } -- cgit From fcd964dda5ece2fa77f78f843bc3455348787282 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 7 Oct 2014 17:29:07 +0800 Subject: sched: Update comments for CLONE_NEWNS Signed-off-by: Chen Hanxiao Acked-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1412674147-8941-1-git-send-email-chenhanxiao@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 34f9d7387d13..b932be9f5c5b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -13,7 +13,7 @@ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ -- cgit From b438b1ab35507bbccc28d13f0b8286ffcf24019d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Oct 2014 22:16:36 -0700 Subject: perf: Fix typos in sample code in the perf_event.h header struct perf_event_mmap_page has members called "index" and "cap_user_rdpmc". Spell them correctly in the examples. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/320ba26391a8123cc16e5f02d24d34bd404332fd.1412313343.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9269de254874..9d845404d875 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -364,7 +364,7 @@ struct perf_event_mmap_page { /* * Bits needed to read the hw events in user-space. * - * u32 seq, time_mult, time_shift, idx, width; + * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; * u64 cyc, time_offset; * s64 pmc = 0; @@ -383,11 +383,11 @@ struct perf_event_mmap_page { * time_shift = pc->time_shift; * } * - * idx = pc->index; + * index = pc->index; * count = pc->offset; - * if (pc->cap_usr_rdpmc && idx) { + * if (pc->cap_user_rdpmc && index) { * width = pc->pmc_width; - * pmc = rdpmc(idx - 1); + * pmc = rdpmc(index - 1); * } * * barrier(); @@ -415,7 +415,7 @@ struct perf_event_mmap_page { }; /* - * If cap_usr_rdpmc this field provides the bit-width of the value + * If cap_user_rdpmc this field provides the bit-width of the value * read using the rdpmc() or equivalent instruction. This can be used * to sign extend the result like: * @@ -439,10 +439,10 @@ struct perf_event_mmap_page { * * Where time_offset,time_mult,time_shift and cyc are read in the * seqcount loop described above. This delta can then be added to - * enabled and possible running (if idx), improving the scaling: + * enabled and possible running (if index), improving the scaling: * * enabled += delta; - * if (idx) + * if (index) * running += delta; * * quot = count / running; -- cgit From 5631b8fba640a4ab2f8a954f63a603fa34eda96b Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sat, 25 Oct 2014 15:09:42 -0700 Subject: compiler/gcc4+: Remove inaccurate comment about 'asm goto' miscompiles The bug referenced by the comment in this commit was not completely fixed in GCC 4.8.2, as I mentioned in a thread back in February: https://lkml.org/lkml/2014/2/12/797 The conclusion at that time was to make the quirk unconditional until the bug could be found and fixed in GCC. Unfortunately, when I submitted the patch (commit a9f18034) I left a comment in that claimed the bug was fixed in GCC 4.8.2+. This comment is inaccurate, and should be removed. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar Cc: Jakub Jelinek Cc: Richard Henderson Cc: Linus Torvalds Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1414274982-14040-1-git-send-email-steven@uplinklabs.net Cc: Ingo Molnar --- include/linux/compiler-gcc4.h | 1 - include/linux/compiler-gcc5.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 2507fd2a1eb4..d1a558239b1a 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -71,7 +71,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h index cdd1cc202d51..c8c565952548 100644 --- a/include/linux/compiler-gcc5.h +++ b/include/linux/compiler-gcc5.h @@ -53,7 +53,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ -- cgit From 8c3e434769b1707fd2d24de5a2eb25fedc634c4a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 26 Oct 2014 15:18:42 -0400 Subject: drm/radeon: remove invalid pci id 0x4c6e is a secondary device id so should not be used by the driver. Noticed-by: Mark Kettenis Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- include/drm/drm_pciids.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index e973540cd15b..2dd405c9be78 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -74,7 +74,6 @@ {0x1002, 0x4C64, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV250|RADEON_IS_MOBILITY}, \ {0x1002, 0x4C66, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV250|RADEON_IS_MOBILITY}, \ {0x1002, 0x4C67, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV250|RADEON_IS_MOBILITY}, \ - {0x1002, 0x4C6E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV280|RADEON_IS_MOBILITY}, \ {0x1002, 0x4E44, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R300}, \ {0x1002, 0x4E45, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R300}, \ {0x1002, 0x4E46, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R300}, \ -- cgit From d7e29933969e5ca7c112ce1368a07911f4485dc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 09:15:54 -0700 Subject: rcu: Make rcu_barrier() understand about missing rcuo kthreads Commit 35ce7f29a44a (rcu: Create rcuo kthreads only for onlined CPUs) avoids creating rcuo kthreads for CPUs that never come online. This fixes a bug in many instances of firmware: Instead of lying about their age, these systems instead lie about the number of CPUs that they have. Before commit 35ce7f29a44a, this could result in huge numbers of useless rcuo kthreads being created. It appears that experience indicates that I should have told the people suffering from this problem to fix their broken firmware, but I instead produced what turned out to be a partial fix. The missing piece supplied by this commit makes sure that rcu_barrier() knows not to post callbacks for no-CBs CPUs that have not yet come online, because otherwise rcu_barrier() will hang on systems having firmware that lies about the number of CPUs. It is tempting to simply have rcu_barrier() refuse to post a callback on any no-CBs CPU that does not have an rcuo kthread. This unfortunately does not work because rcu_barrier() is required to wait for all pending callbacks. It is therefore required to wait even for those callbacks that cannot possibly be invoked. Even if doing so hangs the system. Given that posting a callback to a no-CBs CPU that does not yet have an rcuo kthread can hang rcu_barrier(), It is tempting to report an error in this case. Unfortunately, this will result in false positives at boot time, when it is perfectly legal to post callbacks to the boot CPU before the scheduler has started, in other words, before it is legal to invoke rcu_barrier(). So this commit instead has rcu_barrier() avoid posting callbacks to CPUs having neither rcuo kthread nor pending callbacks, and has it complain bitterly if it finds CPUs having no rcuo kthread but some pending callbacks. And when rcu_barrier() does find CPUs having no rcuo kthread but pending callbacks, as noted earlier, it has no choice but to hang indefinitely. Reported-by: Yanko Kaneti Reported-by: Jay Vosburgh Reported-by: Meelis Roos Reported-by: Eric B Munson Signed-off-by: Paul E. McKenney Tested-by: Eric B Munson Tested-by: Jay Vosburgh Tested-by: Yanko Kaneti Tested-by: Kevin Fenzi Tested-by: Meelis Roos --- include/trace/events/rcu.h | 18 +++++++++--------- kernel/rcu/tree.c | 15 ++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9b56f37148cf..e335e7d8c6c2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -660,18 +660,18 @@ TRACE_EVENT(rcu_torture_read, /* * Tracepoint for _rcu_barrier() execution. The string "s" describes * the _rcu_barrier phase: - * "Begin": rcu_barrier_callback() started. - * "Check": rcu_barrier_callback() checking for piggybacking. - * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. - * "Inc1": rcu_barrier_callback() piggyback check counter incremented. - * "Offline": rcu_barrier_callback() found offline CPU - * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. - * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. - * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. + * "Begin": _rcu_barrier() started. + * "Check": _rcu_barrier() checking for piggybacking. + * "EarlyExit": _rcu_barrier() piggybacked, thus early exit. + * "Inc1": _rcu_barrier() piggyback check counter incremented. + * "OfflineNoCB": _rcu_barrier() found callback on never-online CPU + * "OnlineNoCB": _rcu_barrier() found online no-CBs CPU. + * "OnlineQ": _rcu_barrier() found online CPU with callbacks. + * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. - * "Inc2": rcu_barrier_callback() piggyback check counter incremented. + * "Inc2": _rcu_barrier() piggyback check counter incremented. * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument * is the count of remaining callbacks, and "done" is the piggybacking count. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e47223095..9815447d22e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d03764652d91..bbdc45d8d74f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd4599344..c1d7f27bd38f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2049,6 +2049,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } } +/* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + /* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the @@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit From ebcf34f3d4be11f994340aff629f3c17171a4f65 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 26 Oct 2014 19:14:06 -0700 Subject: skbuff.h: fix kernel-doc warning for headers_end Fix kernel-doc warning in by making both headers_start and headers_end private fields. Warning(..//include/linux/skbuff.h:654): No description found for parameter 'headers_end[0]' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a59d9343c25b..5884f95ff0e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -557,7 +557,9 @@ struct sk_buff { /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ + /* private: */ __u32 headers_start[0]; + /* public: */ /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -642,7 +644,9 @@ struct sk_buff { __u16 network_header; __u16 mac_header; + /* private: */ __u32 headers_end[0]; + /* public: */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; -- cgit From 1efed2d06c703489342ab6af2951683e07509c99 Mon Sep 17 00:00:00 2001 From: Olivier Blin Date: Fri, 24 Oct 2014 19:43:00 +0200 Subject: usbnet: add a callback for set_rx_mode To delegate promiscuous mode and multicast filtering to the subdriver. Signed-off-by: Olivier Blin Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 20 ++++++++++++++++++++ include/linux/usb/usbnet.h | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 20615bbd693b..3a6770a65d78 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1052,6 +1052,21 @@ static void __handle_link_change(struct usbnet *dev) clear_bit(EVENT_LINK_CHANGE, &dev->flags); } +static void usbnet_set_rx_mode(struct net_device *net) +{ + struct usbnet *dev = netdev_priv(net); + + usbnet_defer_kevent(dev, EVENT_SET_RX_MODE); +} + +static void __handle_set_rx_mode(struct usbnet *dev) +{ + if (dev->driver_info->set_rx_mode) + (dev->driver_info->set_rx_mode)(dev); + + clear_bit(EVENT_SET_RX_MODE, &dev->flags); +} + /* work that cannot be done in interrupt context uses keventd. * * NOTE: with 2.5 we could do more of this using completion callbacks, @@ -1157,6 +1172,10 @@ skip_reset: if (test_bit (EVENT_LINK_CHANGE, &dev->flags)) __handle_link_change(dev); + if (test_bit (EVENT_SET_RX_MODE, &dev->flags)) + __handle_set_rx_mode(dev); + + if (dev->flags) netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags); } @@ -1525,6 +1544,7 @@ static const struct net_device_ops usbnet_netdev_ops = { .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, + .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 26088feb6608..d9a4905e01d0 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -78,6 +78,7 @@ struct usbnet { # define EVENT_NO_RUNTIME_PM 9 # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 +# define EVENT_SET_RX_MODE 12 }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -159,6 +160,9 @@ struct driver_info { /* called by minidriver when receiving indication */ void (*indication)(struct usbnet *dev, void *ind, int indlen); + /* rx mode change (device changes address list filtering) */ + void (*set_rx_mode)(struct usbnet *dev); + /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ int out; /* tx endpoint */ -- cgit From 54ef6df3f3f1353d99c80c437259d317b2cd1cbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 21:11:27 -0700 Subject: rcu: Provide counterpart to rcu_dereference() for non-RCU situations Although rcu_dereference() and friends can be used in situations where object lifetimes are being managed by something other than RCU, the resulting sparse and lockdep-RCU noise can be annoying. This commit therefore supplies a lockless_dereference(), which provides the protection for dereferences without the RCU-related debugging noise. Reported-by: Al Viro Signed-off-by: Paul E. McKenney Signed-off-by: Al Viro --- include/linux/rcupdate.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a4a819ffb2d1..53ff1a752d7e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -616,6 +616,21 @@ static inline void rcu_preempt_sleep_check(void) */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) +/** + * lockless_dereference() - safely load a pointer for later dereference + * @p: The pointer to load + * + * Similar to rcu_dereference(), but for situations where the pointed-to + * object's lifetime is managed by something other than RCU. That + * "something other" might be reference counting or simple immortality. + */ +#define lockless_dereference(p) \ +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) + /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to -- cgit From d1b72cc6d8cb766c802fdc70a5edc2f0ba8a2b57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Oct 2014 15:42:01 +0100 Subject: overlayfs: fix lockdep misannotation In an overlay directory that shadows an empty lower directory, say /mnt/a/empty102, do: touch /mnt/a/empty102/x unlink /mnt/a/empty102/x rmdir /mnt/a/empty102 It's actually harmless, but needs another level of nesting between I_MUTEX_CHILD and I_MUTEX_NORMAL. Signed-off-by: Miklos Szeredi Tested-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/fs.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 42df664e95e5..922f27068c4c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2497,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3fbf0d306e12..401f0840f5cc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -571,7 +571,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4e41a4a331bb..01036262095f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -639,11 +639,13 @@ static inline int inode_unhashed(struct inode *inode) * 2: child/target * 3: xattr * 4: second non-directory - * The last is for certain operations (such as rename) which lock two + * 5: second parent (when locking independent directories in rename) + * + * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is - * parent -> child -> normal -> xattr -> second non-directory + * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { @@ -651,7 +653,8 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, - I_MUTEX_NONDIR2 + I_MUTEX_NONDIR2, + I_MUTEX_PARENT2, }; void lock_two_nondirectories(struct inode *, struct inode*); -- cgit From cb1a5ab6ece7a37da4ac98ee26b0475b7c3ea79e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 28 Oct 2014 20:27:43 -0600 Subject: block: Fix merge logic when CONFIG_BLK_DEV_INTEGRITY is not defined Commit 4eaf99beadce switched to returning bool and as a result reversed the logic of the integrity merge checks. However, the empty stubs used when the block integrity code is compiled out were still returning 0. Make these stubs return "true". Signed-off-by: Martin K. Petersen Reported-by: Michael L. Semon Tested-by: Michael L. Semon Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..6cbee8395f60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1583,13 +1583,13 @@ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { - return 0; + return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { - return 0; + return true; } static inline bool blk_integrity_is_initialized(struct gendisk *g) { -- cgit From 47f29df7db78ee4fcdb104cf36918d987ddd0278 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 29 Oct 2014 14:50:29 -0700 Subject: drivers: of: add return value to of_reserved_mem_device_init() Driver calling of_reserved_mem_device_init() might be interested if the initialization has been successful or not, so add support for returning error code. This fixes a build warining caused by commit 7bfa5ab6fa1b ("drivers: dma-coherent: add initialization from device tree"), which has been merged without this change and without fixing function return value. Fixes: 7bfa5ab6fa1b1 ("drivers: dma-coherent: add initialization from device tree") Signed-off-by: Marek Szyprowski Acked-by: Arnd Bergmann Cc: Michal Nazarewicz Cc: Grant Likely Cc: Laura Abbott Cc: Josh Cartwright Cc: Joonsoo Kim Cc: Kyungmin Park Cc: Russell King Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 3 ++- drivers/of/of_reserved_mem.c | 14 +++++++++----- include/linux/of_reserved_mem.h | 9 ++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 473ff4892401..950fff9ce453 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -223,9 +223,10 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #undef pr_fmt #define pr_fmt(fmt) fmt -static void rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { dev_set_cma_area(dev, rmem->priv); + return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 59fb12e84e6b..dc566b38645f 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -243,23 +243,27 @@ static inline struct reserved_mem *__find_rmem(struct device_node *node) * This function assign memory region pointed by "memory-region" device tree * property to the given device. */ -void of_reserved_mem_device_init(struct device *dev) +int of_reserved_mem_device_init(struct device *dev) { struct reserved_mem *rmem; struct device_node *np; + int ret; np = of_parse_phandle(dev->of_node, "memory-region", 0); if (!np) - return; + return -ENODEV; rmem = __find_rmem(np); of_node_put(np); if (!rmem || !rmem->ops || !rmem->ops->device_init) - return; + return -EINVAL; + + ret = rmem->ops->device_init(rmem, dev); + if (ret == 0) + dev_info(dev, "assigned reserved memory node %s\n", rmem->name); - rmem->ops->device_init(rmem, dev); - dev_info(dev, "assigned reserved memory node %s\n", rmem->name); + return ret; } /** diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 5b5efae09135..ad2f67054372 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -16,7 +16,7 @@ struct reserved_mem { }; struct reserved_mem_ops { - void (*device_init)(struct reserved_mem *rmem, + int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); @@ -28,14 +28,17 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn) #ifdef CONFIG_OF_RESERVED_MEM -void of_reserved_mem_device_init(struct device *dev); +int of_reserved_mem_device_init(struct device *dev); void of_reserved_mem_device_release(struct device *dev); void fdt_init_reserved_mem(void); void fdt_reserved_mem_save_node(unsigned long node, const char *uname, phys_addr_t base, phys_addr_t size); #else -static inline void of_reserved_mem_device_init(struct device *dev) { } +static inline int of_reserved_mem_device_init(struct device *dev) +{ + return -ENOSYS; +} static inline void of_reserved_mem_device_release(struct device *pdev) { } static inline void fdt_init_reserved_mem(void) { } -- cgit From 6d50e60cd2edb5a57154db5a6f64eef5aa59b751 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Oct 2014 14:50:31 -0700 Subject: mm, thp: fix collapsing of hugepages on madvise If an anonymous mapping is not allowed to fault thp memory and then madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never collapse this memory into thp memory. This occurs because the madvise(2) handler for thp, hugepage_madvise(), clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags until the final action of madvise_behavior(). This causes the khugepaged_enter_vma_merge() to be a no-op in hugepage_madvise() when the vma had previously had VM_NOHUGEPAGE set. Fix this by passing the correct vma flags to the khugepaged mm slot handler. There's no chance khugepaged can run on this vma until after madvise_behavior() returns since we hold mm->mmap_sem. It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags in hugepage_advise(), but I didn't want to introduce special case behavior into madvise_behavior(). I think it's best to just let it always set vma->vm_flags itself. Signed-off-by: David Rientjes Reported-by: Suleiman Souhlal Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 17 ++++++++++------- mm/huge_memory.c | 11 ++++++----- mm/mmap.c | 8 ++++---- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 6b394f0b5148..eeb307985715 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -6,7 +6,8 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -35,13 +36,13 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) && - !(vma->vm_flags & VM_NOHUGEPAGE)) + (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && + !(vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; @@ -54,11 +55,13 @@ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 780d12c000e9..de984159cf0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 7f855206e7fb..87e82b38453c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } -- cgit From 3a3c02ecf7f2852f122d6d16fb9b3d9cb0c6f201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:46 -0700 Subject: mm: page-writeback: inline account_page_dirtied() into single caller A follow-up patch would have changed the call signature. To save the trouble, just fold it instead. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page-writeback.c | 23 ++++------------------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 27eb1bfbe704..b46461116cd2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1235,7 +1235,6 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff24c9d83112..ff6a5b07211e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2115,23 +2115,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); -/* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); + if (!ret) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; -- cgit From d7365e783edb858279be1d03f61bc8d5d3383d90 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:48 -0700 Subject: mm: memcontrol: fix missed end-writeback page accounting Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page migration to uncharge the old page right away. The page is locked, unmapped, truncated, and off the LRU, but it could race with writeback ending, which then doesn't unaccount the page properly: test_clear_page_writeback() migration wait_on_page_writeback() TestClearPageWriteback() mem_cgroup_migrate() clear PCG_USED mem_cgroup_update_page_stat() if (PageCgroupUsed(pc)) decrease memcg pages under writeback release pc->mem_cgroup->move_lock The per-page statistics interface is heavily optimized to avoid a function call and a lookup_page_cgroup() in the file unmap fast path, which means it doesn't verify whether a page is still charged before clearing PageWriteback() and it has to do it in the stat update later. Rework it so that it looks up the page's memcg once at the beginning of the transaction and then uses it throughout. The charge will be verified before clearing PageWriteback() and migration can't uncharge the page as long as that is still set. The RCU lock will protect the memcg past uncharge. As far as losing the optimization goes, the following test results are from a microbenchmark that maps, faults, and unmaps a 4GB sparse file three times in a nested fashion, so that there are two negative passes that don't account but still go through the new transaction overhead. There is no actual difference: old: 33.195102545 seconds time elapsed ( +- 0.01% ) new: 33.199231369 seconds time elapsed ( +- 0.03% ) The time spent in page_remove_rmap()'s callees still adds up to the same, but the time spent in the function itself seems reduced: # Children Self Command Shared Object Symbol old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 58 ++++++++----------------- mm/memcontrol.c | 105 ++++++++++++++++++++++++--------------------- mm/page-writeback.c | 22 +++++----- mm/rmap.c | 20 ++++----- 4 files changed, 96 insertions(+), 109 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 19df5d857411..6b75640ef5ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void) return false; } -void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, - unsigned long *flags); - -extern atomic_t memcg_moving; - -static inline void mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - rcu_read_lock(); - *locked = false; - if (atomic_read(&memcg_moving)) - __mem_cgroup_begin_update_page_stat(page, locked, flags); -} - -void __mem_cgroup_end_update_page_stat(struct page *page, - unsigned long *flags); -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - if (*locked) - __mem_cgroup_end_update_page_stat(page, flags); - rcu_read_unlock(); -} - -void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, - int val); - -static inline void mem_cgroup_inc_page_stat(struct page *page, +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, + unsigned long *flags); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags); +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx, int val); + +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, 1); + mem_cgroup_update_page_stat(memcg, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, -1); + mem_cgroup_update_page_stat(memcg, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, @@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_begin_update_page_stat(struct page *page, +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, unsigned long *flags) { + return NULL; } -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, + bool locked, unsigned long flags) { } @@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_inc_page_stat(struct page *page, +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23976fd885fd..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -2204,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2246,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff6a5b07211e..19ceae87522d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 116a5053415b..f574046f77d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct mem_cgroup *uninitialized_var(memcg); bool anon = PageAnon(page); - bool locked; unsigned long flags; + bool locked; /* * The anon case has no mem_cgroup page_stat to update; but may @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) * we hold the lock against page_stat move: so avoid it on anon. */ if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; out: if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /* -- cgit From 39bb5e62867de82b269b07df900165029b928359 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Oct 2014 10:32:34 -0700 Subject: net: skb_fclone_busy() needs to detect orphaned skb Some drivers are unable to perform TX completions in a bound time. They instead call skb_orphan() Problem is skb_fclone_busy() has to detect this case, otherwise we block TCP retransmits and can freeze unlucky tcp sessions on mostly idle hosts. Signed-off-by: Eric Dumazet Fixes: 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++++++-- net/ipv4/tcp_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5884f95ff0e9..6c8b6f604e76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -799,15 +799,19 @@ struct sk_buff_fclones { * @skb: buffer * * Returns true is skb is a fast clone, and its clone is not freed. + * Some drivers call skb_orphan() in their ndo_start_xmit(), + * so we also check that this didnt happen. */ -static inline bool skb_fclone_busy(const struct sk_buff *skb) +static inline bool skb_fclone_busy(const struct sock *sk, + const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && - fclones->skb2.fclone == SKB_FCLONE_CLONE; + fclones->skb2.fclone == SKB_FCLONE_CLONE && + fclones->skb2.sk == sk; } static inline struct sk_buff *alloc_skb_fclone(unsigned int size, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..a3d453b94747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4c4e457e7888..88bf289abdc9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1962,7 +1962,7 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } -- cgit From 5188cd44c55db3e92cd9e77a40b5baa7ed4340f7 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 30 Oct 2014 18:27:17 +0000 Subject: drivers/net, ipv6: Select IPv6 fragment idents for virtio UFO packets UFO is now disabled on all drivers that work with virtio net headers, but userland may try to send UFO/IPv6 packets anyway. Instead of sending with ID=0, we should select identifiers on their behalf (as we used to). Signed-off-by: Ben Hutchings Fixes: 916e4cf46d02 ("ipv6: reuse ip6_frag_id from ip6_ufo_append_data") Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 3 +++ drivers/net/tun.c | 6 +++++- include/net/ipv6.h | 2 ++ net/ipv6/output_core.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 2aeaa61ece09..6f226de655a4 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -572,6 +573,8 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, pr_warn_once("macvtap: %s: using disabled UFO feature; please fix this program\n", current->comm); gso_type = SKB_GSO_UDP; + if (skb->protocol == htons(ETH_P_IPV6)) + ipv6_proxy_select_ident(skb); break; default: return -EINVAL; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 280d3d2a9792..7302398f0b1f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1139,6 +1140,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, break; } + skb_reset_network_header(skb); + if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -1159,6 +1162,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, current->comm); } skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + if (skb->protocol == htons(ETH_P_IPV6)) + ipv6_proxy_select_ident(skb); break; } default: @@ -1189,7 +1194,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } - skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); rxhash = skb_get_hash(skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 97f472012438..4292929392b0 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -671,6 +671,8 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } +void ipv6_proxy_select_ident(struct sk_buff *skb); + int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index fc24c390af05..97f41a3e68d9 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -3,11 +3,45 @@ * not configured or static. These functions are needed by GSO/GRO implementation. */ #include +#include #include #include #include #include +/* This function exists only for tap drivers that must support broken + * clients requesting UFO without specifying an IPv6 fragment ID. + * + * This is similar to ipv6_select_ident() but we use an independent hash + * seed to limit information leakage. + * + * The network header must be set before calling this. + */ +void ipv6_proxy_select_ident(struct sk_buff *skb) +{ + static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; + struct in6_addr *addrs; + u32 hash, id; + + addrs = skb_header_pointer(skb, + skb_network_offset(skb) + + offsetof(struct ipv6hdr, saddr), + sizeof(buf), buf); + if (!addrs) + return; + + net_get_random_once(&ip6_proxy_idents_hashrnd, + sizeof(ip6_proxy_idents_hashrnd)); + + hash = __ipv6_addr_jhash(&addrs[1], ip6_proxy_idents_hashrnd); + hash = __ipv6_addr_jhash(&addrs[0], hash); + + id = ip_idents_reserve(hash, 1); + skb_shinfo(skb)->ip6_frag_id = htonl(id); +} +EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); + int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); -- cgit From b2de525f095708b2adbadaec3f1e4017a23d1e09 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 29 Sep 2014 10:21:10 -0400 Subject: Return short read or 0 at end of a raw device, not EIO Author: David Jeffery Changes to the basic direct I/O code have broken the raw driver when reading to the end of a raw device. Instead of returning a short read for a read that extends partially beyond the device's end or 0 when at the end of the device, these reads now return EIO. The raw driver needs the same end of device handling as was added for normal block devices. Using blkdev_read_iter, which has the needed size checks, prevents the EIO conditions at the end of the device. Signed-off-by: David Jeffery Signed-off-by: Al Viro --- drivers/char/raw.c | 2 +- fs/block_dev.c | 3 ++- include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 0102dc788608..a24891b97547 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = new_sync_read, - .read_iter = generic_file_read_iter, + .read_iter = blkdev_read_iter, .write = new_sync_write, .write_iter = blkdev_write_iter, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index cc9d4114cda0..1d9c9f3754f8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) } EXPORT_SYMBOL_GPL(blkdev_write_iter); -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = file->f_mapping->host; @@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_truncate(to, size); return generic_file_read_iter(iocb, to); } +EXPORT_SYMBOL_GPL(blkdev_read_iter); /* * Try to release a page associated with block device when the system diff --git a/include/linux/fs.h b/include/linux/fs.h index 01036262095f..9ab779e8a63c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2469,6 +2469,7 @@ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); /* fs/block_dev.c */ +extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to); extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); -- cgit From 052b9498eea532deb5de75277a53f6e0623215dc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 25 Oct 2014 18:24:57 +0200 Subject: netfilter: nf_reject_ipv4: split nf_send_reset() in smaller functions That can be reused by the reject bridge expression to build the reject packet. The new functions are: * nf_reject_ip_tcphdr_get(): to sanitize and to obtain the TCP header. * nf_reject_iphdr_put(): to build the IPv4 header. * nf_reject_ip_tcphdr_put(): to build the TCP header. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 10 ++++ net/ipv4/netfilter/nf_reject_ipv4.c | 88 ++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index e8427193c777..03e928a55229 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -1,6 +1,8 @@ #ifndef _IPV4_NF_REJECT_H #define _IPV4_NF_REJECT_H +#include +#include #include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) @@ -10,4 +12,12 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code) void nf_send_reset(struct sk_buff *oldskb, int hook); +const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); +struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int ttl); +void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); + #endif /* _IPV4_NF_REJECT_H */ diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 92b303dbd5fc..1baaa83dfe5c 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,43 +12,39 @@ #include #include #include +#include -/* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; + return NULL; oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); + sizeof(struct tcphdr), _oth); if (oth == NULL) - return; + return NULL; /* No RST for RST. */ if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; + return NULL; /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); + return NULL; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; + return oth; +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); - skb_reserve(nskb, LL_MAX_HEADER); +struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int ttl) +{ + struct iphdr *niph, *oiph = ip_hdr(oldskb); skb_reset_network_header(nskb); niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); @@ -57,10 +53,23 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) niph->tos = 0; niph->id = 0; niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; + niph->protocol = protocol; niph->check = 0; niph->saddr = oiph->daddr; niph->daddr = oiph->saddr; + niph->ttl = ttl; + + nskb->protocol = htons(ETH_P_IP); + + return niph; +} +EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); + +void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) +{ + struct iphdr *niph = ip_hdr(nskb); + struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); @@ -69,9 +78,9 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) tcph->dest = oth->source; tcph->doff = sizeof(struct tcphdr) / 4; - if (oth->ack) + if (oth->ack) { tcph->seq = oth->ack_seq; - else { + } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); @@ -84,16 +93,43 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _oth; + + oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); + if (!oth) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); - nskb->protocol = htons(ETH_P_IP); + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + ip4_dst_hoplimit(skb_dst(nskb))); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) goto free_nskb; - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; -- cgit From 8bfcdf6671b1c8006c52c3eaf9fd1b5dfcf41c3d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 26 Oct 2014 12:35:54 +0100 Subject: netfilter: nf_reject_ipv6: split nf_send_reset6() in smaller functions That can be reused by the reject bridge expression to build the reject packet. The new functions are: * nf_reject_ip6_tcphdr_get(): to sanitize and to obtain the TCP header. * nf_reject_ip6hdr_put(): to build the IPv6 header. * nf_reject_ip6_tcphdr_put(): to build the TCP header. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_reject.h | 10 ++ net/ipv6/netfilter/nf_reject_ipv6.c | 175 ++++++++++++++++++++------------- 2 files changed, 119 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 48e18810a9be..23216d48abf9 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -15,4 +15,14 @@ nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook); +const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); +struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int hoplimit); +void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); + #endif /* _IPV6_NF_REJECT_H */ diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 20d9defc6c59..015eb8a80766 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,116 +12,102 @@ #include #include #include +#include -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { - struct sk_buff *nskb; - struct tcphdr otcph, *tcph; - unsigned int otcplen, hh_len; - int tcphoff, needs_ack; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; -#define DEFAULT_TOS_VALUE 0x0U - const __u8 tclass = DEFAULT_TOS_VALUE; - struct dst_entry *dst = NULL; u8 proto; __be16 frag_off; - struct flowi6 fl6; - - if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || - (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { - pr_debug("addr is not unicast.\n"); - return; - } + int tcphoff; proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), + &proto, &frag_off); if ((tcphoff < 0) || (tcphoff > oldskb->len)) { pr_debug("Cannot get TCP header.\n"); - return; + return NULL; } - otcplen = oldskb->len - tcphoff; + *otcplen = oldskb->len - tcphoff; /* IP header checks: fragment, too short. */ - if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { - pr_debug("proto(%d) != IPPROTO_TCP, " - "or too short. otcplen = %d\n", - proto, otcplen); - return; + if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n", + proto, *otcplen); + return NULL; } - if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) - BUG(); + otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr), + otcph); + if (otcph == NULL) + return NULL; /* No RST for RST. */ - if (otcph.rst) { + if (otcph->rst) { pr_debug("RST is set\n"); - return; + return NULL; } /* Check checksum. */ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { pr_debug("TCP checksum is invalid\n"); - return; - } - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = oip6h->daddr; - fl6.daddr = oip6h->saddr; - fl6.fl6_sport = otcph.dest; - fl6.fl6_dport = otcph.source; - security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { - dst_release(dst); - return; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - return; - - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, - GFP_ATOMIC); - - if (!nskb) { - net_dbg_ratelimited("cannot alloc skb\n"); - dst_release(dst); - return; + return NULL; } - skb_dst_set(nskb, dst); + return otcph; +} +EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); - skb_reserve(nskb, hh_len + dst->header_len); +struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int hoplimit) +{ + struct ipv6hdr *ip6h; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); ip6_flow_hdr(ip6h, tclass, 0); - ip6h->hop_limit = ip6_dst_hoplimit(dst); - ip6h->nexthdr = IPPROTO_TCP; + ip6h->hop_limit = hoplimit; + ip6h->nexthdr = protocol; ip6h->saddr = oip6h->daddr; ip6h->daddr = oip6h->saddr; + nskb->protocol = htons(ETH_P_IPV6); + + return ip6h; +} +EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); + +void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) +{ + struct tcphdr *tcph; + int needs_ack; + skb_reset_transport_header(nskb); tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; - tcph->source = otcph.dest; - tcph->dest = otcph.source; + tcph->source = oth->dest; + tcph->dest = oth->source; - if (otcph.ack) { + if (oth->ack) { needs_ack = 0; - tcph->seq = otcph.ack_seq; + tcph->seq = oth->ack_seq; tcph->ack_seq = 0; } else { needs_ack = 1; - tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin - + otcplen - (otcph.doff<<2)); + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + otcplen - (oth->doff<<2)); tcph->seq = 0; } @@ -139,6 +125,63 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) sizeof(struct tcphdr), IPPROTO_TCP, csum_partial(tcph, sizeof(struct tcphdr), 0)); +} +EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); + +void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr _otcph; + const struct tcphdr *otcph; + unsigned int otcplen, hh_len; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; + struct dst_entry *dst = NULL; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook); + if (!otcph) + return; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; + fl6.fl6_sport = otcph->dest; + fl6.fl6_dport = otcph->source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + net_dbg_ratelimited("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, + ip6_dst_hoplimit(dst)); + nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); -- cgit From c72c553249bb73705f594e292a8f8750027fbcbe Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 27 Oct 2014 17:40:44 +0100 Subject: ARM: imx: clk-vf610: define PLL's clock tree So far, the required PLL's (PLL1/PLL2/PLL5) have been initialized by boot loader and the kernel code defined fixed rates according to those default configurations. Beginning with the USB PLL7 the code started to initialize the PLL's itself (using imx_clk_pllv3). However, since commit dc4805c2e78ba5a22ea1632f3e3e4ee601a1743b (ARM: imx: remove ENABLE and BYPASS bits from clk-pllv3 driver) imx_clk_pllv3 no longer takes care of the ENABLE and BYPASS bits, hence the USB PLL were not configured correctly anymore. This patch not only fixes those USB PLL's, but also makes use of the imx_clk_pllv3 for all PLL's and alignes the code with the PLL support of the i.MX6 series. Signed-off-by: Stefan Agner Signed-off-by: Shawn Guo --- arch/arm/mach-imx/clk-vf610.c | 134 ++++++++++++++++++++++---------- include/dt-bindings/clock/vf610-clock.h | 39 ++++++++-- 2 files changed, 123 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-imx/clk-vf610.c b/arch/arm/mach-imx/clk-vf610.c index a17818475050..409637254594 100644 --- a/arch/arm/mach-imx/clk-vf610.c +++ b/arch/arm/mach-imx/clk-vf610.c @@ -58,8 +58,14 @@ #define PFD_PLL1_BASE (anatop_base + 0x2b0) #define PFD_PLL2_BASE (anatop_base + 0x100) #define PFD_PLL3_BASE (anatop_base + 0xf0) +#define PLL1_CTRL (anatop_base + 0x270) +#define PLL2_CTRL (anatop_base + 0x30) #define PLL3_CTRL (anatop_base + 0x10) +#define PLL4_CTRL (anatop_base + 0x70) +#define PLL5_CTRL (anatop_base + 0xe0) +#define PLL6_CTRL (anatop_base + 0xa0) #define PLL7_CTRL (anatop_base + 0x20) +#define ANA_MISC1 (anatop_base + 0x160) static void __iomem *anatop_base; static void __iomem *ccm_base; @@ -67,25 +73,34 @@ static void __iomem *ccm_base; /* sources for multiplexer clocks, this is used multiple times */ static const char *fast_sels[] = { "firc", "fxosc", }; static const char *slow_sels[] = { "sirc_32k", "sxosc", }; -static const char *pll1_sels[] = { "pll1_main", "pll1_pfd1", "pll1_pfd2", "pll1_pfd3", "pll1_pfd4", }; -static const char *pll2_sels[] = { "pll2_main", "pll2_pfd1", "pll2_pfd2", "pll2_pfd3", "pll2_pfd4", }; -static const char *sys_sels[] = { "fast_clk_sel", "slow_clk_sel", "pll2_pfd_sel", "pll2_main", "pll1_pfd_sel", "pll3_main", }; +static const char *pll1_sels[] = { "pll1_sys", "pll1_pfd1", "pll1_pfd2", "pll1_pfd3", "pll1_pfd4", }; +static const char *pll2_sels[] = { "pll2_bus", "pll2_pfd1", "pll2_pfd2", "pll2_pfd3", "pll2_pfd4", }; +static const char *pll_bypass_src_sels[] = { "fast_clk_sel", "lvds1_in", }; +static const char *pll1_bypass_sels[] = { "pll1", "pll1_bypass_src", }; +static const char *pll2_bypass_sels[] = { "pll2", "pll2_bypass_src", }; +static const char *pll3_bypass_sels[] = { "pll3", "pll3_bypass_src", }; +static const char *pll4_bypass_sels[] = { "pll4", "pll4_bypass_src", }; +static const char *pll5_bypass_sels[] = { "pll5", "pll5_bypass_src", }; +static const char *pll6_bypass_sels[] = { "pll6", "pll6_bypass_src", }; +static const char *pll7_bypass_sels[] = { "pll7", "pll7_bypass_src", }; +static const char *sys_sels[] = { "fast_clk_sel", "slow_clk_sel", "pll2_pfd_sel", "pll2_bus", "pll1_pfd_sel", "pll3_usb_otg", }; static const char *ddr_sels[] = { "pll2_pfd2", "sys_sel", }; static const char *rmii_sels[] = { "enet_ext", "audio_ext", "enet_50m", "enet_25m", }; static const char *enet_ts_sels[] = { "enet_ext", "fxosc", "audio_ext", "usb", "enet_ts", "enet_25m", "enet_50m", }; -static const char *esai_sels[] = { "audio_ext", "mlb", "spdif_rx", "pll4_main_div", }; -static const char *sai_sels[] = { "audio_ext", "mlb", "spdif_rx", "pll4_main_div", }; +static const char *esai_sels[] = { "audio_ext", "mlb", "spdif_rx", "pll4_audio_div", }; +static const char *sai_sels[] = { "audio_ext", "mlb", "spdif_rx", "pll4_audio_div", }; static const char *nfc_sels[] = { "platform_bus", "pll1_pfd1", "pll3_pfd1", "pll3_pfd3", }; -static const char *qspi_sels[] = { "pll3_main", "pll3_pfd4", "pll2_pfd4", "pll1_pfd4", }; -static const char *esdhc_sels[] = { "pll3_main", "pll3_pfd3", "pll1_pfd3", "platform_bus", }; -static const char *dcu_sels[] = { "pll1_pfd2", "pll3_main", }; +static const char *qspi_sels[] = { "pll3_usb_otg", "pll3_pfd4", "pll2_pfd4", "pll1_pfd4", }; +static const char *esdhc_sels[] = { "pll3_usb_otg", "pll3_pfd3", "pll1_pfd3", "platform_bus", }; +static const char *dcu_sels[] = { "pll1_pfd2", "pll3_usb_otg", }; static const char *gpu_sels[] = { "pll2_pfd2", "pll3_pfd2", }; -static const char *vadc_sels[] = { "pll6_main_div", "pll3_main_div", "pll3_main", }; +static const char *vadc_sels[] = { "pll6_video_div", "pll3_usb_otg_div", "pll3_usb_otg", }; /* FTM counter clock source, not module clock */ static const char *ftm_ext_sels[] = {"sirc_128k", "sxosc", "fxosc_half", "audio_ext", }; static const char *ftm_fix_sels[] = { "sxosc", "ipg_bus", }; -static struct clk_div_table pll4_main_div_table[] = { + +static struct clk_div_table pll4_audio_div_table[] = { { .val = 0, .div = 1 }, { .val = 1, .div = 2 }, { .val = 2, .div = 6 }, @@ -120,6 +135,9 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) clk[VF610_CLK_AUDIO_EXT] = imx_obtain_fixed_clock("audio_ext", 0); clk[VF610_CLK_ENET_EXT] = imx_obtain_fixed_clock("enet_ext", 0); + /* Clock source from external clock via LVDs PAD */ + clk[VF610_CLK_ANACLK1] = imx_obtain_fixed_clock("anaclk1", 0); + clk[VF610_CLK_FXOSC_HALF] = imx_clk_fixed_factor("fxosc_half", "fxosc", 1, 2); np = of_find_compatible_node(NULL, NULL, "fsl,vf610-anatop"); @@ -133,31 +151,63 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) clk[VF610_CLK_SLOW_CLK_SEL] = imx_clk_mux("slow_clk_sel", CCM_CCSR, 4, 1, slow_sels, ARRAY_SIZE(slow_sels)); clk[VF610_CLK_FASK_CLK_SEL] = imx_clk_mux("fast_clk_sel", CCM_CCSR, 5, 1, fast_sels, ARRAY_SIZE(fast_sels)); - clk[VF610_CLK_PLL1_MAIN] = imx_clk_fixed_factor("pll1_main", "fast_clk_sel", 22, 1); - clk[VF610_CLK_PLL1_PFD1] = imx_clk_pfd("pll1_pfd1", "pll1_main", PFD_PLL1_BASE, 0); - clk[VF610_CLK_PLL1_PFD2] = imx_clk_pfd("pll1_pfd2", "pll1_main", PFD_PLL1_BASE, 1); - clk[VF610_CLK_PLL1_PFD3] = imx_clk_pfd("pll1_pfd3", "pll1_main", PFD_PLL1_BASE, 2); - clk[VF610_CLK_PLL1_PFD4] = imx_clk_pfd("pll1_pfd4", "pll1_main", PFD_PLL1_BASE, 3); - - clk[VF610_CLK_PLL2_MAIN] = imx_clk_fixed_factor("pll2_main", "fast_clk_sel", 22, 1); - clk[VF610_CLK_PLL2_PFD1] = imx_clk_pfd("pll2_pfd1", "pll2_main", PFD_PLL2_BASE, 0); - clk[VF610_CLK_PLL2_PFD2] = imx_clk_pfd("pll2_pfd2", "pll2_main", PFD_PLL2_BASE, 1); - clk[VF610_CLK_PLL2_PFD3] = imx_clk_pfd("pll2_pfd3", "pll2_main", PFD_PLL2_BASE, 2); - clk[VF610_CLK_PLL2_PFD4] = imx_clk_pfd("pll2_pfd4", "pll2_main", PFD_PLL2_BASE, 3); - - clk[VF610_CLK_PLL3_MAIN] = imx_clk_fixed_factor("pll3_main", "fast_clk_sel", 20, 1); - clk[VF610_CLK_PLL3_PFD1] = imx_clk_pfd("pll3_pfd1", "pll3_main", PFD_PLL3_BASE, 0); - clk[VF610_CLK_PLL3_PFD2] = imx_clk_pfd("pll3_pfd2", "pll3_main", PFD_PLL3_BASE, 1); - clk[VF610_CLK_PLL3_PFD3] = imx_clk_pfd("pll3_pfd3", "pll3_main", PFD_PLL3_BASE, 2); - clk[VF610_CLK_PLL3_PFD4] = imx_clk_pfd("pll3_pfd4", "pll3_main", PFD_PLL3_BASE, 3); - - clk[VF610_CLK_PLL4_MAIN] = imx_clk_fixed_factor("pll4_main", "fast_clk_sel", 25, 1); - /* Enet pll: fixed 50Mhz */ - clk[VF610_CLK_PLL5_MAIN] = imx_clk_fixed_factor("pll5_main", "fast_clk_sel", 125, 6); - /* pll6: default 960Mhz */ - clk[VF610_CLK_PLL6_MAIN] = imx_clk_fixed_factor("pll6_main", "fast_clk_sel", 40, 1); - /* pll7: USB1 PLL at 480MHz */ - clk[VF610_CLK_PLL7_MAIN] = imx_clk_pllv3(IMX_PLLV3_USB, "pll7_main", "fast_clk_sel", PLL7_CTRL, 0x2); + clk[VF610_CLK_PLL1_BYPASS_SRC] = imx_clk_mux("pll1_bypass_src", PLL1_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL2_BYPASS_SRC] = imx_clk_mux("pll2_bypass_src", PLL2_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL3_BYPASS_SRC] = imx_clk_mux("pll3_bypass_src", PLL3_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL4_BYPASS_SRC] = imx_clk_mux("pll4_bypass_src", PLL4_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL5_BYPASS_SRC] = imx_clk_mux("pll5_bypass_src", PLL5_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL6_BYPASS_SRC] = imx_clk_mux("pll6_bypass_src", PLL6_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + clk[VF610_CLK_PLL7_BYPASS_SRC] = imx_clk_mux("pll7_bypass_src", PLL7_CTRL, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); + + clk[VF610_CLK_PLL1] = imx_clk_pllv3(IMX_PLLV3_GENERIC, "pll1", "pll1_bypass_src", PLL1_CTRL, 0x1); + clk[VF610_CLK_PLL2] = imx_clk_pllv3(IMX_PLLV3_GENERIC, "pll2", "pll2_bypass_src", PLL2_CTRL, 0x1); + clk[VF610_CLK_PLL3] = imx_clk_pllv3(IMX_PLLV3_USB, "pll3", "pll3_bypass_src", PLL3_CTRL, 0x1); + clk[VF610_CLK_PLL4] = imx_clk_pllv3(IMX_PLLV3_AV, "pll4", "pll4_bypass_src", PLL4_CTRL, 0x7f); + clk[VF610_CLK_PLL5] = imx_clk_pllv3(IMX_PLLV3_ENET, "pll5", "pll5_bypass_src", PLL5_CTRL, 0x3); + clk[VF610_CLK_PLL6] = imx_clk_pllv3(IMX_PLLV3_AV, "pll6", "pll6_bypass_src", PLL6_CTRL, 0x7f); + clk[VF610_CLK_PLL7] = imx_clk_pllv3(IMX_PLLV3_USB, "pll7", "pll7_bypass_src", PLL7_CTRL, 0x1); + + clk[VF610_PLL1_BYPASS] = imx_clk_mux_flags("pll1_bypass", PLL1_CTRL, 16, 1, pll1_bypass_sels, ARRAY_SIZE(pll1_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL2_BYPASS] = imx_clk_mux_flags("pll2_bypass", PLL2_CTRL, 16, 1, pll2_bypass_sels, ARRAY_SIZE(pll2_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL3_BYPASS] = imx_clk_mux_flags("pll3_bypass", PLL3_CTRL, 16, 1, pll3_bypass_sels, ARRAY_SIZE(pll3_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL4_BYPASS] = imx_clk_mux_flags("pll4_bypass", PLL4_CTRL, 16, 1, pll4_bypass_sels, ARRAY_SIZE(pll4_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL5_BYPASS] = imx_clk_mux_flags("pll5_bypass", PLL5_CTRL, 16, 1, pll5_bypass_sels, ARRAY_SIZE(pll5_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL6_BYPASS] = imx_clk_mux_flags("pll6_bypass", PLL6_CTRL, 16, 1, pll6_bypass_sels, ARRAY_SIZE(pll6_bypass_sels), CLK_SET_RATE_PARENT); + clk[VF610_PLL7_BYPASS] = imx_clk_mux_flags("pll7_bypass", PLL7_CTRL, 16, 1, pll7_bypass_sels, ARRAY_SIZE(pll7_bypass_sels), CLK_SET_RATE_PARENT); + + /* Do not bypass PLLs initially */ + clk_set_parent(clk[VF610_PLL1_BYPASS], clk[VF610_CLK_PLL1]); + clk_set_parent(clk[VF610_PLL2_BYPASS], clk[VF610_CLK_PLL2]); + clk_set_parent(clk[VF610_PLL3_BYPASS], clk[VF610_CLK_PLL3]); + clk_set_parent(clk[VF610_PLL4_BYPASS], clk[VF610_CLK_PLL4]); + clk_set_parent(clk[VF610_PLL5_BYPASS], clk[VF610_CLK_PLL5]); + clk_set_parent(clk[VF610_PLL6_BYPASS], clk[VF610_CLK_PLL6]); + clk_set_parent(clk[VF610_PLL7_BYPASS], clk[VF610_CLK_PLL7]); + + clk[VF610_CLK_PLL1_SYS] = imx_clk_gate("pll1_sys", "pll1_bypass", PLL1_CTRL, 13); + clk[VF610_CLK_PLL2_BUS] = imx_clk_gate("pll2_bus", "pll2_bypass", PLL2_CTRL, 13); + clk[VF610_CLK_PLL3_USB_OTG] = imx_clk_gate("pll3_usb_otg", "pll3_bypass", PLL3_CTRL, 13); + clk[VF610_CLK_PLL4_AUDIO] = imx_clk_gate("pll4_audio", "pll4_bypass", PLL4_CTRL, 13); + clk[VF610_CLK_PLL5_ENET] = imx_clk_gate("pll5_enet", "pll5_bypass", PLL5_CTRL, 13); + clk[VF610_CLK_PLL6_VIDEO] = imx_clk_gate("pll6_video", "pll6_bypass", PLL6_CTRL, 13); + clk[VF610_CLK_PLL7_USB_HOST] = imx_clk_gate("pll7_usb_host", "pll7_bypass", PLL7_CTRL, 13); + + clk[VF610_CLK_LVDS1_IN] = imx_clk_gate_exclusive("lvds1_in", "anaclk1", ANA_MISC1, 12, BIT(10)); + + clk[VF610_CLK_PLL1_PFD1] = imx_clk_pfd("pll1_pfd1", "pll1_sys", PFD_PLL1_BASE, 0); + clk[VF610_CLK_PLL1_PFD2] = imx_clk_pfd("pll1_pfd2", "pll1_sys", PFD_PLL1_BASE, 1); + clk[VF610_CLK_PLL1_PFD3] = imx_clk_pfd("pll1_pfd3", "pll1_sys", PFD_PLL1_BASE, 2); + clk[VF610_CLK_PLL1_PFD4] = imx_clk_pfd("pll1_pfd4", "pll1_sys", PFD_PLL1_BASE, 3); + + clk[VF610_CLK_PLL2_PFD1] = imx_clk_pfd("pll2_pfd1", "pll2_bus", PFD_PLL2_BASE, 0); + clk[VF610_CLK_PLL2_PFD2] = imx_clk_pfd("pll2_pfd2", "pll2_bus", PFD_PLL2_BASE, 1); + clk[VF610_CLK_PLL2_PFD3] = imx_clk_pfd("pll2_pfd3", "pll2_bus", PFD_PLL2_BASE, 2); + clk[VF610_CLK_PLL2_PFD4] = imx_clk_pfd("pll2_pfd4", "pll2_bus", PFD_PLL2_BASE, 3); + + clk[VF610_CLK_PLL3_PFD1] = imx_clk_pfd("pll3_pfd1", "pll3_usb_otg", PFD_PLL3_BASE, 0); + clk[VF610_CLK_PLL3_PFD2] = imx_clk_pfd("pll3_pfd2", "pll3_usb_otg", PFD_PLL3_BASE, 1); + clk[VF610_CLK_PLL3_PFD3] = imx_clk_pfd("pll3_pfd3", "pll3_usb_otg", PFD_PLL3_BASE, 2); + clk[VF610_CLK_PLL3_PFD4] = imx_clk_pfd("pll3_pfd4", "pll3_usb_otg", PFD_PLL3_BASE, 3); clk[VF610_CLK_PLL1_PFD_SEL] = imx_clk_mux("pll1_pfd_sel", CCM_CCSR, 16, 3, pll1_sels, 5); clk[VF610_CLK_PLL2_PFD_SEL] = imx_clk_mux("pll2_pfd_sel", CCM_CCSR, 19, 3, pll2_sels, 5); @@ -167,12 +217,12 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) clk[VF610_CLK_PLATFORM_BUS] = imx_clk_divider("platform_bus", "sys_bus", CCM_CACRR, 3, 3); clk[VF610_CLK_IPG_BUS] = imx_clk_divider("ipg_bus", "platform_bus", CCM_CACRR, 11, 2); - clk[VF610_CLK_PLL3_MAIN_DIV] = imx_clk_divider("pll3_main_div", "pll3_main", CCM_CACRR, 20, 1); - clk[VF610_CLK_PLL4_MAIN_DIV] = clk_register_divider_table(NULL, "pll4_main_div", "pll4_main", 0, CCM_CACRR, 6, 3, 0, pll4_main_div_table, &imx_ccm_lock); - clk[VF610_CLK_PLL6_MAIN_DIV] = imx_clk_divider("pll6_main_div", "pll6_main", CCM_CACRR, 21, 1); + clk[VF610_CLK_PLL3_MAIN_DIV] = imx_clk_divider("pll3_usb_otg_div", "pll3_usb_otg", CCM_CACRR, 20, 1); + clk[VF610_CLK_PLL4_MAIN_DIV] = clk_register_divider_table(NULL, "pll4_audio_div", "pll4_audio", 0, CCM_CACRR, 6, 3, 0, pll4_audio_div_table, &imx_ccm_lock); + clk[VF610_CLK_PLL6_MAIN_DIV] = imx_clk_divider("pll6_video_div", "pll6_video", CCM_CACRR, 21, 1); - clk[VF610_CLK_USBPHY0] = imx_clk_gate("usbphy0", "pll3_main", PLL3_CTRL, 6); - clk[VF610_CLK_USBPHY1] = imx_clk_gate("usbphy1", "pll7_main", PLL7_CTRL, 6); + clk[VF610_CLK_USBPHY0] = imx_clk_gate("usbphy0", "pll3_usb_otg", PLL3_CTRL, 6); + clk[VF610_CLK_USBPHY1] = imx_clk_gate("usbphy1", "pll7_usb_host", PLL7_CTRL, 6); clk[VF610_CLK_USBC0] = imx_clk_gate2("usbc0", "ipg_bus", CCM_CCGR1, CCM_CCGRx_CGn(4)); clk[VF610_CLK_USBC1] = imx_clk_gate2("usbc1", "ipg_bus", CCM_CCGR7, CCM_CCGRx_CGn(4)); @@ -191,8 +241,8 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) clk[VF610_CLK_QSPI1_X1_DIV] = imx_clk_divider("qspi1_x1", "qspi1_x2", CCM_CSCDR3, 11, 1); clk[VF610_CLK_QSPI1] = imx_clk_gate2("qspi1", "qspi1_x1", CCM_CCGR8, CCM_CCGRx_CGn(4)); - clk[VF610_CLK_ENET_50M] = imx_clk_fixed_factor("enet_50m", "pll5_main", 1, 10); - clk[VF610_CLK_ENET_25M] = imx_clk_fixed_factor("enet_25m", "pll5_main", 1, 20); + clk[VF610_CLK_ENET_50M] = imx_clk_fixed_factor("enet_50m", "pll5_enet", 1, 10); + clk[VF610_CLK_ENET_25M] = imx_clk_fixed_factor("enet_25m", "pll5_enet", 1, 20); clk[VF610_CLK_ENET_SEL] = imx_clk_mux("enet_sel", CCM_CSCMR2, 4, 2, rmii_sels, 4); clk[VF610_CLK_ENET_TS_SEL] = imx_clk_mux("enet_ts_sel", CCM_CSCMR2, 0, 3, enet_ts_sels, 7); clk[VF610_CLK_ENET] = imx_clk_gate("enet", "enet_sel", CCM_CSCDR1, 24); diff --git a/include/dt-bindings/clock/vf610-clock.h b/include/dt-bindings/clock/vf610-clock.h index d6b56b21539b..801c0ac50c47 100644 --- a/include/dt-bindings/clock/vf610-clock.h +++ b/include/dt-bindings/clock/vf610-clock.h @@ -21,24 +21,24 @@ #define VF610_CLK_FASK_CLK_SEL 8 #define VF610_CLK_AUDIO_EXT 9 #define VF610_CLK_ENET_EXT 10 -#define VF610_CLK_PLL1_MAIN 11 +#define VF610_CLK_PLL1_SYS 11 #define VF610_CLK_PLL1_PFD1 12 #define VF610_CLK_PLL1_PFD2 13 #define VF610_CLK_PLL1_PFD3 14 #define VF610_CLK_PLL1_PFD4 15 -#define VF610_CLK_PLL2_MAIN 16 +#define VF610_CLK_PLL2_BUS 16 #define VF610_CLK_PLL2_PFD1 17 #define VF610_CLK_PLL2_PFD2 18 #define VF610_CLK_PLL2_PFD3 19 #define VF610_CLK_PLL2_PFD4 20 -#define VF610_CLK_PLL3_MAIN 21 +#define VF610_CLK_PLL3_USB_OTG 21 #define VF610_CLK_PLL3_PFD1 22 #define VF610_CLK_PLL3_PFD2 23 #define VF610_CLK_PLL3_PFD3 24 #define VF610_CLK_PLL3_PFD4 25 -#define VF610_CLK_PLL4_MAIN 26 -#define VF610_CLK_PLL5_MAIN 27 -#define VF610_CLK_PLL6_MAIN 28 +#define VF610_CLK_PLL4_AUDIO 26 +#define VF610_CLK_PLL5_ENET 27 +#define VF610_CLK_PLL6_VIDEO 28 #define VF610_CLK_PLL3_MAIN_DIV 29 #define VF610_CLK_PLL4_MAIN_DIV 30 #define VF610_CLK_PLL6_MAIN_DIV 31 @@ -166,9 +166,32 @@ #define VF610_CLK_DMAMUX3 153 #define VF610_CLK_FLEXCAN0_EN 154 #define VF610_CLK_FLEXCAN1_EN 155 -#define VF610_CLK_PLL7_MAIN 156 +#define VF610_CLK_PLL7_USB_HOST 156 #define VF610_CLK_USBPHY0 157 #define VF610_CLK_USBPHY1 158 -#define VF610_CLK_END 159 +#define VF610_CLK_LVDS1_IN 159 +#define VF610_CLK_ANACLK1 160 +#define VF610_CLK_PLL1_BYPASS_SRC 161 +#define VF610_CLK_PLL2_BYPASS_SRC 162 +#define VF610_CLK_PLL3_BYPASS_SRC 163 +#define VF610_CLK_PLL4_BYPASS_SRC 164 +#define VF610_CLK_PLL5_BYPASS_SRC 165 +#define VF610_CLK_PLL6_BYPASS_SRC 166 +#define VF610_CLK_PLL7_BYPASS_SRC 167 +#define VF610_CLK_PLL1 168 +#define VF610_CLK_PLL2 169 +#define VF610_CLK_PLL3 170 +#define VF610_CLK_PLL4 171 +#define VF610_CLK_PLL5 172 +#define VF610_CLK_PLL6 173 +#define VF610_CLK_PLL7 174 +#define VF610_PLL1_BYPASS 175 +#define VF610_PLL2_BYPASS 176 +#define VF610_PLL3_BYPASS 177 +#define VF610_PLL4_BYPASS 178 +#define VF610_PLL5_BYPASS 179 +#define VF610_PLL6_BYPASS 180 +#define VF610_PLL7_BYPASS 181 +#define VF610_CLK_END 182 #endif /* __DT_BINDINGS_CLOCK_VF610_H */ -- cgit From a87fa1d81a9fb5e9adca9820e16008c40ad09f33 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 3 Nov 2014 15:15:35 +0000 Subject: of: Fix overflow bug in string property parsing functions The string property read helpers will run off the end of the buffer if it is handed a malformed string property. Rework the parsers to make sure that doesn't happen. At the same time add new test cases to make sure the functions behave themselves. The original implementations of of_property_read_string_index() and of_property_count_strings() both open-coded the same block of parsing code, each with it's own subtly different bugs. The fix here merges functions into a single helper and makes the original functions static inline wrappers around the helper. One non-bugfix aspect of this patch is the addition of a new wrapper, of_property_read_string_array(). The new wrapper is needed by the device_properties feature that Rafael is working on and planning to merge for v3.19. The implementation is identical both with and without the new static inline wrapper, so it just got left in to reduce the churn on the header file. Signed-off-by: Grant Likely Cc: Rafael J. Wysocki Cc: Mika Westerberg Cc: Rob Herring Cc: Arnd Bergmann Cc: Darren Hart Cc: # v3.3+: Drop selftest hunks that don't apply --- drivers/of/base.c | 88 ++++++++--------------------- drivers/of/selftest.c | 66 ++++++++++++++++++++-- drivers/of/testcase-data/tests-phandle.dtsi | 2 + include/linux/of.h | 84 ++++++++++++++++++++++----- 4 files changed, 154 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index 2305dc0382bc..3823edf2d012 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1279,52 +1279,6 @@ int of_property_read_string(struct device_node *np, const char *propname, } EXPORT_SYMBOL_GPL(of_property_read_string); -/** - * of_property_read_string_index - Find and read a string from a multiple - * strings property. - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @index: index of the string in the list of strings - * @out_string: pointer to null terminated return string, modified only if - * return value is 0. - * - * Search for a property in a device tree node and retrieve a null - * terminated string value (pointer to data, not a copy) in the list of strings - * contained in that property. - * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if - * property does not have a value, and -EILSEQ if the string is not - * null-terminated within the length of the property data. - * - * The out_string pointer is modified only if a valid string can be decoded. - */ -int of_property_read_string_index(struct device_node *np, const char *propname, - int index, const char **output) -{ - struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; - - if (!prop) - return -EINVAL; - if (!prop->value) - return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - - p = prop->value; - - for (i = 0; total < prop->length; total += l, p += l) { - l = strlen(p) + 1; - if (i++ == index) { - *output = p; - return 0; - } - } - return -ENODATA; -} -EXPORT_SYMBOL_GPL(of_property_read_string_index); - /** * of_property_match_string() - Find string in a list and return index * @np: pointer to node containing string list property @@ -1351,7 +1305,7 @@ int of_property_match_string(struct device_node *np, const char *propname, end = p + prop->length; for (i = 0; p < end; i++, p += l) { - l = strlen(p) + 1; + l = strnlen(p, end - p) + 1; if (p + l > end) return -EILSEQ; pr_debug("comparing %s with %s\n", string, p); @@ -1363,39 +1317,41 @@ int of_property_match_string(struct device_node *np, const char *propname, EXPORT_SYMBOL_GPL(of_property_match_string); /** - * of_property_count_strings - Find and return the number of strings from a - * multiple strings property. + * of_property_read_string_util() - Utility helper for parsing string properties * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * @skip: Number of strings to skip over at beginning of list. * - * Search for a property in a device tree node and retrieve the number of null - * terminated string contain in it. Returns the number of strings on - * success, -EINVAL if the property does not exist, -ENODATA if property - * does not have a value, and -EILSEQ if the string is not null-terminated - * within the length of the property data. + * Don't call this function directly. It is a utility helper for the + * of_property_read_string*() family of functions. */ -int of_property_count_strings(struct device_node *np, const char *propname) +int of_property_read_string_helper(struct device_node *np, const char *propname, + const char **out_strs, size_t sz, int skip) { struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; + int l = 0, i = 0; + const char *p, *end; if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - p = prop->value; + end = p + prop->length; - for (i = 0; total < prop->length; total += l, p += l, i++) - l = strlen(p) + 1; - - return i; + for (i = 0; p < end && (!out_strs || i < skip + sz); i++, p += l) { + l = strnlen(p, end - p) + 1; + if (p + l > end) + return -EILSEQ; + if (out_strs && i >= skip) + *out_strs++ = p; + } + i -= skip; + return i <= 0 ? -ENODATA : i; } -EXPORT_SYMBOL_GPL(of_property_count_strings); +EXPORT_SYMBOL_GPL(of_property_read_string_helper); void of_print_phandle_args(const char *msg, const struct of_phandle_args *args) { diff --git a/drivers/of/selftest.c b/drivers/of/selftest.c index 78001270a598..11b873c54a77 100644 --- a/drivers/of/selftest.c +++ b/drivers/of/selftest.c @@ -339,8 +339,9 @@ static void __init of_selftest_parse_phandle_with_args(void) selftest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); } -static void __init of_selftest_property_match_string(void) +static void __init of_selftest_property_string(void) { + const char *strings[4]; struct device_node *np; int rc; @@ -357,13 +358,66 @@ static void __init of_selftest_property_match_string(void) rc = of_property_match_string(np, "phandle-list-names", "third"); selftest(rc == 2, "third expected:0 got:%i\n", rc); rc = of_property_match_string(np, "phandle-list-names", "fourth"); - selftest(rc == -ENODATA, "unmatched string; rc=%i", rc); + selftest(rc == -ENODATA, "unmatched string; rc=%i\n", rc); rc = of_property_match_string(np, "missing-property", "blah"); - selftest(rc == -EINVAL, "missing property; rc=%i", rc); + selftest(rc == -EINVAL, "missing property; rc=%i\n", rc); rc = of_property_match_string(np, "empty-property", "blah"); - selftest(rc == -ENODATA, "empty property; rc=%i", rc); + selftest(rc == -ENODATA, "empty property; rc=%i\n", rc); rc = of_property_match_string(np, "unterminated-string", "blah"); - selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + + /* of_property_count_strings() tests */ + rc = of_property_count_strings(np, "string-property"); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "phandle-list-names"); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string"); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string-list"); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + + /* of_property_read_string_index() tests */ + rc = of_property_read_string_index(np, "string-property", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "foobar"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "string-property", 1, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 1, strings); + selftest(rc == 0 && !strcmp(strings[0], "second"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 2, strings); + selftest(rc == 0 && !strcmp(strings[0], "third"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "phandle-list-names", 3, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string", 0, strings); + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "unterminated-string-list", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string-list", 2, strings); /* should fail */ + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[1] = NULL; + + /* of_property_read_string_array() tests */ + rc = of_property_read_string_array(np, "string-property", strings, 4); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "phandle-list-names", strings, 4); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "unterminated-string", strings, 4); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + /* -- An incorrectly formed string should cause a failure */ + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 4); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + /* -- parsing the correctly formed strings should still work: */ + strings[2] = NULL; + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 2); + selftest(rc == 2 && strings[2] == NULL, "of_property_read_string_array() failure; rc=%i\n", rc); + strings[1] = NULL; + rc = of_property_read_string_array(np, "phandle-list-names", strings, 1); + selftest(rc == 1 && strings[1] == NULL, "Overwrote end of string array; rc=%i, str='%s'\n", rc, strings[1]); } #define propcmp(p1, p2) (((p1)->length == (p2)->length) && \ @@ -881,7 +935,7 @@ static int __init of_selftest(void) of_selftest_find_node_by_name(); of_selftest_dynamic(); of_selftest_parse_phandle_with_args(); - of_selftest_property_match_string(); + of_selftest_property_string(); of_selftest_property_copy(); of_selftest_changeset(); of_selftest_parse_interrupts(); diff --git a/drivers/of/testcase-data/tests-phandle.dtsi b/drivers/of/testcase-data/tests-phandle.dtsi index ce0fe083d406..5b1527e8a7fb 100644 --- a/drivers/of/testcase-data/tests-phandle.dtsi +++ b/drivers/of/testcase-data/tests-phandle.dtsi @@ -39,7 +39,9 @@ phandle-list-bad-args = <&provider2 1 0>, <&provider3 0>; empty-property; + string-property = "foobar"; unterminated-string = [40 41 42 43]; + unterminated-string-list = "first", "second", [40 41 42 43]; }; }; }; diff --git a/include/linux/of.h b/include/linux/of.h index 6545e7aec7bb..29f0adc5f3e4 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -267,14 +267,12 @@ extern int of_property_read_u64(const struct device_node *np, extern int of_property_read_string(struct device_node *np, const char *propname, const char **out_string); -extern int of_property_read_string_index(struct device_node *np, - const char *propname, - int index, const char **output); extern int of_property_match_string(struct device_node *np, const char *propname, const char *string); -extern int of_property_count_strings(struct device_node *np, - const char *propname); +extern int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index); extern int of_device_is_compatible(const struct device_node *device, const char *); extern int of_device_is_available(const struct device_node *device); @@ -486,15 +484,9 @@ static inline int of_property_read_string(struct device_node *np, return -ENOSYS; } -static inline int of_property_read_string_index(struct device_node *np, - const char *propname, int index, - const char **out_string) -{ - return -ENOSYS; -} - -static inline int of_property_count_strings(struct device_node *np, - const char *propname) +static inline int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index) { return -ENOSYS; } @@ -667,6 +659,70 @@ static inline int of_property_count_u64_elems(const struct device_node *np, return of_property_count_elems_of_size(np, propname, sizeof(u64)); } +/** + * of_property_read_string_array() - Read an array of strings from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * + * Search for a property in a device tree node and retrieve a list of + * terminated string values (pointer to data, not a copy) in that property. + * + * If @out_strs is NULL, the number of strings in the property is returned. + */ +static inline int of_property_read_string_array(struct device_node *np, + const char *propname, const char **out_strs, + size_t sz) +{ + return of_property_read_string_helper(np, propname, out_strs, sz, 0); +} + +/** + * of_property_count_strings() - Find and return the number of strings from a + * multiple strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * + * Search for a property in a device tree node and retrieve the number of null + * terminated string contain in it. Returns the number of strings on + * success, -EINVAL if the property does not exist, -ENODATA if property + * does not have a value, and -EILSEQ if the string is not null-terminated + * within the length of the property data. + */ +static inline int of_property_count_strings(struct device_node *np, + const char *propname) +{ + return of_property_read_string_helper(np, propname, NULL, 0, 0); +} + +/** + * of_property_read_string_index() - Find and read a string from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @index: index of the string in the list of strings + * @out_string: pointer to null terminated return string, modified only if + * return value is 0. + * + * Search for a property in a device tree node and retrieve a null + * terminated string value (pointer to data, not a copy) in the list of strings + * contained in that property. + * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if + * property does not have a value, and -EILSEQ if the string is not + * null-terminated within the length of the property data. + * + * The out_string pointer is modified only if a valid string can be decoded. + */ +static inline int of_property_read_string_index(struct device_node *np, + const char *propname, + int index, const char **output) +{ + int rc = of_property_read_string_helper(np, propname, output, 1, index); + return rc < 0 ? rc : 0; +} + /** * of_property_read_bool - Findfrom a property * @np: device node from which the property value is to be read. -- cgit From 32f638fc11db0526c706454d9ab4339d55ac89f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Oct 2014 10:17:25 -0600 Subject: PCI: Don't oops on virtual buses in acpi_pci_get_bridge_handle() acpi_pci_get_bridge_handle() returns the ACPI handle for the bridge device (either a host bridge or a PCI-to-PCI bridge) leading to a PCI bus. But SR-IOV virtual functions can be on a virtual bus with no bridge leading to it. Return a NULL acpi_handle in this case instead of trying to dereference the NULL pointer to the bridge. This fixes a NULL pointer dereference oops in pci_get_hp_params() when adding SR-IOV VF devices on virtual buses. [bhelgaas: changelog, add comment in code] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Link: https://bugzilla.kernel.org/show_bug.cgi?id=87591 Reported-by: Chao Zhou Reported-by: Joerg Roedel Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- include/linux/pci-acpi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 64dacb7288a6..24c7728ca681 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -41,8 +41,13 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) if (pci_is_root_bus(pbus)) dev = pbus->bridge; - else + else { + /* If pbus is a virtual bus, there is no bridge to it */ + if (!pbus->self) + return NULL; + dev = &pbus->self->dev; + } return ACPI_HANDLE(dev); } -- cgit