From 8ce953aa39e2bfd66036a27abdf761c2cb93f02c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 27 Feb 2014 09:46:18 +0100 Subject: drbd: silence -Wmissing-prototypes warnings Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- lib/lru_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 4a83ecd03650..6111cd19762d 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -169,7 +169,7 @@ out_fail: return NULL; } -void lc_free_by_index(struct lru_cache *lc, unsigned i) +static void lc_free_by_index(struct lru_cache *lc, unsigned i) { void *p = lc->lc_element[i]; WARN_ON(!p); -- cgit From 54e6fc38e888a54b016e1e04e1eceea78ddf7ace Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 May 2014 13:39:35 +0200 Subject: drbd: debugfs: add per volume oldest_requests Show oldest requests * pending master bio completion and, * if different, local disk bio completion. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_debugfs.c | 153 ++++++++++++++++++++++++++++++++++++-- lib/lru_cache.c | 21 +++--- 2 files changed, 160 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 230d9e1fc85c..45b52056f88d 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -434,12 +434,10 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo goto out; /* serialize with d_delete() */ mutex_lock(&parent->d_inode->i_mutex); - if (!debugfs_positive(file->f_dentry)) - goto out_unlock; /* Make sure the object is still alive */ - if (kref_get_unless_zero(kref)) + if (debugfs_positive(file->f_dentry) + && kref_get_unless_zero(kref)) ret = 0; -out_unlock: mutex_unlock(&parent->d_inode->i_mutex); if (!ret) { ret = single_open(file, show, data); @@ -592,6 +590,53 @@ static const struct file_operations connection_callback_history_fops = { .release = callback_history_release, }; +static int connection_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + spin_lock_irq(&connection->resource->req_lock); + r1 = connection->req_next; + if (r1) + seq_print_minor_vnr_req(m, r1, now); + r2 = connection->req_ack_pending; + if (r2 && r2 != r1) { + r1 = r2; + seq_print_minor_vnr_req(m, r1, now); + } + r2 = connection->req_not_net_done; + if (r2 && r2 != r1) + seq_print_minor_vnr_req(m, r2, now); + spin_unlock_irq(&connection->resource->req_lock); + return 0; +} + +static int connection_oldest_requests_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, connection_oldest_requests_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int connection_oldest_requests_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_oldest_requests_fops = { + .owner = THIS_MODULE, + .open = connection_oldest_requests_open, + .read = seq_read, + .llseek = seq_lseek, + .release = connection_oldest_requests_release, +}; + void drbd_debugfs_connection_add(struct drbd_connection *connection) { struct dentry *conns_dir = connection->resource->debugfs_res_connections; @@ -627,6 +672,89 @@ void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) drbd_debugfs_remove(&connection->debugfs_conn); } +static void resync_dump_detail(struct seq_file *m, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(m, "%5d %s %s %s\n", bme->rs_left, + test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", + test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", + test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" + ); +} + +static int device_resync_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->resync); + lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail); + put_ldev(device); + } + return 0; +} + +static int device_act_log_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->act_log); + lc_seq_dump_details(m, device->act_log, "", NULL); + put_ldev(device); + } + return 0; +} + +static int device_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_resource *resource = device->resource; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + int i; + + seq_puts(m, RQ_HDR); + spin_lock_irq(&resource->req_lock); + /* WRITE, then READ */ + for (i = 1; i >= 0; --i) { + r1 = list_first_entry_or_null(&device->pending_master_completion[i], + struct drbd_request, req_pending_master_completion); + r2 = list_first_entry_or_null(&device->pending_completion[i], + struct drbd_request, req_pending_local); + if (r1) + seq_print_one_request(m, r1, now); + if (r2 && r2 != r1) + seq_print_one_request(m, r2, now); + } + spin_unlock_irq(&resource->req_lock); + return 0; +} + +#define drbd_debugfs_device_attr(name) \ +static int device_ ## name ## _open(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + return drbd_single_open(file, device_ ## name ## _show, device, \ + &device->kref, drbd_destroy_device); \ +} \ +static int device_ ## name ## _release(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + kref_put(&device->kref, drbd_destroy_device); \ + return single_release(inode, file); \ +} \ +static const struct file_operations device_ ## name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = device_ ## name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = device_ ## name ## _release, \ +}; + +drbd_debugfs_device_attr(oldest_requests) +drbd_debugfs_device_attr(act_log_extents) +drbd_debugfs_device_attr(resync_extents) + void drbd_debugfs_device_add(struct drbd_device *device) { struct dentry *vols_dir = device->resource->debugfs_res_volumes; @@ -650,10 +778,25 @@ void drbd_debugfs_device_add(struct drbd_device *device) if (!slink_name) goto fail; dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); + kfree(slink_name); + slink_name = NULL; if (IS_ERR_OR_NULL(dentry)) goto fail; device->debugfs_minor = dentry; - kfree(slink_name); + +#define DCF(name) do { \ + dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ + device->debugfs_vol, device, \ + &device_ ## name ## _fops); \ + if (IS_ERR_OR_NULL(dentry)) \ + goto fail; \ + device->debugfs_vol_ ## name = dentry; \ + } while (0) + + DCF(oldest_requests); + DCF(act_log_extents); + DCF(resync_extents); + return; fail: drbd_debugfs_device_cleanup(device); diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 6111cd19762d..852c81e3ba9a 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index) * lc_dump - Dump a complete LRU cache to seq in textual form. * @lc: the lru cache to operate on * @seq: the &struct seq_file pointer to seq_printf into - * @utext: user supplied "heading" or other info + * @utext: user supplied additional "heading" or other info * @detail: function pointer the user may provide to dump further details - * of the object the lc_element is embedded in. + * of the object the lc_element is embedded in. May be NULL. + * Note: a leading space ' ' and trailing newline '\n' is implied. */ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, void (*detail) (struct seq_file *, struct lc_element *)) @@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext struct lc_element *e; int i; - seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext); + seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext); for (i = 0; i < nr_elements; i++) { e = lc_element_by_index(lc, i); - if (e->lc_number == LC_FREE) { - seq_printf(seq, "\t%2d: FREE\n", i); - } else { - seq_printf(seq, "\t%2d: %4u %4u ", i, - e->lc_number, e->refcnt); + if (e->lc_number != e->lc_new_number) + seq_printf(seq, "\t%5d: %6d %8d %6d ", + i, e->lc_number, e->lc_new_number, e->refcnt); + else + seq_printf(seq, "\t%5d: %6d %-8s %6d ", + i, e->lc_number, "-\"-", e->refcnt); + if (detail) detail(seq, e); - } + seq_putc(seq, '\n'); } } -- cgit From 866ced950bcd54820c3e571229356adc2b2dd72e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Jul 2014 20:50:18 +0200 Subject: kbuild: Support split debug info v4 This is an alternative approach to lower the overhead of debug info (as we discussed a few days ago) gcc 4.7+ and newer binutils have a new "split debug info" debug info model where the debug info is only written once into central ".dwo" files. This avoids having to copy it around multiple times, from the object files to the final executable. It lowers the disk space requirements. In addition it defaults to compressed debug data. More details here: http://gcc.gnu.org/wiki/DebugFission This patch adds a new option to enable it. It has to be an option, because it'll undoubtedly break everyone's debuginfo packaging scheme. gdb/objdump/etc. all still work, if you have new enough versions. I don't see big compile wins (maybe a second or two faster or so), but the object dirs with debuginfo get significantly smaller. My standard kernel config (slightly bigger than defconfig) shrinks from 2.9G disk space to 1.1G objdir (with non reduced debuginfo). I presume if you are IO limited the compile time difference will be larger. Only problem I've seen so far is that it doesn't play well with older versions of ccache (apparently fixed, see https://bugzilla.samba.org/show_bug.cgi?id=10005) v2: various fixes from Dirk Gouders. Improve commit message slightly. v3: Fix clean rules and improve Kconfig slightly v4: Fix merge error in last version (Sam Ravnborg) Clarify description that it mainly helps disk size. Cc: Dirk Gouders Signed-off-by: Andi Kleen Acked-by: Sam Ravnborg Signed-off-by: Michal Marek --- .gitignore | 1 + Makefile | 5 +++++ lib/Kconfig.debug | 15 +++++++++++++++ 3 files changed, 21 insertions(+) (limited to 'lib') diff --git a/.gitignore b/.gitignore index f4c0b091dcf4..e213b27f3921 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ *.gcno modules.builtin Module.symvers +*.dwo # # Top-level generic files diff --git a/Makefile b/Makefile index 89fe23a0825c..e5a269465f99 100644 --- a/Makefile +++ b/Makefile @@ -684,7 +684,11 @@ endif endif ifdef CONFIG_DEBUG_INFO +ifdef CONFIG_DEBUG_INFO_SPLIT +KBUILD_CFLAGS += $(call cc-option, -gsplit-dwarf, -g) +else KBUILD_CFLAGS += -g +endif KBUILD_AFLAGS += -Wa,-gdwarf-2 endif @@ -1372,6 +1376,7 @@ clean: $(clean-dirs) @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \ -o -name '*.ko.*' \ + -o -name '*.dwo' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ -o -name modules.builtin -o -name '.tmp_*.o.*' \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7cfcc1b8e101..910355da9d14 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -143,6 +143,21 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. +config DEBUG_INFO_SPLIT + bool "Produce split debuginfo in .dwo files" + depends on DEBUG_INFO + help + Generate debug info into separate .dwo files. This significantly + reduces the build directory size for builds with DEBUG_INFO, + because it stores the information only once on disk in .dwo + files instead of multiple times in object files and executables. + In addition the debug information is also compressed. + + Requires recent gcc (4.7+) and recent gdb/binutils. + Any tool that packages or reads debug information would need + to know about the .dwo files and include them. + Incompatible with older versions of ccache. + config ENABLE_WARN_DEPRECATED bool "Enable __deprecated logic" default y -- cgit From bfaf2dd3509bc73bf4a4cea0e72472755ed860e2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Jul 2014 20:50:19 +0200 Subject: Kbuild: Add a option to enable dwarf4 v2 I found that a lot of unresolvable variables when using gdb on the kernel become resolvable when dwarf4 is enabled. So add a Kconfig flag to enable it. It definitely increases the debug information size, but on the other hand this isn't so bad when debug fusion is used. v2: Use cc-option Signed-off-by: Andi Kleen Acked-by: Sam Ravnborg Signed-off-by: Michal Marek --- Makefile | 3 +++ lib/Kconfig.debug | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'lib') diff --git a/Makefile b/Makefile index e5a269465f99..2fd21a58f5cf 100644 --- a/Makefile +++ b/Makefile @@ -691,6 +691,9 @@ KBUILD_CFLAGS += -g endif KBUILD_AFLAGS += -Wa,-gdwarf-2 endif +ifdef CONFIG_DEBUG_INFO_DWARF4 +KBUILD_CFLAGS += $(call cc-option, -gdwarf-4,) +endif ifdef CONFIG_DEBUG_INFO_REDUCED KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly) \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 910355da9d14..d70f203ce710 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -158,6 +158,15 @@ config DEBUG_INFO_SPLIT to know about the .dwo files and include them. Incompatible with older versions of ccache. +config DEBUG_INFO_DWARF4 + bool "Generate dwarf4 debuginfo" + depends on DEBUG_INFO + help + Generate dwarf4 debug info. This requires recent versions + of gcc and gdb. It makes the debug information larger. + But it significantly improves the success of resolving + variables in gdb on optimized code. + config ENABLE_WARN_DEPRECATED bool "Enable __deprecated logic" default y -- cgit From 42a9dc0b3d0f749375c767c7d5cab56e89160576 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:01 -0700 Subject: printk: rename DEFAULT_MESSAGE_LOGLEVEL Commit a8fe19ebfbfd ("kernel/printk: use symbolic defines for console loglevels") makes consistent use of symbolic values for printk() log levels. The naming scheme used is different from the one used for DEFAULT_MESSAGE_LOGLEVEL though. Change that symbol name to be MESSAGE_LOGLEVEL_DEFAULT for consistency. And because the value of that symbol comes from a similarly-named config option, rename CONFIG_DEFAULT_MESSAGE_LOGLEVEL as well. Signed-off-by: Alex Elder Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 +- kernel/printk/printk.c | 2 +- lib/Kconfig.debug | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/printk.h b/include/linux/printk.h index 319ff7e53efb..0990997a5304 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,7 +31,7 @@ static inline const char *printk_skip_level(const char *buffer) } /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL +#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ec3bfb0b1f62..770ed4821ba9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -56,7 +56,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cfe7df8f62cc..cb45f59685e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,7 +15,7 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt -config DEFAULT_MESSAGE_LOGLEVEL +config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" -- cgit From 0f9859ca92c9182bcb8f18c55cae1a04627cbb59 Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:18 -0700 Subject: klist: use same naming scheme as hlist for klist_add_after() The name was modified from hlist_add_after() to hlist_add_behind() when adjusting the order of arguments to match the one with klist_add_after(). This is necessary to break old code when it would use it the wrong way. Make klist follow this naming scheme for consistency. Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/klist.h | 2 +- lib/klist.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/klist.h b/include/linux/klist.h index a370ce57cf1d..61e5b723ae73 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -44,7 +44,7 @@ struct klist_node { extern void klist_add_tail(struct klist_node *n, struct klist *k); extern void klist_add_head(struct klist_node *n, struct klist *k); -extern void klist_add_after(struct klist_node *n, struct klist_node *pos); +extern void klist_add_behind(struct klist_node *n, struct klist_node *pos); extern void klist_add_before(struct klist_node *n, struct klist_node *pos); extern void klist_del(struct klist_node *n); diff --git a/lib/klist.c b/lib/klist.c index 358a368a2947..89b485a2a58d 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -140,11 +140,11 @@ void klist_add_tail(struct klist_node *n, struct klist *k) EXPORT_SYMBOL_GPL(klist_add_tail); /** - * klist_add_after - Init a klist_node and add it after an existing node + * klist_add_behind - Init a klist_node and add it after an existing node * @n: node we're adding. * @pos: node to put @n after */ -void klist_add_after(struct klist_node *n, struct klist_node *pos) +void klist_add_behind(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); @@ -153,7 +153,7 @@ void klist_add_after(struct klist_node *n, struct klist_node *pos) list_add(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } -EXPORT_SYMBOL_GPL(klist_add_after); +EXPORT_SYMBOL_GPL(klist_add_behind); /** * klist_add_before - Init a klist_node and add it before an existing node -- cgit From 62e7ca5280fd8cbf523970757e13f0324ce0daa0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:09:21 -0700 Subject: zlib: clean up some dead code Cleanup unused `if 0'-ed functions, which have been dead since 2006 (commits 87c2ce3b9305 ("lib/zlib*: cleanups") by Adrian Bunk and 4f3865fb57a0 ("zlib_inflate: Upgrade library code to a recent version") by Richard Purdie): - zlib_deflateSetDictionary - zlib_deflateParams - zlib_deflateCopy - zlib_inflateSync - zlib_syncsearch - zlib_inflateSetDictionary - zlib_inflatePrime Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zlib.h | 118 ------------------------------------- lib/zlib_deflate/deflate.c | 143 --------------------------------------------- lib/zlib_inflate/inflate.c | 132 ----------------------------------------- 3 files changed, 393 deletions(-) (limited to 'lib') diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 9c5a6b4de0a3..197abb2a54c5 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -493,64 +493,6 @@ extern int deflateInit2 (z_streamp strm, method). msg is set to null if there is no error message. deflateInit2 does not perform any compression: this will be done by deflate(). */ - -#if 0 -extern int zlib_deflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -#endif -/* - Initializes the compression dictionary from the given byte sequence - without producing any compressed output. This function must be called - immediately after deflateInit, deflateInit2 or deflateReset, before any - call of deflate. The compressor and decompressor must use exactly the same - dictionary (see inflateSetDictionary). - - The dictionary should consist of strings (byte sequences) that are likely - to be encountered later in the data to be compressed, with the most commonly - used strings preferably put towards the end of the dictionary. Using a - dictionary is most useful when the data to be compressed is short and can be - predicted with good accuracy; the data can then be compressed better than - with the default empty dictionary. - - Depending on the size of the compression data structures selected by - deflateInit or deflateInit2, a part of the dictionary may in effect be - discarded, for example if the dictionary is larger than the window size in - deflate or deflate2. Thus the strings most likely to be useful should be - put at the end of the dictionary, not at the front. - - Upon return of this function, strm->adler is set to the Adler32 value - of the dictionary; the decompressor may later use this value to determine - which dictionary has been used by the compressor. (The Adler32 value - applies to the whole dictionary even if only a subset of the dictionary is - actually used by the compressor.) - - deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent (for example if deflate has already been called for this stream - or if the compression method is bsort). deflateSetDictionary does not - perform any compression: this will be done by deflate(). -*/ - -#if 0 -extern int zlib_deflateCopy (z_streamp dest, z_streamp source); -#endif - -/* - Sets the destination stream as a complete copy of the source stream. - - This function can be useful when several compression strategies will be - tried, for example when there are several ways of pre-processing the input - data with a filter. The streams that will be discarded should then be freed - by calling deflateEnd. Note that deflateCopy duplicates the internal - compression state which can be quite large, so this strategy is slow and - can consume lots of memory. - - deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not - enough memory, Z_STREAM_ERROR if the source stream state was inconsistent - (such as zalloc being NULL). msg is left unchanged in both source and - destination. -*/ extern int zlib_deflateReset (z_streamp strm); /* @@ -568,27 +510,6 @@ static inline unsigned long deflateBound(unsigned long s) return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11; } -#if 0 -extern int zlib_deflateParams (z_streamp strm, int level, int strategy); -#endif -/* - Dynamically update the compression level and compression strategy. The - interpretation of level and strategy is as in deflateInit2. This can be - used to switch between compression and straight copy of the input data, or - to switch to a different kind of input data requiring a different - strategy. If the compression level is changed, the input available so far - is compressed with the old level (and may be flushed); the new level will - take effect only at the next call of deflate(). - - Before the call of deflateParams, the stream state must be set as for - a call of deflate(), since the currently available input may have to - be compressed and flushed. In particular, strm->avail_out must be non-zero. - - deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source - stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR - if strm->avail_out was zero. -*/ - /* extern int inflateInit2 (z_streamp strm, int windowBits); @@ -631,45 +552,6 @@ extern int inflateInit2 (z_streamp strm, int windowBits); and avail_out are unchanged.) */ -extern int zlib_inflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -/* - Initializes the decompression dictionary from the given uncompressed byte - sequence. This function must be called immediately after a call of inflate, - if that call returned Z_NEED_DICT. The dictionary chosen by the compressor - can be determined from the adler32 value returned by that call of inflate. - The compressor and decompressor must use exactly the same dictionary (see - deflateSetDictionary). For raw inflate, this function can be called - immediately after inflateInit2() or inflateReset() and before any call of - inflate() to set the dictionary. The application must insure that the - dictionary that was used for compression is provided. - - inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the - expected one (incorrect adler32 value). inflateSetDictionary does not - perform any decompression: this will be done by subsequent calls of - inflate(). -*/ - -#if 0 -extern int zlib_inflateSync (z_streamp strm); -#endif -/* - Skips invalid compressed data until a full flush point (see above the - description of deflate with Z_FULL_FLUSH) can be found, or until all - available input is skipped. No output is provided. - - inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR - if no more input was provided, Z_DATA_ERROR if no flush point has been found, - or Z_STREAM_ERROR if the stream structure was inconsistent. In the success - case, the application may save the current current value of total_in which - indicates where valid compressed data was found. In the error case, the - application may repeatedly call inflateSync, providing more input each time, - until success or end of the input data. -*/ - extern int zlib_inflateReset (z_streamp strm); /* This function is equivalent to inflateEnd followed by inflateInit, diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index d63381e8e333..d20ef458f137 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -249,52 +249,6 @@ int zlib_deflateInit2( return zlib_deflateReset(strm); } -/* ========================================================================= */ -#if 0 -int zlib_deflateSetDictionary( - z_streamp strm, - const Byte *dictionary, - uInt dictLength -) -{ - deflate_state *s; - uInt length = dictLength; - uInt n; - IPos hash_head = 0; - - if (strm == NULL || strm->state == NULL || dictionary == NULL) - return Z_STREAM_ERROR; - - s = (deflate_state *) strm->state; - if (s->status != INIT_STATE) return Z_STREAM_ERROR; - - strm->adler = zlib_adler32(strm->adler, dictionary, dictLength); - - if (length < MIN_MATCH) return Z_OK; - if (length > MAX_DIST(s)) { - length = MAX_DIST(s); -#ifndef USE_DICT_HEAD - dictionary += dictLength - length; /* use the tail of the dictionary */ -#endif - } - memcpy((char *)s->window, dictionary, length); - s->strstart = length; - s->block_start = (long)length; - - /* Insert all strings in the hash table (except for the last two bytes). - * s->lookahead stays null, so s->ins_h will be recomputed at the next - * call of fill_window. - */ - s->ins_h = s->window[0]; - UPDATE_HASH(s, s->ins_h, s->window[1]); - for (n = 0; n <= length - MIN_MATCH; n++) { - INSERT_STRING(s, n, hash_head); - } - if (hash_head) hash_head = 0; /* to make compiler happy */ - return Z_OK; -} -#endif /* 0 */ - /* ========================================================================= */ int zlib_deflateReset( z_streamp strm @@ -326,45 +280,6 @@ int zlib_deflateReset( return Z_OK; } -/* ========================================================================= */ -#if 0 -int zlib_deflateParams( - z_streamp strm, - int level, - int strategy -) -{ - deflate_state *s; - compress_func func; - int err = Z_OK; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - s = (deflate_state *) strm->state; - - if (level == Z_DEFAULT_COMPRESSION) { - level = 6; - } - if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { - return Z_STREAM_ERROR; - } - func = configuration_table[s->level].func; - - if (func != configuration_table[level].func && strm->total_in != 0) { - /* Flush the last buffer: */ - err = zlib_deflate(strm, Z_PARTIAL_FLUSH); - } - if (s->level != level) { - s->level = level; - s->max_lazy_match = configuration_table[level].max_lazy; - s->good_match = configuration_table[level].good_length; - s->nice_match = configuration_table[level].nice_length; - s->max_chain_length = configuration_table[level].max_chain; - } - s->strategy = strategy; - return err; -} -#endif /* 0 */ - /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in @@ -568,64 +483,6 @@ int zlib_deflateEnd( return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } -/* ========================================================================= - * Copy the source state to the destination state. - */ -#if 0 -int zlib_deflateCopy ( - z_streamp dest, - z_streamp source -) -{ -#ifdef MAXSEG_64K - return Z_STREAM_ERROR; -#else - deflate_state *ds; - deflate_state *ss; - ush *overlay; - deflate_workspace *mem; - - - if (source == NULL || dest == NULL || source->state == NULL) { - return Z_STREAM_ERROR; - } - - ss = (deflate_state *) source->state; - - *dest = *source; - - mem = (deflate_workspace *) dest->workspace; - - ds = &(mem->deflate_memory); - - dest->state = (struct internal_state *) ds; - *ds = *ss; - ds->strm = dest; - - ds->window = (Byte *) mem->window_memory; - ds->prev = (Pos *) mem->prev_memory; - ds->head = (Pos *) mem->head_memory; - overlay = (ush *) mem->overlay_memory; - ds->pending_buf = (uch *) overlay; - - memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); - memcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); - memcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); - memcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); - - ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); - ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); - ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; - - ds->l_desc.dyn_tree = ds->dyn_ltree; - ds->d_desc.dyn_tree = ds->dyn_dtree; - ds->bl_desc.dyn_tree = ds->bl_tree; - - return Z_OK; -#endif -} -#endif /* 0 */ - /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index f5ce87b0800e..58a733b10387 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -45,21 +45,6 @@ int zlib_inflateReset(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflatePrime(z_streamp strm, int bits, int value) -{ - struct inflate_state *state; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; - value &= (1L << bits) - 1; - state->hold += value << state->bits; - state->bits += bits; - return Z_OK; -} -#endif - int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; @@ -761,123 +746,6 @@ int zlib_inflateEnd(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary, - uInt dictLength) -{ - struct inflate_state *state; - unsigned long id; - - /* check state */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (state->wrap != 0 && state->mode != DICT) - return Z_STREAM_ERROR; - - /* check for correct dictionary id */ - if (state->mode == DICT) { - id = zlib_adler32(0L, NULL, 0); - id = zlib_adler32(id, dictionary, dictLength); - if (id != state->check) - return Z_DATA_ERROR; - } - - /* copy dictionary to window */ - zlib_updatewindow(strm, strm->avail_out); - - if (dictLength > state->wsize) { - memcpy(state->window, dictionary + dictLength - state->wsize, - state->wsize); - state->whave = state->wsize; - } - else { - memcpy(state->window + state->wsize - dictLength, dictionary, - dictLength); - state->whave = dictLength; - } - state->havedict = 1; - return Z_OK; -} -#endif - -#if 0 -/* - Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found - or when out of input. When called, *have is the number of pattern bytes - found in order so far, in 0..3. On return *have is updated to the new - state. If on return *have equals four, then the pattern was found and the - return value is how many bytes were read including the last byte of the - pattern. If *have is less than four, then the pattern has not been found - yet and the return value is len. In the latter case, zlib_syncsearch() can be - called again with more data and the *have state. *have is initialized to - zero for the first call. - */ -static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf, - unsigned len) -{ - unsigned got; - unsigned next; - - got = *have; - next = 0; - while (next < len && got < 4) { - if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) - got++; - else if (buf[next]) - got = 0; - else - got = 4 - got; - next++; - } - *have = got; - return next; -} -#endif - -#if 0 -int zlib_inflateSync(z_streamp strm) -{ - unsigned len; /* number of bytes to look at or looked at */ - unsigned long in, out; /* temporary to save total_in and total_out */ - unsigned char buf[4]; /* to restore bit buffer to byte string */ - struct inflate_state *state; - - /* check parameters */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; - - /* if first time, start search in bit buffer */ - if (state->mode != SYNC) { - state->mode = SYNC; - state->hold <<= state->bits & 7; - state->bits -= state->bits & 7; - len = 0; - while (state->bits >= 8) { - buf[len++] = (unsigned char)(state->hold); - state->hold >>= 8; - state->bits -= 8; - } - state->have = 0; - zlib_syncsearch(&(state->have), buf, len); - } - - /* search available input */ - len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in); - strm->avail_in -= len; - strm->next_in += len; - strm->total_in += len; - - /* return no joy or set up to restart inflate() on a new block */ - if (state->have != 4) return Z_DATA_ERROR; - in = strm->total_in; out = strm->total_out; - zlib_inflateReset(strm); - strm->total_in = in; strm->total_out = out; - state->mode = TYPE; - return Z_OK; -} -#endif - /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; -- cgit From b01250856b25f4417c51aa33afc451fbf7da1484 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 6 Aug 2014 16:09:23 -0700 Subject: lib: add lib/glob.c This is a helper function from drivers/ata/libata_core.c, where it is used to blacklist particular device models. It's being moved to lib/ so other drivers may use it for the same purpose. This implementation in non-recursive, so is safe for the kernel stack. [akpm@linux-foundation.org: fix sparse warning] Signed-off-by: George Spelvin Cc: Randy Dunlap Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/glob.h | 9 ++++ lib/Kconfig | 19 ++++++++ lib/Makefile | 2 + lib/glob.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 include/linux/glob.h create mode 100644 lib/glob.c (limited to 'lib') diff --git a/include/linux/glob.h b/include/linux/glob.h new file mode 100644 index 000000000000..861d8347d08e --- /dev/null +++ b/include/linux/glob.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_GLOB_H +#define _LINUX_GLOB_H + +#include /* For bool */ +#include /* For __pure */ + +bool __pure glob_match(char const *pat, char const *str); + +#endif /* _LINUX_GLOB_H */ diff --git a/lib/Kconfig b/lib/Kconfig index a8a775730c09..41bfeec72e40 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -396,6 +396,25 @@ config CPU_RMAP config DQL bool +config GLOB + bool +# This actually supports modular compilation, but the module overhead +# is ridiculous for the amount of code involved. Until an out-of-tree +# driver asks for it, we'll just link it directly it into the kernel +# when required. Since we're ignoring out-of-tree users, there's also +# no need bother prompting for a manual decision: +# prompt "glob_match() function" + help + This option provides a glob_match function for performing + simple text pattern matching. It originated in the ATA code + to blacklist particular drive models, but other device drivers + may need similar functionality. + + All drivers in the Linux kernel tree that require this function + should automatically select this option. Say N unless you + are compiling an out-of tree driver which tells you that it + depends on this. + # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/Makefile b/lib/Makefile index 8427df95dade..d6b4bc496408 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -137,6 +137,8 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o +obj-$(CONFIG_GLOB) += glob.o + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o diff --git a/lib/glob.c b/lib/glob.c new file mode 100644 index 000000000000..0ba3ea86b546 --- /dev/null +++ b/lib/glob.c @@ -0,0 +1,123 @@ +#include +#include + +/* + * The only reason this code can be compiled as a module is because the + * ATA code that depends on it can be as well. In practice, they're + * both usually compiled in and the module overhead goes away. + */ +MODULE_DESCRIPTION("glob(7) matching"); +MODULE_LICENSE("Dual MIT/GPL"); + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +bool __pure glob_match(char const *pat, char const *str) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const *back_pat = NULL, *back_str = back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + /*FALLTHROUGH*/ + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + } +} +EXPORT_SYMBOL(glob_match); -- cgit From 5f9be8248d9802730185f98f4db41285dc00283c Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 6 Aug 2014 16:09:25 -0700 Subject: lib/glob.c: add CONFIG_GLOB_SELFTEST This was useful during development, and is retained for future regression testing. GCC appears to have no way to place string literals in a particular section; adding __initconst to a char pointer leaves the string itself in the default string section, where it will not be thrown away after module load. Thus all string constants are kept in explicitly declared and named arrays. Sorry this makes printk a bit harder to read. At least the tests are more compact. Signed-off-by: George Spelvin Cc: Randy Dunlap Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 14 ++++++ lib/glob.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index 41bfeec72e40..df872659ddd3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -415,6 +415,20 @@ config GLOB are compiling an out-of tree driver which tells you that it depends on this. +config GLOB_SELFTEST + bool "glob self-test on init" + default n + depends on GLOB + help + This option enables a simple self-test of the glob_match + function on startup. It is primarily useful for people + working on the code to ensure they haven't introduced any + regressions. + + It only adds a little bit of code and slows kernel boot (or + module load) by a small amount, so you're welcome to play with + it, but you probably don't need it. + # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/glob.c b/lib/glob.c index 0ba3ea86b546..500fc80d23e1 100644 --- a/lib/glob.c +++ b/lib/glob.c @@ -121,3 +121,167 @@ backtrack: } } EXPORT_SYMBOL(glob_match); + + +#ifdef CONFIG_GLOB_SELFTEST + +#include +#include + +/* Boot with "glob.verbose=1" to show successful tests, too */ +static bool verbose = false; +module_param(verbose, bool, 0); + +struct glob_test { + char const *pat, *str; + bool expected; +}; + +static bool __pure __init test(char const *pat, char const *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* Can't get string literals into a particular section, so... */ + static char const msg_error[] __initconst = + KERN_ERR "glob: \"%s\" vs. \"%s\": %s *** ERROR ***\n"; + static char const msg_ok[] __initconst = + KERN_DEBUG "glob: \"%s\" vs. \"%s\": %s OK\n"; + static char const mismatch[] __initconst = "mismatch"; + char const *message; + + if (!success) + message = msg_error; + else if (verbose) + message = msg_ok; + else + return success; + + printk(message, pat, str, mismatch + 3*match); + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static char const glob_tests[] __initconst = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +static int __init glob_init(void) +{ + unsigned successes = 0; + unsigned n = 0; + char const *p = glob_tests; + static char const message[] __initconst = + KERN_INFO "glob: %u self-tests passed, %u failed\n"; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const *pat = p; + + p += strlen(p) + 1; + successes += test(pat, p, expected); + p += strlen(p) + 1; + n++; + } + + n -= successes; + printk(message, successes, n); + + /* What's the errno for "kernel bug detected"? Guess... */ + return n ? -ECANCELED : 0; +} + +/* We need a dummy exit function to allow unload */ +static void __exit glob_fini(void) { } + +module_init(glob_init); +module_exit(glob_fini); + +#endif /* CONFIG_GLOB_SELFTEST */ -- cgit From e004f3c7780de32fa822f292ebadd985bcadb1e0 Mon Sep 17 00:00:00 2001 From: Gui Hecheng Date: Wed, 6 Aug 2014 16:09:29 -0700 Subject: lib/cmdline.c: add size unit t/p/e to memparse For modern filesystems such as btrfs, t/p/e size level operations are common. add size unit t/p/e parsing to memparse Signed-off-by: Gui Hecheng Acked-by: David Rientjes Reviewed-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/cmdline.c b/lib/cmdline.c index d4932f745e92..76a712e6e20e 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -121,11 +121,7 @@ EXPORT_SYMBOL(get_options); * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with %K (for kilobytes, or 1024 bytes), - * %M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or - * 1073741824). If the number is suffixed with K, M, or G, then - * the return value is the number multiplied by one kilobyte, one - * megabyte, or one gigabyte, respectively. + * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) @@ -135,6 +131,15 @@ unsigned long long memparse(const char *ptr, char **retptr) unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; case 'G': case 'g': ret <<= 10; -- cgit From 142cda5dbcb0dc3738c079f591290d6384261c73 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 6 Aug 2014 16:09:31 -0700 Subject: lib/string_helpers.c: constify static arrays Complement commit 68aecfb97978 ("lib/string_helpers.c: make arrays static") by making the arrays const -- not only pointing to const strings. This moves them out of the data section to the r/o data section: text data bss dec hex filename 1150 176 0 1326 52e lib/string_helpers.old.o 1326 0 0 1326 52e lib/string_helpers.new.o Signed-off-by: Mathias Krause Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string_helpers.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/string_helpers.c b/lib/string_helpers.c index ed5c1454dd62..29033f319aea 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -25,12 +25,15 @@ int string_get_size(u64 size, const enum string_size_units units, char *buf, int len) { - static const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", - "EB", "ZB", "YB", NULL}; - static const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB", NULL }; - static const char **units_str[] = { - [STRING_UNITS_10] = units_10, + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", + NULL + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { -- cgit From 129965a916a8b659d7bb3b59877a3e8259bed5e3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:09:33 -0700 Subject: lib/test-kstrtox.c: use ARRAY_SIZE instead of sizeof/sizeof[0] Use kernel.h definition. Signed-off-by: Fabian Frederick Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test-kstrtox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c index bea3f3fa3f02..4137bca5f8e8 100644 --- a/lib/test-kstrtox.c +++ b/lib/test-kstrtox.c @@ -3,7 +3,7 @@ #include #define for_each_test(i, test) \ - for (i = 0; i < sizeof(test) / sizeof(test[0]); i++) + for (i = 0; i < ARRAY_SIZE(test); i++) struct test_fail { const char *str; -- cgit From 27d555d101c820ac4b1962680bd0192993c6e4e0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:38 -0700 Subject: lib: list_sort_test(): return -ENOMEM when allocation fails Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index 1183fa70a44d..291412ade89a 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -207,7 +207,7 @@ static int __init cmp(void *priv, struct list_head *a, struct list_head *b) static int __init list_sort_test(void) { - int i, count = 1, err = -EINVAL; + int i, count = 1, err = -ENOMEM; struct debug_el *el; struct list_head *cur, *tmp; LIST_HEAD(head); @@ -239,6 +239,7 @@ static int __init list_sort_test(void) list_sort(NULL, &head, cmp); + err = -EINVAL; for (cur = head.next; cur->next != &head; cur = cur->next) { struct debug_el *el1; int cmp_result; -- cgit From 9d418dcc6d15539a9567b2ad7fe7473648989f44 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:40 -0700 Subject: lib: list_sort_test(): add extra corruption check Add a check to make sure that the prev pointer of the list head points to the last element on the list. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index 291412ade89a..fbdbc867b252 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -272,6 +272,11 @@ static int __init list_sort_test(void) } count++; } + if (head.prev != cur) { + printk(KERN_ERR "list_sort_test: error: list is corrupted\n"); + goto exit; + } + if (count != TEST_LIST_LEN) { printk(KERN_ERR "list_sort_test: error: bad list length %d", -- cgit From 694123031d12458a343492528fa40113e5ec843e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:42 -0700 Subject: lib: list_sort_test(): simplify and harden cleanup There is no reason to maintain the list structure while freeing the debug elements. Aside from the redundant pointer manipulations, it is also inefficient from a locality-of-reference viewpoint, since they are visited in a random order (wrt. the order they were allocated). Furthermore, if we jumped to exit: after detecting list corruption, it is actually dangerous. So just free the elements in the order they were allocated, using the backing array elts. Allocate that using kcalloc(), so that if allocation of one of the debug element fails, we just end up calling kfree(NULL) for the trailing elements. Minor details: Use sizeof(*elts) instead of sizeof(void *), and return err immediately when allocation of elts fails, to avoid introducing another label just before the final return statement. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index fbdbc867b252..a34c78c30d56 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -209,16 +209,16 @@ static int __init list_sort_test(void) { int i, count = 1, err = -ENOMEM; struct debug_el *el; - struct list_head *cur, *tmp; + struct list_head *cur; LIST_HEAD(head); printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); - elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { printk(KERN_ERR "list_sort_test: error: cannot allocate " "memory\n"); - goto exit; + return err; } for (i = 0; i < TEST_LIST_LEN; i++) { @@ -286,11 +286,9 @@ static int __init list_sort_test(void) err = 0; exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); kfree(elts); - list_for_each_safe(cur, tmp, &head) { - list_del(cur); - kfree(container_of(cur, struct debug_el, list)); - } return err; } module_init(list_sort_test); -- cgit From 61b3d6c48f059bb054b0019088736dab6c2ac0ec Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:44 -0700 Subject: lib: list_sort.c: Limit number of unused cmp callbacks The helper merge_and_restore_back_links() makes sure to call the caller's cmp function during the final ->prev pointer fixup, so that the cmp function may call cond_resched(). However, if the cmp function does not call cond_resched() at all, this is entirely redundant. If it does, doing at least two function calls for every two pointer assignments is a bit excessive. This patch limits the calls to once for every 256 iterations. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index a34c78c30d56..6b9fdaf1d32e 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -47,6 +47,7 @@ static void merge_and_restore_back_links(void *priv, struct list_head *a, struct list_head *b) { struct list_head *tail = head; + u8 count = 0; while (a && b) { /* if equal, take 'a' -- important for sort stability */ @@ -70,7 +71,8 @@ static void merge_and_restore_back_links(void *priv, * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - (*cmp)(priv, tail->next, tail->next); + if (unlikely(!(++count))) + (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; -- cgit From d0da23b0debcef135c866cc8117d197fb40a6079 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:09:46 -0700 Subject: lib/list_sort.c: convert to pr_foo Cc: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/list_sort.c b/lib/list_sort.c index 6b9fdaf1d32e..12bcba1c8612 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,3 +1,6 @@ + +#define pr_fmt(fmt) "list_sort_test: " fmt + #include #include #include @@ -125,9 +128,7 @@ void list_sort(void *priv, struct list_head *head, } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list passed to" - " list_sort() too long for" - " efficiency\n"); + printk_once(KERN_DEBUG "list too long for efficiency\n"); lev--; } max_lev = lev; @@ -170,27 +171,25 @@ static struct debug_el **elts __initdata; static int __init check(struct debug_el *ela, struct debug_el *elb) { if (ela->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - ela->serial); + pr_err("error: incorrect serial %d\n", ela->serial); return -EINVAL; } if (elb->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - elb->serial); + pr_err("error: incorrect serial %d\n", elb->serial); return -EINVAL; } if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - printk(KERN_ERR "list_sort_test: error: phantom element\n"); + pr_err("error: phantom element\n"); return -EINVAL; } if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); return -EINVAL; } if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); return -EINVAL; } return 0; @@ -214,20 +213,18 @@ static int __init list_sort_test(void) struct list_head *cur; LIST_HEAD(head); - printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + pr_debug("start testing list_sort()\n"); elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { - printk(KERN_ERR "list_sort_test: error: cannot allocate " - "memory\n"); + pr_err("error: cannot allocate memory\n"); return err; } for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { - printk(KERN_ERR "list_sort_test: error: cannot " - "allocate memory\n"); + pr_err("error: cannot allocate memory\n"); goto exit; } /* force some equivalencies */ @@ -247,42 +244,38 @@ static int __init list_sort_test(void) int cmp_result; if (cur->next->prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is " - "corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } cmp_result = cmp(NULL, cur, cur->next); if (cmp_result > 0) { - printk(KERN_ERR "list_sort_test: error: list is not " - "sorted\n"); + pr_err("error: list is not sorted\n"); goto exit; } el = container_of(cur, struct debug_el, list); el1 = container_of(cur->next, struct debug_el, list); if (cmp_result == 0 && el->serial >= el1->serial) { - printk(KERN_ERR "list_sort_test: error: order of " - "equivalent elements not preserved\n"); + pr_err("error: order of equivalent elements not " + "preserved\n"); goto exit; } if (check(el, el1)) { - printk(KERN_ERR "list_sort_test: error: element check " - "failed\n"); + pr_err("error: element check failed\n"); goto exit; } count++; } if (head.prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } if (count != TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: bad list length %d", - count); + pr_err("error: bad list length %d", count); goto exit; } -- cgit From 0679cc483669d08153d158273455398a389ee9ca Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:49 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_empty unsigned Many functions in lib/bitmap.c start with an expression such as lim = bits/BITS_PER_LONG. Since bits has type (signed) int, and since gcc cannot know that it is in fact non-negative, it generates worse code than it could. These patches, mostly consisting of changing various parameters to unsigned, gives a slight overall code reduction: add/remove: 1/1 grow/shrink: 8/16 up/down: 251/-414 (-163) function old new delta tick_device_uses_broadcast 335 425 +90 __irq_alloc_descs 498 554 +56 __bitmap_andnot 73 115 +42 __bitmap_and 70 101 +31 bitmap_weight - 11 +11 copy_hugetlb_page_range 752 762 +10 follow_hugetlb_page 846 854 +8 hugetlb_init 1415 1417 +2 hugetlb_nrpages_setup 130 131 +1 hugetlb_add_hstate 377 376 -1 bitmap_allocate_region 82 80 -2 select_task_rq_fair 2202 2191 -11 hweight_long 66 55 -11 __reg_op 230 219 -11 dm_stats_message 2849 2833 -16 bitmap_parselist 92 74 -18 __bitmap_weight 115 97 -18 __bitmap_subset 153 129 -24 __bitmap_full 128 104 -24 __bitmap_empty 120 96 -24 bitmap_set 179 149 -30 bitmap_clear 185 155 -30 __bitmap_equal 136 105 -31 __bitmap_intersects 148 108 -40 __bitmap_complement 109 67 -42 tick_device_setup_broadcast_func.isra 81 - -81 [The increases in __bitmap_and{,not} are due to bug fixes 17/18,18/18. No idea why bitmap_weight suddenly appears.] While 163 bytes treewide is insignificant, I believe the bitmap functions are often called with locks held, so saving even a few cycles might be worth it. While making these changes, I found a few other things that might be worth including. 16,17,18 are actual bug fixes. The rest shouldn't change the behaviour of any of the functions, provided no-one passed negative nbits values. If something should come up, it should be fairly bisectable. A few issues I thought about, but didn't know what to do with: * Many of the functions misbehave if nbits is compile-time 0; the out-of-line functions generally handle 0 correctly. bitmap_fill() is particularly bad, whether the 0 is known at compile time or not. It would probably be nice to add detection of at least compile-time 0 and handle that appropriately. * I didn't change __bitmap_shift_{left,right} to use unsigned because I want to fully understand why the algorithm works before making that change. However, AFAICT, they behave correctly for all (positive) shift amounts. This is not the case for the small_const_nbits versions. If for example nbits = n = BITS_PER_LONG, the shift operators turn into no-ops (at least on x86), so one get *dst = *src, whereas one would expect to get *dst=0. That difference in behaviour is somewhat annoying. This patch (of 18): The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7ad634501e48..3d3fd6b2f157 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -88,7 +88,7 @@ * lib/bitmap.c provides these functions: */ -extern int __bitmap_empty(const unsigned long *bitmap, int bits); +extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_full(const unsigned long *bitmap, int bits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); @@ -257,7 +257,7 @@ static inline int bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline int bitmap_empty(const unsigned long *src, int nbits) +static inline int bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index 06f7e4fe8d2d..378911001442 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -40,9 +40,9 @@ * for the best explanations of this ordering. */ -int __bitmap_empty(const unsigned long *bitmap, int bits) +int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap[k]) return 0; -- cgit From 8397927c8045c58afc68ef839855eb5505259df3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:51 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_full unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3d3fd6b2f157..bc7e520d3f78 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -89,7 +89,7 @@ */ extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); -extern int __bitmap_full(const unsigned long *bitmap, int bits); +extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, @@ -265,7 +265,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits) return __bitmap_empty(src, nbits); } -static inline int bitmap_full(const unsigned long *src, int nbits) +static inline int bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index 378911001442..9859f38660f9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -55,9 +55,9 @@ int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_empty); -int __bitmap_full(const unsigned long *bitmap, int bits) +int __bitmap_full(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (~bitmap[k]) return 0; -- cgit From 5e068069319a9fb02fb14337c2cedeae5f16d812 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:53 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_equal unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index bc7e520d3f78..1e0f46c91125 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -91,7 +91,7 @@ extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits); extern void __bitmap_shift_right(unsigned long *dst, diff --git a/lib/bitmap.c b/lib/bitmap.c index 9859f38660f9..d6bb955e71cb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -71,9 +71,9 @@ int __bitmap_full(const unsigned long *bitmap, unsigned int bits) EXPORT_SYMBOL(__bitmap_full); int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return 0; -- cgit From 3d6684f4e6a46f3a8263f5681e093bccbb767a1c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:55 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_complement unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 6 +++--- lib/bitmap.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 1e0f46c91125..21fb52ffe444 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -93,7 +93,7 @@ extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - int bits); + unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, @@ -222,7 +222,7 @@ static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); @@ -231,7 +231,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr } static inline int bitmap_equal(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index d6bb955e71cb..0f2f845702eb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -86,9 +86,9 @@ int __bitmap_equal(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_equal); -void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) +void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) dst[k] = ~src[k]; -- cgit From 65b4ee62c9cd10640f0054f47fd84c7920e8c118 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:57 -0700 Subject: lib: bitmap: remove unnecessary mask from bitmap_complement Since the extra bits are "don't care", there is no reason to mask the last word to the used bits when complementing. This shaves off yet a few bytes. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 21fb52ffe444..f42d72d5fe82 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -225,7 +225,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr unsigned int nbits) { if (small_const_nbits(nbits)) - *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); + *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index 0f2f845702eb..4387e3c092fd 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -93,7 +93,7 @@ void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned dst[k] = ~src[k]; if (bits % BITS_PER_LONG) - dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); + dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); -- cgit From 2f9305eb31097fdd3dc86daca65d8097d1fcf2ff Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:59 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_{and,or,xor,andnot} unsigned This change is only for consistency with the changes to the other bitmap_* functions; it doesn't change the size of the generated code: inside BITS_TO_LONGS there is a sizeof(long), which causes bits to be interpreted as unsigned anyway. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 16 ++++++++-------- lib/bitmap.c | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index f42d72d5fe82..7048782fe5b9 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -99,13 +99,13 @@ extern void __bitmap_shift_right(unsigned long *dst, extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_subset(const unsigned long *bitmap1, @@ -188,7 +188,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2) != 0; @@ -196,7 +196,7 @@ static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -205,7 +205,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; @@ -214,7 +214,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, } static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2)) != 0; diff --git a/lib/bitmap.c b/lib/bitmap.c index 4387e3c092fd..03207373cc5a 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -182,10 +182,10 @@ void __bitmap_shift_left(unsigned long *dst, EXPORT_SYMBOL(__bitmap_shift_left); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); unsigned long result = 0; for (k = 0; k < nr; k++) @@ -195,10 +195,10 @@ int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; @@ -206,10 +206,10 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; @@ -217,10 +217,10 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_xor); int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); unsigned long result = 0; for (k = 0; k < nr; k++) -- cgit From 6dfe9799c2a03d225316a3e959b0447f3f50303e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:01 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_intersects unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7048782fe5b9..2f3f3a4d5996 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -107,7 +107,7 @@ extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); @@ -240,7 +240,7 @@ static inline int bitmap_equal(const unsigned long *src1, } static inline int bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; diff --git a/lib/bitmap.c b/lib/bitmap.c index 03207373cc5a..e85daa90b237 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -230,9 +230,9 @@ int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_andnot); int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return 1; -- cgit From 5be20213e855550de2b32fde6fc116f74bab86a6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:03 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_subset unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2f3f3a4d5996..87e88f79def1 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -109,7 +109,7 @@ extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); extern void bitmap_set(unsigned long *map, int i, int len); @@ -249,7 +249,7 @@ static inline int bitmap_intersects(const unsigned long *src1, } static inline int bitmap_subset(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index e85daa90b237..c9bff5379795 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -245,9 +245,9 @@ int __bitmap_intersects(const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_intersects); int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return 0; -- cgit From 877d9f3b63ac2e5dbc51cbcdff156433f03b3a32 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:05 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_weight unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. I didn't change the return type, since that might change the semantics of some expression containing a call to bitmap_weight(). Certainly an int is capable of holding the result. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 87e88f79def1..64b0ebe9f9a8 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -110,7 +110,7 @@ extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_weight(const unsigned long *bitmap, int bits); +extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); extern void bitmap_set(unsigned long *map, int i, int len); extern void bitmap_clear(unsigned long *map, int start, int nr); @@ -273,7 +273,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits) return __bitmap_full(src, nbits); } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index c9bff5379795..f69435c23f9c 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -259,9 +259,10 @@ int __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); -int __bitmap_weight(const unsigned long *bitmap, int bits) +int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - int k, w = 0, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; + int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); -- cgit From fb5ac54263ef3fcb5c469a61e0ab6b06e45e2307 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:07 -0700 Subject: lib: bitmap: make the start index of bitmap_set unsigned The compiler can generate slightly smaller and simpler code when it knows that "start" is non-negative. Also, use the names "start" and "len" for the two parameters in both header file and implementation, instead of the previous mix. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 64b0ebe9f9a8..ad2c67d3583e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -112,7 +112,7 @@ extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void bitmap_set(unsigned long *map, int i, int len); +extern void bitmap_set(unsigned long *map, unsigned int start, int len); extern void bitmap_clear(unsigned long *map, int start, int nr); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, diff --git a/lib/bitmap.c b/lib/bitmap.c index f69435c23f9c..2a3a92fc3355 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -274,21 +274,21 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, int start, int nr) +void bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_set >= 0) { + while (len - bits_to_set >= 0) { *p |= mask_to_set; - nr -= bits_to_set; + len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } - if (nr) { + if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } -- cgit From 154f5e38f30f262025c8c2e825376f6eb51e8bcb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:10 -0700 Subject: lib: bitmap: make the start index of bitmap_clear unsigned The compiler can generate slightly smaller and simpler code when it knows that "start" is non-negative. Also, use the names "start" and "len" for the two parameters for consistency with bitmap_set. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ad2c67d3583e..83c1c7d25073 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -113,7 +113,7 @@ extern int __bitmap_subset(const unsigned long *bitmap1, extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); extern void bitmap_set(unsigned long *map, unsigned int start, int len); -extern void bitmap_clear(unsigned long *map, int start, int nr); +extern void bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, diff --git a/lib/bitmap.c b/lib/bitmap.c index 2a3a92fc3355..5d2540396300 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -295,21 +295,21 @@ void bitmap_set(unsigned long *map, unsigned int start, int len) } EXPORT_SYMBOL(bitmap_set); -void bitmap_clear(unsigned long *map, int start, int nr) +void bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_clear >= 0) { + while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; - nr -= bits_to_clear; + len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } - if (nr) { + if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } -- cgit From bc5be1828065681300f3727ab55fac0d7ef37af3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:12 -0700 Subject: lib: bitmap: simplify bitmap_parselist We want len to be the index of the first '\n', or the length of the string if there is no newline. This is a good example of the usefulness of strchrnul(). Use that instead, thus eliminating a branch and a call to strlen(). Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index 5d2540396300..d4b3a6d4b2d7 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -665,13 +665,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) { - char *nl = strchr(bp, '\n'); - int len; - - if (nl) - len = nl - bp; - else - len = strlen(bp); + char *nl = strchrnul(bp, '\n'); + int len = nl - bp; return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); } -- cgit From a855174878f06d197ddfde5cbe1e69a1d4a2df27 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:14 -0700 Subject: lib: bitmap: fix typo in kerneldoc for bitmap_pos_to_ord A few lines above, it was stated that positions for non-set bits are mapped to -1, which is obviously also what the code does. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index d4b3a6d4b2d7..2714df9f5cdb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -712,7 +712,7 @@ EXPORT_SYMBOL(bitmap_parselist_user); * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, - * and other @pos values will get mapped to 0. When @pos value 7 + * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * -- cgit From 9279d3286e10736766edcaf815ae10e00856e448 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:16 -0700 Subject: lib: bitmap: change parameter of bitmap_*_region to unsigned Changing the pos parameter of __reg_op to unsigned allows the compiler to generate slightly smaller and simpler code. Also update its callers bitmap_*_region to receive and pass unsigned int. The return types of bitmap_find_free_region and bitmap_allocate_region are still int to allow a negative error code to be returned. An int is certainly capable of representing any realistic return value. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 6 +++--- lib/bitmap.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 83c1c7d25073..210037833356 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -140,9 +140,9 @@ extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, int sz, int bits); -extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); +extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); diff --git a/lib/bitmap.c b/lib/bitmap.c index 2714df9f5cdb..c2f3807b3601 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1042,7 +1042,7 @@ enum { REG_OP_RELEASE, /* clear all bits in region */ }; -static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op) +static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op) { int nbits_reg; /* number of bits in region */ int index; /* index first long of region in bitmap */ @@ -1108,11 +1108,11 @@ done: * Return the bit offset in bitmap of the allocated region, * or -errno on failure. */ -int bitmap_find_free_region(unsigned long *bitmap, int bits, int order) +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { - int pos, end; /* scans bitmap by regions of size order */ + unsigned int pos, end; /* scans bitmap by regions of size order */ - for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { + for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) continue; __reg_op(bitmap, pos, order, REG_OP_ALLOC); @@ -1133,7 +1133,7 @@ EXPORT_SYMBOL(bitmap_find_free_region); * * No return value. */ -void bitmap_release_region(unsigned long *bitmap, int pos, int order) +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { __reg_op(bitmap, pos, order, REG_OP_RELEASE); } @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(bitmap_release_region); * Return 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ -int bitmap_allocate_region(unsigned long *bitmap, int pos, int order) +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; -- cgit From 2ac521d3325a24f01c1f1e86c74537b831d282c5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:18 -0700 Subject: lib: bitmap: micro-optimize bitmap_allocate_region __reg_op(..., REG_OP_ALLOC) always returns 0, so we might as well use that and save an instruction. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index c2f3807b3601..faaf7206d4cf 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1154,8 +1154,7 @@ int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; - __reg_op(bitmap, pos, order, REG_OP_ALLOC); - return 0; + return __reg_op(bitmap, pos, order, REG_OP_ALLOC); } EXPORT_SYMBOL(bitmap_allocate_region); -- cgit From 7e5f97d1927f41affa21aa5b321865ceab1994ce Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:22 -0700 Subject: lib: bitmap: add missing mask in bitmap_and Apparently, bitmap_and is supposed to return whether the new bitmap is empty. But it didn't take potential garbage bits in the last word into account. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 75df61d9ecfb..3399a9ecd991 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -191,7 +191,7 @@ static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & *src2) != 0; + return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index faaf7206d4cf..ce2ec80bf431 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -185,11 +185,14 @@ int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; - unsigned int nr = BITS_TO_LONGS(bits); + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); -- cgit From 74e765319084bd2940a9612ada961f0f7385936c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:24 -0700 Subject: lib: bitmap: add missing mask in bitmap_andnot Apparently, bitmap_andnot is supposed to return whether the new bitmap is empty. But it didn't take potential garbage bits in the last word into account. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3399a9ecd991..e1c8d080c427 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -217,7 +217,7 @@ static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & ~(*src2)) != 0; + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index ce2ec80bf431..1e031f2c9aba 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -223,11 +223,14 @@ int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; - unsigned int nr = BITS_TO_LONGS(bits); + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); -- cgit From 93b7aca35dd7bf0c3ba7ea0542b556bcfdb28e76 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 8 Aug 2014 14:22:07 -0700 Subject: lib/idr.c: fix out-of-bounds pointer dereference I'm working on address sanitizer project for kernel. Recently we started experiments with stack instrumentation, to detect out-of-bounds read/write bugs on stack. Just after booting I've hit out-of-bounds read on stack in idr_for_each (and in __idr_remove_all as well): struct idr_layer **paa = &pa[0]; while (id >= 0 && id <= max) { ... while (n < fls(id)) { n += IDR_BITS; p = *--paa; <--- here we are reading pa[-1] value. } } Despite the fact that after this dereference we are exiting out of loop and never use p, such behaviour is undefined and should be avoided. Fix this by moving pointer derference to the beggining of the loop, right before we will use it. Signed-off-by: Andrey Ryabinin Reviewed-by: Lai Jiangshan Cc: Tejun Heo Cc: Alexey Preobrazhensky Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 39158abebad1..50be3fa9b657 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -590,26 +590,27 @@ static void __idr_remove_all(struct idr *idp) struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = idp->top; + *paa = idp->top; RCU_INIT_POINTER(idp->top, NULL); max = idr_max(idp->layers); id = 0; while (id >= 0 && id <= max) { + p = *paa; while (n > IDR_BITS && p) { n -= IDR_BITS; - *paa++ = p; p = p->ary[(id >> n) & IDR_MASK]; + *++paa = p; } bt_mask = id; id += 1 << n; /* Get the highest bit that the above add changed from 0->1. */ while (n < fls(id ^ bt_mask)) { - if (p) - free_layer(idp, p); + if (*paa) + free_layer(idp, *paa); n += IDR_BITS; - p = *--paa; + --paa; } } idp->layers = 0; @@ -692,15 +693,16 @@ int idr_for_each(struct idr *idp, struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = rcu_dereference_raw(idp->top); + *paa = rcu_dereference_raw(idp->top); max = idr_max(idp->layers); id = 0; while (id >= 0 && id <= max) { + p = *paa; while (n > 0 && p) { n -= IDR_BITS; - *paa++ = p; p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); + *++paa = p; } if (p) { @@ -712,7 +714,7 @@ int idr_for_each(struct idr *idp, id += 1 << n; while (n < fls(id)) { n += IDR_BITS; - p = *--paa; + --paa; } } @@ -740,17 +742,18 @@ void *idr_get_next(struct idr *idp, int *nextidp) int n, max; /* find first ent */ - p = rcu_dereference_raw(idp->top); + p = *paa = rcu_dereference_raw(idp->top); if (!p) return NULL; n = (p->layer + 1) * IDR_BITS; max = idr_max(p->layer + 1); while (id >= 0 && id <= max) { + p = *paa; while (n > 0 && p) { n -= IDR_BITS; - *paa++ = p; p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); + *++paa = p; } if (p) { @@ -768,7 +771,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) id = round_up(id + 1, 1 << n); while (n < fls(id)) { n += IDR_BITS; - p = *--paa; + --paa; } } return NULL; -- cgit From 1b9c53e849aa65776d4f611d99aa09f856518dad Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 8 Aug 2014 14:22:14 -0700 Subject: lib/rbtree.c: fix typo in comment of __rb_insert() In case 1, it passes down the BLACK color from G to p and u, and maintains the color of n. By doing so, it maintains the black height of the sub-tree. While in the comment, it marks the color of n to BLACK. This is a typo and not consistents with the code. This patch fixs this typo in comment. Signed-off-by: Wei Yang Acked-by: Michel Lespinasse Cc: Xiao Guangrong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index 65f4effd117f..c16c81a3d430 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -101,7 +101,7 @@ __rb_insert(struct rb_node *node, struct rb_root *root, * / \ / \ * p u --> P U * / / - * n N + * n n * * However, since g's parent might be red, and * 4) does not allow this, we need to recurse -- cgit From 89b3ac63013e64621369f619fe732b629879c671 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Fri, 8 Aug 2014 14:22:44 -0700 Subject: kfifo: use BUG_ON Use BUG_ON(x) rather than if(x) BUG(); The semantic patch that fixes this problem is as follows: // @@ identifier x; @@ -if (!x) BUG(); +BUG_ON(!x); // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kfifo.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/kfifo.c b/lib/kfifo.c index d79b9d222065..90ba1eb1df06 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -561,8 +561,7 @@ EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - if (!nents) - BUG(); + BUG_ON(!nents); len = __kfifo_max_r(len, recsize); @@ -585,8 +584,7 @@ EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - if (!nents) - BUG(); + BUG_ON(!nents); len = __kfifo_max_r(len, recsize); -- cgit From 4d4b866aee039d609c0b40e7e5b27204607ce614 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 Aug 2014 14:23:10 -0700 Subject: initrd: fix lz4 decompress with initrd During testing initrd (>2G) support, find decompress/lz4 does not work with initrd at all. decompress_* should support: 1. inbuf[]/outbuf[] for kernel preboot. 2. inbuf[]/flush() for initramfs 3. fill()/flush() for initrd. in the unlz4 does not handle case 3, as input len is passed as 0, and it failed in first try. Fix that add one extra if (fill) checking, and get out if EOF from the fill(). Signed-off-by: Yinghai Lu Cc: Kyungsik Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress_unlz4.c | 65 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c index 7d1e83caf8ad..3ad7f3954dfd 100644 --- a/lib/decompress_unlz4.c +++ b/lib/decompress_unlz4.c @@ -83,13 +83,20 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, if (posp) *posp = 0; - if (fill) - fill(inp, 4); + if (fill) { + size = fill(inp, 4); + if (size < 4) { + error("data corrupted"); + goto exit_2; + } + } chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; + if (!fill) { + inp += 4; + size -= 4; + } } else { error("invalid header"); goto exit_2; @@ -100,29 +107,44 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, for (;;) { - if (fill) - fill(inp, 4); + if (fill) { + size = fill(inp, 4); + if (size == 0) + break; + if (size < 4) { + error("data corrupted"); + goto exit_2; + } + } chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; + if (!fill) { + inp += 4; + size -= 4; + } if (posp) *posp += 4; continue; } - inp += 4; - size -= 4; + if (posp) *posp += 4; - if (fill) { + if (!fill) { + inp += 4; + size -= 4; + } else { if (chunksize > lz4_compressbound(uncomp_chunksize)) { error("chunk length is longer than allocated"); goto exit_2; } - fill(inp, chunksize); + size = fill(inp, chunksize); + if (size < chunksize) { + error("data corrupted"); + goto exit_2; + } } #ifdef PREBOOT if (out_len >= uncomp_chunksize) { @@ -149,18 +171,17 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, if (posp) *posp += chunksize; - size -= chunksize; + if (!fill) { + size -= chunksize; - if (size == 0) - break; - else if (size < 0) { - error("data corrupted"); - goto exit_2; + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + inp += chunksize; } - - inp += chunksize; - if (fill) - inp = inp_start; } ret = 0; -- cgit From d97b07c54f34e88352ebe676beb798c8f59ac588 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 Aug 2014 14:23:14 -0700 Subject: initramfs: support initramfs that is bigger than 2GiB Now with 64bit bzImage and kexec tools, we support ramdisk that size is bigger than 2g, as we could put it above 4G. Found compressed initramfs image could not be decompressed properly. It turns out that image length is int during decompress detection, and it will become < 0 when length is more than 2G. Furthermore, during decompressing len as int is used for inbuf count, that has problem too. Change len to long, that should be ok as on 32 bit platform long is 32bits. Tested with following compressed initramfs image as root with kexec. gzip, bzip2, xz, lzma, lzop, lz4. run time for populate_rootfs(): size name Nehalem-EX Westmere-EX Ivybridge-EX 9034400256 root_img : 26s 24s 30s 3561095057 root_img.lz4 : 28s 27s 27s 3459554629 root_img.lzo : 29s 29s 28s 3219399480 root_img.gz : 64s 62s 49s 2251594592 root_img.xz : 262s 260s 183s 2226366598 root_img.lzma: 386s 376s 277s 2901482513 root_img.bz2 : 635s 599s Signed-off-by: Yinghai Lu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rashika Kheria Cc: Josh Triplett Cc: Kyungsik Lee Cc: P J P Cc: Al Viro Cc: Tetsuo Handa Cc: "Daniel M. Weeks" Cc: Alexandre Courbot Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- crypto/zlib.c | 8 ++++---- fs/isofs/compress.c | 4 ++-- fs/jffs2/compr_zlib.c | 7 ++++--- include/linux/decompress/bunzip2.h | 8 ++++---- include/linux/decompress/generic.h | 10 +++++----- include/linux/decompress/inflate.h | 8 ++++---- include/linux/decompress/unlz4.h | 8 ++++---- include/linux/decompress/unlzma.h | 8 ++++---- include/linux/decompress/unlzo.h | 8 ++++---- include/linux/decompress/unxz.h | 8 ++++---- include/linux/zlib.h | 4 ++-- init/do_mounts_rd.c | 10 +++++----- init/initramfs.c | 22 +++++++++++----------- lib/decompress.c | 2 +- lib/decompress_bunzip2.c | 26 +++++++++++++------------- lib/decompress_inflate.c | 12 ++++++------ lib/decompress_unlz4.c | 18 +++++++++--------- lib/decompress_unlzma.c | 28 ++++++++++++++-------------- lib/decompress_unlzo.c | 12 ++++++------ lib/decompress_unxz.c | 10 +++++----- 20 files changed, 111 insertions(+), 110 deletions(-) (limited to 'lib') diff --git a/crypto/zlib.c b/crypto/zlib.c index 06b62e5cdcc7..c9ee681d57fd 100644 --- a/crypto/zlib.c +++ b/crypto/zlib.c @@ -168,7 +168,7 @@ static int zlib_compress_update(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -198,7 +198,7 @@ static int zlib_compress_final(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -283,7 +283,7 @@ static int zlib_decompress_update(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -331,7 +331,7 @@ static int zlib_decompress_final(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 592e5115a561..f311bf084015 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, "zisofs: zisofs_inflate returned" " %d, inode = %lu," " page idx = %d, bh idx = %d," - " avail_in = %d," - " avail_out = %d\n", + " avail_in = %ld," + " avail_out = %ld\n", zerr, inode->i_ino, curpage, curbh, stream.avail_in, stream.avail_out); diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 0b9a1e44e833..5698dae5d92d 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in, while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); - def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); - jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in = min_t(unsigned long, + (*sourcelen-def_strm.total_in), def_strm.avail_out); + jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n", def_strm.avail_in, def_strm.avail_out); ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); - jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n", def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out); if (ret != Z_OK) { diff --git a/include/linux/decompress/bunzip2.h b/include/linux/decompress/bunzip2.h index 115272137a9c..4d683df898e6 100644 --- a/include/linux/decompress/bunzip2.h +++ b/include/linux/decompress/bunzip2.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_BUNZIP2_H #define DECOMPRESS_BUNZIP2_H -int bunzip2(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int bunzip2(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h index 0c7111a55a1a..1fcfd64b5076 100644 --- a/include/linux/decompress/generic.h +++ b/include/linux/decompress/generic.h @@ -1,11 +1,11 @@ #ifndef DECOMPRESS_GENERIC_H #define DECOMPRESS_GENERIC_H -typedef int (*decompress_fn) (unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +typedef int (*decompress_fn) (unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *posp, + long *posp, void(*error)(char *x)); /* inbuf - input buffer @@ -33,7 +33,7 @@ typedef int (*decompress_fn) (unsigned char *inbuf, int len, /* Utility routine to detect the decompression method */ -decompress_fn decompress_method(const unsigned char *inbuf, int len, +decompress_fn decompress_method(const unsigned char *inbuf, long len, const char **name); #endif diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h index 1d0aedef9822..e4f411fdbd24 100644 --- a/include/linux/decompress/inflate.h +++ b/include/linux/decompress/inflate.h @@ -1,10 +1,10 @@ #ifndef LINUX_DECOMPRESS_INFLATE_H #define LINUX_DECOMPRESS_INFLATE_H -int gunzip(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int gunzip(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error_fn)(char *x)); #endif diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h index d5b68bf3ec92..3273c2f36496 100644 --- a/include/linux/decompress/unlz4.h +++ b/include/linux/decompress/unlz4.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_UNLZ4_H #define DECOMPRESS_UNLZ4_H -int unlz4(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlz4(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/unlzma.h b/include/linux/decompress/unlzma.h index 7796538f1bf4..8a891a193840 100644 --- a/include/linux/decompress/unlzma.h +++ b/include/linux/decompress/unlzma.h @@ -1,11 +1,11 @@ #ifndef DECOMPRESS_UNLZMA_H #define DECOMPRESS_UNLZMA_H -int unlzma(unsigned char *, int, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlzma(unsigned char *, long, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ); diff --git a/include/linux/decompress/unlzo.h b/include/linux/decompress/unlzo.h index 987229752519..af18f95d6570 100644 --- a/include/linux/decompress/unlzo.h +++ b/include/linux/decompress/unlzo.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_UNLZO_H #define DECOMPRESS_UNLZO_H -int unlzo(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlzo(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h index 41728fc6c8a1..f764e2a7201e 100644 --- a/include/linux/decompress/unxz.h +++ b/include/linux/decompress/unxz.h @@ -10,10 +10,10 @@ #ifndef DECOMPRESS_UNXZ_H #define DECOMPRESS_UNXZ_H -int unxz(unsigned char *in, int in_size, - int (*fill)(void *dest, unsigned int size), - int (*flush)(void *src, unsigned int size), - unsigned char *out, int *in_used, +int unxz(unsigned char *in, long in_size, + long (*fill)(void *dest, unsigned long size), + long (*flush)(void *src, unsigned long size), + unsigned char *out, long *in_used, void (*error)(char *x)); #endif diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 197abb2a54c5..92dbbd3f6c75 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -83,11 +83,11 @@ struct internal_state; typedef struct z_stream_s { const Byte *next_in; /* next input byte */ - uInt avail_in; /* number of bytes available at next_in */ + uLong avail_in; /* number of bytes available at next_in */ uLong total_in; /* total nb of input bytes read so far */ Byte *next_out; /* next output byte should be put there */ - uInt avail_out; /* remaining free space at next_out */ + uLong avail_out; /* remaining free space at next_out */ uLong total_out; /* total nb of bytes output so far */ char *msg; /* last error message, NULL if no error */ diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index a8227022e3a0..e5d059e8aa11 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -311,9 +311,9 @@ static int exit_code; static int decompress_error; static int crd_infd, crd_outfd; -static int __init compr_fill(void *buf, unsigned int len) +static long __init compr_fill(void *buf, unsigned long len) { - int r = sys_read(crd_infd, buf, len); + long r = sys_read(crd_infd, buf, len); if (r < 0) printk(KERN_ERR "RAMDISK: error while reading compressed data"); else if (r == 0) @@ -321,13 +321,13 @@ static int __init compr_fill(void *buf, unsigned int len) return r; } -static int __init compr_flush(void *window, unsigned int outcnt) +static long __init compr_flush(void *window, unsigned long outcnt) { - int written = sys_write(crd_outfd, window, outcnt); + long written = sys_write(crd_outfd, window, outcnt); if (written != outcnt) { if (decompress_error == 0) printk(KERN_ERR - "RAMDISK: incomplete write (%d != %d)\n", + "RAMDISK: incomplete write (%ld != %ld)\n", written, outcnt); decompress_error = 1; return -1; diff --git a/init/initramfs.c b/init/initramfs.c index 4f276b6a167b..a7566031242e 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -197,7 +197,7 @@ static __initdata enum state { } state, next_state; static __initdata char *victim; -static __initdata unsigned count; +static unsigned long count __initdata; static __initdata loff_t this_header, next_header; static inline void __init eat(unsigned n) @@ -209,7 +209,7 @@ static inline void __init eat(unsigned n) static __initdata char *vcollected; static __initdata char *collected; -static __initdata int remains; +static long remains __initdata; static __initdata char *collect; static void __init read_into(char *buf, unsigned size, enum state next) @@ -236,7 +236,7 @@ static int __init do_start(void) static int __init do_collect(void) { - unsigned n = remains; + unsigned long n = remains; if (count < n) n = count; memcpy(collect, victim, n); @@ -407,7 +407,7 @@ static __initdata int (*actions[])(void) = { [Reset] = do_reset, }; -static int __init write_buffer(char *buf, unsigned len) +static long __init write_buffer(char *buf, unsigned long len) { count = len; victim = buf; @@ -417,11 +417,11 @@ static int __init write_buffer(char *buf, unsigned len) return len - count; } -static int __init flush_buffer(void *bufv, unsigned len) +static long __init flush_buffer(void *bufv, unsigned long len) { char *buf = (char *) bufv; - int written; - int origLen = len; + long written; + long origLen = len; if (message) return -1; while ((written = write_buffer(buf, len)) < len && !message) { @@ -440,13 +440,13 @@ static int __init flush_buffer(void *bufv, unsigned len) return origLen; } -static unsigned my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ #include -static char * __init unpack_to_rootfs(char *buf, unsigned len) +static char * __init unpack_to_rootfs(char *buf, unsigned long len) { - int written, res; + long written; decompress_fn decompress; const char *compress_name; static __initdata char msg_buf[64]; @@ -480,7 +480,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len) decompress = decompress_method(buf, len, &compress_name); pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { - res = decompress(buf, len, NULL, flush_buffer, NULL, + int res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); if (res) error("decompressor failed"); diff --git a/lib/decompress.c b/lib/decompress.c index 86069d74c062..37f3c786348f 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -54,7 +54,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0, 0}, NULL, NULL } }; -decompress_fn __init decompress_method(const unsigned char *inbuf, int len, +decompress_fn __init decompress_method(const unsigned char *inbuf, long len, const char **name) { const struct compress_format *cf; diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 31c5f7675fbf..8290e0bef7ea 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -92,8 +92,8 @@ struct bunzip_data { /* State for interrupting output loop */ int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent; /* I/O tracking data (file handles, buffers, positions, etc.) */ - int (*fill)(void*, unsigned int); - int inbufCount, inbufPos /*, outbufPos*/; + long (*fill)(void*, unsigned long); + long inbufCount, inbufPos /*, outbufPos*/; unsigned char *inbuf /*,*outbuf*/; unsigned int inbufBitCount, inbufBits; /* The CRC values stored in the block header and calculated from the @@ -617,7 +617,7 @@ decode_next_byte: goto decode_next_byte; } -static int INIT nofill(void *buf, unsigned int len) +static long INIT nofill(void *buf, unsigned long len) { return -1; } @@ -625,8 +625,8 @@ static int INIT nofill(void *buf, unsigned int len) /* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are ignored, and data is read from file handle into temporary buffer. */ -static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, - int (*fill)(void*, unsigned int)) +static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, long len, + long (*fill)(void*, unsigned long)) { struct bunzip_data *bd; unsigned int i, j, c; @@ -675,11 +675,11 @@ static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, /* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data, not end of file.) */ -STATIC int INIT bunzip2(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT bunzip2(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *pos, + long *pos, void(*error)(char *x)) { struct bunzip_data *bd; @@ -743,11 +743,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *pos, + long *pos, void(*error)(char *x)) { return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error); diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index 0edfd742a154..d4c7891635ec 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -27,17 +27,17 @@ #define GZIP_IOBUF_SIZE (16*1024) -static int INIT nofill(void *buffer, unsigned int len) +static long INIT nofill(void *buffer, unsigned long len) { return -1; } /* Included from initramfs et al code */ -STATIC int INIT gunzip(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT gunzip(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *out_buf, - int *pos, + long *pos, void(*error)(char *x)) { u8 *zbuf; struct z_stream_s *strm; @@ -142,7 +142,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, /* Write any data generated */ if (flush && strm->next_out > out_buf) { - int l = strm->next_out - out_buf; + long l = strm->next_out - out_buf; if (l != flush(out_buf, l)) { rc = -1; error("write error"); diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c index 3ad7f3954dfd..40f66ebe57b7 100644 --- a/lib/decompress_unlz4.c +++ b/lib/decompress_unlz4.c @@ -31,10 +31,10 @@ #define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) #define ARCHIVE_MAGICNUMBER 0x184C2102 -STATIC inline int INIT unlz4(u8 *input, int in_len, - int (*fill) (void *, unsigned int), - int (*flush) (void *, unsigned int), - u8 *output, int *posp, +STATIC inline int INIT unlz4(u8 *input, long in_len, + long (*fill)(void *, unsigned long), + long (*flush)(void *, unsigned long), + u8 *output, long *posp, void (*error) (char *x)) { int ret = -1; @@ -43,7 +43,7 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, u8 *inp; u8 *inp_start; u8 *outp; - int size = in_len; + long size = in_len; #ifdef PREBOOT size_t out_len = get_unaligned_le32(input + in_len); #endif @@ -196,11 +196,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index 32adb73a9038..0be83af62b88 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -65,11 +65,11 @@ static long long INIT read_int(unsigned char *ptr, int size) #define LZMA_IOBUF_SIZE 0x10000 struct rc { - int (*fill)(void*, unsigned int); + long (*fill)(void*, unsigned long); uint8_t *ptr; uint8_t *buffer; uint8_t *buffer_end; - int buffer_size; + long buffer_size; uint32_t code; uint32_t range; uint32_t bound; @@ -82,7 +82,7 @@ struct rc { #define RC_MODEL_TOTAL_BITS 11 -static int INIT nofill(void *buffer, unsigned int len) +static long INIT nofill(void *buffer, unsigned long len) { return -1; } @@ -99,8 +99,8 @@ static void INIT rc_read(struct rc *rc) /* Called once */ static inline void INIT rc_init(struct rc *rc, - int (*fill)(void*, unsigned int), - char *buffer, int buffer_size) + long (*fill)(void*, unsigned long), + char *buffer, long buffer_size) { if (fill) rc->fill = fill; @@ -280,7 +280,7 @@ struct writer { size_t buffer_pos; int bufsize; size_t global_pos; - int(*flush)(void*, unsigned int); + long (*flush)(void*, unsigned long); struct lzma_header *header; }; @@ -534,11 +534,11 @@ static inline int INIT process_bit1(struct writer *wr, struct rc *rc, -STATIC inline int INIT unlzma(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC inline int INIT unlzma(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { @@ -667,11 +667,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index 960183d4258f..b94a31bdd87d 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c @@ -51,7 +51,7 @@ static const unsigned char lzop_magic[] = { #define HEADER_SIZE_MIN (9 + 7 + 4 + 8 + 1 + 4) #define HEADER_SIZE_MAX (9 + 7 + 1 + 8 + 8 + 4 + 1 + 255 + 4) -STATIC inline int INIT parse_header(u8 *input, int *skip, int in_len) +STATIC inline long INIT parse_header(u8 *input, long *skip, long in_len) { int l; u8 *parse = input; @@ -108,14 +108,14 @@ STATIC inline int INIT parse_header(u8 *input, int *skip, int in_len) return 1; } -STATIC inline int INIT unlzo(u8 *input, int in_len, - int (*fill) (void *, unsigned int), - int (*flush) (void *, unsigned int), - u8 *output, int *posp, +STATIC int INIT unlzo(u8 *input, long in_len, + long (*fill)(void *, unsigned long), + long (*flush)(void *, unsigned long), + u8 *output, long *posp, void (*error) (char *x)) { u8 r = 0; - int skip = 0; + long skip = 0; u32 src_len, dst_len; size_t tmp; u8 *in_buf, *in_buf_save, *out_buf; diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 9f34eb56854d..b07a78340e9d 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -248,10 +248,10 @@ void *memmove(void *dest, const void *src, size_t size) * both input and output buffers are available as a single chunk, i.e. when * fill() and flush() won't be used. */ -STATIC int INIT unxz(unsigned char *in, int in_size, - int (*fill)(void *dest, unsigned int size), - int (*flush)(void *src, unsigned int size), - unsigned char *out, int *in_used, +STATIC int INIT unxz(unsigned char *in, long in_size, + long (*fill)(void *dest, unsigned long size), + long (*flush)(void *src, unsigned long size), + unsigned char *out, long *in_used, void (*error)(char *x)) { struct xz_buf b; @@ -329,7 +329,7 @@ STATIC int INIT unxz(unsigned char *in, int in_size, * returned by xz_dec_run(), but probably * it's not too bad. */ - if (flush(b.out, b.out_pos) != (int)b.out_pos) + if (flush(b.out, b.out_pos) != (long)b.out_pos) ret = XZ_BUF_ERROR; b.out_pos = 0; -- cgit From 308c09f17da4adc53935115dbeb5bce4f067d8f9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 8 Aug 2014 14:23:25 -0700 Subject: lib/scatterlist: make ARCH_HAS_SG_CHAIN an actual Kconfig Rather than have architectures #define ARCH_HAS_SG_CHAIN in an architecture specific scatterlist.h, make it a proper Kconfig option and use that instead. At same time, remove the header files are are now mostly useless and just include asm-generic/scatterlist.h. [sfr@canb.auug.org.au: powerpc files now need asm/dma.h] Signed-off-by: Laura Abbott Acked-by: Thomas Gleixner [x86] Acked-by: Benjamin Herrenschmidt [powerpc] Acked-by: Heiko Carstens Cc: Russell King Cc: Tony Luck Cc: Fenghua Yu Cc: Paul Mackerras Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Martin Schwidefsky Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm/include/asm/scatterlist.h | 12 ------------ arch/arm64/Kconfig | 1 + arch/ia64/Kconfig | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/ia64/include/asm/scatterlist.h | 7 ------- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/scatterlist.h | 17 ----------------- arch/powerpc/mm/dma-noncoherent.c | 1 + arch/powerpc/platforms/44x/warp.c | 1 + arch/powerpc/platforms/52xx/efika.c | 1 + arch/powerpc/platforms/amigaone/setup.c | 1 + arch/s390/Kconfig | 1 + arch/s390/include/asm/Kbuild | 1 + arch/s390/include/asm/scatterlist.h | 3 --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/sparc/include/asm/scatterlist.h | 8 -------- arch/um/include/asm/Kbuild | 1 + arch/x86/Kconfig | 1 + arch/x86/include/asm/Kbuild | 3 ++- arch/x86/include/asm/scatterlist.h | 8 -------- include/linux/scatterlist.h | 2 +- include/scsi/scsi.h | 2 +- lib/Kconfig | 7 +++++++ lib/scatterlist.c | 4 ++-- 28 files changed, 30 insertions(+), 60 deletions(-) delete mode 100644 arch/arm/include/asm/scatterlist.h delete mode 100644 arch/ia64/include/asm/scatterlist.h delete mode 100644 arch/powerpc/include/asm/scatterlist.h delete mode 100644 arch/s390/include/asm/scatterlist.h delete mode 100644 arch/sparc/include/asm/scatterlist.h delete mode 100644 arch/x86/include/asm/scatterlist.h (limited to 'lib') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d31c500653a2..8e9dbcbcf5af 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -83,6 +83,7 @@ config ARM . config ARM_HAS_SG_CHAIN + select ARCH_HAS_SG_CHAIN bool config NEED_SG_DMA_LENGTH diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index f5a357601983..70cd84eb7fda 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += poll.h generic-y += preempt.h generic-y += resource.h generic-y += rwsem.h +generic-y += scatterlist.h generic-y += sections.h generic-y += segment.h generic-y += sembuf.h diff --git a/arch/arm/include/asm/scatterlist.h b/arch/arm/include/asm/scatterlist.h deleted file mode 100644 index cefdb8f898a1..000000000000 --- a/arch/arm/include/asm/scatterlist.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASMARM_SCATTERLIST_H -#define _ASMARM_SCATTERLIST_H - -#ifdef CONFIG_ARM_HAS_SG_CHAIN -#define ARCH_HAS_SG_CHAIN -#endif - -#include -#include -#include - -#endif /* _ASMARM_SCATTERLIST_H */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b0f9c9db9590..fd4e81a4e1ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,6 +1,7 @@ config ARM64 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_SUPPORTS_ATOMIC_RMW diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 44a6915ab13d..c84c88bbbbd7 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -28,6 +28,7 @@ config IA64 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SG_CHAIN select VIRT_TO_BUS select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 0da4aa2602ae..e8317d2d6c8d 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -5,5 +5,6 @@ generic-y += hash.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/ia64/include/asm/scatterlist.h b/arch/ia64/include/asm/scatterlist.h deleted file mode 100644 index 08fd93bff1db..000000000000 --- a/arch/ia64/include/asm/scatterlist.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_SCATTERLIST_H -#define _ASM_IA64_SCATTERLIST_H - -#include -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_IA64_SCATTERLIST_H */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 80b94b0add1f..4bc7b62fb4b6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -111,6 +111,7 @@ config PPC select HAVE_DMA_API_DEBUG select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK + select ARCH_HAS_SG_CHAIN select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 3fb1bc432f4f..7f23f162ce9c 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -4,5 +4,6 @@ generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += rwsem.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/powerpc/include/asm/scatterlist.h b/arch/powerpc/include/asm/scatterlist.h deleted file mode 100644 index de1f620bd5c9..000000000000 --- a/arch/powerpc/include/asm/scatterlist.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _ASM_POWERPC_SCATTERLIST_H -#define _ASM_POWERPC_SCATTERLIST_H -/* - * Copyright (C) 2001 PPC64 Team, IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_POWERPC_SCATTERLIST_H */ diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 7b6c10750179..d85e86aac7fb 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -33,6 +33,7 @@ #include #include +#include #include "mmu_decl.h" diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index 534574a97ec9..3a104284b338 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -25,6 +25,7 @@ #include #include #include +#include static __initdata struct of_device_id warp_of_bus[] = { diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 6e19b0ad5d26..3feffde9128d 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 03aabc0e16ac..2fe12046279e 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -24,6 +24,7 @@ #include #include #include +#include extern void __flush_disable_L1(void); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8ca60f8d5683..05c78bb5f570 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -145,6 +145,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select ARCH_HAS_SG_CHAIN config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 57892a8a9055..b3fea0722ff1 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -4,4 +4,5 @@ generic-y += clkdev.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/s390/include/asm/scatterlist.h b/arch/s390/include/asm/scatterlist.h deleted file mode 100644 index 6d45ef6c12a7..000000000000 --- a/arch/s390/include/asm/scatterlist.h +++ /dev/null @@ -1,3 +0,0 @@ -#include - -#define ARCH_HAS_SG_CHAIN diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 4692c90936f1..a537816613f9 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -42,6 +42,7 @@ config SPARC select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND + select ARCH_HAS_SG_CHAIN config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a45821818003..cdd1b447bb6c 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += mcs_spinlock.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h deleted file mode 100644 index 92bb638313f8..000000000000 --- a/arch/sparc/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _SPARC_SCATTERLIST_H -#define _SPARC_SCATTERLIST_H - -#include - -#define ARCH_HAS_SG_CHAIN - -#endif /* !(_SPARC_SCATTERLIST_H) */ diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index a5e4b6068213..7bd64aa2e94a 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -21,6 +21,7 @@ generic-y += param.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += sections.h generic-y += switch_to.h generic-y += topology.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bf2405053af5..c915cc6e40be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -96,6 +96,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 3ca9762e1649..3bf000fab0ae 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,6 +5,7 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h -generic-y += early_ioremap.h generic-y += cputime.h +generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += scatterlist.h diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h deleted file mode 100644 index 4240878b9d76..000000000000 --- a/arch/x86/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H - -#include - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index f4ec8bbcb372..ed8f9e70df9b 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -136,7 +136,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN BUG(); #endif diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index e6df23cae7be..261e708010da 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -31,7 +31,7 @@ enum scsi_timeouts { * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ -#ifdef ARCH_HAS_SG_CHAIN +#ifdef CONFIG_ARCH_HAS_SG_CHAIN #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 #else #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS diff --git a/lib/Kconfig b/lib/Kconfig index df872659ddd3..a5ce0c7f6c30 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -508,4 +508,11 @@ config UCS2_STRING source "lib/fonts/Kconfig" +# +# sg chaining option +# + +config ARCH_HAS_SG_CHAIN + def_bool n + endmenu diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b4415fceb7e7..9cdf62f8accd 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -73,7 +73,7 @@ EXPORT_SYMBOL(sg_nents); **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN struct scatterlist *ret = &sgl[nents - 1]; #else struct scatterlist *sg, *ret = NULL; @@ -255,7 +255,7 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents, if (nents == 0) return -EINVAL; -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif -- cgit From 214e0aed639ef40987bf6159fad303171a6de31e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:55 -0700 Subject: locking/Documentation: Move locking related docs into Documentation/locking/ Specifically: Documentation/locking/lockdep-design.txt Documentation/locking/lockstat.txt Documentation/locking/mutex-design.txt Documentation/locking/rt-mutex-design.txt Documentation/locking/rt-mutex.txt Documentation/locking/spinlocks.txt Documentation/locking/ww-mutex-design.txt Signed-off-by: Davidlohr Bueso Acked-by: Randy Dunlap Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Cc: aswin@hp.com Cc: Alexei Starovoitov Cc: Al Viro Cc: Andrew Morton Cc: Chris Mason Cc: Dan Streetman Cc: David Airlie Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Low Cc: Josef Bacik Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Masanari Iida Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Tim Chen Cc: Vineet Gupta Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1406752916-3341-6-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- Documentation/00-INDEX | 2 + Documentation/DocBook/kernel-locking.tmpl | 2 +- Documentation/lockdep-design.txt | 286 ----------- Documentation/locking/lockdep-design.txt | 286 +++++++++++ Documentation/locking/lockstat.txt | 178 +++++++ Documentation/locking/mutex-design.txt | 157 ++++++ Documentation/locking/rt-mutex-design.txt | 781 ++++++++++++++++++++++++++++++ Documentation/locking/rt-mutex.txt | 79 +++ Documentation/locking/spinlocks.txt | 167 +++++++ Documentation/locking/ww-mutex-design.txt | 344 +++++++++++++ Documentation/lockstat.txt | 178 ------- Documentation/mutex-design.txt | 157 ------ Documentation/rt-mutex-design.txt | 781 ------------------------------ Documentation/rt-mutex.txt | 79 --- Documentation/spinlocks.txt | 167 ------- Documentation/ww-mutex-design.txt | 344 ------------- MAINTAINERS | 4 +- drivers/gpu/drm/drm_modeset_lock.c | 2 +- include/linux/lockdep.h | 2 +- include/linux/mutex.h | 2 +- include/linux/rwsem.h | 2 +- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- lib/Kconfig.debug | 4 +- 24 files changed, 2005 insertions(+), 2003 deletions(-) delete mode 100644 Documentation/lockdep-design.txt create mode 100644 Documentation/locking/lockdep-design.txt create mode 100644 Documentation/locking/lockstat.txt create mode 100644 Documentation/locking/mutex-design.txt create mode 100644 Documentation/locking/rt-mutex-design.txt create mode 100644 Documentation/locking/rt-mutex.txt create mode 100644 Documentation/locking/spinlocks.txt create mode 100644 Documentation/locking/ww-mutex-design.txt delete mode 100644 Documentation/lockstat.txt delete mode 100644 Documentation/mutex-design.txt delete mode 100644 Documentation/rt-mutex-design.txt delete mode 100644 Documentation/rt-mutex.txt delete mode 100644 Documentation/spinlocks.txt delete mode 100644 Documentation/ww-mutex-design.txt (limited to 'lib') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 27e67a98b7be..1750fcef1ab4 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -287,6 +287,8 @@ local_ops.txt - semantics and behavior of local atomic operations. lockdep-design.txt - documentation on the runtime locking correctness validator. +locking/ + - directory with info about kernel locking primitives lockstat.txt - info on collecting statistics on locks (and contention). lockup-watchdogs.txt diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index e584ee12a1e7..7c9cc4846cb6 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1972,7 +1972,7 @@ machines due to caching. - Documentation/spinlocks.txt: + Documentation/locking/spinlocks.txt: Linus Torvalds' spinlocking tutorial in the kernel sources. diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt deleted file mode 100644 index 5dbc99c04f6e..000000000000 --- a/Documentation/lockdep-design.txt +++ /dev/null @@ -1,286 +0,0 @@ -Runtime locking correctness validator -===================================== - -started by Ingo Molnar -additions by Arjan van de Ven - -Lock-class ----------- - -The basic object the validator operates upon is a 'class' of locks. - -A class of locks is a group of locks that are logically the same with -respect to locking rules, even if the locks may have multiple (possibly -tens of thousands of) instantiations. For example a lock in the inode -struct is one class, while each inode has its own instantiation of that -lock class. - -The validator tracks the 'state' of lock-classes, and it tracks -dependencies between different lock-classes. The validator maintains a -rolling proof that the state and the dependencies are correct. - -Unlike an lock instantiation, the lock-class itself never goes away: when -a lock-class is used for the first time after bootup it gets registered, -and all subsequent uses of that lock-class will be attached to this -lock-class. - -State ------ - -The validator tracks lock-class usage history into 4n + 1 separate state bits: - -- 'ever held in STATE context' -- 'ever held as readlock in STATE context' -- 'ever held with STATE enabled' -- 'ever held as readlock with STATE enabled' - -Where STATE can be either one of (kernel/lockdep_states.h) - - hardirq - - softirq - - reclaim_fs - -- 'ever used' [ == !unused ] - -When locking rules are violated, these state bits are presented in the -locking error messages, inside curlies. A contrived example: - - modprobe/2287 is trying to acquire lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - but task is already holding lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - -The bit position indicates STATE, STATE-read, for each of the states listed -above, and the character displayed in each indicates: - - '.' acquired while irqs disabled and not in irq context - '-' acquired in irq context - '+' acquired with irqs enabled - '?' acquired in irq context with irqs enabled. - -Unused mutexes cannot be part of the cause of an error. - - -Single-lock state rules: ------------------------- - -A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The -following states are exclusive, and only one of them is allowed to be -set for any lock-class: - - and - and - -The validator detects and reports lock usage that violate these -single-lock state rules. - -Multi-lock dependency rules: ----------------------------- - -The same lock-class must not be acquired twice, because this could lead -to lock recursion deadlocks. - -Furthermore, two locks may not be taken in different order: - - -> - -> - -because this could lead to lock inversion deadlocks. (The validator -finds such dependencies in arbitrary complexity, i.e. there can be any -other locking sequence between the acquire-lock operations, the -validator will still track all dependencies between locks.) - -Furthermore, the following usage based lock dependencies are not allowed -between any two lock-classes: - - -> - -> - -The first rule comes from the fact the a hardirq-safe lock could be -taken by a hardirq context, interrupting a hardirq-unsafe lock - and -thus could result in a lock inversion deadlock. Likewise, a softirq-safe -lock could be taken by an softirq context, interrupting a softirq-unsafe -lock. - -The above rules are enforced for any locking sequence that occurs in the -kernel: when acquiring a new lock, the validator checks whether there is -any rule violation between the new lock and any of the held locks. - -When a lock-class changes its state, the following aspects of the above -dependency rules are enforced: - -- if a new hardirq-safe lock is discovered, we check whether it - took any hardirq-unsafe lock in the past. - -- if a new softirq-safe lock is discovered, we check whether it took - any softirq-unsafe lock in the past. - -- if a new hardirq-unsafe lock is discovered, we check whether any - hardirq-safe lock took it in the past. - -- if a new softirq-unsafe lock is discovered, we check whether any - softirq-safe lock took it in the past. - -(Again, we do these checks too on the basis that an interrupt context -could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which -could lead to a lock inversion deadlock - even if that lock scenario did -not trigger in practice yet.) - -Exception: Nested data dependencies leading to nested locking -------------------------------------------------------------- - -There are a few cases where the Linux kernel acquires more than one -instance of the same lock-class. Such cases typically happen when there -is some sort of hierarchy within objects of the same type. In these -cases there is an inherent "natural" ordering between the two objects -(defined by the properties of the hierarchy), and the kernel grabs the -locks in this fixed order on each of the objects. - -An example of such an object hierarchy that results in "nested locking" -is that of a "whole disk" block-dev object and a "partition" block-dev -object; the partition is "part of" the whole device and as long as one -always takes the whole disk lock as a higher lock than the partition -lock, the lock ordering is fully correct. The validator does not -automatically detect this natural ordering, as the locking rule behind -the ordering is not static. - -In order to teach the validator about this correct usage model, new -versions of the various locking primitives were added that allow you to -specify a "nesting level". An example call, for the block device mutex, -looks like this: - -enum bdev_bd_mutex_lock_class -{ - BD_MUTEX_NORMAL, - BD_MUTEX_WHOLE, - BD_MUTEX_PARTITION -}; - - mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); - -In this case the locking is done on a bdev object that is known to be a -partition. - -The validator treats a lock that is taken in such a nested fashion as a -separate (sub)class for the purposes of validation. - -Note: When changing code to use the _nested() primitives, be careful and -check really thoroughly that the hierarchy is correctly mapped; otherwise -you can get false positives or false negatives. - -Proof of 100% correctness: --------------------------- - -The validator achieves perfect, mathematical 'closure' (proof of locking -correctness) in the sense that for every simple, standalone single-task -locking sequence that occurred at least once during the lifetime of the -kernel, the validator proves it with a 100% certainty that no -combination and timing of these locking sequences can cause any class of -lock related deadlock. [*] - -I.e. complex multi-CPU and multi-task locking scenarios do not have to -occur in practice to prove a deadlock: only the simple 'component' -locking chains have to occur at least once (anytime, in any -task/context) for the validator to be able to prove correctness. (For -example, complex deadlocks that would normally need more than 3 CPUs and -a very unlikely constellation of tasks, irq-contexts and timings to -occur, can be detected on a plain, lightly loaded single-CPU system as -well!) - -This radically decreases the complexity of locking related QA of the -kernel: what has to be done during QA is to trigger as many "simple" -single-task locking dependencies in the kernel as possible, at least -once, to prove locking correctness - instead of having to trigger every -possible combination of locking interaction between CPUs, combined with -every possible hardirq and softirq nesting scenario (which is impossible -to do in practice). - -[*] assuming that the validator itself is 100% correct, and no other - part of the system corrupts the state of the validator in any way. - We also assume that all NMI/SMM paths [which could interrupt - even hardirq-disabled codepaths] are correct and do not interfere - with the validator. We also assume that the 64-bit 'chain hash' - value is unique for every lock-chain in the system. Also, lock - recursion must not be higher than 20. - -Performance: ------------- - -The above rules require _massive_ amounts of runtime checking. If we did -that for every lock taken and for every irqs-enable event, it would -render the system practically unusably slow. The complexity of checking -is O(N^2), so even with just a few hundred lock-classes we'd have to do -tens of thousands of checks for every event. - -This problem is solved by checking any given 'locking scenario' (unique -sequence of locks taken after each other) only once. A simple stack of -held locks is maintained, and a lightweight 64-bit hash value is -calculated, which hash is unique for every lock chain. The hash value, -when the chain is validated for the first time, is then put into a hash -table, which hash-table can be checked in a lockfree manner. If the -locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. - -Troubleshooting: ----------------- - -The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. -Exceeding this number will trigger the following lockdep warning: - - (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - -By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical -desktop systems have less than 1,000 lock classes, so this warning -normally results from lock-class leakage or failure to properly -initialize locks. These two problems are illustrated below: - -1. Repeated module loading and unloading while running the validator - will result in lock-class leakage. The issue here is that each - load of the module will create a new set of lock classes for - that module's locks, but module unloading does not remove old - classes (see below discussion of reuse of lock classes for why). - Therefore, if that module is loaded and unloaded repeatedly, - the number of lock classes will eventually reach the maximum. - -2. Using structures such as arrays that have large numbers of - locks that are not explicitly initialized. For example, - a hash table with 8192 buckets where each bucket has its own - spinlock_t will consume 8192 lock classes -unless- each spinlock - is explicitly initialized at runtime, for example, using the - run-time spin_lock_init() as opposed to compile-time initializers - such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize - the per-bucket spinlocks would guarantee lock-class overflow. - In contrast, a loop that called spin_lock_init() on each lock - would place all 8192 locks into a single lock class. - - The moral of this story is that you should always explicitly - initialize your locks. - -One might argue that the validator should be modified to allow -lock classes to be reused. However, if you are tempted to make this -argument, first review the code and think through the changes that would -be required, keeping in mind that the lock classes to be removed are -likely to be linked into the lock-dependency graph. This turns out to -be harder to do than to say. - -Of course, if you do run out of lock classes, the next thing to do is -to find the offending lock classes. First, the following command gives -you the number of lock classes currently in use along with the maximum: - - grep "lock-classes" /proc/lockdep_stats - -This command produces the following output on a modest system: - - lock-classes: 748 [max: 8191] - -If the number allocated (748 above) increases continually over time, -then there is likely a leak. The following command can be used to -identify the leaking lock classes: - - grep "BD" /proc/lockdep - -Run the command and save the output, then compare against the output from -a later run of this command to identify the leakers. This same output -can also help you find situations where runtime lock initialization has -been omitted. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt new file mode 100644 index 000000000000..5dbc99c04f6e --- /dev/null +++ b/Documentation/locking/lockdep-design.txt @@ -0,0 +1,286 @@ +Runtime locking correctness validator +===================================== + +started by Ingo Molnar +additions by Arjan van de Ven + +Lock-class +---------- + +The basic object the validator operates upon is a 'class' of locks. + +A class of locks is a group of locks that are logically the same with +respect to locking rules, even if the locks may have multiple (possibly +tens of thousands of) instantiations. For example a lock in the inode +struct is one class, while each inode has its own instantiation of that +lock class. + +The validator tracks the 'state' of lock-classes, and it tracks +dependencies between different lock-classes. The validator maintains a +rolling proof that the state and the dependencies are correct. + +Unlike an lock instantiation, the lock-class itself never goes away: when +a lock-class is used for the first time after bootup it gets registered, +and all subsequent uses of that lock-class will be attached to this +lock-class. + +State +----- + +The validator tracks lock-class usage history into 4n + 1 separate state bits: + +- 'ever held in STATE context' +- 'ever held as readlock in STATE context' +- 'ever held with STATE enabled' +- 'ever held as readlock with STATE enabled' + +Where STATE can be either one of (kernel/lockdep_states.h) + - hardirq + - softirq + - reclaim_fs + +- 'ever used' [ == !unused ] + +When locking rules are violated, these state bits are presented in the +locking error messages, inside curlies. A contrived example: + + modprobe/2287 is trying to acquire lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + but task is already holding lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + +The bit position indicates STATE, STATE-read, for each of the states listed +above, and the character displayed in each indicates: + + '.' acquired while irqs disabled and not in irq context + '-' acquired in irq context + '+' acquired with irqs enabled + '?' acquired in irq context with irqs enabled. + +Unused mutexes cannot be part of the cause of an error. + + +Single-lock state rules: +------------------------ + +A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The +following states are exclusive, and only one of them is allowed to be +set for any lock-class: + + and + and + +The validator detects and reports lock usage that violate these +single-lock state rules. + +Multi-lock dependency rules: +---------------------------- + +The same lock-class must not be acquired twice, because this could lead +to lock recursion deadlocks. + +Furthermore, two locks may not be taken in different order: + + -> + -> + +because this could lead to lock inversion deadlocks. (The validator +finds such dependencies in arbitrary complexity, i.e. there can be any +other locking sequence between the acquire-lock operations, the +validator will still track all dependencies between locks.) + +Furthermore, the following usage based lock dependencies are not allowed +between any two lock-classes: + + -> + -> + +The first rule comes from the fact the a hardirq-safe lock could be +taken by a hardirq context, interrupting a hardirq-unsafe lock - and +thus could result in a lock inversion deadlock. Likewise, a softirq-safe +lock could be taken by an softirq context, interrupting a softirq-unsafe +lock. + +The above rules are enforced for any locking sequence that occurs in the +kernel: when acquiring a new lock, the validator checks whether there is +any rule violation between the new lock and any of the held locks. + +When a lock-class changes its state, the following aspects of the above +dependency rules are enforced: + +- if a new hardirq-safe lock is discovered, we check whether it + took any hardirq-unsafe lock in the past. + +- if a new softirq-safe lock is discovered, we check whether it took + any softirq-unsafe lock in the past. + +- if a new hardirq-unsafe lock is discovered, we check whether any + hardirq-safe lock took it in the past. + +- if a new softirq-unsafe lock is discovered, we check whether any + softirq-safe lock took it in the past. + +(Again, we do these checks too on the basis that an interrupt context +could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which +could lead to a lock inversion deadlock - even if that lock scenario did +not trigger in practice yet.) + +Exception: Nested data dependencies leading to nested locking +------------------------------------------------------------- + +There are a few cases where the Linux kernel acquires more than one +instance of the same lock-class. Such cases typically happen when there +is some sort of hierarchy within objects of the same type. In these +cases there is an inherent "natural" ordering between the two objects +(defined by the properties of the hierarchy), and the kernel grabs the +locks in this fixed order on each of the objects. + +An example of such an object hierarchy that results in "nested locking" +is that of a "whole disk" block-dev object and a "partition" block-dev +object; the partition is "part of" the whole device and as long as one +always takes the whole disk lock as a higher lock than the partition +lock, the lock ordering is fully correct. The validator does not +automatically detect this natural ordering, as the locking rule behind +the ordering is not static. + +In order to teach the validator about this correct usage model, new +versions of the various locking primitives were added that allow you to +specify a "nesting level". An example call, for the block device mutex, +looks like this: + +enum bdev_bd_mutex_lock_class +{ + BD_MUTEX_NORMAL, + BD_MUTEX_WHOLE, + BD_MUTEX_PARTITION +}; + + mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); + +In this case the locking is done on a bdev object that is known to be a +partition. + +The validator treats a lock that is taken in such a nested fashion as a +separate (sub)class for the purposes of validation. + +Note: When changing code to use the _nested() primitives, be careful and +check really thoroughly that the hierarchy is correctly mapped; otherwise +you can get false positives or false negatives. + +Proof of 100% correctness: +-------------------------- + +The validator achieves perfect, mathematical 'closure' (proof of locking +correctness) in the sense that for every simple, standalone single-task +locking sequence that occurred at least once during the lifetime of the +kernel, the validator proves it with a 100% certainty that no +combination and timing of these locking sequences can cause any class of +lock related deadlock. [*] + +I.e. complex multi-CPU and multi-task locking scenarios do not have to +occur in practice to prove a deadlock: only the simple 'component' +locking chains have to occur at least once (anytime, in any +task/context) for the validator to be able to prove correctness. (For +example, complex deadlocks that would normally need more than 3 CPUs and +a very unlikely constellation of tasks, irq-contexts and timings to +occur, can be detected on a plain, lightly loaded single-CPU system as +well!) + +This radically decreases the complexity of locking related QA of the +kernel: what has to be done during QA is to trigger as many "simple" +single-task locking dependencies in the kernel as possible, at least +once, to prove locking correctness - instead of having to trigger every +possible combination of locking interaction between CPUs, combined with +every possible hardirq and softirq nesting scenario (which is impossible +to do in practice). + +[*] assuming that the validator itself is 100% correct, and no other + part of the system corrupts the state of the validator in any way. + We also assume that all NMI/SMM paths [which could interrupt + even hardirq-disabled codepaths] are correct and do not interfere + with the validator. We also assume that the 64-bit 'chain hash' + value is unique for every lock-chain in the system. Also, lock + recursion must not be higher than 20. + +Performance: +------------ + +The above rules require _massive_ amounts of runtime checking. If we did +that for every lock taken and for every irqs-enable event, it would +render the system practically unusably slow. The complexity of checking +is O(N^2), so even with just a few hundred lock-classes we'd have to do +tens of thousands of checks for every event. + +This problem is solved by checking any given 'locking scenario' (unique +sequence of locks taken after each other) only once. A simple stack of +held locks is maintained, and a lightweight 64-bit hash value is +calculated, which hash is unique for every lock chain. The hash value, +when the chain is validated for the first time, is then put into a hash +table, which hash-table can be checked in a lockfree manner. If the +locking chain occurs again later on, the hash table tells us that we +dont have to validate the chain again. + +Troubleshooting: +---------------- + +The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. +Exceeding this number will trigger the following lockdep warning: + + (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + +By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical +desktop systems have less than 1,000 lock classes, so this warning +normally results from lock-class leakage or failure to properly +initialize locks. These two problems are illustrated below: + +1. Repeated module loading and unloading while running the validator + will result in lock-class leakage. The issue here is that each + load of the module will create a new set of lock classes for + that module's locks, but module unloading does not remove old + classes (see below discussion of reuse of lock classes for why). + Therefore, if that module is loaded and unloaded repeatedly, + the number of lock classes will eventually reach the maximum. + +2. Using structures such as arrays that have large numbers of + locks that are not explicitly initialized. For example, + a hash table with 8192 buckets where each bucket has its own + spinlock_t will consume 8192 lock classes -unless- each spinlock + is explicitly initialized at runtime, for example, using the + run-time spin_lock_init() as opposed to compile-time initializers + such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize + the per-bucket spinlocks would guarantee lock-class overflow. + In contrast, a loop that called spin_lock_init() on each lock + would place all 8192 locks into a single lock class. + + The moral of this story is that you should always explicitly + initialize your locks. + +One might argue that the validator should be modified to allow +lock classes to be reused. However, if you are tempted to make this +argument, first review the code and think through the changes that would +be required, keeping in mind that the lock classes to be removed are +likely to be linked into the lock-dependency graph. This turns out to +be harder to do than to say. + +Of course, if you do run out of lock classes, the next thing to do is +to find the offending lock classes. First, the following command gives +you the number of lock classes currently in use along with the maximum: + + grep "lock-classes" /proc/lockdep_stats + +This command produces the following output on a modest system: + + lock-classes: 748 [max: 8191] + +If the number allocated (748 above) increases continually over time, +then there is likely a leak. The following command can be used to +identify the leaking lock classes: + + grep "BD" /proc/lockdep + +Run the command and save the output, then compare against the output from +a later run of this command to identify the leakers. This same output +can also help you find situations where runtime lock initialization has +been omitted. diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt new file mode 100644 index 000000000000..7428773a1e69 --- /dev/null +++ b/Documentation/locking/lockstat.txt @@ -0,0 +1,178 @@ + +LOCK STATISTICS + +- WHAT + +As the name suggests, it provides statistics on locks. + +- WHY + +Because things like lock contention can severely impact performance. + +- HOW + +Lockdep already has hooks in the lock functions and maps lock instances to +lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt). +The graph below shows the relation between the lock functions and the various +hooks therein. + + __acquire + | + lock _____ + | \ + | __contended + | | + | + | _______/ + |/ + | + __acquired + | + . + + . + | + __release + | + unlock + +lock, unlock - the regular lock functions +__* - the hooks +<> - states + +With these hooks we provide the following statistics: + + con-bounces - number of lock contention that involved x-cpu data + contentions - number of lock acquisitions that had to wait + wait time min - shortest (non-0) time we ever had to wait for a lock + max - longest time we ever had to wait for a lock + total - total time we spend waiting on this lock + avg - average time spent waiting on this lock + acq-bounces - number of lock acquisitions that involved x-cpu data + acquisitions - number of times we took the lock + hold time min - shortest (non-0) time we ever held the lock + max - longest time we ever held the lock + total - total time this lock was held + avg - average time this lock was held + +These numbers are gathered per lock class, per read/write state (when +applicable). + +It also tracks 4 contention points per class. A contention point is a call site +that had to wait on lock acquisition. + + - CONFIGURATION + +Lock statistics are enabled via CONFIG_LOCK_STAT. + + - USAGE + +Enable collection of statistics: + +# echo 1 >/proc/sys/kernel/lock_stat + +Disable collection of statistics: + +# echo 0 >/proc/sys/kernel/lock_stat + +Look at the current lock statistics: + +( line numbers not part of actual output, done for clarity in the explanation + below ) + +# less /proc/lock_stat + +01 lock_stat version 0.4 +02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg +04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +05 +06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 +07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 +08 --------------- +09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 +19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 +11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 +12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 +13 --------------- +14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 +15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 +16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 +17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 +18 +19............................................................................................................................................................................................................................. +20 +21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 +22 --------------- +23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 +24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 +25 unix_table_lock 15 [] unix_find_other+0x117/0x230 +26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 +27 --------------- +28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 +29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 +30 unix_table_lock 20 [] unix_find_other+0x117/0x230 +31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 + + +This excerpt shows the first two lock class statistics. Line 01 shows the +output version - each time the format changes this will be updated. Line 02-04 +show the header with column descriptions. Lines 05-18 and 20-31 show the actual +statistics. These statistics come in two parts; the actual stats separated by a +short separator (line 08, 13) from the contention points. + +The first lock (05-18) is a read/write lock, and shows two lines above the +short separator. The contention points don't match the column descriptors, +they have two: contentions and [] symbol. The second set of contention +points are the points we're contending with. + +The integer part of the time values is in us. + +Dealing with nested locks, subclasses may appear: + +32........................................................................................................................................................................................................................... +33 +34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 +35 --------- +36 &rq->lock 645 [] task_rq_lock+0x43/0x75 +37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a +38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a +39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb +40 --------- +41 &rq->lock 77 [] task_rq_lock+0x43/0x75 +42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a +43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 +44 &rq->lock 893 [] schedule+0x157/0x7b8 +45 +46........................................................................................................................................................................................................................... +47 +48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 +49 ----------- +50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 +51 ----------- +52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 +53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 +54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 +55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a + +Line 48 shows statistics for the second subclass (/1) of &rq->lock class +(subclass starts from 0), since in this case, as line 50 suggests, +double_rq_lock actually acquires a nested lock of two spinlocks. + +View the top contending locks: + +# grep : /proc/lock_stat | head + clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 + tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 + &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 + &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 + &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 + tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 + tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 + rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 + &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 + rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 + +Clear the statistics: + +# echo 0 > /proc/lock_stat diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt new file mode 100644 index 000000000000..ee231ed09ec6 --- /dev/null +++ b/Documentation/locking/mutex-design.txt @@ -0,0 +1,157 @@ +Generic Mutex Subsystem + +started by Ingo Molnar +updated by Davidlohr Bueso + +What are mutexes? +----------------- + +In the Linux kernel, mutexes refer to a particular locking primitive +that enforces serialization on shared memory systems, and not only to +the generic term referring to 'mutual exclusion' found in academia +or similar theoretical text books. Mutexes are sleeping locks which +behave similarly to binary semaphores, and were introduced in 2006[1] +as an alternative to these. This new data structure provided a number +of advantages, including simpler interfaces, and at that time smaller +code (see Disadvantages). + +[1] http://lwn.net/Articles/164802/ + +Implementation +-------------- + +Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h +and implemented in kernel/locking/mutex.c. These locks use a three +state atomic counter (->count) to represent the different possible +transitions that can occur during the lifetime of a lock: + + 1: unlocked + 0: locked, no waiters + negative: locked, with potential waiters + +In its most basic form it also includes a wait-queue and a spinlock +that serializes access to it. CONFIG_SMP systems can also include +a pointer to the lock task owner (->owner) as well as a spinner MCS +lock (->osq), both described below in (ii). + +When acquiring a mutex, there are three possible paths that can be +taken, depending on the state of the lock: + +(i) fastpath: tries to atomically acquire the lock by decrementing the + counter. If it was already taken by another task it goes to the next + possible path. This logic is architecture specific. On x86-64, the + locking fastpath is 2 instructions: + + 0000000000000e10 : + e21: f0 ff 0b lock decl (%rbx) + e24: 79 08 jns e2e + + the unlocking fastpath is equally tight: + + 0000000000000bc0 : + bc8: f0 ff 07 lock incl (%rdi) + bcb: 7f 0a jg bd7 + + +(ii) midpath: aka optimistic spinning, tries to spin for acquisition + while the lock owner is running and there are no other tasks ready + to run that have higher priority (need_resched). The rationale is + that if the lock owner is running, it is likely to release the lock + soon. The mutex spinners are queued up using MCS lock so that only + one spinner can compete for the mutex. + + The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock + with the desirable properties of being fair and with each cpu trying + to acquire the lock spinning on a local variable. It avoids expensive + cacheline bouncing that common test-and-set spinlock implementations + incur. An MCS-like lock is specially tailored for optimistic spinning + for sleeping lock implementation. An important feature of the customized + MCS lock is that it has the extra property that spinners are able to exit + the MCS spinlock queue when they need to reschedule. This further helps + avoid situations where MCS spinners that need to reschedule would continue + waiting to spin on mutex owner, only to go directly to slowpath upon + obtaining the MCS lock. + + +(iii) slowpath: last resort, if the lock is still unable to be acquired, + the task is added to the wait-queue and sleeps until woken up by the + unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. + +While formally kernel mutexes are sleepable locks, it is path (ii) that +makes them more practically a hybrid type. By simply not interrupting a +task and busy-waiting for a few cycles instead of immediately sleeping, +the performance of this lock has been seen to significantly improve a +number of workloads. Note that this technique is also used for rw-semaphores. + +Semantics +--------- + +The mutex subsystem checks and enforces the following rules: + + - Only one task can hold the mutex at a time. + - Only the owner can unlock the mutex. + - Multiple unlocks are not permitted. + - Recursive locking/unlocking is not permitted. + - A mutex must only be initialized via the API (see below). + - A task may not exit with a mutex held. + - Memory areas where held locks reside must not be freed. + - Held mutexes must not be reinitialized. + - Mutexes may not be used in hardware or software interrupt + contexts such as tasklets and timers. + +These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. +In addition, the mutex debugging code also implements a number of other +features that make lock debugging easier and faster: + + - Uses symbolic names of mutexes, whenever they are printed + in debug output. + - Point-of-acquire tracking, symbolic lookup of function names, + list of all locks held in the system, printout of them. + - Owner tracking. + - Detects self-recursing locks and prints out all relevant info. + - Detects multi-task circular deadlocks and prints out all affected + locks and tasks (and only those tasks). + + +Interfaces +---------- +Statically define the mutex: + DEFINE_MUTEX(name); + +Dynamically initialize the mutex: + mutex_init(mutex); + +Acquire the mutex, uninterruptible: + void mutex_lock(struct mutex *lock); + void mutex_lock_nested(struct mutex *lock, unsigned int subclass); + int mutex_trylock(struct mutex *lock); + +Acquire the mutex, interruptible: + int mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass); + int mutex_lock_interruptible(struct mutex *lock); + +Acquire the mutex, interruptible, if dec to 0: + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); + +Unlock the mutex: + void mutex_unlock(struct mutex *lock); + +Test if the mutex is taken: + int mutex_is_locked(struct mutex *lock); + +Disadvantages +------------- + +Unlike its original design and purpose, 'struct mutex' is larger than +most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice +as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the +'struct rw_semaphore' variant. Larger structure sizes mean more CPU +cache and memory footprint. + +When to use mutexes +------------------- + +Unless the strict semantics of mutexes are unsuitable and/or the critical +region prevents the lock from being shared, always prefer them to any other +locking primitive. diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt new file mode 100644 index 000000000000..8666070d3189 --- /dev/null +++ b/Documentation/locking/rt-mutex-design.txt @@ -0,0 +1,781 @@ +# +# Copyright (c) 2006 Steven Rostedt +# Licensed under the GNU Free Documentation License, Version 1.2 +# + +RT-mutex implementation design +------------------------------ + +This document tries to describe the design of the rtmutex.c implementation. +It doesn't describe the reasons why rtmutex.c exists. For that please see +Documentation/rt-mutex.txt. Although this document does explain problems +that happen without this code, but that is in the concept to understand +what the code actually is doing. + +The goal of this document is to help others understand the priority +inheritance (PI) algorithm that is used, as well as reasons for the +decisions that were made to implement PI in the manner that was done. + + +Unbounded Priority Inversion +---------------------------- + +Priority inversion is when a lower priority process executes while a higher +priority process wants to run. This happens for several reasons, and +most of the time it can't be helped. Anytime a high priority process wants +to use a resource that a lower priority process has (a mutex for example), +the high priority process must wait until the lower priority process is done +with the resource. This is a priority inversion. What we want to prevent +is something called unbounded priority inversion. That is when the high +priority process is prevented from running by a lower priority process for +an undetermined amount of time. + +The classic example of unbounded priority inversion is where you have three +processes, let's call them processes A, B, and C, where A is the highest +priority process, C is the lowest, and B is in between. A tries to grab a lock +that C owns and must wait and lets C run to release the lock. But in the +meantime, B executes, and since B is of a higher priority than C, it preempts C, +but by doing so, it is in fact preempting A which is a higher priority process. +Now there's no way of knowing how long A will be sleeping waiting for C +to release the lock, because for all we know, B is a CPU hog and will +never give C a chance to release the lock. This is called unbounded priority +inversion. + +Here's a little ASCII art to show the problem. + + grab lock L1 (owned by C) + | +A ---+ + C preempted by B + | +C +----+ + +B +--------> + B now keeps A from running. + + +Priority Inheritance (PI) +------------------------- + +There are several ways to solve this issue, but other ways are out of scope +for this document. Here we only discuss PI. + +PI is where a process inherits the priority of another process if the other +process blocks on a lock owned by the current process. To make this easier +to understand, let's use the previous example, with processes A, B, and C again. + +This time, when A blocks on the lock owned by C, C would inherit the priority +of A. So now if B becomes runnable, it would not preempt C, since C now has +the high priority of A. As soon as C releases the lock, it loses its +inherited priority, and A then can continue with the resource that C had. + +Terminology +----------- + +Here I explain some terminology that is used in this document to help describe +the design that is used to implement PI. + +PI chain - The PI chain is an ordered series of locks and processes that cause + processes to inherit priorities from a previous process that is + blocked on one of its locks. This is described in more detail + later in this document. + +mutex - In this document, to differentiate from locks that implement + PI and spin locks that are used in the PI code, from now on + the PI locks will be called a mutex. + +lock - In this document from now on, I will use the term lock when + referring to spin locks that are used to protect parts of the PI + algorithm. These locks disable preemption for UP (when + CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from + entering critical sections simultaneously. + +spin lock - Same as lock above. + +waiter - A waiter is a struct that is stored on the stack of a blocked + process. Since the scope of the waiter is within the code for + a process being blocked on the mutex, it is fine to allocate + the waiter on the process's stack (local variable). This + structure holds a pointer to the task, as well as the mutex that + the task is blocked on. It also has the plist node structures to + place the task in the waiter_list of a mutex as well as the + pi_list of a mutex owner task (described below). + + waiter is sometimes used in reference to the task that is waiting + on a mutex. This is the same as waiter->task. + +waiters - A list of processes that are blocked on a mutex. + +top waiter - The highest priority process waiting on a specific mutex. + +top pi waiter - The highest priority process waiting on one of the mutexes + that a specific process owns. + +Note: task and process are used interchangeably in this document, mostly to + differentiate between two processes that are being described together. + + +PI chain +-------- + +The PI chain is a list of processes and mutexes that may cause priority +inheritance to take place. Multiple chains may converge, but a chain +would never diverge, since a process can't be blocked on more than one +mutex at a time. + +Example: + + Process: A, B, C, D, E + Mutexes: L1, L2, L3, L4 + + A owns: L1 + B blocked on L1 + B owns L2 + C blocked on L2 + C owns L3 + D blocked on L3 + D owns L4 + E blocked on L4 + +The chain would be: + + E->L4->D->L3->C->L2->B->L1->A + +To show where two chains merge, we could add another process F and +another mutex L5 where B owns L5 and F is blocked on mutex L5. + +The chain for F would be: + + F->L5->B->L1->A + +Since a process may own more than one mutex, but never be blocked on more than +one, the chains merge. + +Here we show both chains: + + E->L4->D->L3->C->L2-+ + | + +->B->L1->A + | + F->L5-+ + +For PI to work, the processes at the right end of these chains (or we may +also call it the Top of the chain) must be equal to or higher in priority +than the processes to the left or below in the chain. + +Also since a mutex may have more than one process blocked on it, we can +have multiple chains merge at mutexes. If we add another process G that is +blocked on mutex L2: + + G->L2->B->L1->A + +And once again, to show how this can grow I will show the merging chains +again. + + E->L4->D->L3->C-+ + +->L2-+ + | | + G-+ +->B->L1->A + | + F->L5-+ + + +Plist +----- + +Before I go further and talk about how the PI chain is stored through lists +on both mutexes and processes, I'll explain the plist. This is similar to +the struct list_head functionality that is already in the kernel. +The implementation of plist is out of scope for this document, but it is +very important to understand what it does. + +There are a few differences between plist and list, the most important one +being that plist is a priority sorted linked list. This means that the +priorities of the plist are sorted, such that it takes O(1) to retrieve the +highest priority item in the list. Obviously this is useful to store processes +based on their priorities. + +Another difference, which is important for implementation, is that, unlike +list, the head of the list is a different element than the nodes of a list. +So the head of the list is declared as struct plist_head and nodes that will +be added to the list are declared as struct plist_node. + + +Mutex Waiter List +----------------- + +Every mutex keeps track of all the waiters that are blocked on itself. The mutex +has a plist to store these waiters by priority. This list is protected by +a spin lock that is located in the struct of the mutex. This lock is called +wait_lock. Since the modification of the waiter list is never done in +interrupt context, the wait_lock can be taken without disabling interrupts. + + +Task PI List +------------ + +To keep track of the PI chains, each process has its own PI list. This is +a list of all top waiters of the mutexes that are owned by the process. +Note that this list only holds the top waiters and not all waiters that are +blocked on mutexes owned by the process. + +The top of the task's PI list is always the highest priority task that +is waiting on a mutex that is owned by the task. So if the task has +inherited a priority, it will always be the priority of the task that is +at the top of this list. + +This list is stored in the task structure of a process as a plist called +pi_list. This list is protected by a spin lock also in the task structure, +called pi_lock. This lock may also be taken in interrupt context, so when +locking the pi_lock, interrupts must be disabled. + + +Depth of the PI Chain +--------------------- + +The maximum depth of the PI chain is not dynamic, and could actually be +defined. But is very complex to figure it out, since it depends on all +the nesting of mutexes. Let's look at the example where we have 3 mutexes, +L1, L2, and L3, and four separate functions func1, func2, func3 and func4. +The following shows a locking order of L1->L2->L3, but may not actually +be directly nested that way. + +void func1(void) +{ + mutex_lock(L1); + + /* do anything */ + + mutex_unlock(L1); +} + +void func2(void) +{ + mutex_lock(L1); + mutex_lock(L2); + + /* do something */ + + mutex_unlock(L2); + mutex_unlock(L1); +} + +void func3(void) +{ + mutex_lock(L2); + mutex_lock(L3); + + /* do something else */ + + mutex_unlock(L3); + mutex_unlock(L2); +} + +void func4(void) +{ + mutex_lock(L3); + + /* do something again */ + + mutex_unlock(L3); +} + +Now we add 4 processes that run each of these functions separately. +Processes A, B, C, and D which run functions func1, func2, func3 and func4 +respectively, and such that D runs first and A last. With D being preempted +in func4 in the "do something again" area, we have a locking that follows: + +D owns L3 + C blocked on L3 + C owns L2 + B blocked on L2 + B owns L1 + A blocked on L1 + +And thus we have the chain A->L1->B->L2->C->L3->D. + +This gives us a PI depth of 4 (four processes), but looking at any of the +functions individually, it seems as though they only have at most a locking +depth of two. So, although the locking depth is defined at compile time, +it still is very difficult to find the possibilities of that depth. + +Now since mutexes can be defined by user-land applications, we don't want a DOS +type of application that nests large amounts of mutexes to create a large +PI chain, and have the code holding spin locks while looking at a large +amount of data. So to prevent this, the implementation not only implements +a maximum lock depth, but also only holds at most two different locks at a +time, as it walks the PI chain. More about this below. + + +Mutex owner and flags +--------------------- + +The mutex structure contains a pointer to the owner of the mutex. If the +mutex is not owned, this owner is set to NULL. Since all architectures +have the task structure on at least a four byte alignment (and if this is +not true, the rtmutex.c code will be broken!), this allows for the two +least significant bits to be used as flags. This part is also described +in Documentation/rt-mutex.txt, but will also be briefly described here. + +Bit 0 is used as the "Pending Owner" flag. This is described later. +Bit 1 is used as the "Has Waiters" flags. This is also described later + in more detail, but is set whenever there are waiters on a mutex. + + +cmpxchg Tricks +-------------- + +Some architectures implement an atomic cmpxchg (Compare and Exchange). This +is used (when applicable) to keep the fast path of grabbing and releasing +mutexes short. + +cmpxchg is basically the following function performed atomically: + +unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) +{ + unsigned long T = *A; + if (*A == *B) { + *A = *C; + } + return T; +} +#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) + +This is really nice to have, since it allows you to only update a variable +if the variable is what you expect it to be. You know if it succeeded if +the return value (the old value of A) is equal to B. + +The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If +the architecture does not support CMPXCHG, then this macro is simply set +to fail every time. But if CMPXCHG is supported, then this will +help out extremely to keep the fast path short. + +The use of rt_mutex_cmpxchg with the flags in the owner field help optimize +the system for architectures that support it. This will also be explained +later in this document. + + +Priority adjustments +-------------------- + +The implementation of the PI code in rtmutex.c has several places that a +process must adjust its priority. With the help of the pi_list of a +process this is rather easy to know what needs to be adjusted. + +The functions implementing the task adjustments are rt_mutex_adjust_prio, +__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock +to already be taken), rt_mutex_getprio, and rt_mutex_setprio. + +rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. + +rt_mutex_getprio returns the priority that the task should have. Either the +task's own normal priority, or if a process of a higher priority is waiting on +a mutex owned by the task, then that higher priority should be returned. +Since the pi_list of a task holds an order by priority list of all the top +waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs +to compare the top pi waiter to its own normal priority, and return the higher +priority back. + +(Note: if looking at the code, you will notice that the lower number of + prio is returned. This is because the prio field in the task structure + is an inverse order of the actual priority. So a "prio" of 5 is + of higher priority than a "prio" of 10.) + +__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the +result does not equal the task's current priority, then rt_mutex_setprio +is called to adjust the priority of the task to the new priority. +Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the +actual change in priority. + +It is interesting to note that __rt_mutex_adjust_prio can either increase +or decrease the priority of the task. In the case that a higher priority +process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio +would increase/boost the task's priority. But if a higher priority task +were for some reason to leave the mutex (timeout or signal), this same function +would decrease/unboost the priority of the task. That is because the pi_list +always contains the highest priority task that is waiting on a mutex owned +by the task, so we only need to compare the priority of that top pi waiter +to the normal priority of the given task. + + +High level overview of the PI chain walk +---------------------------------------- + +The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. + +The implementation has gone through several iterations, and has ended up +with what we believe is the best. It walks the PI chain by only grabbing +at most two locks at a time, and is very efficient. + +The rt_mutex_adjust_prio_chain can be used either to boost or lower process +priorities. + +rt_mutex_adjust_prio_chain is called with a task to be checked for PI +(de)boosting (the owner of a mutex that a process is blocking on), a flag to +check for deadlocking, the mutex that the task owns, and a pointer to a waiter +that is the process's waiter struct that is blocked on the mutex (although this +parameter may be NULL for deboosting). + +For this explanation, I will not mention deadlock detection. This explanation +will try to stay at a high level. + +When this function is called, there are no locks held. That also means +that the state of the owner and lock can change when entered into this function. + +Before this function is called, the task has already had rt_mutex_adjust_prio +performed on it. This means that the task is set to the priority that it +should be at, but the plist nodes of the task's waiter have not been updated +with the new priorities, and that this task may not be in the proper locations +in the pi_lists and wait_lists that the task is blocked on. This function +solves all that. + +A loop is entered, where task is the owner to be checked for PI changes that +was passed by parameter (for the first iteration). The pi_lock of this task is +taken to prevent any more changes to the pi_list of the task. This also +prevents new tasks from completing the blocking on a mutex that is owned by this +task. + +If the task is not blocked on a mutex then the loop is exited. We are at +the top of the PI chain. + +A check is now done to see if the original waiter (the process that is blocked +on the current mutex) is the top pi waiter of the task. That is, is this +waiter on the top of the task's pi_list. If it is not, it either means that +there is another process higher in priority that is blocked on one of the +mutexes that the task owns, or that the waiter has just woken up via a signal +or timeout and has left the PI chain. In either case, the loop is exited, since +we don't need to do any more changes to the priority of the current task, or any +task that owns a mutex that this current task is waiting on. A priority chain +walk is only needed when a new top pi waiter is made to a task. + +The next check sees if the task's waiter plist node has the priority equal to +the priority the task is set at. If they are equal, then we are done with +the loop. Remember that the function started with the priority of the +task adjusted, but the plist nodes that hold the task in other processes +pi_lists have not been adjusted. + +Next, we look at the mutex that the task is blocked on. The mutex's wait_lock +is taken. This is done by a spin_trylock, because the locking order of the +pi_lock and wait_lock goes in the opposite direction. If we fail to grab the +lock, the pi_lock is released, and we restart the loop. + +Now that we have both the pi_lock of the task as well as the wait_lock of +the mutex the task is blocked on, we update the task's waiter's plist node +that is located on the mutex's wait_list. + +Now we release the pi_lock of the task. + +Next the owner of the mutex has its pi_lock taken, so we can update the +task's entry in the owner's pi_list. If the task is the highest priority +process on the mutex's wait_list, then we remove the previous top waiter +from the owner's pi_list, and replace it with the task. + +Note: It is possible that the task was the current top waiter on the mutex, + in which case the task is not yet on the pi_list of the waiter. This + is OK, since plist_del does nothing if the plist node is not on any + list. + +If the task was not the top waiter of the mutex, but it was before we +did the priority updates, that means we are deboosting/lowering the +task. In this case, the task is removed from the pi_list of the owner, +and the new top waiter is added. + +Lastly, we unlock both the pi_lock of the task, as well as the mutex's +wait_lock, and continue the loop again. On the next iteration of the +loop, the previous owner of the mutex will be the task that will be +processed. + +Note: One might think that the owner of this mutex might have changed + since we just grab the mutex's wait_lock. And one could be right. + The important thing to remember is that the owner could not have + become the task that is being processed in the PI chain, since + we have taken that task's pi_lock at the beginning of the loop. + So as long as there is an owner of this mutex that is not the same + process as the tasked being worked on, we are OK. + + Looking closely at the code, one might be confused. The check for the + end of the PI chain is when the task isn't blocked on anything or the + task's waiter structure "task" element is NULL. This check is + protected only by the task's pi_lock. But the code to unlock the mutex + sets the task's waiter structure "task" element to NULL with only + the protection of the mutex's wait_lock, which was not taken yet. + Isn't this a race condition if the task becomes the new owner? + + The answer is No! The trick is the spin_trylock of the mutex's + wait_lock. If we fail that lock, we release the pi_lock of the + task and continue the loop, doing the end of PI chain check again. + + In the code to release the lock, the wait_lock of the mutex is held + the entire time, and it is not let go when we grab the pi_lock of the + new owner of the mutex. So if the switch of a new owner were to happen + after the check for end of the PI chain and the grabbing of the + wait_lock, the unlocking code would spin on the new owner's pi_lock + but never give up the wait_lock. So the PI chain loop is guaranteed to + fail the spin_trylock on the wait_lock, release the pi_lock, and + try again. + + If you don't quite understand the above, that's OK. You don't have to, + unless you really want to make a proof out of it ;) + + +Pending Owners and Lock stealing +-------------------------------- + +One of the flags in the owner field of the mutex structure is "Pending Owner". +What this means is that an owner was chosen by the process releasing the +mutex, but that owner has yet to wake up and actually take the mutex. + +Why is this important? Why can't we just give the mutex to another process +and be done with it? + +The PI code is to help with real-time processes, and to let the highest +priority process run as long as possible with little latencies and delays. +If a high priority process owns a mutex that a lower priority process is +blocked on, when the mutex is released it would be given to the lower priority +process. What if the higher priority process wants to take that mutex again. +The high priority process would fail to take that mutex that it just gave up +and it would need to boost the lower priority process to run with full +latency of that critical section (since the low priority process just entered +it). + +There's no reason a high priority process that gives up a mutex should be +penalized if it tries to take that mutex again. If the new owner of the +mutex has not woken up yet, there's no reason that the higher priority process +could not take that mutex away. + +To solve this, we introduced Pending Ownership and Lock Stealing. When a +new process is given a mutex that it was blocked on, it is only given +pending ownership. This means that it's the new owner, unless a higher +priority process comes in and tries to grab that mutex. If a higher priority +process does come along and wants that mutex, we let the higher priority +process "steal" the mutex from the pending owner (only if it is still pending) +and continue with the mutex. + + +Taking of a mutex (The walk through) +------------------------------------ + +OK, now let's take a look at the detailed walk through of what happens when +taking a mutex. + +The first thing that is tried is the fast taking of the mutex. This is +done when we have CMPXCHG enabled (otherwise the fast taking automatically +fails). Only when the owner field of the mutex is NULL can the lock be +taken with the CMPXCHG and nothing else needs to be done. + +If there is contention on the lock, whether it is owned or pending owner +we go about the slow path (rt_mutex_slowlock). + +The slow path function is where the task's waiter structure is created on +the stack. This is because the waiter structure is only needed for the +scope of this function. The waiter structure holds the nodes to store +the task on the wait_list of the mutex, and if need be, the pi_list of +the owner. + +The wait_lock of the mutex is taken since the slow path of unlocking the +mutex also takes this lock. + +We then call try_to_take_rt_mutex. This is where the architecture that +does not implement CMPXCHG would always grab the lock (if there's no +contention). + +try_to_take_rt_mutex is used every time the task tries to grab a mutex in the +slow path. The first thing that is done here is an atomic setting of +the "Has Waiters" flag of the mutex's owner field. Yes, this could really +be false, because if the mutex has no owner, there are no waiters and +the current task also won't have any waiters. But we don't have the lock +yet, so we assume we are going to be a waiter. The reason for this is to +play nice for those architectures that do have CMPXCHG. By setting this flag +now, the owner of the mutex can't release the mutex without going into the +slow unlock path, and it would then need to grab the wait_lock, which this +code currently holds. So setting the "Has Waiters" flag forces the owner +to synchronize with this code. + +Now that we know that we can't have any races with the owner releasing the +mutex, we check to see if we can take the ownership. This is done if the +mutex doesn't have a owner, or if we can steal the mutex from a pending +owner. Let's look at the situations we have here. + + 1) Has owner that is pending + ---------------------------- + + The mutex has a owner, but it hasn't woken up and the mutex flag + "Pending Owner" is set. The first check is to see if the owner isn't the + current task. This is because this function is also used for the pending + owner to grab the mutex. When a pending owner wakes up, it checks to see + if it can take the mutex, and this is done if the owner is already set to + itself. If so, we succeed and leave the function, clearing the "Pending + Owner" bit. + + If the pending owner is not current, we check to see if the current priority is + higher than the pending owner. If not, we fail the function and return. + + There's also something special about a pending owner. That is a pending owner + is never blocked on a mutex. So there is no PI chain to worry about. It also + means that if the mutex doesn't have any waiters, there's no accounting needed + to update the pending owner's pi_list, since we only worry about processes + blocked on the current mutex. + + If there are waiters on this mutex, and we just stole the ownership, we need + to take the top waiter, remove it from the pi_list of the pending owner, and + add it to the current pi_list. Note that at this moment, the pending owner + is no longer on the list of waiters. This is fine, since the pending owner + would add itself back when it realizes that it had the ownership stolen + from itself. When the pending owner tries to grab the mutex, it will fail + in try_to_take_rt_mutex if the owner field points to another process. + + 2) No owner + ----------- + + If there is no owner (or we successfully stole the lock), we set the owner + of the mutex to current, and set the flag of "Has Waiters" if the current + mutex actually has waiters, or we clear the flag if it doesn't. See, it was + OK that we set that flag early, since now it is cleared. + + 3) Failed to grab ownership + --------------------------- + + The most interesting case is when we fail to take ownership. This means that + there exists an owner, or there's a pending owner with equal or higher + priority than the current task. + +We'll continue on the failed case. + +If the mutex has a timeout, we set up a timer to go off to break us out +of this mutex if we failed to get it after a specified amount of time. + +Now we enter a loop that will continue to try to take ownership of the mutex, or +fail from a timeout or signal. + +Once again we try to take the mutex. This will usually fail the first time +in the loop, since it had just failed to get the mutex. But the second time +in the loop, this would likely succeed, since the task would likely be +the pending owner. + +If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done +here. + +The waiter structure has a "task" field that points to the task that is blocked +on the mutex. This field can be NULL the first time it goes through the loop +or if the task is a pending owner and had its mutex stolen. If the "task" +field is NULL then we need to set up the accounting for it. + +Task blocks on mutex +-------------------- + +The accounting of a mutex and process is done with the waiter structure of +the process. The "task" field is set to the process, and the "lock" field +to the mutex. The plist nodes are initialized to the processes current +priority. + +Since the wait_lock was taken at the entry of the slow lock, we can safely +add the waiter to the wait_list. If the current process is the highest +priority process currently waiting on this mutex, then we remove the +previous top waiter process (if it exists) from the pi_list of the owner, +and add the current process to that list. Since the pi_list of the owner +has changed, we call rt_mutex_adjust_prio on the owner to see if the owner +should adjust its priority accordingly. + +If the owner is also blocked on a lock, and had its pi_list changed +(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead +and run rt_mutex_adjust_prio_chain on the owner, as described earlier. + +Now all locks are released, and if the current process is still blocked on a +mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). + +Waking up in the loop +--------------------- + +The schedule can then wake up for a few reasons. + 1) we were given pending ownership of the mutex. + 2) we received a signal and was TASK_INTERRUPTIBLE + 3) we had a timeout and was TASK_INTERRUPTIBLE + +In any of these cases, we continue the loop and once again try to grab the +ownership of the mutex. If we succeed, we exit the loop, otherwise we continue +and on signal and timeout, will exit the loop, or if we had the mutex stolen +we just simply add ourselves back on the lists and go back to sleep. + +Note: For various reasons, because of timeout and signals, the steal mutex + algorithm needs to be careful. This is because the current process is + still on the wait_list. And because of dynamic changing of priorities, + especially on SCHED_OTHER tasks, the current process can be the + highest priority task on the wait_list. + +Failed to get mutex on Timeout or Signal +---------------------------------------- + +If a timeout or signal occurred, the waiter's "task" field would not be +NULL and the task needs to be taken off the wait_list of the mutex and perhaps +pi_list of the owner. If this process was a high priority process, then +the rt_mutex_adjust_prio_chain needs to be executed again on the owner, +but this time it will be lowering the priorities. + + +Unlocking the Mutex +------------------- + +The unlocking of a mutex also has a fast path for those architectures with +CMPXCHG. Since the taking of a mutex on contention always sets the +"Has Waiters" flag of the mutex's owner, we use this to know if we need to +take the slow path when unlocking the mutex. If the mutex doesn't have any +waiters, the owner field of the mutex would equal the current process and +the mutex can be unlocked by just replacing the owner field with NULL. + +If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), +the slow unlock path is taken. + +The first thing done in the slow unlock path is to take the wait_lock of the +mutex. This synchronizes the locking and unlocking of the mutex. + +A check is made to see if the mutex has waiters or not. On architectures that +do not have CMPXCHG, this is the location that the owner of the mutex will +determine if a waiter needs to be awoken or not. On architectures that +do have CMPXCHG, that check is done in the fast path, but it is still needed +in the slow path too. If a waiter of a mutex woke up because of a signal +or timeout between the time the owner failed the fast path CMPXCHG check and +the grabbing of the wait_lock, the mutex may not have any waiters, thus the +owner still needs to make this check. If there are no waiters then the mutex +owner field is set to NULL, the wait_lock is released and nothing more is +needed. + +If there are waiters, then we need to wake one up and give that waiter +pending ownership. + +On the wake up code, the pi_lock of the current owner is taken. The top +waiter of the lock is found and removed from the wait_list of the mutex +as well as the pi_list of the current owner. The task field of the new +pending owner's waiter structure is set to NULL, and the owner field of the +mutex is set to the new owner with the "Pending Owner" bit set, as well +as the "Has Waiters" bit if there still are other processes blocked on the +mutex. + +The pi_lock of the previous owner is released, and the new pending owner's +pi_lock is taken. Remember that this is the trick to prevent the race +condition in rt_mutex_adjust_prio_chain from adding itself as a waiter +on the mutex. + +We now clear the "pi_blocked_on" field of the new pending owner, and if +the mutex still has waiters pending, we add the new top waiter to the pi_list +of the pending owner. + +Finally we unlock the pi_lock of the pending owner and wake it up. + + +Contact +------- + +For updates on this document, please email Steven Rostedt + + +Credits +------- + +Author: Steven Rostedt + +Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap + +Updates +------- + +This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt new file mode 100644 index 000000000000..243393d882ee --- /dev/null +++ b/Documentation/locking/rt-mutex.txt @@ -0,0 +1,79 @@ +RT-mutex subsystem with PI support +---------------------------------- + +RT-mutexes with priority inheritance are used to support PI-futexes, +which enable pthread_mutex_t priority inheritance attributes +(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +about PI-futexes.] + +This technology was developed in the -rt tree and streamlined for +pthread_mutex support. + +Basic principles: +----------------- + +RT-mutexes extend the semantics of simple mutexes by the priority +inheritance protocol. + +A low priority owner of a rt-mutex inherits the priority of a higher +priority waiter until the rt-mutex is released. If the temporarily +boosted owner blocks on a rt-mutex itself it propagates the priority +boosting to the owner of the other rt_mutex it gets blocked on. The +priority boosting is immediately removed once the rt_mutex has been +unlocked. + +This approach allows us to shorten the block of high-prio tasks on +mutexes which protect shared resources. Priority inheritance is not a +magic bullet for poorly designed applications, but it allows +well-designed applications to use userspace locks in critical parts of +an high priority thread, without losing determinism. + +The enqueueing of the waiters into the rtmutex waiter list is done in +priority order. For same priorities FIFO order is chosen. For each +rtmutex, only the top priority waiter is enqueued into the owner's +priority waiters list. This list too queues in priority order. Whenever +the top priority waiter of a task changes (for example it timed out or +got a signal), the priority of the owner task is readjusted. [The +priority enqueueing is handled by "plists", see include/linux/plist.h +for more details.] + +RT-mutexes are optimized for fastpath operations and have no internal +locking overhead when locking an uncontended mutex or unlocking a mutex +without waiters. The optimized fastpath operations require cmpxchg +support. [If that is not available then the rt-mutex internal spinlock +is used] + +The state of the rt-mutex is tracked via the owner field of the rt-mutex +structure: + +rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 +are used to keep track of the "owner is pending" and "rtmutex has +waiters" state. + + owner bit1 bit0 + NULL 0 0 mutex is free (fast acquire possible) + NULL 0 1 invalid state + NULL 1 0 Transitional state* + NULL 1 1 invalid state + taskpointer 0 0 mutex is held (fast release possible) + taskpointer 0 1 task is pending owner + taskpointer 1 0 mutex is held and has waiters + taskpointer 1 1 task is pending owner and mutex has waiters + +Pending-ownership handling is a performance optimization: +pending-ownership is assigned to the first (highest priority) waiter of +the mutex, when the mutex is released. The thread is woken up and once +it starts executing it can acquire the mutex. Until the mutex is taken +by it (bit 0 is cleared) a competing higher priority thread can "steal" +the mutex which puts the woken up thread back on the waiters list. + +The pending-ownership optimization is especially important for the +uninterrupted workflow of high-prio tasks which repeatedly +takes/releases locks that have lower-prio waiters. Without this +optimization the higher-prio thread would ping-pong to the lower-prio +task [because at unlock time we always assign a new owner]. + +(*) The "mutex has waiters" bit gets set to take the lock. If the lock +doesn't already have an owner, this bit is quickly cleared if there are +no waiters. So this is a transitional state to synchronize with looking +at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt new file mode 100644 index 000000000000..ff35e40bdf5b --- /dev/null +++ b/Documentation/locking/spinlocks.txt @@ -0,0 +1,167 @@ +Lesson 1: Spin locks + +The most basic primitive for locking is spinlock. + +static DEFINE_SPINLOCK(xxx_lock); + + unsigned long flags; + + spin_lock_irqsave(&xxx_lock, flags); + ... critical section here .. + spin_unlock_irqrestore(&xxx_lock, flags); + +The above is always safe. It will disable interrupts _locally_, but the +spinlock itself will guarantee the global lock, so it will guarantee that +there is only one thread-of-control within the region(s) protected by that +lock. This works well even under UP also, so the code does _not_ need to +worry about UP vs SMP issues: the spinlocks work correctly under both. + + NOTE! Implications of spin_locks for memory are further described in: + + Documentation/memory-barriers.txt + (5) LOCK operations. + (6) UNLOCK operations. + +The above is usually pretty simple (you usually need and want only one +spinlock for most things - using more than one spinlock can make things a +lot more complex and even slower and is usually worth it only for +sequences that you _know_ need to be split up: avoid it at all cost if you +aren't sure). + +This is really the only really hard part about spinlocks: once you start +using spinlocks they tend to expand to areas you might not have noticed +before, because you have to make sure the spinlocks correctly protect the +shared data structures _everywhere_ they are used. The spinlocks are most +easily added to places that are completely independent of other code (for +example, internal driver data structures that nobody else ever touches). + + NOTE! The spin-lock is safe only when you _also_ use the lock itself + to do locking across CPU's, which implies that EVERYTHING that + touches a shared variable has to agree about the spinlock they want + to use. + +---- + +Lesson 2: reader-writer spinlocks. + +If your data accesses have a very natural pattern where you usually tend +to mostly read from the shared variables, the reader-writer locks +(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple +readers to be in the same critical region at once, but if somebody wants +to change the variables it has to get an exclusive write lock. + + NOTE! reader-writer locks require more atomic memory operations than + simple spinlocks. Unless the reader critical section is long, you + are better off just using spinlocks. + +The routines look the same as above: + + rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); + + unsigned long flags; + + read_lock_irqsave(&xxx_lock, flags); + .. critical section that only reads the info ... + read_unlock_irqrestore(&xxx_lock, flags); + + write_lock_irqsave(&xxx_lock, flags); + .. read and write exclusive access to the info ... + write_unlock_irqrestore(&xxx_lock, flags); + +The above kind of lock may be useful for complex data structures like +linked lists, especially searching for entries without changing the list +itself. The read lock allows many concurrent readers. Anything that +_changes_ the list will have to get the write lock. + + NOTE! RCU is better for list traversal, but requires careful + attention to design detail (see Documentation/RCU/listRCU.txt). + +Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ +time need to do any changes (even if you don't do it every time), you have +to get the write-lock at the very beginning. + + NOTE! We are working hard to remove reader-writer spinlocks in most + cases, so please don't add a new one without consensus. (Instead, see + Documentation/RCU/rcu.txt for complete information.) + +---- + +Lesson 3: spinlocks revisited. + +The single spin-lock primitives above are by no means the only ones. They +are the most safe ones, and the ones that work under all circumstances, +but partly _because_ they are safe they are also fairly slow. They are slower +than they'd need to be, because they do have to disable interrupts +(which is just a single instruction on a x86, but it's an expensive one - +and on other architectures it can be worse). + +If you have a case where you have to protect a data structure across +several CPU's and you want to use spinlocks you can potentially use +cheaper versions of the spinlocks. IFF you know that the spinlocks are +never used in interrupt handlers, you can use the non-irq versions: + + spin_lock(&lock); + ... + spin_unlock(&lock); + +(and the equivalent read-write versions too, of course). The spinlock will +guarantee the same kind of exclusive access, and it will be much faster. +This is useful if you know that the data in question is only ever +manipulated from a "process context", ie no interrupts involved. + +The reasons you mustn't use these versions if you have interrupts that +play with the spinlock is that you can get deadlocks: + + spin_lock(&lock); + ... + <- interrupt comes in: + spin_lock(&lock); + +where an interrupt tries to lock an already locked variable. This is ok if +the other interrupt happens on another CPU, but it is _not_ ok if the +interrupt happens on the same CPU that already holds the lock, because the +lock will obviously never be released (because the interrupt is waiting +for the lock, and the lock-holder is interrupted by the interrupt and will +not continue until the interrupt has been processed). + +(This is also the reason why the irq-versions of the spinlocks only need +to disable the _local_ interrupts - it's ok to use spinlocks in interrupts +on other CPU's, because an interrupt on another CPU doesn't interrupt the +CPU that holds the lock, so the lock-holder can continue and eventually +releases the lock). + +Note that you can be clever with read-write locks and interrupts. For +example, if you know that the interrupt only ever gets a read-lock, then +you can use a non-irq version of read locks everywhere - because they +don't block on each other (and thus there is no dead-lock wrt interrupts. +But when you do the write-lock, you have to use the irq-safe version. + +For an example of being clever with rw-locks, see the "waitqueue_lock" +handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from +within an interrupt, they only read the queue in order to know whom to +wake up. So read-locks are safe (which is good: they are very common +indeed), while write-locks need to protect themselves against interrupts. + + Linus + +---- + +Reference information: + +For dynamic initialization, use spin_lock_init() or rwlock_init() as +appropriate: + + spinlock_t xxx_lock; + rwlock_t xxx_rw_lock; + + static int __init xxx_init(void) + { + spin_lock_init(&xxx_lock); + rwlock_init(&xxx_rw_lock); + ... + } + + module_init(xxx_init); + +For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or +__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt new file mode 100644 index 000000000000..8a112dc304c3 --- /dev/null +++ b/Documentation/locking/ww-mutex-design.txt @@ -0,0 +1,344 @@ +Wait/Wound Deadlock-Proof Mutex Design +====================================== + +Please read mutex-design.txt first, as it applies to wait/wound mutexes too. + +Motivation for WW-Mutexes +------------------------- + +GPU's do operations that commonly involve many buffers. Those buffers +can be shared across contexts/processes, exist in different memory +domains (for example VRAM vs system memory), and so on. And with +PRIME / dmabuf, they can even be shared across devices. So there are +a handful of situations where the driver needs to wait for buffers to +become ready. If you think about this in terms of waiting on a buffer +mutex for it to become available, this presents a problem because +there is no way to guarantee that buffers appear in a execbuf/batch in +the same order in all contexts. That is directly under control of +userspace, and a result of the sequence of GL calls that an application +makes. Which results in the potential for deadlock. The problem gets +more complex when you consider that the kernel may need to migrate the +buffer(s) into VRAM before the GPU operates on the buffer(s), which +may in turn require evicting some other buffers (and you don't want to +evict other buffers which are already queued up to the GPU), but for a +simplified understanding of the problem you can ignore this. + +The algorithm that the TTM graphics subsystem came up with for dealing with +this problem is quite simple. For each group of buffers (execbuf) that need +to be locked, the caller would be assigned a unique reservation id/ticket, +from a global counter. In case of deadlock while locking all the buffers +associated with a execbuf, the one with the lowest reservation ticket (i.e. +the oldest task) wins, and the one with the higher reservation id (i.e. the +younger task) unlocks all of the buffers that it has already locked, and then +tries again. + +In the RDBMS literature this deadlock handling approach is called wait/wound: +The older tasks waits until it can acquire the contended lock. The younger tasks +needs to back off and drop all the locks it is currently holding, i.e. the +younger task is wounded. + +Concepts +-------- + +Compared to normal mutexes two additional concepts/objects show up in the lock +interface for w/w mutexes: + +Acquire context: To ensure eventual forward progress it is important the a task +trying to acquire locks doesn't grab a new reservation id, but keeps the one it +acquired when starting the lock acquisition. This ticket is stored in the +acquire context. Furthermore the acquire context keeps track of debugging state +to catch w/w mutex interface abuse. + +W/w class: In contrast to normal mutexes the lock class needs to be explicit for +w/w mutexes, since it is required to initialize the acquire context. + +Furthermore there are three different class of w/w lock acquire functions: + +* Normal lock acquisition with a context, using ww_mutex_lock. + +* Slowpath lock acquisition on the contending lock, used by the wounded task + after having dropped all already acquired locks. These functions have the + _slow postfix. + + From a simple semantics point-of-view the _slow functions are not strictly + required, since simply calling the normal ww_mutex_lock functions on the + contending lock (after having dropped all other already acquired locks) will + work correctly. After all if no other ww mutex has been acquired yet there's + no deadlock potential and hence the ww_mutex_lock call will block and not + prematurely return -EDEADLK. The advantage of the _slow functions is in + interface safety: + - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow + has a void return type. Note that since ww mutex code needs loops/retries + anyway the __must_check doesn't result in spurious warnings, even though the + very first lock operation can never fail. + - When full debugging is enabled ww_mutex_lock_slow checks that all acquired + ww mutex have been released (preventing deadlocks) and makes sure that we + block on the contending lock (preventing spinning through the -EDEADLK + slowpath until the contended lock can be acquired). + +* Functions to only acquire a single w/w mutex, which results in the exact same + semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL + context. + + Again this is not strictly required. But often you only want to acquire a + single lock in which case it's pointless to set up an acquire context (and so + better to avoid grabbing a deadlock avoidance ticket). + +Of course, all the usual variants for handling wake-ups due to signals are also +provided. + +Usage +----- + +Three different ways to acquire locks within the same w/w class. Common +definitions for methods #1 and #2: + +static DEFINE_WW_CLASS(ww_class); + +struct obj { + struct ww_mutex lock; + /* obj data */ +}; + +struct obj_entry { + struct list_head head; + struct obj *obj; +}; + +Method 1, using a list in execbuf->buffers that's not allowed to be reordered. +This is useful if a list of required objects is already tracked somewhere. +Furthermore the lock helper can use propagate the -EALREADY return code back to +the caller as a signal that an object is twice on the list. This is useful if +the list is constructed from userspace input and the ABI requires userspace to +not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *res_obj = NULL; + struct obj_entry *contended_entry = NULL; + struct obj_entry *entry; + + ww_acquire_init(ctx, &ww_class); + +retry: + list_for_each_entry (entry, list, head) { + if (entry->obj == res_obj) { + res_obj = NULL; + continue; + } + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + contended_entry = entry; + goto err; + } + } + + ww_acquire_done(ctx); + return 0; + +err: + list_for_each_entry_continue_reverse (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + if (res_obj) + ww_mutex_unlock(&res_obj->lock); + + if (ret == -EDEADLK) { + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); + res_obj = contended_entry->obj; + goto retry; + } + ww_acquire_fini(ctx); + + return ret; +} + +Method 2, using a list in execbuf->buffers that can be reordered. Same semantics +of duplicate entry detection using -EALREADY as method 1 above. But the +list-reordering allows for a bit more idiomatic code. + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry, *entry2; + + ww_acquire_init(ctx, &ww_class); + + list_for_each_entry (entry, list, head) { + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + entry2 = entry; + + list_for_each_entry_continue_reverse (entry2, list, head) + ww_mutex_unlock(&entry2->obj->lock); + + if (ret != -EDEADLK) { + ww_acquire_fini(ctx); + return ret; + } + + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&entry->obj->lock, ctx); + + /* + * Move buf to head of the list, this will point + * buf->next to the first unlocked entry, + * restarting the for loop. + */ + list_del(&entry->head); + list_add(&entry->head, list); + } + } + + ww_acquire_done(ctx); + return 0; +} + +Unlocking works the same way for both methods #1 and #2: + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry; + + list_for_each_entry (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + ww_acquire_fini(ctx); +} + +Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, +e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, +and edges can only be changed when holding the locks of all involved nodes. w/w +mutexes are a natural fit for such a case for two reasons: +- They can handle lock-acquisition in any order which allows us to start walking + a graph from a starting point and then iteratively discovering new edges and + locking down the nodes those edges connect to. +- Due to the -EALREADY return code signalling that a given objects is already + held there's no need for additional book-keeping to break cycles in the graph + or keep track off which looks are already held (when using more than one node + as a starting point). + +Note that this approach differs in two important ways from the above methods: +- Since the list of objects is dynamically constructed (and might very well be + different when retrying due to hitting the -EDEADLK wound condition) there's + no need to keep any object on a persistent list when it's not locked. We can + therefore move the list_head into the object itself. +- On the other hand the dynamic object list construction also means that the -EALREADY return + code can't be propagated. + +Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a +list of starting nodes (passed in from userspace) using one of the above +methods. And then lock any additional objects affected by the operations using +method #3 below. The backoff/retry procedure will be a bit more involved, since +when the dynamic locking step hits -EDEADLK we also need to unlock all the +objects acquired with the fixed list. But the w/w mutex debug checks will catch +any interface misuse for these cases. + +Also, method 3 can't fail the lock acquisition step since it doesn't return +-EALREADY. Of course this would be different when using the _interruptible +variants, but that's outside of the scope of these examples here. + +struct obj { + struct ww_mutex ww_mutex; + struct list_head locked_list; +}; + +static DEFINE_WW_CLASS(ww_class); + +void __unlock_objs(struct list_head *list) +{ + struct obj *entry, *temp; + + list_for_each_entry_safe (entry, temp, list, locked_list) { + /* need to do that before unlocking, since only the current lock holder is + allowed to use object */ + list_del(&entry->locked_list); + ww_mutex_unlock(entry->ww_mutex) + } +} + +void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *obj; + + ww_acquire_init(ctx, &ww_class); + +retry: + /* re-init loop start state */ + loop { + /* magic code which walks over a graph and decides which objects + * to lock */ + + ret = ww_mutex_lock(obj->ww_mutex, ctx); + if (ret == -EALREADY) { + /* we have that one already, get to the next object */ + continue; + } + if (ret == -EDEADLK) { + __unlock_objs(list); + + ww_mutex_lock_slow(obj, ctx); + list_add(&entry->locked_list, list); + goto retry; + } + + /* locked a new object, add it to the list */ + list_add_tail(&entry->locked_list, list); + } + + ww_acquire_done(ctx); + return 0; +} + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + __unlock_objs(list); + ww_acquire_fini(ctx); +} + +Method 4: Only lock one single objects. In that case deadlock detection and +prevention is obviously overkill, since with grabbing just one lock you can't +produce a deadlock within just one class. To simplify this case the w/w mutex +api can be used with a NULL context. + +Implementation Details +---------------------- + +Design: + ww_mutex currently encapsulates a struct mutex, this means no extra overhead for + normal mutex locks, which are far more common. As such there is only a small + increase in code size if wait/wound mutexes are not used. + + In general, not much contention is expected. The locks are typically used to + serialize access to resources for devices. The only way to make wakeups + smarter would be at the cost of adding a field to struct mutex_waiter. This + would add overhead to all cases where normal mutexes are used, and + ww_mutexes are generally less performance sensitive. + +Lockdep: + Special care has been taken to warn for as many cases of api abuse + as possible. Some common api abuses will be caught with + CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. + + Some of the errors which will be warned about: + - Forgetting to call ww_acquire_fini or ww_acquire_init. + - Attempting to lock more mutexes after ww_acquire_done. + - Attempting to lock the wrong mutex after -EDEADLK and + unlocking all mutexes. + - Attempting to lock the right mutex after -EDEADLK, + before unlocking all mutexes. + + - Calling ww_mutex_lock_slow before -EDEADLK was returned. + + - Unlocking mutexes with the wrong unlock function. + - Calling one of the ww_acquire_* twice on the same context. + - Using a different ww_class for the mutex than for the ww_acquire_ctx. + - Normal lockdep errors that can result in deadlocks. + + Some of the lockdep errors that can result in deadlocks: + - Calling ww_acquire_init to initialize a second ww_acquire_ctx before + having called ww_acquire_fini on the first. + - 'normal' deadlocks that can occur. + +FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic +implemented. diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt deleted file mode 100644 index 72d010689751..000000000000 --- a/Documentation/lockstat.txt +++ /dev/null @@ -1,178 +0,0 @@ - -LOCK STATISTICS - -- WHAT - -As the name suggests, it provides statistics on locks. - -- WHY - -Because things like lock contention can severely impact performance. - -- HOW - -Lockdep already has hooks in the lock functions and maps lock instances to -lock classes. We build on that (see Documentation/lockdep-design.txt). -The graph below shows the relation between the lock functions and the various -hooks therein. - - __acquire - | - lock _____ - | \ - | __contended - | | - | - | _______/ - |/ - | - __acquired - | - . - - . - | - __release - | - unlock - -lock, unlock - the regular lock functions -__* - the hooks -<> - states - -With these hooks we provide the following statistics: - - con-bounces - number of lock contention that involved x-cpu data - contentions - number of lock acquisitions that had to wait - wait time min - shortest (non-0) time we ever had to wait for a lock - max - longest time we ever had to wait for a lock - total - total time we spend waiting on this lock - avg - average time spent waiting on this lock - acq-bounces - number of lock acquisitions that involved x-cpu data - acquisitions - number of times we took the lock - hold time min - shortest (non-0) time we ever held the lock - max - longest time we ever held the lock - total - total time this lock was held - avg - average time this lock was held - -These numbers are gathered per lock class, per read/write state (when -applicable). - -It also tracks 4 contention points per class. A contention point is a call site -that had to wait on lock acquisition. - - - CONFIGURATION - -Lock statistics are enabled via CONFIG_LOCK_STAT. - - - USAGE - -Enable collection of statistics: - -# echo 1 >/proc/sys/kernel/lock_stat - -Disable collection of statistics: - -# echo 0 >/proc/sys/kernel/lock_stat - -Look at the current lock statistics: - -( line numbers not part of actual output, done for clarity in the explanation - below ) - -# less /proc/lock_stat - -01 lock_stat version 0.4 -02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg -04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -05 -06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 -07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 -08 --------------- -09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 -19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 -11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 -12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 -13 --------------- -14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 -15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 -16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 -17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 -18 -19............................................................................................................................................................................................................................. -20 -21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 -22 --------------- -23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 -24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 -25 unix_table_lock 15 [] unix_find_other+0x117/0x230 -26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 -27 --------------- -28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 -29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 -30 unix_table_lock 20 [] unix_find_other+0x117/0x230 -31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 - - -This excerpt shows the first two lock class statistics. Line 01 shows the -output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-18 and 20-31 show the actual -statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 13) from the contention points. - -The first lock (05-18) is a read/write lock, and shows two lines above the -short separator. The contention points don't match the column descriptors, -they have two: contentions and [] symbol. The second set of contention -points are the points we're contending with. - -The integer part of the time values is in us. - -Dealing with nested locks, subclasses may appear: - -32........................................................................................................................................................................................................................... -33 -34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 -35 --------- -36 &rq->lock 645 [] task_rq_lock+0x43/0x75 -37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a -38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a -39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb -40 --------- -41 &rq->lock 77 [] task_rq_lock+0x43/0x75 -42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a -43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 -44 &rq->lock 893 [] schedule+0x157/0x7b8 -45 -46........................................................................................................................................................................................................................... -47 -48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 -49 ----------- -50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 -51 ----------- -52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 -53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 -54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 -55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a - -Line 48 shows statistics for the second subclass (/1) of &rq->lock class -(subclass starts from 0), since in this case, as line 50 suggests, -double_rq_lock actually acquires a nested lock of two spinlocks. - -View the top contending locks: - -# grep : /proc/lock_stat | head - clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 - tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 - &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 - &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 - &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 - tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 - tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 - rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 - &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 - rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 - -Clear the statistics: - -# echo 0 > /proc/lock_stat diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt deleted file mode 100644 index ee231ed09ec6..000000000000 --- a/Documentation/mutex-design.txt +++ /dev/null @@ -1,157 +0,0 @@ -Generic Mutex Subsystem - -started by Ingo Molnar -updated by Davidlohr Bueso - -What are mutexes? ------------------ - -In the Linux kernel, mutexes refer to a particular locking primitive -that enforces serialization on shared memory systems, and not only to -the generic term referring to 'mutual exclusion' found in academia -or similar theoretical text books. Mutexes are sleeping locks which -behave similarly to binary semaphores, and were introduced in 2006[1] -as an alternative to these. This new data structure provided a number -of advantages, including simpler interfaces, and at that time smaller -code (see Disadvantages). - -[1] http://lwn.net/Articles/164802/ - -Implementation --------------- - -Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h -and implemented in kernel/locking/mutex.c. These locks use a three -state atomic counter (->count) to represent the different possible -transitions that can occur during the lifetime of a lock: - - 1: unlocked - 0: locked, no waiters - negative: locked, with potential waiters - -In its most basic form it also includes a wait-queue and a spinlock -that serializes access to it. CONFIG_SMP systems can also include -a pointer to the lock task owner (->owner) as well as a spinner MCS -lock (->osq), both described below in (ii). - -When acquiring a mutex, there are three possible paths that can be -taken, depending on the state of the lock: - -(i) fastpath: tries to atomically acquire the lock by decrementing the - counter. If it was already taken by another task it goes to the next - possible path. This logic is architecture specific. On x86-64, the - locking fastpath is 2 instructions: - - 0000000000000e10 : - e21: f0 ff 0b lock decl (%rbx) - e24: 79 08 jns e2e - - the unlocking fastpath is equally tight: - - 0000000000000bc0 : - bc8: f0 ff 07 lock incl (%rdi) - bcb: 7f 0a jg bd7 - - -(ii) midpath: aka optimistic spinning, tries to spin for acquisition - while the lock owner is running and there are no other tasks ready - to run that have higher priority (need_resched). The rationale is - that if the lock owner is running, it is likely to release the lock - soon. The mutex spinners are queued up using MCS lock so that only - one spinner can compete for the mutex. - - The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock - with the desirable properties of being fair and with each cpu trying - to acquire the lock spinning on a local variable. It avoids expensive - cacheline bouncing that common test-and-set spinlock implementations - incur. An MCS-like lock is specially tailored for optimistic spinning - for sleeping lock implementation. An important feature of the customized - MCS lock is that it has the extra property that spinners are able to exit - the MCS spinlock queue when they need to reschedule. This further helps - avoid situations where MCS spinners that need to reschedule would continue - waiting to spin on mutex owner, only to go directly to slowpath upon - obtaining the MCS lock. - - -(iii) slowpath: last resort, if the lock is still unable to be acquired, - the task is added to the wait-queue and sleeps until woken up by the - unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. - -While formally kernel mutexes are sleepable locks, it is path (ii) that -makes them more practically a hybrid type. By simply not interrupting a -task and busy-waiting for a few cycles instead of immediately sleeping, -the performance of this lock has been seen to significantly improve a -number of workloads. Note that this technique is also used for rw-semaphores. - -Semantics ---------- - -The mutex subsystem checks and enforces the following rules: - - - Only one task can hold the mutex at a time. - - Only the owner can unlock the mutex. - - Multiple unlocks are not permitted. - - Recursive locking/unlocking is not permitted. - - A mutex must only be initialized via the API (see below). - - A task may not exit with a mutex held. - - Memory areas where held locks reside must not be freed. - - Held mutexes must not be reinitialized. - - Mutexes may not be used in hardware or software interrupt - contexts such as tasklets and timers. - -These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. -In addition, the mutex debugging code also implements a number of other -features that make lock debugging easier and faster: - - - Uses symbolic names of mutexes, whenever they are printed - in debug output. - - Point-of-acquire tracking, symbolic lookup of function names, - list of all locks held in the system, printout of them. - - Owner tracking. - - Detects self-recursing locks and prints out all relevant info. - - Detects multi-task circular deadlocks and prints out all affected - locks and tasks (and only those tasks). - - -Interfaces ----------- -Statically define the mutex: - DEFINE_MUTEX(name); - -Dynamically initialize the mutex: - mutex_init(mutex); - -Acquire the mutex, uninterruptible: - void mutex_lock(struct mutex *lock); - void mutex_lock_nested(struct mutex *lock, unsigned int subclass); - int mutex_trylock(struct mutex *lock); - -Acquire the mutex, interruptible: - int mutex_lock_interruptible_nested(struct mutex *lock, - unsigned int subclass); - int mutex_lock_interruptible(struct mutex *lock); - -Acquire the mutex, interruptible, if dec to 0: - int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); - -Unlock the mutex: - void mutex_unlock(struct mutex *lock); - -Test if the mutex is taken: - int mutex_is_locked(struct mutex *lock); - -Disadvantages -------------- - -Unlike its original design and purpose, 'struct mutex' is larger than -most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice -as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the -'struct rw_semaphore' variant. Larger structure sizes mean more CPU -cache and memory footprint. - -When to use mutexes -------------------- - -Unless the strict semantics of mutexes are unsuitable and/or the critical -region prevents the lock from being shared, always prefer them to any other -locking primitive. diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt deleted file mode 100644 index 8666070d3189..000000000000 --- a/Documentation/rt-mutex-design.txt +++ /dev/null @@ -1,781 +0,0 @@ -# -# Copyright (c) 2006 Steven Rostedt -# Licensed under the GNU Free Documentation License, Version 1.2 -# - -RT-mutex implementation design ------------------------------- - -This document tries to describe the design of the rtmutex.c implementation. -It doesn't describe the reasons why rtmutex.c exists. For that please see -Documentation/rt-mutex.txt. Although this document does explain problems -that happen without this code, but that is in the concept to understand -what the code actually is doing. - -The goal of this document is to help others understand the priority -inheritance (PI) algorithm that is used, as well as reasons for the -decisions that were made to implement PI in the manner that was done. - - -Unbounded Priority Inversion ----------------------------- - -Priority inversion is when a lower priority process executes while a higher -priority process wants to run. This happens for several reasons, and -most of the time it can't be helped. Anytime a high priority process wants -to use a resource that a lower priority process has (a mutex for example), -the high priority process must wait until the lower priority process is done -with the resource. This is a priority inversion. What we want to prevent -is something called unbounded priority inversion. That is when the high -priority process is prevented from running by a lower priority process for -an undetermined amount of time. - -The classic example of unbounded priority inversion is where you have three -processes, let's call them processes A, B, and C, where A is the highest -priority process, C is the lowest, and B is in between. A tries to grab a lock -that C owns and must wait and lets C run to release the lock. But in the -meantime, B executes, and since B is of a higher priority than C, it preempts C, -but by doing so, it is in fact preempting A which is a higher priority process. -Now there's no way of knowing how long A will be sleeping waiting for C -to release the lock, because for all we know, B is a CPU hog and will -never give C a chance to release the lock. This is called unbounded priority -inversion. - -Here's a little ASCII art to show the problem. - - grab lock L1 (owned by C) - | -A ---+ - C preempted by B - | -C +----+ - -B +--------> - B now keeps A from running. - - -Priority Inheritance (PI) -------------------------- - -There are several ways to solve this issue, but other ways are out of scope -for this document. Here we only discuss PI. - -PI is where a process inherits the priority of another process if the other -process blocks on a lock owned by the current process. To make this easier -to understand, let's use the previous example, with processes A, B, and C again. - -This time, when A blocks on the lock owned by C, C would inherit the priority -of A. So now if B becomes runnable, it would not preempt C, since C now has -the high priority of A. As soon as C releases the lock, it loses its -inherited priority, and A then can continue with the resource that C had. - -Terminology ------------ - -Here I explain some terminology that is used in this document to help describe -the design that is used to implement PI. - -PI chain - The PI chain is an ordered series of locks and processes that cause - processes to inherit priorities from a previous process that is - blocked on one of its locks. This is described in more detail - later in this document. - -mutex - In this document, to differentiate from locks that implement - PI and spin locks that are used in the PI code, from now on - the PI locks will be called a mutex. - -lock - In this document from now on, I will use the term lock when - referring to spin locks that are used to protect parts of the PI - algorithm. These locks disable preemption for UP (when - CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from - entering critical sections simultaneously. - -spin lock - Same as lock above. - -waiter - A waiter is a struct that is stored on the stack of a blocked - process. Since the scope of the waiter is within the code for - a process being blocked on the mutex, it is fine to allocate - the waiter on the process's stack (local variable). This - structure holds a pointer to the task, as well as the mutex that - the task is blocked on. It also has the plist node structures to - place the task in the waiter_list of a mutex as well as the - pi_list of a mutex owner task (described below). - - waiter is sometimes used in reference to the task that is waiting - on a mutex. This is the same as waiter->task. - -waiters - A list of processes that are blocked on a mutex. - -top waiter - The highest priority process waiting on a specific mutex. - -top pi waiter - The highest priority process waiting on one of the mutexes - that a specific process owns. - -Note: task and process are used interchangeably in this document, mostly to - differentiate between two processes that are being described together. - - -PI chain --------- - -The PI chain is a list of processes and mutexes that may cause priority -inheritance to take place. Multiple chains may converge, but a chain -would never diverge, since a process can't be blocked on more than one -mutex at a time. - -Example: - - Process: A, B, C, D, E - Mutexes: L1, L2, L3, L4 - - A owns: L1 - B blocked on L1 - B owns L2 - C blocked on L2 - C owns L3 - D blocked on L3 - D owns L4 - E blocked on L4 - -The chain would be: - - E->L4->D->L3->C->L2->B->L1->A - -To show where two chains merge, we could add another process F and -another mutex L5 where B owns L5 and F is blocked on mutex L5. - -The chain for F would be: - - F->L5->B->L1->A - -Since a process may own more than one mutex, but never be blocked on more than -one, the chains merge. - -Here we show both chains: - - E->L4->D->L3->C->L2-+ - | - +->B->L1->A - | - F->L5-+ - -For PI to work, the processes at the right end of these chains (or we may -also call it the Top of the chain) must be equal to or higher in priority -than the processes to the left or below in the chain. - -Also since a mutex may have more than one process blocked on it, we can -have multiple chains merge at mutexes. If we add another process G that is -blocked on mutex L2: - - G->L2->B->L1->A - -And once again, to show how this can grow I will show the merging chains -again. - - E->L4->D->L3->C-+ - +->L2-+ - | | - G-+ +->B->L1->A - | - F->L5-+ - - -Plist ------ - -Before I go further and talk about how the PI chain is stored through lists -on both mutexes and processes, I'll explain the plist. This is similar to -the struct list_head functionality that is already in the kernel. -The implementation of plist is out of scope for this document, but it is -very important to understand what it does. - -There are a few differences between plist and list, the most important one -being that plist is a priority sorted linked list. This means that the -priorities of the plist are sorted, such that it takes O(1) to retrieve the -highest priority item in the list. Obviously this is useful to store processes -based on their priorities. - -Another difference, which is important for implementation, is that, unlike -list, the head of the list is a different element than the nodes of a list. -So the head of the list is declared as struct plist_head and nodes that will -be added to the list are declared as struct plist_node. - - -Mutex Waiter List ------------------ - -Every mutex keeps track of all the waiters that are blocked on itself. The mutex -has a plist to store these waiters by priority. This list is protected by -a spin lock that is located in the struct of the mutex. This lock is called -wait_lock. Since the modification of the waiter list is never done in -interrupt context, the wait_lock can be taken without disabling interrupts. - - -Task PI List ------------- - -To keep track of the PI chains, each process has its own PI list. This is -a list of all top waiters of the mutexes that are owned by the process. -Note that this list only holds the top waiters and not all waiters that are -blocked on mutexes owned by the process. - -The top of the task's PI list is always the highest priority task that -is waiting on a mutex that is owned by the task. So if the task has -inherited a priority, it will always be the priority of the task that is -at the top of this list. - -This list is stored in the task structure of a process as a plist called -pi_list. This list is protected by a spin lock also in the task structure, -called pi_lock. This lock may also be taken in interrupt context, so when -locking the pi_lock, interrupts must be disabled. - - -Depth of the PI Chain ---------------------- - -The maximum depth of the PI chain is not dynamic, and could actually be -defined. But is very complex to figure it out, since it depends on all -the nesting of mutexes. Let's look at the example where we have 3 mutexes, -L1, L2, and L3, and four separate functions func1, func2, func3 and func4. -The following shows a locking order of L1->L2->L3, but may not actually -be directly nested that way. - -void func1(void) -{ - mutex_lock(L1); - - /* do anything */ - - mutex_unlock(L1); -} - -void func2(void) -{ - mutex_lock(L1); - mutex_lock(L2); - - /* do something */ - - mutex_unlock(L2); - mutex_unlock(L1); -} - -void func3(void) -{ - mutex_lock(L2); - mutex_lock(L3); - - /* do something else */ - - mutex_unlock(L3); - mutex_unlock(L2); -} - -void func4(void) -{ - mutex_lock(L3); - - /* do something again */ - - mutex_unlock(L3); -} - -Now we add 4 processes that run each of these functions separately. -Processes A, B, C, and D which run functions func1, func2, func3 and func4 -respectively, and such that D runs first and A last. With D being preempted -in func4 in the "do something again" area, we have a locking that follows: - -D owns L3 - C blocked on L3 - C owns L2 - B blocked on L2 - B owns L1 - A blocked on L1 - -And thus we have the chain A->L1->B->L2->C->L3->D. - -This gives us a PI depth of 4 (four processes), but looking at any of the -functions individually, it seems as though they only have at most a locking -depth of two. So, although the locking depth is defined at compile time, -it still is very difficult to find the possibilities of that depth. - -Now since mutexes can be defined by user-land applications, we don't want a DOS -type of application that nests large amounts of mutexes to create a large -PI chain, and have the code holding spin locks while looking at a large -amount of data. So to prevent this, the implementation not only implements -a maximum lock depth, but also only holds at most two different locks at a -time, as it walks the PI chain. More about this below. - - -Mutex owner and flags ---------------------- - -The mutex structure contains a pointer to the owner of the mutex. If the -mutex is not owned, this owner is set to NULL. Since all architectures -have the task structure on at least a four byte alignment (and if this is -not true, the rtmutex.c code will be broken!), this allows for the two -least significant bits to be used as flags. This part is also described -in Documentation/rt-mutex.txt, but will also be briefly described here. - -Bit 0 is used as the "Pending Owner" flag. This is described later. -Bit 1 is used as the "Has Waiters" flags. This is also described later - in more detail, but is set whenever there are waiters on a mutex. - - -cmpxchg Tricks --------------- - -Some architectures implement an atomic cmpxchg (Compare and Exchange). This -is used (when applicable) to keep the fast path of grabbing and releasing -mutexes short. - -cmpxchg is basically the following function performed atomically: - -unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) -{ - unsigned long T = *A; - if (*A == *B) { - *A = *C; - } - return T; -} -#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) - -This is really nice to have, since it allows you to only update a variable -if the variable is what you expect it to be. You know if it succeeded if -the return value (the old value of A) is equal to B. - -The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If -the architecture does not support CMPXCHG, then this macro is simply set -to fail every time. But if CMPXCHG is supported, then this will -help out extremely to keep the fast path short. - -The use of rt_mutex_cmpxchg with the flags in the owner field help optimize -the system for architectures that support it. This will also be explained -later in this document. - - -Priority adjustments --------------------- - -The implementation of the PI code in rtmutex.c has several places that a -process must adjust its priority. With the help of the pi_list of a -process this is rather easy to know what needs to be adjusted. - -The functions implementing the task adjustments are rt_mutex_adjust_prio, -__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock -to already be taken), rt_mutex_getprio, and rt_mutex_setprio. - -rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. - -rt_mutex_getprio returns the priority that the task should have. Either the -task's own normal priority, or if a process of a higher priority is waiting on -a mutex owned by the task, then that higher priority should be returned. -Since the pi_list of a task holds an order by priority list of all the top -waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs -to compare the top pi waiter to its own normal priority, and return the higher -priority back. - -(Note: if looking at the code, you will notice that the lower number of - prio is returned. This is because the prio field in the task structure - is an inverse order of the actual priority. So a "prio" of 5 is - of higher priority than a "prio" of 10.) - -__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the -result does not equal the task's current priority, then rt_mutex_setprio -is called to adjust the priority of the task to the new priority. -Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the -actual change in priority. - -It is interesting to note that __rt_mutex_adjust_prio can either increase -or decrease the priority of the task. In the case that a higher priority -process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio -would increase/boost the task's priority. But if a higher priority task -were for some reason to leave the mutex (timeout or signal), this same function -would decrease/unboost the priority of the task. That is because the pi_list -always contains the highest priority task that is waiting on a mutex owned -by the task, so we only need to compare the priority of that top pi waiter -to the normal priority of the given task. - - -High level overview of the PI chain walk ----------------------------------------- - -The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. - -The implementation has gone through several iterations, and has ended up -with what we believe is the best. It walks the PI chain by only grabbing -at most two locks at a time, and is very efficient. - -The rt_mutex_adjust_prio_chain can be used either to boost or lower process -priorities. - -rt_mutex_adjust_prio_chain is called with a task to be checked for PI -(de)boosting (the owner of a mutex that a process is blocking on), a flag to -check for deadlocking, the mutex that the task owns, and a pointer to a waiter -that is the process's waiter struct that is blocked on the mutex (although this -parameter may be NULL for deboosting). - -For this explanation, I will not mention deadlock detection. This explanation -will try to stay at a high level. - -When this function is called, there are no locks held. That also means -that the state of the owner and lock can change when entered into this function. - -Before this function is called, the task has already had rt_mutex_adjust_prio -performed on it. This means that the task is set to the priority that it -should be at, but the plist nodes of the task's waiter have not been updated -with the new priorities, and that this task may not be in the proper locations -in the pi_lists and wait_lists that the task is blocked on. This function -solves all that. - -A loop is entered, where task is the owner to be checked for PI changes that -was passed by parameter (for the first iteration). The pi_lock of this task is -taken to prevent any more changes to the pi_list of the task. This also -prevents new tasks from completing the blocking on a mutex that is owned by this -task. - -If the task is not blocked on a mutex then the loop is exited. We are at -the top of the PI chain. - -A check is now done to see if the original waiter (the process that is blocked -on the current mutex) is the top pi waiter of the task. That is, is this -waiter on the top of the task's pi_list. If it is not, it either means that -there is another process higher in priority that is blocked on one of the -mutexes that the task owns, or that the waiter has just woken up via a signal -or timeout and has left the PI chain. In either case, the loop is exited, since -we don't need to do any more changes to the priority of the current task, or any -task that owns a mutex that this current task is waiting on. A priority chain -walk is only needed when a new top pi waiter is made to a task. - -The next check sees if the task's waiter plist node has the priority equal to -the priority the task is set at. If they are equal, then we are done with -the loop. Remember that the function started with the priority of the -task adjusted, but the plist nodes that hold the task in other processes -pi_lists have not been adjusted. - -Next, we look at the mutex that the task is blocked on. The mutex's wait_lock -is taken. This is done by a spin_trylock, because the locking order of the -pi_lock and wait_lock goes in the opposite direction. If we fail to grab the -lock, the pi_lock is released, and we restart the loop. - -Now that we have both the pi_lock of the task as well as the wait_lock of -the mutex the task is blocked on, we update the task's waiter's plist node -that is located on the mutex's wait_list. - -Now we release the pi_lock of the task. - -Next the owner of the mutex has its pi_lock taken, so we can update the -task's entry in the owner's pi_list. If the task is the highest priority -process on the mutex's wait_list, then we remove the previous top waiter -from the owner's pi_list, and replace it with the task. - -Note: It is possible that the task was the current top waiter on the mutex, - in which case the task is not yet on the pi_list of the waiter. This - is OK, since plist_del does nothing if the plist node is not on any - list. - -If the task was not the top waiter of the mutex, but it was before we -did the priority updates, that means we are deboosting/lowering the -task. In this case, the task is removed from the pi_list of the owner, -and the new top waiter is added. - -Lastly, we unlock both the pi_lock of the task, as well as the mutex's -wait_lock, and continue the loop again. On the next iteration of the -loop, the previous owner of the mutex will be the task that will be -processed. - -Note: One might think that the owner of this mutex might have changed - since we just grab the mutex's wait_lock. And one could be right. - The important thing to remember is that the owner could not have - become the task that is being processed in the PI chain, since - we have taken that task's pi_lock at the beginning of the loop. - So as long as there is an owner of this mutex that is not the same - process as the tasked being worked on, we are OK. - - Looking closely at the code, one might be confused. The check for the - end of the PI chain is when the task isn't blocked on anything or the - task's waiter structure "task" element is NULL. This check is - protected only by the task's pi_lock. But the code to unlock the mutex - sets the task's waiter structure "task" element to NULL with only - the protection of the mutex's wait_lock, which was not taken yet. - Isn't this a race condition if the task becomes the new owner? - - The answer is No! The trick is the spin_trylock of the mutex's - wait_lock. If we fail that lock, we release the pi_lock of the - task and continue the loop, doing the end of PI chain check again. - - In the code to release the lock, the wait_lock of the mutex is held - the entire time, and it is not let go when we grab the pi_lock of the - new owner of the mutex. So if the switch of a new owner were to happen - after the check for end of the PI chain and the grabbing of the - wait_lock, the unlocking code would spin on the new owner's pi_lock - but never give up the wait_lock. So the PI chain loop is guaranteed to - fail the spin_trylock on the wait_lock, release the pi_lock, and - try again. - - If you don't quite understand the above, that's OK. You don't have to, - unless you really want to make a proof out of it ;) - - -Pending Owners and Lock stealing --------------------------------- - -One of the flags in the owner field of the mutex structure is "Pending Owner". -What this means is that an owner was chosen by the process releasing the -mutex, but that owner has yet to wake up and actually take the mutex. - -Why is this important? Why can't we just give the mutex to another process -and be done with it? - -The PI code is to help with real-time processes, and to let the highest -priority process run as long as possible with little latencies and delays. -If a high priority process owns a mutex that a lower priority process is -blocked on, when the mutex is released it would be given to the lower priority -process. What if the higher priority process wants to take that mutex again. -The high priority process would fail to take that mutex that it just gave up -and it would need to boost the lower priority process to run with full -latency of that critical section (since the low priority process just entered -it). - -There's no reason a high priority process that gives up a mutex should be -penalized if it tries to take that mutex again. If the new owner of the -mutex has not woken up yet, there's no reason that the higher priority process -could not take that mutex away. - -To solve this, we introduced Pending Ownership and Lock Stealing. When a -new process is given a mutex that it was blocked on, it is only given -pending ownership. This means that it's the new owner, unless a higher -priority process comes in and tries to grab that mutex. If a higher priority -process does come along and wants that mutex, we let the higher priority -process "steal" the mutex from the pending owner (only if it is still pending) -and continue with the mutex. - - -Taking of a mutex (The walk through) ------------------------------------- - -OK, now let's take a look at the detailed walk through of what happens when -taking a mutex. - -The first thing that is tried is the fast taking of the mutex. This is -done when we have CMPXCHG enabled (otherwise the fast taking automatically -fails). Only when the owner field of the mutex is NULL can the lock be -taken with the CMPXCHG and nothing else needs to be done. - -If there is contention on the lock, whether it is owned or pending owner -we go about the slow path (rt_mutex_slowlock). - -The slow path function is where the task's waiter structure is created on -the stack. This is because the waiter structure is only needed for the -scope of this function. The waiter structure holds the nodes to store -the task on the wait_list of the mutex, and if need be, the pi_list of -the owner. - -The wait_lock of the mutex is taken since the slow path of unlocking the -mutex also takes this lock. - -We then call try_to_take_rt_mutex. This is where the architecture that -does not implement CMPXCHG would always grab the lock (if there's no -contention). - -try_to_take_rt_mutex is used every time the task tries to grab a mutex in the -slow path. The first thing that is done here is an atomic setting of -the "Has Waiters" flag of the mutex's owner field. Yes, this could really -be false, because if the mutex has no owner, there are no waiters and -the current task also won't have any waiters. But we don't have the lock -yet, so we assume we are going to be a waiter. The reason for this is to -play nice for those architectures that do have CMPXCHG. By setting this flag -now, the owner of the mutex can't release the mutex without going into the -slow unlock path, and it would then need to grab the wait_lock, which this -code currently holds. So setting the "Has Waiters" flag forces the owner -to synchronize with this code. - -Now that we know that we can't have any races with the owner releasing the -mutex, we check to see if we can take the ownership. This is done if the -mutex doesn't have a owner, or if we can steal the mutex from a pending -owner. Let's look at the situations we have here. - - 1) Has owner that is pending - ---------------------------- - - The mutex has a owner, but it hasn't woken up and the mutex flag - "Pending Owner" is set. The first check is to see if the owner isn't the - current task. This is because this function is also used for the pending - owner to grab the mutex. When a pending owner wakes up, it checks to see - if it can take the mutex, and this is done if the owner is already set to - itself. If so, we succeed and leave the function, clearing the "Pending - Owner" bit. - - If the pending owner is not current, we check to see if the current priority is - higher than the pending owner. If not, we fail the function and return. - - There's also something special about a pending owner. That is a pending owner - is never blocked on a mutex. So there is no PI chain to worry about. It also - means that if the mutex doesn't have any waiters, there's no accounting needed - to update the pending owner's pi_list, since we only worry about processes - blocked on the current mutex. - - If there are waiters on this mutex, and we just stole the ownership, we need - to take the top waiter, remove it from the pi_list of the pending owner, and - add it to the current pi_list. Note that at this moment, the pending owner - is no longer on the list of waiters. This is fine, since the pending owner - would add itself back when it realizes that it had the ownership stolen - from itself. When the pending owner tries to grab the mutex, it will fail - in try_to_take_rt_mutex if the owner field points to another process. - - 2) No owner - ----------- - - If there is no owner (or we successfully stole the lock), we set the owner - of the mutex to current, and set the flag of "Has Waiters" if the current - mutex actually has waiters, or we clear the flag if it doesn't. See, it was - OK that we set that flag early, since now it is cleared. - - 3) Failed to grab ownership - --------------------------- - - The most interesting case is when we fail to take ownership. This means that - there exists an owner, or there's a pending owner with equal or higher - priority than the current task. - -We'll continue on the failed case. - -If the mutex has a timeout, we set up a timer to go off to break us out -of this mutex if we failed to get it after a specified amount of time. - -Now we enter a loop that will continue to try to take ownership of the mutex, or -fail from a timeout or signal. - -Once again we try to take the mutex. This will usually fail the first time -in the loop, since it had just failed to get the mutex. But the second time -in the loop, this would likely succeed, since the task would likely be -the pending owner. - -If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done -here. - -The waiter structure has a "task" field that points to the task that is blocked -on the mutex. This field can be NULL the first time it goes through the loop -or if the task is a pending owner and had its mutex stolen. If the "task" -field is NULL then we need to set up the accounting for it. - -Task blocks on mutex --------------------- - -The accounting of a mutex and process is done with the waiter structure of -the process. The "task" field is set to the process, and the "lock" field -to the mutex. The plist nodes are initialized to the processes current -priority. - -Since the wait_lock was taken at the entry of the slow lock, we can safely -add the waiter to the wait_list. If the current process is the highest -priority process currently waiting on this mutex, then we remove the -previous top waiter process (if it exists) from the pi_list of the owner, -and add the current process to that list. Since the pi_list of the owner -has changed, we call rt_mutex_adjust_prio on the owner to see if the owner -should adjust its priority accordingly. - -If the owner is also blocked on a lock, and had its pi_list changed -(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead -and run rt_mutex_adjust_prio_chain on the owner, as described earlier. - -Now all locks are released, and if the current process is still blocked on a -mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). - -Waking up in the loop ---------------------- - -The schedule can then wake up for a few reasons. - 1) we were given pending ownership of the mutex. - 2) we received a signal and was TASK_INTERRUPTIBLE - 3) we had a timeout and was TASK_INTERRUPTIBLE - -In any of these cases, we continue the loop and once again try to grab the -ownership of the mutex. If we succeed, we exit the loop, otherwise we continue -and on signal and timeout, will exit the loop, or if we had the mutex stolen -we just simply add ourselves back on the lists and go back to sleep. - -Note: For various reasons, because of timeout and signals, the steal mutex - algorithm needs to be careful. This is because the current process is - still on the wait_list. And because of dynamic changing of priorities, - especially on SCHED_OTHER tasks, the current process can be the - highest priority task on the wait_list. - -Failed to get mutex on Timeout or Signal ----------------------------------------- - -If a timeout or signal occurred, the waiter's "task" field would not be -NULL and the task needs to be taken off the wait_list of the mutex and perhaps -pi_list of the owner. If this process was a high priority process, then -the rt_mutex_adjust_prio_chain needs to be executed again on the owner, -but this time it will be lowering the priorities. - - -Unlocking the Mutex -------------------- - -The unlocking of a mutex also has a fast path for those architectures with -CMPXCHG. Since the taking of a mutex on contention always sets the -"Has Waiters" flag of the mutex's owner, we use this to know if we need to -take the slow path when unlocking the mutex. If the mutex doesn't have any -waiters, the owner field of the mutex would equal the current process and -the mutex can be unlocked by just replacing the owner field with NULL. - -If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), -the slow unlock path is taken. - -The first thing done in the slow unlock path is to take the wait_lock of the -mutex. This synchronizes the locking and unlocking of the mutex. - -A check is made to see if the mutex has waiters or not. On architectures that -do not have CMPXCHG, this is the location that the owner of the mutex will -determine if a waiter needs to be awoken or not. On architectures that -do have CMPXCHG, that check is done in the fast path, but it is still needed -in the slow path too. If a waiter of a mutex woke up because of a signal -or timeout between the time the owner failed the fast path CMPXCHG check and -the grabbing of the wait_lock, the mutex may not have any waiters, thus the -owner still needs to make this check. If there are no waiters then the mutex -owner field is set to NULL, the wait_lock is released and nothing more is -needed. - -If there are waiters, then we need to wake one up and give that waiter -pending ownership. - -On the wake up code, the pi_lock of the current owner is taken. The top -waiter of the lock is found and removed from the wait_list of the mutex -as well as the pi_list of the current owner. The task field of the new -pending owner's waiter structure is set to NULL, and the owner field of the -mutex is set to the new owner with the "Pending Owner" bit set, as well -as the "Has Waiters" bit if there still are other processes blocked on the -mutex. - -The pi_lock of the previous owner is released, and the new pending owner's -pi_lock is taken. Remember that this is the trick to prevent the race -condition in rt_mutex_adjust_prio_chain from adding itself as a waiter -on the mutex. - -We now clear the "pi_blocked_on" field of the new pending owner, and if -the mutex still has waiters pending, we add the new top waiter to the pi_list -of the pending owner. - -Finally we unlock the pi_lock of the pending owner and wake it up. - - -Contact -------- - -For updates on this document, please email Steven Rostedt - - -Credits -------- - -Author: Steven Rostedt - -Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap - -Updates -------- - -This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt deleted file mode 100644 index 243393d882ee..000000000000 --- a/Documentation/rt-mutex.txt +++ /dev/null @@ -1,79 +0,0 @@ -RT-mutex subsystem with PI support ----------------------------------- - -RT-mutexes with priority inheritance are used to support PI-futexes, -which enable pthread_mutex_t priority inheritance attributes -(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details -about PI-futexes.] - -This technology was developed in the -rt tree and streamlined for -pthread_mutex support. - -Basic principles: ------------------ - -RT-mutexes extend the semantics of simple mutexes by the priority -inheritance protocol. - -A low priority owner of a rt-mutex inherits the priority of a higher -priority waiter until the rt-mutex is released. If the temporarily -boosted owner blocks on a rt-mutex itself it propagates the priority -boosting to the owner of the other rt_mutex it gets blocked on. The -priority boosting is immediately removed once the rt_mutex has been -unlocked. - -This approach allows us to shorten the block of high-prio tasks on -mutexes which protect shared resources. Priority inheritance is not a -magic bullet for poorly designed applications, but it allows -well-designed applications to use userspace locks in critical parts of -an high priority thread, without losing determinism. - -The enqueueing of the waiters into the rtmutex waiter list is done in -priority order. For same priorities FIFO order is chosen. For each -rtmutex, only the top priority waiter is enqueued into the owner's -priority waiters list. This list too queues in priority order. Whenever -the top priority waiter of a task changes (for example it timed out or -got a signal), the priority of the owner task is readjusted. [The -priority enqueueing is handled by "plists", see include/linux/plist.h -for more details.] - -RT-mutexes are optimized for fastpath operations and have no internal -locking overhead when locking an uncontended mutex or unlocking a mutex -without waiters. The optimized fastpath operations require cmpxchg -support. [If that is not available then the rt-mutex internal spinlock -is used] - -The state of the rt-mutex is tracked via the owner field of the rt-mutex -structure: - -rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 -are used to keep track of the "owner is pending" and "rtmutex has -waiters" state. - - owner bit1 bit0 - NULL 0 0 mutex is free (fast acquire possible) - NULL 0 1 invalid state - NULL 1 0 Transitional state* - NULL 1 1 invalid state - taskpointer 0 0 mutex is held (fast release possible) - taskpointer 0 1 task is pending owner - taskpointer 1 0 mutex is held and has waiters - taskpointer 1 1 task is pending owner and mutex has waiters - -Pending-ownership handling is a performance optimization: -pending-ownership is assigned to the first (highest priority) waiter of -the mutex, when the mutex is released. The thread is woken up and once -it starts executing it can acquire the mutex. Until the mutex is taken -by it (bit 0 is cleared) a competing higher priority thread can "steal" -the mutex which puts the woken up thread back on the waiters list. - -The pending-ownership optimization is especially important for the -uninterrupted workflow of high-prio tasks which repeatedly -takes/releases locks that have lower-prio waiters. Without this -optimization the higher-prio thread would ping-pong to the lower-prio -task [because at unlock time we always assign a new owner]. - -(*) The "mutex has waiters" bit gets set to take the lock. If the lock -doesn't already have an owner, this bit is quickly cleared if there are -no waiters. So this is a transitional state to synchronize with looking -at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt deleted file mode 100644 index 97eaf5727178..000000000000 --- a/Documentation/spinlocks.txt +++ /dev/null @@ -1,167 +0,0 @@ -Lesson 1: Spin locks - -The most basic primitive for locking is spinlock. - -static DEFINE_SPINLOCK(xxx_lock); - - unsigned long flags; - - spin_lock_irqsave(&xxx_lock, flags); - ... critical section here .. - spin_unlock_irqrestore(&xxx_lock, flags); - -The above is always safe. It will disable interrupts _locally_, but the -spinlock itself will guarantee the global lock, so it will guarantee that -there is only one thread-of-control within the region(s) protected by that -lock. This works well even under UP also, so the code does _not_ need to -worry about UP vs SMP issues: the spinlocks work correctly under both. - - NOTE! Implications of spin_locks for memory are further described in: - - Documentation/memory-barriers.txt - (5) LOCK operations. - (6) UNLOCK operations. - -The above is usually pretty simple (you usually need and want only one -spinlock for most things - using more than one spinlock can make things a -lot more complex and even slower and is usually worth it only for -sequences that you _know_ need to be split up: avoid it at all cost if you -aren't sure). - -This is really the only really hard part about spinlocks: once you start -using spinlocks they tend to expand to areas you might not have noticed -before, because you have to make sure the spinlocks correctly protect the -shared data structures _everywhere_ they are used. The spinlocks are most -easily added to places that are completely independent of other code (for -example, internal driver data structures that nobody else ever touches). - - NOTE! The spin-lock is safe only when you _also_ use the lock itself - to do locking across CPU's, which implies that EVERYTHING that - touches a shared variable has to agree about the spinlock they want - to use. - ----- - -Lesson 2: reader-writer spinlocks. - -If your data accesses have a very natural pattern where you usually tend -to mostly read from the shared variables, the reader-writer locks -(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple -readers to be in the same critical region at once, but if somebody wants -to change the variables it has to get an exclusive write lock. - - NOTE! reader-writer locks require more atomic memory operations than - simple spinlocks. Unless the reader critical section is long, you - are better off just using spinlocks. - -The routines look the same as above: - - rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); - - unsigned long flags; - - read_lock_irqsave(&xxx_lock, flags); - .. critical section that only reads the info ... - read_unlock_irqrestore(&xxx_lock, flags); - - write_lock_irqsave(&xxx_lock, flags); - .. read and write exclusive access to the info ... - write_unlock_irqrestore(&xxx_lock, flags); - -The above kind of lock may be useful for complex data structures like -linked lists, especially searching for entries without changing the list -itself. The read lock allows many concurrent readers. Anything that -_changes_ the list will have to get the write lock. - - NOTE! RCU is better for list traversal, but requires careful - attention to design detail (see Documentation/RCU/listRCU.txt). - -Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ -time need to do any changes (even if you don't do it every time), you have -to get the write-lock at the very beginning. - - NOTE! We are working hard to remove reader-writer spinlocks in most - cases, so please don't add a new one without consensus. (Instead, see - Documentation/RCU/rcu.txt for complete information.) - ----- - -Lesson 3: spinlocks revisited. - -The single spin-lock primitives above are by no means the only ones. They -are the most safe ones, and the ones that work under all circumstances, -but partly _because_ they are safe they are also fairly slow. They are slower -than they'd need to be, because they do have to disable interrupts -(which is just a single instruction on a x86, but it's an expensive one - -and on other architectures it can be worse). - -If you have a case where you have to protect a data structure across -several CPU's and you want to use spinlocks you can potentially use -cheaper versions of the spinlocks. IFF you know that the spinlocks are -never used in interrupt handlers, you can use the non-irq versions: - - spin_lock(&lock); - ... - spin_unlock(&lock); - -(and the equivalent read-write versions too, of course). The spinlock will -guarantee the same kind of exclusive access, and it will be much faster. -This is useful if you know that the data in question is only ever -manipulated from a "process context", ie no interrupts involved. - -The reasons you mustn't use these versions if you have interrupts that -play with the spinlock is that you can get deadlocks: - - spin_lock(&lock); - ... - <- interrupt comes in: - spin_lock(&lock); - -where an interrupt tries to lock an already locked variable. This is ok if -the other interrupt happens on another CPU, but it is _not_ ok if the -interrupt happens on the same CPU that already holds the lock, because the -lock will obviously never be released (because the interrupt is waiting -for the lock, and the lock-holder is interrupted by the interrupt and will -not continue until the interrupt has been processed). - -(This is also the reason why the irq-versions of the spinlocks only need -to disable the _local_ interrupts - it's ok to use spinlocks in interrupts -on other CPU's, because an interrupt on another CPU doesn't interrupt the -CPU that holds the lock, so the lock-holder can continue and eventually -releases the lock). - -Note that you can be clever with read-write locks and interrupts. For -example, if you know that the interrupt only ever gets a read-lock, then -you can use a non-irq version of read locks everywhere - because they -don't block on each other (and thus there is no dead-lock wrt interrupts. -But when you do the write-lock, you have to use the irq-safe version. - -For an example of being clever with rw-locks, see the "waitqueue_lock" -handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from -within an interrupt, they only read the queue in order to know whom to -wake up. So read-locks are safe (which is good: they are very common -indeed), while write-locks need to protect themselves against interrupts. - - Linus - ----- - -Reference information: - -For dynamic initialization, use spin_lock_init() or rwlock_init() as -appropriate: - - spinlock_t xxx_lock; - rwlock_t xxx_rw_lock; - - static int __init xxx_init(void) - { - spin_lock_init(&xxx_lock); - rwlock_init(&xxx_rw_lock); - ... - } - - module_init(xxx_init); - -For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or -__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt deleted file mode 100644 index 8a112dc304c3..000000000000 --- a/Documentation/ww-mutex-design.txt +++ /dev/null @@ -1,344 +0,0 @@ -Wait/Wound Deadlock-Proof Mutex Design -====================================== - -Please read mutex-design.txt first, as it applies to wait/wound mutexes too. - -Motivation for WW-Mutexes -------------------------- - -GPU's do operations that commonly involve many buffers. Those buffers -can be shared across contexts/processes, exist in different memory -domains (for example VRAM vs system memory), and so on. And with -PRIME / dmabuf, they can even be shared across devices. So there are -a handful of situations where the driver needs to wait for buffers to -become ready. If you think about this in terms of waiting on a buffer -mutex for it to become available, this presents a problem because -there is no way to guarantee that buffers appear in a execbuf/batch in -the same order in all contexts. That is directly under control of -userspace, and a result of the sequence of GL calls that an application -makes. Which results in the potential for deadlock. The problem gets -more complex when you consider that the kernel may need to migrate the -buffer(s) into VRAM before the GPU operates on the buffer(s), which -may in turn require evicting some other buffers (and you don't want to -evict other buffers which are already queued up to the GPU), but for a -simplified understanding of the problem you can ignore this. - -The algorithm that the TTM graphics subsystem came up with for dealing with -this problem is quite simple. For each group of buffers (execbuf) that need -to be locked, the caller would be assigned a unique reservation id/ticket, -from a global counter. In case of deadlock while locking all the buffers -associated with a execbuf, the one with the lowest reservation ticket (i.e. -the oldest task) wins, and the one with the higher reservation id (i.e. the -younger task) unlocks all of the buffers that it has already locked, and then -tries again. - -In the RDBMS literature this deadlock handling approach is called wait/wound: -The older tasks waits until it can acquire the contended lock. The younger tasks -needs to back off and drop all the locks it is currently holding, i.e. the -younger task is wounded. - -Concepts --------- - -Compared to normal mutexes two additional concepts/objects show up in the lock -interface for w/w mutexes: - -Acquire context: To ensure eventual forward progress it is important the a task -trying to acquire locks doesn't grab a new reservation id, but keeps the one it -acquired when starting the lock acquisition. This ticket is stored in the -acquire context. Furthermore the acquire context keeps track of debugging state -to catch w/w mutex interface abuse. - -W/w class: In contrast to normal mutexes the lock class needs to be explicit for -w/w mutexes, since it is required to initialize the acquire context. - -Furthermore there are three different class of w/w lock acquire functions: - -* Normal lock acquisition with a context, using ww_mutex_lock. - -* Slowpath lock acquisition on the contending lock, used by the wounded task - after having dropped all already acquired locks. These functions have the - _slow postfix. - - From a simple semantics point-of-view the _slow functions are not strictly - required, since simply calling the normal ww_mutex_lock functions on the - contending lock (after having dropped all other already acquired locks) will - work correctly. After all if no other ww mutex has been acquired yet there's - no deadlock potential and hence the ww_mutex_lock call will block and not - prematurely return -EDEADLK. The advantage of the _slow functions is in - interface safety: - - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow - has a void return type. Note that since ww mutex code needs loops/retries - anyway the __must_check doesn't result in spurious warnings, even though the - very first lock operation can never fail. - - When full debugging is enabled ww_mutex_lock_slow checks that all acquired - ww mutex have been released (preventing deadlocks) and makes sure that we - block on the contending lock (preventing spinning through the -EDEADLK - slowpath until the contended lock can be acquired). - -* Functions to only acquire a single w/w mutex, which results in the exact same - semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL - context. - - Again this is not strictly required. But often you only want to acquire a - single lock in which case it's pointless to set up an acquire context (and so - better to avoid grabbing a deadlock avoidance ticket). - -Of course, all the usual variants for handling wake-ups due to signals are also -provided. - -Usage ------ - -Three different ways to acquire locks within the same w/w class. Common -definitions for methods #1 and #2: - -static DEFINE_WW_CLASS(ww_class); - -struct obj { - struct ww_mutex lock; - /* obj data */ -}; - -struct obj_entry { - struct list_head head; - struct obj *obj; -}; - -Method 1, using a list in execbuf->buffers that's not allowed to be reordered. -This is useful if a list of required objects is already tracked somewhere. -Furthermore the lock helper can use propagate the -EALREADY return code back to -the caller as a signal that an object is twice on the list. This is useful if -the list is constructed from userspace input and the ABI requires userspace to -not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *res_obj = NULL; - struct obj_entry *contended_entry = NULL; - struct obj_entry *entry; - - ww_acquire_init(ctx, &ww_class); - -retry: - list_for_each_entry (entry, list, head) { - if (entry->obj == res_obj) { - res_obj = NULL; - continue; - } - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - contended_entry = entry; - goto err; - } - } - - ww_acquire_done(ctx); - return 0; - -err: - list_for_each_entry_continue_reverse (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - if (res_obj) - ww_mutex_unlock(&res_obj->lock); - - if (ret == -EDEADLK) { - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); - res_obj = contended_entry->obj; - goto retry; - } - ww_acquire_fini(ctx); - - return ret; -} - -Method 2, using a list in execbuf->buffers that can be reordered. Same semantics -of duplicate entry detection using -EALREADY as method 1 above. But the -list-reordering allows for a bit more idiomatic code. - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry, *entry2; - - ww_acquire_init(ctx, &ww_class); - - list_for_each_entry (entry, list, head) { - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - entry2 = entry; - - list_for_each_entry_continue_reverse (entry2, list, head) - ww_mutex_unlock(&entry2->obj->lock); - - if (ret != -EDEADLK) { - ww_acquire_fini(ctx); - return ret; - } - - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&entry->obj->lock, ctx); - - /* - * Move buf to head of the list, this will point - * buf->next to the first unlocked entry, - * restarting the for loop. - */ - list_del(&entry->head); - list_add(&entry->head, list); - } - } - - ww_acquire_done(ctx); - return 0; -} - -Unlocking works the same way for both methods #1 and #2: - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry; - - list_for_each_entry (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - ww_acquire_fini(ctx); -} - -Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, -e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, -and edges can only be changed when holding the locks of all involved nodes. w/w -mutexes are a natural fit for such a case for two reasons: -- They can handle lock-acquisition in any order which allows us to start walking - a graph from a starting point and then iteratively discovering new edges and - locking down the nodes those edges connect to. -- Due to the -EALREADY return code signalling that a given objects is already - held there's no need for additional book-keeping to break cycles in the graph - or keep track off which looks are already held (when using more than one node - as a starting point). - -Note that this approach differs in two important ways from the above methods: -- Since the list of objects is dynamically constructed (and might very well be - different when retrying due to hitting the -EDEADLK wound condition) there's - no need to keep any object on a persistent list when it's not locked. We can - therefore move the list_head into the object itself. -- On the other hand the dynamic object list construction also means that the -EALREADY return - code can't be propagated. - -Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a -list of starting nodes (passed in from userspace) using one of the above -methods. And then lock any additional objects affected by the operations using -method #3 below. The backoff/retry procedure will be a bit more involved, since -when the dynamic locking step hits -EDEADLK we also need to unlock all the -objects acquired with the fixed list. But the w/w mutex debug checks will catch -any interface misuse for these cases. - -Also, method 3 can't fail the lock acquisition step since it doesn't return --EALREADY. Of course this would be different when using the _interruptible -variants, but that's outside of the scope of these examples here. - -struct obj { - struct ww_mutex ww_mutex; - struct list_head locked_list; -}; - -static DEFINE_WW_CLASS(ww_class); - -void __unlock_objs(struct list_head *list) -{ - struct obj *entry, *temp; - - list_for_each_entry_safe (entry, temp, list, locked_list) { - /* need to do that before unlocking, since only the current lock holder is - allowed to use object */ - list_del(&entry->locked_list); - ww_mutex_unlock(entry->ww_mutex) - } -} - -void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *obj; - - ww_acquire_init(ctx, &ww_class); - -retry: - /* re-init loop start state */ - loop { - /* magic code which walks over a graph and decides which objects - * to lock */ - - ret = ww_mutex_lock(obj->ww_mutex, ctx); - if (ret == -EALREADY) { - /* we have that one already, get to the next object */ - continue; - } - if (ret == -EDEADLK) { - __unlock_objs(list); - - ww_mutex_lock_slow(obj, ctx); - list_add(&entry->locked_list, list); - goto retry; - } - - /* locked a new object, add it to the list */ - list_add_tail(&entry->locked_list, list); - } - - ww_acquire_done(ctx); - return 0; -} - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - __unlock_objs(list); - ww_acquire_fini(ctx); -} - -Method 4: Only lock one single objects. In that case deadlock detection and -prevention is obviously overkill, since with grabbing just one lock you can't -produce a deadlock within just one class. To simplify this case the w/w mutex -api can be used with a NULL context. - -Implementation Details ----------------------- - -Design: - ww_mutex currently encapsulates a struct mutex, this means no extra overhead for - normal mutex locks, which are far more common. As such there is only a small - increase in code size if wait/wound mutexes are not used. - - In general, not much contention is expected. The locks are typically used to - serialize access to resources for devices. The only way to make wakeups - smarter would be at the cost of adding a field to struct mutex_waiter. This - would add overhead to all cases where normal mutexes are used, and - ww_mutexes are generally less performance sensitive. - -Lockdep: - Special care has been taken to warn for as many cases of api abuse - as possible. Some common api abuses will be caught with - CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. - - Some of the errors which will be warned about: - - Forgetting to call ww_acquire_fini or ww_acquire_init. - - Attempting to lock more mutexes after ww_acquire_done. - - Attempting to lock the wrong mutex after -EDEADLK and - unlocking all mutexes. - - Attempting to lock the right mutex after -EDEADLK, - before unlocking all mutexes. - - - Calling ww_mutex_lock_slow before -EDEADLK was returned. - - - Unlocking mutexes with the wrong unlock function. - - Calling one of the ww_acquire_* twice on the same context. - - Using a different ww_class for the mutex than for the ww_acquire_ctx. - - Normal lockdep errors that can result in deadlocks. - - Some of the lockdep errors that can result in deadlocks: - - Calling ww_acquire_init to initialize a second ww_acquire_ctx before - having called ww_acquire_fini on the first. - - 'normal' deadlocks that can occur. - -FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic -implemented. diff --git a/MAINTAINERS b/MAINTAINERS index 1acc624ecfd7..aac481fcbf5c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5523,8 +5523,8 @@ M: Ingo Molnar L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking S: Maintained -F: Documentation/lockdep*.txt -F: Documentation/lockstat.txt +F: Documentation/locking/lockdep*.txt +F: Documentation/locking/lockstat.txt F: include/linux/lockdep.h F: kernel/locking/ diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index 0dc57d5ecd10..3a02e5e3e9f3 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -35,7 +35,7 @@ * of extra utility/tracking out of our acquire-ctx. This is provided * by drm_modeset_lock / drm_modeset_acquire_ctx. * - * For basic principles of ww_mutex, see: Documentation/ww-mutex-design.txt + * For basic principles of ww_mutex, see: Documentation/locking/ww-mutex-design.txt * * The basic usage pattern is to: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 008388f920d7..f388481201cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -4,7 +4,7 @@ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * - * see Documentation/lockdep-design.txt for more details. + * see Documentation/locking/lockdep-design.txt for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H diff --git a/include/linux/mutex.h b/include/linux/mutex.h index e4c29418f407..cc31498fc526 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -133,7 +133,7 @@ static inline int mutex_is_locked(struct mutex *lock) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 035d3c57fc8a..8f498cdde280 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -149,7 +149,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. - * See Documentation/lockdep-design.txt for more details.) + * See Documentation/locking/lockdep-design.txt for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0d8b6ed93874..dadbf88c22c4 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -15,7 +15,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,7 +8,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.txt for details. */ #include #include diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 901096d31c66..9b94a063e26c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -924,7 +924,7 @@ config PROVE_LOCKING the proof of observed correctness is also maintained for an arbitrary combination of these separate locking variants. - For more details, see Documentation/lockdep-design.txt. + For more details, see Documentation/locking/lockdep-design.txt. config LOCKDEP bool @@ -945,7 +945,7 @@ config LOCK_STAT help This feature enables tracking lock contention points - For more details, see Documentation/lockstat.txt + For more details, see Documentation/locking/lockstat.txt This also enables lock events required by "perf lock", subcommand of perf. -- cgit From ae17ea0ec7d8fa64fbb773a52b2df5ba4766bcb8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 13:22:02 -0400 Subject: locking/selftest: Support queued rwlock The queued rwlock does not support the use of recursive read-lock in the process context. With changes in the lockdep code to check and disallow recursive read-lock, it is also necessary for the locking selftest to be updated to change the process context recursive read locking results from SUCCESS to FAILURE for rwlock. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra Cc: Maarten Lankhorst Cc: Rik van Riel Cc: Scott J Norton Cc: Fengguang Wu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407345722-61615-3-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- lib/locking-selftest.c | 56 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 872a15a2a637..62af709b2083 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -267,19 +267,46 @@ GENERATE_TESTCASE(AA_rsem) #undef E /* - * Special-case for read-locking, they are - * allowed to recurse on the same lock class: + * Special-case for read-locking, they are not allowed to + * recurse on the same lock class except under interrupt context: */ static void rlock_AA1(void) { RL(X1); - RL(X1); // this one should NOT fail + RL(X1); // this one should fail } static void rlock_AA1B(void) { RL(X1); - RL(X2); // this one should NOT fail + RL(X2); // this one should fail +} + +static void rlock_AHA1(void) +{ + RL(X1); + HARDIRQ_ENTER(); + RL(X1); // this one should NOT fail + HARDIRQ_EXIT(); +} + +static void rlock_AHA1B(void) +{ + RL(X1); + HARDIRQ_ENTER(); + RL(X2); // this one should NOT fail + HARDIRQ_EXIT(); +} + +static void rlock_ASAHA1(void) +{ + RL(X1); + SOFTIRQ_ENTER(); + RL(X1); // this one should NOT fail + HARDIRQ_ENTER(); + RL(X1); // this one should NOT fail + HARDIRQ_EXIT(); + SOFTIRQ_EXIT(); } static void rsem_AA1(void) @@ -1069,7 +1096,7 @@ static inline void print_testname(const char *testname) print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1830,14 +1857,14 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); printk(" |"); - dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); + dotest(rlock_AA1, FAILURE, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); printk("\n"); print_testname("recursive read-lock #2"); printk(" |"); - dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK); + dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); printk("\n"); @@ -1856,6 +1883,21 @@ void locking_selftest(void) dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); printk("\n"); + print_testname("recursive rlock with interrupt"); + printk(" |"); + dotest(rlock_AHA1, SUCCESS, LOCKTYPE_RWLOCK); + printk("\n"); + + print_testname("recursive rlock with interrupt #2"); + printk(" |"); + dotest(rlock_AHA1B, SUCCESS, LOCKTYPE_RWLOCK); + printk("\n"); + + print_testname("recursive rlock with interrupt #3"); + printk(" |"); + dotest(rlock_ASAHA1, SUCCESS, LOCKTYPE_RWLOCK); + printk("\n"); + printk(" --------------------------------------------------------------------------\n"); /* -- cgit From 560cb12a4080a48b84da8b96878cafbd193c4d64 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2014 16:12:30 +0200 Subject: locking,arch: Rewrite generic atomic support Rewrite generic atomic support to only require cmpxchg(), generate all other primitives from that. Furthermore reduce the endless repetition for all these primitives to a few CPP macros. This way we get more for less lines. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140508135852.940119622@infradead.org Cc: Arnd Bergmann Cc: David Howells Cc: Paul E. McKenney Cc: David S. Miller Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/atomic.h | 192 ++++++++++++++++++++--------------------- include/asm-generic/atomic64.h | 20 ++++- lib/atomic64.c | 83 ++++++++---------- 3 files changed, 148 insertions(+), 147 deletions(-) (limited to 'lib') diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 9c79e7603459..56d4d36e1531 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -18,14 +18,100 @@ #include #include +/* + * atomic_$op() - $op integer to atomic variable + * @i: integer value to $op + * @v: pointer to the atomic variable + * + * Atomically $ops @i to @v. Does not strictly guarantee a memory-barrier, use + * smp_mb__{before,after}_atomic(). + */ + +/* + * atomic_$op_return() - $op interer to atomic variable and returns the result + * @i: integer value to $op + * @v: pointer to the atomic variable + * + * Atomically $ops @i to @v. Does imply a full memory barrier. + */ + #ifdef CONFIG_SMP -/* Force people to define core atomics */ -# if !defined(atomic_add_return) || !defined(atomic_sub_return) || \ - !defined(atomic_clear_mask) || !defined(atomic_set_mask) -# error "SMP requires a little arch-specific magic" -# endif + +/* we can build all atomic primitives from cmpxchg */ + +#define ATOMIC_OP(op, c_op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + int c, old; \ + \ + c = v->counter; \ + while ((old = cmpxchg(&v->counter, c, c c_op i)) != c) \ + c = old; \ +} + +#define ATOMIC_OP_RETURN(op, c_op) \ +static inline int atomic_##op##_return(int i, atomic_t *v) \ +{ \ + int c, old; \ + \ + c = v->counter; \ + while ((old = cmpxchg(&v->counter, c, c c_op i)) != c) \ + c = old; \ + \ + return c c_op i; \ +} + +#else + +#include + +#define ATOMIC_OP(op, c_op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + unsigned long flags; \ + \ + raw_local_irq_save(flags); \ + v->counter = v->counter c_op i; \ + raw_local_irq_restore(flags); \ +} + +#define ATOMIC_OP_RETURN(op, c_op) \ +static inline int atomic_##op##_return(int i, atomic_t *v) \ +{ \ + unsigned long flags; \ + int ret; \ + \ + raw_local_irq_save(flags); \ + ret = (v->counter = v->counter c_op i); \ + raw_local_irq_restore(flags); \ + \ + return ret; \ +} + +#endif /* CONFIG_SMP */ + +#ifndef atomic_add_return +ATOMIC_OP_RETURN(add, +) +#endif + +#ifndef atomic_sub_return +ATOMIC_OP_RETURN(sub, -) +#endif + +#ifndef atomic_clear_mask +ATOMIC_OP(and, &) +#define atomic_clear_mask(i, v) atomic_and(~(i), (v)) #endif +#ifndef atomic_set_mask +#define CONFIG_ARCH_HAS_ATOMIC_OR +ATOMIC_OP(or, |) +#define atomic_set_mask(i, v) atomic_or((i), (v)) +#endif + +#undef ATOMIC_OP_RETURN +#undef ATOMIC_OP + /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. @@ -33,8 +119,6 @@ #define ATOMIC_INIT(i) { (i) } -#ifdef __KERNEL__ - /** * atomic_read - read atomic variable * @v: pointer of type atomic_t @@ -56,52 +140,6 @@ #include -/** - * atomic_add_return - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns the result - */ -#ifndef atomic_add_return -static inline int atomic_add_return(int i, atomic_t *v) -{ - unsigned long flags; - int temp; - - raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */ - temp = v->counter; - temp += i; - v->counter = temp; - raw_local_irq_restore(flags); - - return temp; -} -#endif - -/** - * atomic_sub_return - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns the result - */ -#ifndef atomic_sub_return -static inline int atomic_sub_return(int i, atomic_t *v) -{ - unsigned long flags; - int temp; - - raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */ - temp = v->counter; - temp -= i; - v->counter = temp; - raw_local_irq_restore(flags); - - return temp; -} -#endif - static inline int atomic_add_negative(int i, atomic_t *v) { return atomic_add_return(i, v) < 0; @@ -139,49 +177,11 @@ static inline void atomic_dec(atomic_t *v) static inline int __atomic_add_unless(atomic_t *v, int a, int u) { - int c, old; - c = atomic_read(v); - while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c) - c = old; - return c; -} - -/** - * atomic_clear_mask - Atomically clear bits in atomic variable - * @mask: Mask of the bits to be cleared - * @v: pointer of type atomic_t - * - * Atomically clears the bits set in @mask from @v - */ -#ifndef atomic_clear_mask -static inline void atomic_clear_mask(unsigned long mask, atomic_t *v) -{ - unsigned long flags; - - mask = ~mask; - raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */ - v->counter &= mask; - raw_local_irq_restore(flags); + int c, old; + c = atomic_read(v); + while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c) + c = old; + return c; } -#endif - -/** - * atomic_set_mask - Atomically set bits in atomic variable - * @mask: Mask of the bits to be set - * @v: pointer of type atomic_t - * - * Atomically sets the bits set in @mask in @v - */ -#ifndef atomic_set_mask -static inline void atomic_set_mask(unsigned int mask, atomic_t *v) -{ - unsigned long flags; - - raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */ - v->counter |= mask; - raw_local_irq_restore(flags); -} -#endif -#endif /* __KERNEL__ */ #endif /* __ASM_GENERIC_ATOMIC_H */ diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index b18ce4f9ee3d..30ad9c86cebb 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -20,10 +20,22 @@ typedef struct { extern long long atomic64_read(const atomic64_t *v); extern void atomic64_set(atomic64_t *v, long long i); -extern void atomic64_add(long long a, atomic64_t *v); -extern long long atomic64_add_return(long long a, atomic64_t *v); -extern void atomic64_sub(long long a, atomic64_t *v); -extern long long atomic64_sub_return(long long a, atomic64_t *v); + +#define ATOMIC64_OP(op) \ +extern void atomic64_##op(long long a, atomic64_t *v); + +#define ATOMIC64_OP_RETURN(op) \ +extern long long atomic64_##op##_return(long long a, atomic64_t *v); + +#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op) + +ATOMIC64_OPS(add) +ATOMIC64_OPS(sub) + +#undef ATOMIC64_OPS +#undef ATOMIC64_OP_RETURN +#undef ATOMIC64_OP + extern long long atomic64_dec_if_positive(atomic64_t *v); extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n); extern long long atomic64_xchg(atomic64_t *v, long long new); diff --git a/lib/atomic64.c b/lib/atomic64.c index 08a4f068e61e..1298c05ef528 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -70,53 +70,42 @@ void atomic64_set(atomic64_t *v, long long i) } EXPORT_SYMBOL(atomic64_set); -void atomic64_add(long long a, atomic64_t *v) -{ - unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); - - raw_spin_lock_irqsave(lock, flags); - v->counter += a; - raw_spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(atomic64_add); - -long long atomic64_add_return(long long a, atomic64_t *v) -{ - unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); - long long val; - - raw_spin_lock_irqsave(lock, flags); - val = v->counter += a; - raw_spin_unlock_irqrestore(lock, flags); - return val; -} -EXPORT_SYMBOL(atomic64_add_return); - -void atomic64_sub(long long a, atomic64_t *v) -{ - unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); - - raw_spin_lock_irqsave(lock, flags); - v->counter -= a; - raw_spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(atomic64_sub); - -long long atomic64_sub_return(long long a, atomic64_t *v) -{ - unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); - long long val; - - raw_spin_lock_irqsave(lock, flags); - val = v->counter -= a; - raw_spin_unlock_irqrestore(lock, flags); - return val; -} -EXPORT_SYMBOL(atomic64_sub_return); +#define ATOMIC64_OP(op, c_op) \ +void atomic64_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + \ + raw_spin_lock_irqsave(lock, flags); \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ +} \ +EXPORT_SYMBOL(atomic64_##op); + +#define ATOMIC64_OP_RETURN(op, c_op) \ +long long atomic64_##op##_return(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = (v->counter c_op a); \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} \ +EXPORT_SYMBOL(atomic64_##op##_return); + +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) + +ATOMIC64_OPS(add, +=) +ATOMIC64_OPS(sub, -=) + +#undef ATOMIC64_OPS +#undef ATOMIC64_OP_RETURN +#undef ATOMIC64_OP long long atomic64_dec_if_positive(atomic64_t *v) { -- cgit From 5300fdcb7b7e97d83033bc7196582705524d35ea Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 13 Aug 2014 16:38:29 +0200 Subject: rhashtable: RCU annotations for next pointers Properly annotate next pointers as access is RCU protected in the lookup path. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 ++-- lib/rhashtable.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 9cda293c867d..8c6048e77f29 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -21,7 +21,7 @@ #include struct rhash_head { - struct rhash_head *next; + struct rhash_head __rcu *next; }; #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL) @@ -97,7 +97,7 @@ u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t); void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, - struct rhash_head **pprev, gfp_t flags); + struct rhash_head __rcu **pprev, gfp_t flags); bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index e6940cf16628..338dd7aa5e13 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert); * deletion when combined with walking or lookup. */ void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, - struct rhash_head **pprev, gfp_t flags) + struct rhash_head __rcu **pprev, gfp_t flags) { struct bucket_table *tbl = rht_dereference(ht->tbl, ht); -- cgit From c91eee56dc4f8c3d9ae834bacb835596d47a709e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 13 Aug 2014 16:38:30 +0200 Subject: rhashtable: unexport and make rht_obj() static No need to export rht_obj(), all inner to outer object translations occur internally. It was intended to be used with rht_for_each() which now primarily serves as the iterator for rhashtable_remove_pprev() to effectively flush and free the full table. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 1 - lib/rhashtable.c | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'lib') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8c6048e77f29..af967c4c7591 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -117,7 +117,6 @@ void rhashtable_destroy(const struct rhashtable *ht); #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) -/* Internal, use rht_obj() instead */ #define rht_entry(ptr, type, member) container_of(ptr, type, member) #define rht_entry_safe(ptr, type, member) \ ({ \ diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 338dd7aa5e13..a2c78810ebc1 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -38,16 +38,10 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht) EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); #endif -/** - * rht_obj - cast hash head to outer object - * @ht: hash table - * @he: hashed node - */ -void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) +static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (void *) he - ht->p.head_offset; } -EXPORT_SYMBOL_GPL(rht_obj); static u32 __hashfn(const struct rhashtable *ht, const void *key, u32 len, u32 hsize) -- cgit From a98406e22c12e514bac28fec0a49dc793edaf3a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Aug 2014 17:03:28 +0200 Subject: random32: improvements to prandom_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch addresses a couple of minor items, mostly addesssing prandom_bytes(): 1) prandom_bytes{,_state}() should use size_t for length arguments, 2) We can use put_unaligned() when filling the array instead of open coding it [ perhaps some archs will further benefit from their own arch specific implementation when GCC cannot make up for it ], 3) Fix a typo, 4) Better use unsigned int as type for getting the arch seed, 5) Make use of prandom_u32_max() for timer slack. Regarding the change to put_unaligned(), callers of prandom_bytes() which internally invoke prandom_bytes_state(), don't bother as they expect the array to be filled randomly and don't have any control of the internal state what-so-ever (that's also why we have periodic reseeding there, etc), so they really don't care. Now for the direct callers of prandom_bytes_state(), which are solely located in test cases for MTD devices, that is, drivers/mtd/tests/{oobtest.c,pagetest.c,subpagetest.c}: These tests basically fill a test write-vector through prandom_bytes_state() with an a-priori defined seed each time and write that to a MTD device. Later on, they set up a read-vector and read back that blocks from the device. So in the verification phase, the write-vector is being re-setup [ so same seed and prandom_bytes_state() called ], and then memcmp()'ed against the read-vector to check if the data is the same. Akinobu, Lothar and I also tested this patch and it runs through the 3 relevant MTD test cases w/o any errors on the nandsim device (simulator for MTD devs) for x86_64, ppc64, ARM (i.MX28, i.MX53 and i.MX6): # modprobe nandsim first_id_byte=0x20 second_id_byte=0xac \ third_id_byte=0x00 fourth_id_byte=0x15 # modprobe mtd_oobtest dev=0 # modprobe mtd_pagetest dev=0 # modprobe mtd_subpagetest dev=0 We also don't have any users depending directly on a particular result of the PRNG (except the PRNG self-test itself), and that's just fine as it e.g. allowed us easily to do things like upgrading from taus88 to taus113. Signed-off-by: Daniel Borkmann Tested-by: Akinobu Mita Tested-by: Lothar Waßmann Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/random.h | 4 ++-- lib/random32.c | 39 ++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'lib') diff --git a/include/linux/random.h b/include/linux/random.h index 57fbbffd77a0..b05856e16b75 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -26,7 +26,7 @@ unsigned int get_random_int(void); unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); u32 prandom_u32(void); -void prandom_bytes(void *buf, int nbytes); +void prandom_bytes(void *buf, size_t nbytes); void prandom_seed(u32 seed); void prandom_reseed_late(void); @@ -35,7 +35,7 @@ struct rnd_state { }; u32 prandom_u32_state(struct rnd_state *state); -void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); /** * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) diff --git a/lib/random32.c b/lib/random32.c index c9b6bf3afe0c..0bee183fa18f 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_RANDOM32_SELFTEST static void __init prandom_state_selftest(void); @@ -96,27 +97,23 @@ EXPORT_SYMBOL(prandom_u32); * This is used for pseudo-randomness with no outside seeding. * For more random results, use prandom_bytes(). */ -void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes) +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { - unsigned char *p = buf; - int i; - - for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) { - u32 random = prandom_u32_state(state); - int j; + u8 *ptr = buf; - for (j = 0; j < sizeof(u32); j++) { - p[i + j] = random; - random >>= BITS_PER_BYTE; - } + while (bytes >= sizeof(u32)) { + put_unaligned(prandom_u32_state(state), (u32 *) ptr); + ptr += sizeof(u32); + bytes -= sizeof(u32); } - if (i < bytes) { - u32 random = prandom_u32_state(state); - for (; i < bytes; i++) { - p[i] = random; - random >>= BITS_PER_BYTE; - } + if (bytes > 0) { + u32 rem = prandom_u32_state(state); + do { + *ptr++ = (u8) rem; + bytes--; + rem >>= BITS_PER_BYTE; + } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); @@ -126,7 +123,7 @@ EXPORT_SYMBOL(prandom_bytes_state); * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes */ -void prandom_bytes(void *buf, int bytes) +void prandom_bytes(void *buf, size_t bytes) { struct rnd_state *state = &get_cpu_var(net_rand_state); @@ -137,7 +134,7 @@ EXPORT_SYMBOL(prandom_bytes); static void prandom_warmup(struct rnd_state *state) { - /* Calling RNG ten times to satify recurrence condition */ + /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); @@ -152,7 +149,7 @@ static void prandom_warmup(struct rnd_state *state) static u32 __extract_hwseed(void) { - u32 val = 0; + unsigned int val = 0; (void)(arch_get_random_seed_int(&val) || arch_get_random_int(&val)); @@ -228,7 +225,7 @@ static void __prandom_timer(unsigned long dontcare) prandom_seed(entropy); /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ - expires = 40 + (prandom_u32() % 40); + expires = 40 + prandom_u32_max(40); seed_timer.expires = jiffies + msecs_to_jiffies(expires * MSEC_PER_SEC); add_timer(&seed_timer); -- cgit From 72b603ee8cfc6be587f301568d79ce38e7ed735d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Aug 2014 12:27:02 -0700 Subject: bpf: x86: add missing 'shift by register' instructions to x64 eBPF JIT 'shift by register' operations are supported by eBPF interpreter, but were accidently left out of x64 JIT compiler. Fix it and add a testcase. Reported-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 42 ++++++++++++++++++++++++++++++++++++++++++ lib/test_bpf.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) (limited to 'lib') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5c8cb8043c5a..b08a98c59530 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -515,6 +515,48 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); break; + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_LSH | BPF_X: + case BPF_ALU64 | BPF_RSH | BPF_X: + case BPF_ALU64 | BPF_ARSH | BPF_X: + + /* check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ + EMIT_mov(AUX_REG, dst_reg); + dst_reg = AUX_REG; + } + + if (src_reg != BPF_REG_4) { /* common case */ + EMIT1(0x51); /* push rcx */ + + /* mov rcx, src_reg */ + EMIT_mov(BPF_REG_4, src_reg); + } + + /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */ + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, dst_reg)); + else if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + + switch (BPF_OP(insn->code)) { + case BPF_LSH: b3 = 0xE0; break; + case BPF_RSH: b3 = 0xE8; break; + case BPF_ARSH: b3 = 0xF8; break; + } + EMIT2(0xD3, add_1reg(b3, dst_reg)); + + if (src_reg != BPF_REG_4) + EMIT1(0x59); /* pop rcx */ + + if (insn->dst_reg == BPF_REG_4) + /* mov dst_reg, r11 */ + EMIT_mov(insn->dst_reg, AUX_REG); + break; + case BPF_ALU | BPF_END | BPF_FROM_BE: switch (imm32) { case 16: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 89e0345733bd..8c66c6aace04 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1341,6 +1341,44 @@ static struct bpf_test tests[] = { { }, { { 0, -1 } } }, + { + "INT: shifts by register", + .u.insns_int = { + BPF_MOV64_IMM(R0, -1234), + BPF_MOV64_IMM(R1, 1), + BPF_ALU32_REG(BPF_RSH, R0, R1), + BPF_JMP_IMM(BPF_JEQ, R0, 0x7ffffd97, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(R2, 1), + BPF_ALU64_REG(BPF_LSH, R0, R2), + BPF_MOV32_IMM(R4, -1234), + BPF_JMP_REG(BPF_JEQ, R0, R4, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_AND, R4, 63), + BPF_ALU64_REG(BPF_LSH, R0, R4), /* R0 <= 46 */ + BPF_MOV64_IMM(R3, 47), + BPF_ALU64_REG(BPF_ARSH, R0, R3), + BPF_JMP_IMM(BPF_JEQ, R0, -617, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(R2, 1), + BPF_ALU64_REG(BPF_LSH, R4, R2), /* R4 = 46 << 1 */ + BPF_JMP_IMM(BPF_JEQ, R4, 92, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(R4, 4), + BPF_ALU64_REG(BPF_LSH, R4, R4), /* R4 = 4 << 4 */ + BPF_JMP_IMM(BPF_JEQ, R4, 64, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(R4, 5), + BPF_ALU32_REG(BPF_LSH, R4, R4), /* R4 = 5 << 5 */ + BPF_JMP_IMM(BPF_JEQ, R4, 160, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(R0, -1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -1 } } + }, { "INT: DIV + ABS", .u.insns_int = { -- cgit From 4d6923733f158e7f8f0695b43c30c22a59ec0a34 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 27 Aug 2014 11:19:26 -0400 Subject: ww-mutex: clarify help text for DEBUG_WW_MUTEX_SLOWPATH We really don't want distro's enabling this in their kernels. Try and make that more clear. Signed-off-by: Rob Clark Acked-by: Maarten Lankhorst Signed-off-by: Dave Airlie --- lib/Kconfig.debug | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 07c28323f88f..1b233fc67466 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -892,6 +892,10 @@ config DEBUG_WW_MUTEX_SLOWPATH the full mutex checks enabled with (CONFIG_PROVE_LOCKING) this will test all possible w/w mutex interface abuse with the exception of simply not acquiring all the required locks. + Note that this feature can introduce significant overhead, so + it really should not be enabled in a production or distro kernel, + even a debug kernel. If you are a driver writer, enable it. If + you are a distro, do not. config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" -- cgit From 0c38e1fe0fced6aa06dbf444f7203dd7f325e369 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 29 Aug 2014 15:18:35 -0700 Subject: lib: turn CONFIG_STACKTRACE into an actual option. I was puzzled why /proc/$$/stack had disappeared, until I figured out I had disabled the last debug option that did a 'select STACKTRACE'. This patch makes the option show up at config time, so it can be enabled without enabling any of the more heavyweight debug options. Signed-off-by: Dave Jones Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1b233fc67466..a28590083622 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1036,8 +1036,13 @@ config TRACE_IRQFLAGS either tracing or lock debugging. config STACKTRACE - bool + bool "Stack backtrace support" depends on STACKTRACE_SUPPORT + help + This option causes the kernel to create a /proc/pid/stack for + every process, showing its current stack trace. + It is also used by various kernel debugging features that require + stack trace generation. config DEBUG_KOBJECT bool "kobject debugging" -- cgit From 27419604f51a97d497853f14142c1059d46eb597 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Sep 2014 13:52:20 +0100 Subject: KEYS: Fix use-after-free in assoc_array_gc() An edit script should be considered inaccessible by a function once it has called assoc_array_apply_edit() or assoc_array_cancel_edit(). However, assoc_array_gc() is accessing the edit script just after the gc_complete: label. Reported-by: Andreea-Cristina Bernat Signed-off-by: David Howells Reviewed-by: Andreea-Cristina Bernat cc: shemming@brocade.com cc: paulmck@linux.vnet.ibm.com Cc: stable@vger.kernel.org Signed-off-by: James Morris --- lib/assoc_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/assoc_array.c b/lib/assoc_array.c index c0b1007011e1..ae146f0734eb 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -1735,7 +1735,7 @@ ascend_old_tree: gc_complete: edit->set[0].to = new_root; assoc_array_apply_edit(edit); - edit->array->nr_leaves_on_tree = nr_leaves_on_tree; + array->nr_leaves_on_tree = nr_leaves_on_tree; return 0; enomem: -- cgit From ae82ddcf8e8239bdd06e6830d450cf9e785b8024 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 2 Sep 2014 00:26:05 +0200 Subject: rhashtable: fix lockdep splat in rhashtable_destroy() No need for rht_dereference() from rhashtable_destroy() since the existing callers don't hold the mutex when invoking this function from: 1) Netlink, this is called in case of memory allocation errors in the initialization path, no nl_sk_hash_lock is held. 2) Netfilter, this is called from the rcu callback, no nfnl_lock is held either. I think it's reasonable to assume that the caller has to make sure that no hash resizing may happen before releasing the bucket array. Therefore, the caller should be responsible for releasing this in a safe way, document this to make people aware of it. This resolves a rcu lockdep splat in nft_hash: =============================== [ INFO: suspicious RCU usage. ] 3.16.0+ #178 Not tainted ------------------------------- lib/rhashtable.c:596 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by ksoftirqd/2/18: #0: (rcu_callback){......}, at: [] rcu_process_callbacks+0x27e/0x4c7 stack backtrace: CPU: 2 PID: 18 Comm: ksoftirqd/2 Not tainted 3.16.0+ #178 Hardware name: LENOVO 23259H1/23259H1, BIOS G2ET32WW (1.12 ) 05/30/2012 0000000000000001 ffff88011706bb68 ffffffff8143debc 0000000000000000 ffff880117062610 ffff88011706bb98 ffffffff81077515 ffff8800ca041a50 0000000000000004 ffff8800ca386480 ffff8800ca041a00 ffff88011706bbb8 Call Trace: [] dump_stack+0x4e/0x68 [] lockdep_rcu_suspicious+0xfa/0x103 [] rhashtable_destroy+0x46/0x52 [] nft_hash_destroy+0x73/0x82 [nft_hash] Signed-off-by: Pablo Neira Ayuso Acked-by: Thomas Graf --- lib/rhashtable.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a2c78810ebc1..fc0dd8ee5c35 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -589,13 +589,13 @@ EXPORT_SYMBOL_GPL(rhashtable_init); * rhashtable_destroy - destroy hash table * @ht: the hash table to destroy * - * Frees the bucket array. + * Frees the bucket array. This function is not rcu safe, therefore the caller + * has to make sure that no resizing may happen by unpublishing the hashtable + * and waiting for the quiescent cycle before releasing the bucket array. */ void rhashtable_destroy(const struct rhashtable *ht) { - const struct bucket_table *tbl = rht_dereference(ht->tbl, ht); - - bucket_table_free(tbl); + bucket_table_free(ht->tbl); } EXPORT_SYMBOL_GPL(rhashtable_destroy); -- cgit From 940001762ac514810e305aab356983829e5fa82a Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 3 Sep 2014 09:22:36 +0800 Subject: lib/rhashtable: allow user to set the minimum shifts of shrinking Although rhashtable library allows user to specify a quiet big size for user's created hash table, the table may be shrunk to a very small size - HASH_MIN_SIZE(4) after object is removed from the table at the first time. Subsequently, even if the total amount of objects saved in the table is quite lower than user's initial setting in a long time, the hash table size is still dynamically adjusted by rhashtable_shrink() or rhashtable_expand() each time object is inserted or removed from the table. However, as synchronize_rcu() has to be called when table is shrunk or expanded by the two functions, we should permit user to set the minimum table size through configuring the minimum number of shifts according to user specific requirement, avoiding these expensive actions of shrinking or expanding because of calling synchronize_rcu(). Signed-off-by: Ying Xue Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 ++ lib/rhashtable.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 36826c0166c5..fb298e9d6d3a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -44,6 +44,7 @@ struct rhashtable; * @head_offset: Offset of rhash_head in struct to be hashed * @hash_rnd: Seed to use while hashing * @max_shift: Maximum number of shifts while expanding + * @min_shift: Minimum number of shifts while shrinking * @hashfn: Function to hash key * @obj_hashfn: Function to hash object * @grow_decision: If defined, may return true if table should expand @@ -57,6 +58,7 @@ struct rhashtable_params { size_t head_offset; u32 hash_rnd; size_t max_shift; + size_t min_shift; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; bool (*grow_decision)(const struct rhashtable *ht, diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a2c78810ebc1..8dfec3f26d4c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -298,7 +298,7 @@ int rhashtable_shrink(struct rhashtable *ht, gfp_t flags) ASSERT_RHT_MUTEX(ht); - if (tbl->size <= HASH_MIN_SIZE) + if (ht->shift <= ht->p.min_shift) return 0; ntbl = bucket_table_alloc(tbl->size / 2, flags); @@ -506,9 +506,10 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash, } EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); -static size_t rounded_hashtable_size(unsigned int nelem) +static size_t rounded_hashtable_size(struct rhashtable_params *params) { - return max(roundup_pow_of_two(nelem * 4 / 3), HASH_MIN_SIZE); + return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + 1UL << params->min_shift); } /** @@ -566,8 +567,11 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) (!params->key_len && !params->obj_hashfn)) return -EINVAL; + params->min_shift = max_t(size_t, params->min_shift, + ilog2(HASH_MIN_SIZE)); + if (params->nelem_hint) - size = rounded_hashtable_size(params->nelem_hint); + size = rounded_hashtable_size(params); tbl = bucket_table_alloc(size, GFP_KERNEL); if (tbl == NULL) -- cgit From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 3 +- arch/mips/net/bpf_jit.c | 3 +- arch/powerpc/net/bpf_jit_comp.c | 3 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 3 +- arch/x86/net/bpf_jit_comp.c | 18 ++++------ include/linux/filter.h | 49 ++++++++++++++++++++++--- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/seccomp.c | 7 ++-- lib/test_bpf.c | 2 +- net/core/filter.c | 6 ++-- 11 files changed, 144 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a37b989a2f91..a76623bcf722 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,5 +930,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 05a56619ece2..cfa83cf2447d 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1427,5 +1427,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3afa6f4c1957..40c53ff59124 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -697,5 +697,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 61e45b7c04d7..f2833c5b218a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -887,5 +887,5 @@ void bpf_jit_free(struct bpf_prog *fp) module_free(NULL, header); free_filter: - kfree(fp); + bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 1f76c22a6a75..f7a736b645e8 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,5 +812,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b08a98c59530..39ccfbb4a723 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -972,23 +972,17 @@ out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); module_free(NULL, header); - kfree(fp); -} -void bpf_jit_free(struct bpf_prog *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..b54bb2c2e494 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,6 +22,7 @@ */ #include #include +#include #include /* Registers */ @@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_work_struct *ws; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); + if (ws == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->work = ws; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->work from fp_old around in the new + * reallocated structure. + */ + fp_old->work = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->work); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_work_struct *ws; + + ws = container_of(work, struct bpf_work_struct, work); + bpf_jit_free(ws->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_work_struct *ws = fp->work; + + INIT_WORK(&ws->work, bpf_prog_free_deferred); + ws->prog = fp; + schedule_work(&ws->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..84922befea84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8c66c6aace04..9a67456ba29a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1836,7 +1836,7 @@ static struct bpf_prog *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(flen), 0); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..37f8eb06fdee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; -- cgit From ebd8fef304f99da84d4a52ad056f6137ac9652d4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: make percpu_counters_lock irq-safe percpu_counter is scheduled to grow @gfp support to allow atomic initialization. This patch makes percpu_counters_lock irq-safe so that it can be safely used from atomic contexts. Signed-off-by: Tejun Heo --- lib/percpu_counter.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 7dd33577b905..3fde78275cd1 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -115,6 +115,8 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { + unsigned long flags __maybe_unused; + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; @@ -126,9 +128,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_add(&fbc->list, &percpu_counters); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } @@ -136,15 +138,17 @@ EXPORT_SYMBOL(__percpu_counter_init); void percpu_counter_destroy(struct percpu_counter *fbc) { + unsigned long flags __maybe_unused; + if (!fbc->counters) return; debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_del(&fbc->list); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc->counters); fbc->counters = NULL; @@ -173,7 +177,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; cpu = (unsigned long)hcpu; - spin_lock(&percpu_counters_lock); + spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; unsigned long flags; @@ -184,7 +188,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, *pcount = 0; raw_spin_unlock_irqrestore(&fbc->lock, flags); } - spin_unlock(&percpu_counters_lock); + spin_unlock_irq(&percpu_counters_lock); #endif return NOTIFY_OK; } -- cgit From 908c7f1949cb7cc6e92ba8f18f2998e87e265b8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: add @gfp to percpu_counter_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_counter_init() so that !GFP_KERNEL allocation masks can be used with percpu_counters too. We could have left percpu_counter_init() alone and added percpu_counter_init_gfp(); however, the number of users isn't that high and introducing _gfp variants to all percpu data structures would be quite ugly, so let's just do the conversion. This is the one with the most users. Other percpu data structures are a lot easier to convert. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Jan Kara Acked-by: "David S. Miller" Cc: x86@kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andrew Morton --- arch/x86/kvm/mmu.c | 2 +- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/extent-tree.c | 2 +- fs/ext2/super.c | 6 +++--- fs/ext3/super.c | 6 +++--- fs/ext4/super.c | 14 +++++++++----- fs/file_table.c | 2 +- fs/quota/dquot.c | 2 +- fs/super.c | 3 ++- include/linux/percpu_counter.h | 10 ++++++---- include/net/dst_ops.h | 2 +- include/net/inet_frag.h | 2 +- lib/flex_proportions.c | 4 ++-- lib/percpu_counter.c | 4 ++-- lib/proportions.c | 6 +++--- mm/backing-dev.c | 2 +- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/shmem.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_memcontrol.c | 2 +- net/sctp/protocol.c | 2 +- 23 files changed, 49 insertions(+), 42 deletions(-) (limited to 'lib') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 931467881da7..5bd53f206f4f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4534,7 +4534,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto nomem; register_shrinker(&mmu_shrinker); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..61dae01788d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1180,7 +1180,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2193,13 +2193,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f362f9..94ec71eda86b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3493,7 +3493,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc05c230..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 08cdfe5461e3..eba021b88440 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32b43ad154b9..e25ca8fdde7d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3891,7 +3891,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -4105,17 +4106,20 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); + ext4_count_dirs(sb), GFP_KERNEL); if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..0bab12b20460 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee9d1f1..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2725,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/super.c b/fs/super.c index b9a214d2fe98..1b836107acee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d5dd4657c8d6..50e50095c8d1 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SMP @@ -26,14 +27,14 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key); -#define percpu_counter_init(fbc, value) \ +#define percpu_counter_init(fbc, value, gfp) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, &__key); \ + __percpu_counter_init(fbc, value, gfp, &__key); \ }) void percpu_counter_destroy(struct percpu_counter *fbc); @@ -89,7 +90,8 @@ struct percpu_counter { s64 count; }; -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, + gfp_t gfp) { fbc->count = amount; return 0; diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 2f26dfb8450e..1f99a1de0e4f 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val) static inline int dst_entries_init(struct dst_ops *dst) { - return percpu_counter_init(&dst->pcpuc_entries, 0); + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 65a8855e99fe..8d1765577acc 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) static inline void init_frag_mem_limit(struct netns_frags *nf) { - percpu_counter_init(&nf->mem, 0); + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); } static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index ebf3bac460b0..b9d026bfcf38 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -40,7 +40,7 @@ int fprop_global_init(struct fprop_global *p) p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1); + err = percpu_counter_init(&p->events, 1, GFP_KERNEL); if (err) return err; seqcount_init(&p->sequence); @@ -172,7 +172,7 @@ int fprop_local_init_percpu(struct fprop_local_percpu *pl) { int err; - err = percpu_counter_init(&pl->events, 0); + err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); if (err) return err; pl->period = 0; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 3fde78275cd1..48144cdae819 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -112,7 +112,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { unsigned long flags __maybe_unused; @@ -120,7 +120,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; - fbc->counters = alloc_percpu(s32); + fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; diff --git a/lib/proportions.c b/lib/proportions.c index 05df84801b56..ca95f8d54384 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0); + err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0); + err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -193,7 +193,7 @@ int prop_local_init_percpu(struct prop_local_percpu *pl) raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0); + return percpu_counter_init(&pl->events, 0, GFP_KERNEL); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..f19a818be2d3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..d7ec93e25fa1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3196,7 +3196,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb225007c..d4bc55d3f107 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2993,7 +2993,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e719305..e421eddf67b4 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1115,7 +1115,7 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0); + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; rc = -ENOBUFS; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541f26a67ba2..d59c2604c247 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3188,8 +3188,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 3af522622fad..1d191357bf88 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) res_parent = &parent_cg->memory_allocated; res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834f4b95..f00a85a3fddd 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1341,7 +1341,7 @@ static __init int sctp_init(void) if (!sctp_chunk_cachep) goto err_chunk_cachep; - status = percpu_counter_init(&sctp_sockets_allocated, 0); + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; -- cgit From 20ae00792c6f1f18fc4fc5965445a145df92827e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: proportions: add @gfp to init functions Percpu allocator now supports allocation mask. Add @gfp to [flex_]proportions init functions so that !GFP_KERNEL allocation masks can be used with them too. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Peter Zijlstra --- include/linux/flex_proportions.h | 5 +++-- include/linux/proportions.h | 5 +++-- lib/flex_proportions.c | 8 ++++---- lib/proportions.c | 10 +++++----- mm/backing-dev.c | 2 +- mm/page-writeback.c | 2 +- 6 files changed, 17 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 4ebc49fae391..0d348e011a6e 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * When maximum proportion of some event type is specified, this is the @@ -32,7 +33,7 @@ struct fprop_global { seqcount_t sequence; }; -int fprop_global_init(struct fprop_global *p); +int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); @@ -79,7 +80,7 @@ struct fprop_local_percpu { raw_spinlock_t lock; /* Protect period and numerator */ }; -int fprop_local_init_percpu(struct fprop_local_percpu *pl); +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 26a8a4ed9b07..00e8e8fa7358 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -12,6 +12,7 @@ #include #include #include +#include struct prop_global { /* @@ -40,7 +41,7 @@ struct prop_descriptor { struct mutex mutex; /* serialize the prop_global switch */ }; -int prop_descriptor_init(struct prop_descriptor *pd, int shift); +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); void prop_change_shift(struct prop_descriptor *pd, int new_shift); /* @@ -61,7 +62,7 @@ struct prop_local_percpu { raw_spinlock_t lock; /* protect the snapshot state */ }; -int prop_local_init_percpu(struct prop_local_percpu *pl); +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); void prop_local_destroy_percpu(struct prop_local_percpu *pl); void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index b9d026bfcf38..8f25652f40d4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -34,13 +34,13 @@ */ #include -int fprop_global_init(struct fprop_global *p) +int fprop_global_init(struct fprop_global *p, gfp_t gfp) { int err; p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1, GFP_KERNEL); + err = percpu_counter_init(&p->events, 1, gfp); if (err) return err; seqcount_init(&p->sequence); @@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p, */ #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int fprop_local_init_percpu(struct fprop_local_percpu *pl) +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) { int err; - err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); + err = percpu_counter_init(&pl->events, 0, gfp); if (err) return err; pl->period = 0; diff --git a/lib/proportions.c b/lib/proportions.c index ca95f8d54384..6f724298f67a 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -73,7 +73,7 @@ #include #include -int prop_descriptor_init(struct prop_descriptor *pd, int shift) +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) { int err; @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int prop_local_init_percpu(struct prop_local_percpu *pl) +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) { raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0, GFP_KERNEL); + return percpu_counter_init(&pl->events, 0, gfp); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f19a818be2d3..64ec49d1772b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..508599403721 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** -- cgit From a34375ef9e65340a138fc0be287de5c940d260fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: percpu-refcount: add @gfp to percpu_ref_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_ref_init() so that !GFP_KERNEL allocation masks can be used with percpu_refs too. This patch doesn't make any functional difference. v2: blk-mq conversion was missing. Updated. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Benjamin LaHaise Cc: Li Zefan Cc: Nicholas A. Bellinger Cc: Jens Axboe --- block/blk-mq.c | 3 ++- drivers/target/target_core_tpg.c | 3 ++- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 3 ++- kernel/cgroup.c | 6 +++--- lib/percpu-refcount.c | 6 ++++-- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..702df07b980d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1776,7 +1776,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..4ab6da338585 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,8 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index bd7ec2cc2674..93fbcc0f5696 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -666,10 +666,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf237cd8f..ee8325122dbd 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -49,6 +49,7 @@ #include #include #include +#include struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); @@ -66,7 +67,7 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release); + percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..589b4d89a0a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1628,7 +1628,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out; @@ -4487,7 +4487,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); if (err) goto err_free_css; @@ -4555,7 +4555,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a3342e960..ff9903264a91 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -40,6 +40,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; * analagous to atomic_set(ref, 1). @@ -47,11 +48,12 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, + gfp_t gfp) { atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; -- cgit From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:17 -0700 Subject: net: filter: add "load 64-bit immediate" eBPF instruction add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register. All previous instructions were 8-byte. This is first 16-byte instruction. Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction: insn[0].code = BPF_LD | BPF_DW | BPF_IMM insn[0].dst_reg = destination register insn[0].imm = lower 32-bit insn[1].code = 0 insn[1].imm = upper 32-bit All unused fields must be zero. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. x64 JITs it as single 'movabsq %rax, imm64' arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn Note that old eBPF programs are binary compatible with new interpreter. It helps eBPF programs load 64-bit constant into a register with one instruction instead of using two registers and 4 instructions: BPF_MOV32_IMM(R1, imm32) BPF_ALU64_IMM(BPF_LSH, R1, 32) BPF_MOV32_IMM(R2, imm32) BPF_ALU64_REG(BPF_OR, R1, R2) User space generated programs will use this instruction to load constants only. To tell kernel that user space needs a pointer the _pseudo_ variant of this instruction may be added later, which will use extra bits of encoding to indicate what type of pointer user space is asking kernel to provide. For example 'off' or 'src_reg' fields can be used for such purpose. src_reg = 1 could mean that user space is asking kernel to validate and load in-kernel map pointer. src_reg = 2 could mean that user space needs readonly data section pointer src_reg = 3 could mean that user space needs a pointer to per-cpu local data All such future pseudo instructions will not be carrying the actual pointer as part of the instruction, but rather will be treated as a request to kernel to provide one. The kernel will verify the request_for_a_pointer, then will drop _pseudo_ marking and will store actual internal pointer inside the instruction, so the end result is the interpreter and JITs never see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that loads 64-bit immediate into a register. User space never operates on direct pointers and verifier can easily recognize request_for_pointer vs other instructions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 8 +++++++- arch/x86/net/bpf_jit_comp.c | 17 +++++++++++++++++ include/linux/filter.h | 18 ++++++++++++++++++ kernel/bpf/core.c | 5 +++++ lib/test_bpf.c | 21 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index c48a9704bda8..81916ab5d96f 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -951,7 +951,7 @@ Size modifier is one of ... Mode modifier is one of: - BPF_IMM 0x00 /* classic BPF only, reserved in eBPF */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 BPF_IND 0x40 BPF_MEM 0x60 @@ -995,6 +995,12 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + Testing ------- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 39ccfbb4a723..06f8c17f5484 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -393,6 +393,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: diff --git a/include/linux/filter.h b/include/linux/filter.h index c78994593355..bf323da77950 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -166,6 +166,24 @@ enum { .off = 0, \ .imm = IMM }) +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b54bb2c2e494..2c2bfaacce66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -301,6 +302,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9a67456ba29a..413890815d3e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1735,6 +1735,27 @@ static struct bpf_test tests[] = { { }, { { 1, 0 } }, }, + { + "load 64-bit immediate", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x567800001234L), + BPF_MOV64_REG(R2, R1), + BPF_MOV64_REG(R3, R2), + BPF_ALU64_IMM(BPF_RSH, R2, 32), + BPF_ALU64_IMM(BPF_LSH, R3, 32), + BPF_ALU64_IMM(BPF_RSH, R3, 32), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_JMP_IMM(BPF_JEQ, R2, 0x5678, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, R3, 0x1234, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } } + }, }; static struct net_device dev; -- cgit From 95389b08d93d5c06ec63ab49bd732b0069b7c35e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 10 Sep 2014 22:22:00 +0100 Subject: KEYS: Fix termination condition in assoc array garbage collection This fixes CVE-2014-3631. It is possible for an associative array to end up with a shortcut node at the root of the tree if there are more than fan-out leaves in the tree, but they all crowd into the same slot in the lowest level (ie. they all have the same first nibble of their index keys). When assoc_array_gc() returns back up the tree after scanning some leaves, it can fall off of the root and crash because it assumes that the back pointer from a shortcut (after label ascend_old_tree) must point to a normal node - which isn't true of a shortcut node at the root. Should we find we're ascending rootwards over a shortcut, we should check to see if the backpointer is zero - and if it is, we have completed the scan. This particular bug cannot occur if the root node is not a shortcut - ie. if you have fewer than 17 keys in a keyring or if you have at least two keys that sit into separate slots (eg. a keyring and a non keyring). This can be reproduced by: ring=`keyctl newring bar @s` for ((i=1; i<=18; i++)); do last_key=`keyctl newring foo$i $ring`; done keyctl timeout $last_key 2 Doing this: echo 3 >/proc/sys/kernel/keys/gc_delay first will speed things up. If we do fall off of the top of the tree, we get the following oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] assoc_array_gc+0x2f7/0x540 PGD dae15067 PUD cfc24067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: xt_nat xt_mark nf_conntrack_netbios_ns nf_conntrack_broadcast ip6t_rpfilter ip6t_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_ni CPU: 0 PID: 26011 Comm: kworker/0:1 Not tainted 3.14.9-200.fc20.x86_64 #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: events key_garbage_collector task: ffff8800918bd580 ti: ffff8800aac14000 task.ti: ffff8800aac14000 RIP: 0010:[] [] assoc_array_gc+0x2f7/0x540 RSP: 0018:ffff8800aac15d40 EFLAGS: 00010206 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8800aaecacc0 RDX: ffff8800daecf440 RSI: 0000000000000001 RDI: ffff8800aadc2bc0 RBP: ffff8800aac15da8 R08: 0000000000000001 R09: 0000000000000003 R10: ffffffff8136ccc7 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000070 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000018 CR3: 00000000db10d000 CR4: 00000000000006f0 Stack: ffff8800aac15d50 0000000000000011 ffff8800aac15db8 ffffffff812e2a70 ffff880091a00600 0000000000000000 ffff8800aadc2bc3 00000000cd42c987 ffff88003702df20 ffff88003702dfa0 0000000053b65c09 ffff8800aac15fd8 Call Trace: [] ? keyring_detect_cycle_iterator+0x30/0x30 [] keyring_gc+0x75/0x80 [] key_garbage_collector+0x154/0x3c0 [] process_one_work+0x176/0x430 [] worker_thread+0x11b/0x3a0 [] ? rescuer_thread+0x3b0/0x3b0 [] kthread+0xd8/0xf0 [] ? insert_kthread_work+0x40/0x40 [] ret_from_fork+0x7c/0xb0 [] ? insert_kthread_work+0x40/0x40 Code: 08 4c 8b 22 0f 84 bf 00 00 00 41 83 c7 01 49 83 e4 fc 41 83 ff 0f 4c 89 65 c0 0f 8f 5a fe ff ff 48 8b 45 c0 4d 63 cf 49 83 c1 02 <4e> 8b 34 c8 4d 85 f6 0f 84 be 00 00 00 41 f6 c6 01 0f 84 92 RIP [] assoc_array_gc+0x2f7/0x540 RSP CR2: 0000000000000018 ---[ end trace 1129028a088c0cbd ]--- Signed-off-by: David Howells Acked-by: Don Zickus Signed-off-by: James Morris --- lib/assoc_array.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/assoc_array.c b/lib/assoc_array.c index ae146f0734eb..2404d03e251a 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -1723,11 +1723,13 @@ ascend_old_tree: shortcut = assoc_array_ptr_to_shortcut(ptr); slot = shortcut->parent_slot; cursor = shortcut->back_pointer; + if (!cursor) + goto gc_complete; } else { slot = node->parent_slot; cursor = ptr; } - BUG_ON(!ptr); + BUG_ON(!cursor); node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; -- cgit From 72d931046030beb2d13dad6d205be0e228618432 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 13 Sep 2014 11:14:53 -0700 Subject: Make ARCH_HAS_FAST_MULTIPLIER a real config variable It used to be an ad-hoc hack defined by the x86 version of that enabled a couple of library routines to know whether an integer multiply is faster than repeated shifts and additions. This just makes it use the real Kconfig system instead, and makes x86 (which was the only architecture that did this) select the option. NOTE! Even for x86, this really is kind of wrong. If we cared, we would probably not enable this for builds optimized for netburst (P4), where shifts-and-adds are generally faster than multiplies. This patch does *not* change that kind of logic, though, it is purely a syntactic change with no code changes. This was triggered by the fact that we have other places that really want to know "do I want to expand multiples by constants by hand or not", particularly the hash generation code. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/bitops.h | 2 -- lib/Kconfig | 3 +++ lib/hweight.c | 4 ++-- lib/string.c | 4 ++-- 5 files changed, 8 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 778178f4c7d1..36327438caf0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 def_bool y select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_FAST_MULTIPLIER select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index afcd35d331de..cfe3b954d5e4 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -497,8 +497,6 @@ static __always_inline int fls64(__u64 x) #include -#define ARCH_HAS_FAST_MULTIPLIER 1 - #include #include diff --git a/lib/Kconfig b/lib/Kconfig index a5ce0c7f6c30..54cf309a92a5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,6 +51,9 @@ config PERCPU_RWSEM config ARCH_USE_CMPXCHG_LOCKREF bool +config ARCH_HAS_FAST_MULTIPLIER + bool + config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/lib/hweight.c b/lib/hweight.c index b7d81ba143d1..9a5c1f221558 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -11,7 +11,7 @@ unsigned int __sw_hweight32(unsigned int w) { -#ifdef ARCH_HAS_FAST_MULTIPLIER +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x55555555; w = (w & 0x33333333) + ((w >> 2) & 0x33333333); w = (w + (w >> 4)) & 0x0f0f0f0f; @@ -49,7 +49,7 @@ unsigned long __sw_hweight64(__u64 w) return __sw_hweight32((unsigned int)(w >> 32)) + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 -#ifdef ARCH_HAS_FAST_MULTIPLIER +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; diff --git a/lib/string.c b/lib/string.c index 992bf30af759..f3c6ff596414 100644 --- a/lib/string.c +++ b/lib/string.c @@ -807,9 +807,9 @@ void *memchr_inv(const void *start, int c, size_t bytes) return check_bytes8(start, value, bytes); value64 = value; -#if defined(ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 +#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 value64 *= 0x0101010101010101; -#elif defined(ARCH_HAS_FAST_MULTIPLIER) +#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) value64 *= 0x01010101; value64 |= value64 << 32; #else -- cgit From 53d91c5ce0cb8945b55e8bb54e551cabc51eb28d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Sep 2014 17:36:01 +0100 Subject: Provide a binary to hex conversion function Provide a function to convert a buffer of binary data into an unterminated ascii hex string representation of that data. Signed-off-by: David Howells Acked-by: Vivek Goyal --- include/linux/kernel.h | 1 + lib/hexdump.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'lib') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4c52907a6d8b..89a0b8e5a952 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -500,6 +500,7 @@ static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); +extern char *bin2hex(char *dst, const void *src, size_t count); int mac_pton(const char *s, u8 *mac); diff --git a/lib/hexdump.c b/lib/hexdump.c index 8499c810909a..270773b91923 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -58,6 +58,22 @@ int hex2bin(u8 *dst, const char *src, size_t count) } EXPORT_SYMBOL(hex2bin); +/** + * bin2hex - convert binary data to an ascii hexadecimal string + * @dst: ascii hexadecimal result + * @src: binary data + * @count: binary data length + */ +char *bin2hex(char *dst, const void *src, size_t count) +{ + const unsigned char *_src = src; + + while (count--) + dst = hex_byte_pack(dst, *_src++); + return dst; +} +EXPORT_SYMBOL(bin2hex); + /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump -- cgit From 0d9e26329b0c9263d4d9e0422d80a0e73268c52f Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:19 +0100 Subject: sched: Add default-disabled option to BUG() when stack end location is overwritten Currently in the event of a stack overrun a call to schedule() does not check for this type of corruption. This corruption is often silent and can go unnoticed. However once the corrupted region is examined at a later stage, the outcome is undefined and often results in a sporadic page fault which cannot be handled. This patch checks for a stack overrun and takes appropriate action since the damage is already done, there is no point in continuing. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: rostedt@goodmis.org Cc: hannes@cmpxchg.org Cc: Alexei Starovoitov Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Dan Streetman Cc: Davidlohr Bueso Cc: David S. Miller Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1410527779-8133-4-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ lib/Kconfig.debug | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'lib') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b1ddebed54a..61ee2b327a27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2693,6 +2693,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a28590083622..e58163d69db1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -824,6 +824,18 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config SCHED_STACK_END_CHECK + bool "Detect stack corruption on calls to schedule()" + depends on DEBUG_KERNEL + default n + help + This option checks for a stack overrun on calls to schedule(). + If the stack end location is found to be over written always panic as + the content of the corrupted region can no longer be trusted. + This is to ensure no erroneous behaviour occurs which could result in + data corruption or a sporadic crash at a later stage once the region + is examined. The runtime overhead introduced is minimal. + config TIMER_STATS bool "Collect kernel timers statistics" depends on DEBUG_KERNEL && PROC_FS -- cgit From b3f2512ecdb3561ffa44737f370fdb78e1febf6b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 17 Sep 2014 21:07:19 +0200 Subject: lib: rhashtable: remove second linux/log2.h inclusion linux/log2.h was included twice. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- lib/rhashtable.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a2c78810ebc1..7b36e4d40ed7 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -23,7 +23,6 @@ #include #include #include -#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4UL -- cgit From 4843c3320c3d23ab4ecf520f5eaf485aff8c7252 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 20 Sep 2014 01:27:24 -0400 Subject: percpu-refcount: improve WARN messages percpu_ref's WARN messages can be a lot more helpful by indicating who's the culprit. Make them report the release function that the offending percpu-refcount is associated with. This should make it a lot easier to track down the reported invalid refcnting operations. Signed-off-by: Tejun Heo Cc: Kent Overstreet --- lib/percpu-refcount.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index ff9903264a91..70d28c91f35a 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -145,8 +145,9 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); - WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", - atomic_read(&ref->count)); + WARN_ONCE(atomic_read(&ref->count) <= 0, + "percpu ref (%pf) <= 0 (%i) after killed", + ref->release, atomic_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) @@ -178,7 +179,8 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once!\n"); + "percpu_ref_kill() called more than once on %pf!", + ref->release); ref->pcpu_count_ptr |= PCPU_REF_DEAD; ref->confirm_kill = confirm_kill; -- cgit From e625305b390790717cf2cccf61efb81299647028 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 20 Sep 2014 01:27:25 -0400 Subject: percpu-refcount: make percpu_ref based on longs instead of ints percpu_ref is currently based on ints and the number of refs it can cover is (1 << 31). This makes it impossible to use a percpu_ref to count memory objects or pages on 64bit machines as it may overflow. This forces those users to somehow aggregate the references before contributing to the percpu_ref which is often cumbersome and sometimes challenging to get the same level of performance as using the percpu_ref directly. While using ints for the percpu counters makes them pack tighter on 64bit machines, the possible gain from using ints instead of longs is extremely small compared to the overall gain from per-cpu operation. This patch makes percpu_ref based on longs so that it can be used to directly count memory objects or pages. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Johannes Weiner --- include/linux/percpu-refcount.h | 24 ++++++++++++------------ lib/percpu-refcount.c | 37 +++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index ee8325122dbd..5df6784bd9d2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -55,7 +55,7 @@ struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); struct percpu_ref { - atomic_t count; + atomic_long_t count; /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. @@ -97,7 +97,7 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) * branches as it can't assume that @ref->pcpu_count is not NULL. */ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, - unsigned __percpu **pcpu_countp) + unsigned long __percpu **pcpu_countp) { unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); @@ -107,7 +107,7 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) return false; - *pcpu_countp = (unsigned __percpu *)pcpu_ptr; + *pcpu_countp = (unsigned long __percpu *)pcpu_ptr; return true; } @@ -119,14 +119,14 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, */ static inline void percpu_ref_get(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; rcu_read_lock_sched(); if (__pcpu_ref_alive(ref, &pcpu_count)) this_cpu_inc(*pcpu_count); else - atomic_inc(&ref->count); + atomic_long_inc(&ref->count); rcu_read_unlock_sched(); } @@ -142,7 +142,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; int ret = false; rcu_read_lock_sched(); @@ -151,7 +151,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) this_cpu_inc(*pcpu_count); ret = true; } else { - ret = atomic_inc_not_zero(&ref->count); + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); @@ -175,7 +175,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; int ret = false; rcu_read_lock_sched(); @@ -199,13 +199,13 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) */ static inline void percpu_ref_put(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; rcu_read_lock_sched(); if (__pcpu_ref_alive(ref, &pcpu_count)) this_cpu_dec(*pcpu_count); - else if (unlikely(atomic_dec_and_test(&ref->count))) + else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); rcu_read_unlock_sched(); @@ -219,11 +219,11 @@ static inline void percpu_ref_put(struct percpu_ref *ref) */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; if (__pcpu_ref_alive(ref, &pcpu_count)) return false; - return !atomic_read(&ref->count); + return !atomic_long_read(&ref->count); } #endif diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 70d28c91f35a..559ee0b20318 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -25,15 +25,15 @@ * works. * * Converting to non percpu mode is done with some RCUish stuff in - * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t - * can't hit 0 before we've added up all the percpu refs. + * percpu_ref_kill. Additionally, we need a bias value so that the + * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ -#define PCPU_COUNT_BIAS (1U << 31) +#define PCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) -static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) +static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) { - return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + return (unsigned long __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); } /** @@ -43,7 +43,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_set(ref, 1). + * analagous to atomic_long_set(ref, 1). * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). @@ -51,9 +51,9 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned long, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; @@ -75,13 +75,13 @@ EXPORT_SYMBOL_GPL(percpu_ref_init); */ void percpu_ref_reinit(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); int cpu; BUG_ON(!pcpu_count); WARN_ON(!percpu_ref_is_zero(ref)); - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); /* * Restore per-cpu operation. smp_store_release() is paired with @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); */ void percpu_ref_exit(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); if (pcpu_count) { free_percpu(pcpu_count); @@ -121,14 +121,15 @@ EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); - unsigned count = 0; + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(pcpu_count, cpu); - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); + pr_debug("global %ld pcpu %ld", + atomic_long_read(&ref->count), (long)count); /* * It's crucial that we sum the percpu counters _before_ adding the sum @@ -143,11 +144,11 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * time is equivalent and saves us atomic operations: */ - atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); - WARN_ONCE(atomic_read(&ref->count) <= 0, - "percpu ref (%pf) <= 0 (%i) after killed", - ref->release, atomic_read(&ref->count)); + WARN_ONCE(atomic_long_read(&ref->count) <= 0, + "percpu ref (%pf) <= 0 (%ld) after killed", + ref->release, atomic_long_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) -- cgit From 25ee7327d04bc3ff41a7a5ac42d74226f8d60ac6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Sep 2014 13:53:51 -0700 Subject: net: bpf: fix compiler warnings in test_bpf old gcc 4.2 used by avr32 architecture produces warnings: lib/test_bpf.c:1741: warning: integer constant is too large for 'long' type lib/test_bpf.c:1741: warning: integer constant is too large for 'long' type lib/test_bpf.c: In function '__run_one': lib/test_bpf.c:1897: warning: 'ret' may be used uninitialized in this function silence these warnings. Fixes: 02ab695bb37e ("net: filter: add "load 64-bit immediate" eBPF instruction") Reported-by: Fengguang Wu Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/test_bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 413890815d3e..23e070bcf72d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1738,7 +1738,7 @@ static struct bpf_test tests[] = { { "load 64-bit immediate", .u.insns_int = { - BPF_LD_IMM64(R1, 0x567800001234L), + BPF_LD_IMM64(R1, 0x567800001234LL), BPF_MOV64_REG(R2, R1), BPF_MOV64_REG(R3, R2), BPF_ALU64_IMM(BPF_RSH, R2, 32), @@ -1894,7 +1894,7 @@ static int __run_one(const struct bpf_prog *fp, const void *data, int runs, u64 *duration) { u64 start, finish; - int ret, i; + int ret = 0, i; start = ktime_to_us(ktime_get()); -- cgit From 0a30288da1aec914e158c2d7a3482a85f632750f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2014 15:24:32 -0400 Subject: blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe blk-mq uses percpu_ref for its usage counter which tracks the number of in-flight commands and used to synchronously drain the queue on freeze. percpu_ref shutdown takes measureable wallclock time as it involves a sched RCU grace period. This means that draining a blk-mq takes measureable wallclock time. One would think that this shouldn't matter as queue shutdown should be a rare event which takes place asynchronously w.r.t. userland. Unfortunately, SCSI probing involves synchronously setting up and then tearing down a lot of request_queues back-to-back for non-existent LUNs. This means that SCSI probing may take more than ten seconds when scsi-mq is used. This will be properly fixed by implementing a mechanism to keep q->mq_usage_counter in atomic mode till genhd registration; however, that involves rather big updates to percpu_ref which is difficult to apply late in the devel cycle (v3.17-rc6 at the moment). As a stop-gap measure till the proper fix can be implemented in the next cycle, this patch introduces __percpu_ref_kill_expedited() and makes blk_mq_freeze_queue() use it. This is heavy-handed but should work for testing the experimental SCSI blk-mq implementation. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count") Cc: Kent Overstreet Cc: Jens Axboe Tested-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- include/linux/percpu-refcount.h | 1 + lib/percpu-refcount.c | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/block/blk-mq.c b/block/blk-mq.c index c88e6089746d..df8e1e09dd17 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,7 +119,16 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - percpu_ref_kill(&q->mq_usage_counter); + /* + * XXX: Temporary kludge to work around SCSI blk-mq stall. + * SCSI synchronously creates and destroys many queues + * back-to-back during probe leading to lengthy stalls. + * This will be fixed by keeping ->mq_usage_counter in + * atomic mode until genhd registration, but, for now, + * let's work around using expedited synchronization. + */ + __percpu_ref_kill_expedited(&q->mq_usage_counter); + blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf237cd8f..ef5894ca8e50 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -71,6 +71,7 @@ void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void __percpu_ref_kill_expedited(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a3342e960..a89cf09a8268 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -184,3 +184,19 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/* + * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by + * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 + * devel cycle. Do not use anywhere else. + */ +void __percpu_ref_kill_expedited(struct percpu_ref *ref) +{ + WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, + "percpu_ref_kill() called more than once on %pf!", + ref->release); + + ref->pcpu_count_ptr |= PCPU_REF_DEAD; + synchronize_sched_expedited(); + percpu_ref_kill_rcu(&ref->rcu); +} -- cgit From 9eca80461a45177e456219a9cd944c27675d6512 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:07:33 -0400 Subject: Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe" This reverts commit 0a30288da1aec914e158c2d7a3482a85f632750f, which was a temporary fix for SCSI blk-mq stall issue. The following patches will fix the issue properly by introducing atomic mode to percpu_ref. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig --- block/blk-mq.c | 11 +---------- include/linux/percpu-refcount.h | 1 - lib/percpu-refcount.c | 16 ---------------- 3 files changed, 1 insertion(+), 27 deletions(-) (limited to 'lib') diff --git a/block/blk-mq.c b/block/blk-mq.c index 255d79c14dc1..44a78ae3f899 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - /* - * XXX: Temporary kludge to work around SCSI blk-mq stall. - * SCSI synchronously creates and destroys many queues - * back-to-back during probe leading to lengthy stalls. - * This will be fixed by keeping ->mq_usage_counter in - * atomic mode until genhd registration, but, for now, - * let's work around using expedited synchronization. - */ - __percpu_ref_kill_expedited(&q->mq_usage_counter); - + percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 11b38ceca7e2..5df6784bd9d2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -72,7 +72,6 @@ void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void __percpu_ref_kill_expedited(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index c6c31e2829b1..559ee0b20318 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -189,19 +189,3 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); - -/* - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 - * devel cycle. Do not use anywhere else. - */ -void __percpu_ref_kill_expedited(struct percpu_ref *ref) -{ - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); - - ref->pcpu_count_ptr |= PCPU_REF_DEAD; - synchronize_sched_expedited(); - percpu_ref_kill_rcu(&ref->rcu); -} -- cgit From a2237370194484ee6aeeff04b617e4b14d178966 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: relocate percpu_ref_reinit() percpu_ref is gonna go through restructuring. Move percpu_ref_reinit() after percpu_ref_kill_and_confirm(). This will make later changes easier to follow and result in cleaner organization. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet --- include/linux/percpu-refcount.h | 2 +- lib/percpu-refcount.c | 70 ++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 36 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 5df6784bd9d2..f015f139d491 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -68,10 +68,10 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp); -void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 559ee0b20318..070dab5e7d77 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -62,41 +62,6 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, } EXPORT_SYMBOL_GPL(percpu_ref_init); -/** - * percpu_ref_reinit - re-initialize a percpu refcount - * @ref: perpcu_ref to re-initialize - * - * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully, killed - * and reached 0 but not exited. - * - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while - * this function is in progress. - */ -void percpu_ref_reinit(struct percpu_ref *ref) -{ - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); - int cpu; - - BUG_ON(!pcpu_count); - WARN_ON(!percpu_ref_is_zero(ref)); - - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); - - /* - * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following PCPU_REF_DEAD clearing. - */ - for_each_possible_cpu(cpu) - *per_cpu_ptr(pcpu_count, cpu) = 0; - - smp_store_release(&ref->pcpu_count_ptr, - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); -} -EXPORT_SYMBOL_GPL(percpu_ref_reinit); - /** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit @@ -189,3 +154,38 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/** + * percpu_ref_reinit - re-initialize a percpu refcount + * @ref: perpcu_ref to re-initialize + * + * Re-initialize @ref so that it's in the same state as when it finished + * percpu_ref_init(). @ref must have been initialized successfully, killed + * and reached 0 but not exited. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_reinit(struct percpu_ref *ref) +{ + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + int cpu; + + BUG_ON(!pcpu_count); + WARN_ON(!percpu_ref_is_zero(ref)); + + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + + /* + * Restore per-cpu operation. smp_store_release() is paired with + * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees + * that the zeroing is visible to all percpu accesses which can see + * the following PCPU_REF_DEAD clearing. + */ + for_each_possible_cpu(cpu) + *per_cpu_ptr(pcpu_count, cpu) = 0; + + smp_store_release(&ref->pcpu_count_ptr, + ref->pcpu_count_ptr & ~PCPU_REF_DEAD); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 6251f9976af7656b6970a8820153f356430f5de2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: minor code and comment updates * Some comments became stale. Updated. * percpu_ref_tryget() unnecessarily initializes @ret. Removed. * A blank line removed from percpu_ref_kill_rcu(). * Explicit function name in a WARN format string replaced with __func__. * WARN_ON() in percpu_ref_reinit() converted to WARN_ON_ONCE(). Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet --- include/linux/percpu-refcount.h | 25 ++++++++++++++++--------- lib/percpu-refcount.c | 14 ++++++-------- 2 files changed, 22 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index f015f139d491..d44b027f74fd 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -115,8 +115,10 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * - * Analagous to atomic_inc(). - */ + * Analagous to atomic_long_inc(). + * + * This function is safe to call as long as @ref is between init and exit. + */ static inline void percpu_ref_get(struct percpu_ref *ref) { unsigned long __percpu *pcpu_count; @@ -138,12 +140,12 @@ static inline void percpu_ref_get(struct percpu_ref *ref) * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { unsigned long __percpu *pcpu_count; - int ret = false; + int ret; rcu_read_lock_sched(); @@ -166,12 +168,13 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * - * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget - * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be - * used. After the confirm_kill callback is invoked, it's guaranteed that - * no new reference will be given out by percpu_ref_tryget(). + * Completion of percpu_ref_kill() in itself doesn't guarantee that this + * function will fail. For such guarantee, percpu_ref_kill_and_confirm() + * should be used. After the confirm_kill callback is invoked, it's + * guaranteed that no new reference will be given out by + * percpu_ref_tryget_live(). * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { @@ -196,6 +199,8 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) + * + * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { @@ -216,6 +221,8 @@ static inline void percpu_ref_put(struct percpu_ref *ref) * @ref: percpu_ref to test * * Returns %true if @ref reached zero. + * + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 070dab5e7d77..8ef3f5c20df6 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -108,7 +108,6 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ - atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, @@ -120,8 +119,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) ref->confirm_kill(ref); /* - * Now we're in single atomic_t mode with a consistent refcount, so it's - * safe to drop our initial ref: + * Now we're in single atomic_long_t mode with a consistent + * refcount, so it's safe to drop our initial ref: */ percpu_ref_put(ref); } @@ -134,8 +133,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs - all further - * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() - * for more details. + * invocations of percpu_ref_tryget_live() will fail. See + * percpu_ref_tryget_live() for more details. * * Due to the way percpu_ref is implemented, @confirm_kill will be called * after at least one full RCU grace period has passed but this is an @@ -145,8 +144,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); + "%s called more than once on %pf!", __func__, ref->release); ref->pcpu_count_ptr |= PCPU_REF_DEAD; ref->confirm_kill = confirm_kill; @@ -172,7 +170,7 @@ void percpu_ref_reinit(struct percpu_ref *ref) int cpu; BUG_ON(!pcpu_count); - WARN_ON(!percpu_ref_is_zero(ref)); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); -- cgit From eecc16ba9a49b05dd847a317af166a6728eb56ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: replace pcpu_ prefix with percpu_ percpu_ref uses pcpu_ prefix for internal stuff and percpu_ for externally visible ones. This is the same convention used in the percpu allocator implementation. It works fine there but percpu_ref doesn't have too much internal-only stuff and scattered usages of pcpu_ prefix are confusing than helpful. This patch replaces all pcpu_ prefixes with percpu_. This is pure rename and there's no functional change. Note that PCPU_REF_DEAD is renamed to __PERCPU_REF_DEAD to signify that the flag is internal. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet --- include/linux/percpu-refcount.h | 46 ++++++++++++++++----------------- lib/percpu-refcount.c | 56 +++++++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 50 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index d44b027f74fd..3d463a39e0f7 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -13,7 +13,7 @@ * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see - * percpu_ref_kill()/PCPU_COUNT_BIAS. + * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() @@ -60,7 +60,7 @@ struct percpu_ref { * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ - unsigned long pcpu_count_ptr; + unsigned long percpu_count_ptr; percpu_ref_func_t *release; percpu_ref_func_t *confirm_kill; struct rcu_head rcu; @@ -88,26 +88,26 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) return percpu_ref_kill_and_confirm(ref, NULL); } -#define PCPU_REF_DEAD 1 +#define __PERCPU_REF_DEAD 1 /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional - * branches as it can't assume that @ref->pcpu_count is not NULL. + * branches as it can't assume that @ref->percpu_count is not NULL. */ -static inline bool __pcpu_ref_alive(struct percpu_ref *ref, - unsigned long __percpu **pcpu_countp) +static inline bool __percpu_ref_alive(struct percpu_ref *ref, + unsigned long __percpu **percpu_countp) { - unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); + unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_DEAD)) return false; - *pcpu_countp = (unsigned long __percpu *)pcpu_ptr; + *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } @@ -121,12 +121,12 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, */ static inline void percpu_ref_get(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) + this_cpu_inc(*percpu_count); else atomic_long_inc(&ref->count); @@ -144,13 +144,13 @@ static inline void percpu_ref_get(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; int ret; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; } else { ret = atomic_long_inc_not_zero(&ref->count); @@ -178,13 +178,13 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; int ret = false; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; } @@ -204,12 +204,12 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) */ static inline void percpu_ref_put(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_dec(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) + this_cpu_dec(*percpu_count); else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); @@ -226,9 +226,9 @@ static inline void percpu_ref_put(struct percpu_ref *ref) */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; - if (__pcpu_ref_alive(ref, &pcpu_count)) + if (__percpu_ref_alive(ref, &percpu_count)) return false; return !atomic_long_read(&ref->count); } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8ef3f5c20df6..5aea6b7356c7 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -11,8 +11,8 @@ * percpu counters will all sum to the correct value * * (More precisely: because moduler arithmatic is commutative the sum of all the - * pcpu_count vars will be equal to what it would have been if all the gets and - * puts were done to a single integer, even if some of the percpu integers + * percpu_count vars will be equal to what it would have been if all the gets + * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect @@ -29,11 +29,12 @@ * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ -#define PCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) -static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) +static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { - return (unsigned long __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + return (unsigned long __percpu *) + (ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); } /** @@ -51,10 +52,11 @@ static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned long, gfp); - if (!ref->pcpu_count_ptr) + ref->percpu_count_ptr = + (unsigned long)alloc_percpu_gfp(unsigned long, gfp); + if (!ref->percpu_count_ptr) return -ENOMEM; ref->release = release; @@ -74,11 +76,11 @@ EXPORT_SYMBOL_GPL(percpu_ref_init); */ void percpu_ref_exit(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); - if (pcpu_count) { - free_percpu(pcpu_count); - ref->pcpu_count_ptr = PCPU_REF_DEAD; + if (percpu_count) { + free_percpu(percpu_count); + ref->percpu_count_ptr = __PERCPU_REF_DEAD; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -86,14 +88,14 @@ EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) - count += *per_cpu_ptr(pcpu_count, cpu); + count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %ld pcpu %ld", + pr_debug("global %ld percpu %ld", atomic_long_read(&ref->count), (long)count); /* @@ -108,7 +110,7 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ - atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); + atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, "percpu ref (%pf) <= 0 (%ld) after killed", @@ -143,10 +145,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); - ref->pcpu_count_ptr |= PCPU_REF_DEAD; + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; ref->confirm_kill = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); @@ -166,24 +168,24 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; - BUG_ON(!pcpu_count); + BUG_ON(!percpu_count); WARN_ON_ONCE(!percpu_ref_is_zero(ref)); - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); /* * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following PCPU_REF_DEAD clearing. + * smp_read_barrier_depends() in __percpu_ref_alive() and + * guarantees that the zeroing is visible to all percpu accesses + * which can see the following __PERCPU_REF_DEAD clearing. */ for_each_possible_cpu(cpu) - *per_cpu_ptr(pcpu_count, cpu) = 0; + *per_cpu_ptr(percpu_count, cpu) = 0; - smp_store_release(&ref->pcpu_count_ptr, - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + smp_store_release(&ref->percpu_count_ptr, + ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 9e804d1f58da1eca079f796347c1cf1d1df564e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: rename things to prepare for decoupling percpu/atomic mode switch percpu_ref will be restructured so that percpu/atomic mode switching and reference killing are dedoupled. In preparation, do the following renames. * percpu_ref->confirm_kill -> percpu_ref->confirm_switch * __PERCPU_REF_DEAD -> __PERCPU_REF_ATOMIC * __percpu_ref_alive() -> __ref_is_percpu() This patch is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet --- include/linux/percpu-refcount.h | 25 ++++++++++++++----------- lib/percpu-refcount.c | 22 +++++++++++----------- 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3d463a39e0f7..910e5f72055d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -54,6 +54,11 @@ struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); +/* flags set in the lower bits of percpu_ref->percpu_count_ptr */ +enum { + __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ +}; + struct percpu_ref { atomic_long_t count; /* @@ -62,7 +67,7 @@ struct percpu_ref { */ unsigned long percpu_count_ptr; percpu_ref_func_t *release; - percpu_ref_func_t *confirm_kill; + percpu_ref_func_t *confirm_switch; struct rcu_head rcu; }; @@ -88,23 +93,21 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) return percpu_ref_kill_and_confirm(ref, NULL); } -#define __PERCPU_REF_DEAD 1 - /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ -static inline bool __percpu_ref_alive(struct percpu_ref *ref, - unsigned long __percpu **percpu_countp) +static inline bool __ref_is_percpu(struct percpu_ref *ref, + unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; @@ -125,7 +128,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) this_cpu_inc(*percpu_count); else atomic_long_inc(&ref->count); @@ -149,7 +152,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) { + if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; } else { @@ -183,7 +186,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) { + if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; } @@ -208,7 +211,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) this_cpu_dec(*percpu_count); else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); @@ -228,7 +231,7 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) return false; return !atomic_long_read(&ref->count); } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5aea6b7356c7..7aef590c1ef8 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -34,7 +34,7 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) - (ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } /** @@ -80,7 +80,7 @@ void percpu_ref_exit(struct percpu_ref *ref) if (percpu_count) { free_percpu(percpu_count); - ref->percpu_count_ptr = __PERCPU_REF_DEAD; + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -117,8 +117,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) ref->release, atomic_long_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ - if (ref->confirm_kill) - ref->confirm_kill(ref); + if (ref->confirm_switch) + ref->confirm_switch(ref); /* * Now we're in single atomic_long_t mode with a consistent @@ -145,11 +145,11 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC, "%s called more than once on %pf!", __func__, ref->release); - ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - ref->confirm_kill = confirm_kill; + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + ref->confirm_switch = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } @@ -178,14 +178,14 @@ void percpu_ref_reinit(struct percpu_ref *ref) /* * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __percpu_ref_alive() and - * guarantees that the zeroing is visible to all percpu accesses - * which can see the following __PERCPU_REF_DEAD clearing. + * smp_read_barrier_depends() in __ref_is_percpu() and guarantees + * that the zeroing is visible to all percpu accesses which can see + * the following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 27344a9017cdaff82a167827da3001a0918afdc3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: add PCPU_REF_DEAD percpu_ref will be restructured so that percpu/atomic mode switching and reference killing are dedoupled. In preparation, add PCPU_REF_DEAD and PCPU_REF_ATOMIC_DEAD which is OR of ATOMIC and DEAD. For now, ATOMIC and DEAD are changed together and all PCPU_REF_ATOMIC uses are converted to PCPU_REF_ATOMIC_DEAD without causing any behavior changes. percpu_ref_init() now specifies an explicit alignment when allocating the percpu counters so that the pointer has enough unused low bits to accomodate the flags. Note that one flag was fine as min alignment for percpu memory is 2 bytes but two flags are already too many for the natural alignment of unsigned longs on archs like cris and m68k. v2: The original patch had BUILD_BUG_ON() which triggers if unsigned long's alignment isn't enough to accomodate the flags, which triggered on cris and m64k. percpu_ref_init() updated to specify the required alignment explicitly. Reported by Fengguang. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: kbuild test robot --- include/linux/percpu-refcount.h | 6 +++++- lib/percpu-refcount.c | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 910e5f72055d..bd9483d390b4 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -57,6 +57,10 @@ typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ + __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ + __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, + + __PERCPU_REF_FLAG_BITS = 2, }; struct percpu_ref { @@ -107,7 +111,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7aef590c1ef8..e2ff19f970cf 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -34,7 +34,7 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) - (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** @@ -52,10 +52,13 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { + size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, + __alignof__(unsigned long)); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); - ref->percpu_count_ptr = - (unsigned long)alloc_percpu_gfp(unsigned long, gfp); + ref->percpu_count_ptr = (unsigned long) + __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; @@ -80,7 +83,7 @@ void percpu_ref_exit(struct percpu_ref *ref) if (percpu_count) { free_percpu(percpu_count); - ref->percpu_count_ptr = __PERCPU_REF_ATOMIC; + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -145,10 +148,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC_DEAD, "%s called more than once on %pf!", __func__, ref->release); - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC_DEAD; ref->confirm_switch = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); @@ -180,12 +183,12 @@ void percpu_ref_reinit(struct percpu_ref *ref) * Restore per-cpu operation. smp_store_release() is paired with * smp_read_barrier_depends() in __ref_is_percpu() and guarantees * that the zeroing is visible to all percpu accesses which can see - * the following __PERCPU_REF_ATOMIC clearing. + * the following __PERCPU_REF_ATOMIC_DEAD clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 490c79a65708873228cf114cf00e32c204e4e907 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: decouple switching to atomic mode and killing percpu_ref has treated the dropping of the base reference and switching to atomic mode as an integral operation; however, there's nothing inherent tying the two together. The use cases for percpu_ref have been expanding continuously. While the current init/kill/reinit/exit model can cover a lot, the coupling of kill/reinit with atomic/percpu mode switching is turning out to be too restrictive for use cases where many percpu_refs are created and destroyed back-to-back with only some of them reaching extended operation. The coupling also makes implementing always-atomic debug mode difficult. This patch separates out atomic mode switching into percpu_ref_switch_to_atomic() and reimplements percpu_ref_kill_and_confirm() on top of it. * The handling of __PERCPU_REF_ATOMIC and __PERCPU_REF_DEAD is now differentiated. Among get/put operations, percpu_ref_tryget_live() is the only one which cares about DEAD. * percpu_ref_switch_to_atomic() can be called multiple times on the same ref. This means that multiple @confirm_switch may get queued up which we can't do reliably without extra memory area. This is handled by making the later invocation synchronously wait for the completion of the previous one. This isn't particularly desirable but such synchronous waits shouldn't happen in most cases. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner --- include/linux/percpu-refcount.h | 8 ++- lib/percpu-refcount.c | 141 +++++++++++++++++++++++++++++++--------- 2 files changed, 116 insertions(+), 33 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index bd9483d390b4..d1252e1335e8 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -78,9 +78,11 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch); +void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref @@ -111,7 +113,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; @@ -193,6 +195,8 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; + } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index e2ff19f970cf..6e0d14366c5d 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,6 +1,8 @@ #define pr_fmt(fmt) "%s: " fmt "\n", __func__ #include +#include +#include #include /* @@ -31,6 +33,8 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); + static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) @@ -88,7 +92,19 @@ void percpu_ref_exit(struct percpu_ref *ref) } EXPORT_SYMBOL_GPL(percpu_ref_exit); -static void percpu_ref_kill_rcu(struct rcu_head *rcu) +static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) +{ + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); + + ref->confirm_switch(ref); + ref->confirm_switch = NULL; + wake_up_all(&percpu_ref_switch_waitq); + + /* drop ref from percpu_ref_switch_to_atomic() */ + percpu_ref_put(ref); +} + +static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); unsigned long __percpu *percpu_count = percpu_count_ptr(ref); @@ -116,47 +132,79 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, - "percpu ref (%pf) <= 0 (%ld) after killed", + "percpu ref (%pf) <= 0 (%ld) after switching to atomic", ref->release, atomic_long_read(&ref->count)); - /* @ref is viewed as dead on all CPUs, send out kill confirmation */ - if (ref->confirm_switch) - ref->confirm_switch(ref); + /* @ref is viewed as dead on all CPUs, send out switch confirmation */ + percpu_ref_call_confirm_rcu(rcu); +} - /* - * Now we're in single atomic_long_t mode with a consistent - * refcount, so it's safe to drop our initial ref: - */ - percpu_ref_put(ref); +static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) +{ +} + +static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that + * switching is in progress. Use noop one if unspecified. + */ + WARN_ON_ONCE(ref->confirm_switch); + ref->confirm_switch = + confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); + } else if (confirm_switch) { + /* + * Somebody already set ATOMIC. Switching may still be in + * progress. @confirm_switch must be invoked after the + * switching is complete and a full sched RCU grace period + * has passed. Wait synchronously for the previous + * switching and schedule @confirm_switch invocation. + */ + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + ref->confirm_switch = confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + } } /** - * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation - * @ref: percpu_ref to kill - * @confirm_kill: optional confirmation callback + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback * - * Equivalent to percpu_ref_kill() but also schedules kill confirmation if - * @confirm_kill is not NULL. @confirm_kill, which may not block, will be - * called after @ref is seen as dead from all CPUs - all further - * invocations of percpu_ref_tryget_live() will fail. See - * percpu_ref_tryget_live() for more details. + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. * - * Due to the way percpu_ref is implemented, @confirm_kill will be called - * after at least one full RCU grace period has passed but this is an - * implementation detail and callers must not depend on it. + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is already in + * the process of switching to atomic mode. In such cases, @confirm_switch + * will be invoked after the switching is complete. + * + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. */ -void percpu_ref_kill_and_confirm(struct percpu_ref *ref, - percpu_ref_func_t *confirm_kill) +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC_DEAD, - "%s called more than once on %pf!", __func__, ref->release); - - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC_DEAD; - ref->confirm_switch = confirm_kill; - - call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); + __percpu_ref_switch_to_atomic(ref, confirm_switch); } -EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_reinit - re-initialize a percpu refcount @@ -192,3 +240,34 @@ void percpu_ref_reinit(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); + +/** + * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation + * @ref: percpu_ref to kill + * @confirm_kill: optional confirmation callback + * + * Equivalent to percpu_ref_kill() but also schedules kill confirmation if + * @confirm_kill is not NULL. @confirm_kill, which may not block, will be + * called after @ref is seen as dead from all CPUs at which point all + * further invocations of percpu_ref_tryget_live() will fail. See + * percpu_ref_tryget_live() for details. + * + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is already in + * the process of switching to atomic mode by percpu_ref_switch_atomic(). + * + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. + */ +void percpu_ref_kill_and_confirm(struct percpu_ref *ref, + percpu_ref_func_t *confirm_kill) +{ + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, + "%s called more than once on %pf!", __func__, ref->release); + + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + __percpu_ref_switch_to_atomic(ref, confirm_kill); + percpu_ref_put(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); -- cgit From f47ad45784611297b699f3dffb6c7222b76afe64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: decouple switching to percpu mode and reinit percpu_ref has treated the dropping of the base reference and switching to atomic mode as an integral operation; however, there's nothing inherent tying the two together. The use cases for percpu_ref have been expanding continuously. While the current init/kill/reinit/exit model can cover a lot, the coupling of kill/reinit with atomic/percpu mode switching is turning out to be too restrictive for use cases where many percpu_refs are created and destroyed back-to-back with only some of them reaching extended operation. The coupling also makes implementing always-atomic debug mode difficult. This patch separates out percpu mode switching into percpu_ref_switch_to_percpu() and reimplements percpu_ref_reinit() on top of it. * DEAD still requires ATOMIC. A dead ref can't be switched to percpu mode w/o going through reinit. v2: __percpu_ref_switch_to_percpu() was missing static. Fixed. Reported by Fengguang aka kbuild test robot. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner Cc: kbuild test robot --- include/linux/percpu-refcount.h | 3 +- lib/percpu-refcount.c | 73 ++++++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index d1252e1335e8..cd7e20f0fe47 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -80,9 +80,10 @@ int __must_check percpu_ref_init(struct percpu_ref *ref, void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); -void percpu_ref_reinit(struct percpu_ref *ref); +void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 6e0d14366c5d..5a6d43baccc5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -206,40 +206,54 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, __percpu_ref_switch_to_atomic(ref, confirm_switch); } -/** - * percpu_ref_reinit - re-initialize a percpu refcount - * @ref: perpcu_ref to re-initialize - * - * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully, killed - * and reached 0 but not exited. - * - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while - * this function is in progress. - */ -void percpu_ref_reinit(struct percpu_ref *ref) +static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) + return; + + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* * Restore per-cpu operation. smp_store_release() is paired with * smp_read_barrier_depends() in __ref_is_percpu() and guarantees * that the zeroing is visible to all percpu accesses which can see - * the following __PERCPU_REF_ATOMIC_DEAD clearing. + * the following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); +} + +/** + * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode + * @ref: percpu_ref to switch to percpu mode + * + * There's no reason to use this function for the usual reference counting. + * To re-use an expired ref, use percpu_ref_reinit(). + * + * Switch @ref to percpu mode. This function may be invoked concurrently + * with all the get/put operations and can safely be mixed with kill and + * reinit operations. + * + * This function normally doesn't block and can be called from any context + * but it may block if @ref is in the process of switching to atomic mode + * by percpu_ref_switch_atomic(). + */ +void percpu_ref_switch_to_percpu(struct percpu_ref *ref) +{ + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ + if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_percpu(ref); } -EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation @@ -253,8 +267,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode by percpu_ref_switch_atomic(). + * but it may block if @confirm_kill is specified and @ref is in the + * process of switching to atomic mode by percpu_ref_switch_atomic(). * * Due to the way percpu_ref is implemented, @confirm_switch will be called * after at least one full sched RCU grace period has passed but this is an @@ -271,3 +285,24 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_put(ref); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/** + * percpu_ref_reinit - re-initialize a percpu refcount + * @ref: perpcu_ref to re-initialize + * + * Re-initialize @ref so that it's in the same state as when it finished + * percpu_ref_init(). @ref must have been initialized successfully and + * reached 0 but not exited. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_reinit(struct percpu_ref *ref) +{ + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + + ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; + percpu_ref_get(ref); + __percpu_ref_switch_to_percpu(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 2aad2a86f6685c10360ec8a5a55eb9ab7059cb72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: add PERCPU_REF_INIT_* flags With the recent addition of percpu_ref_reinit(), percpu_ref now can be used as a persistent switch which can be turned on and off repeatedly where turning off maps to killing the ref and waiting for it to drain; however, there currently isn't a way to initialize a percpu_ref in its off (killed and drained) state, which can be inconvenient for certain persistent switch use cases. Similarly, percpu_ref_switch_to_atomic/percpu() allow dynamic selection of operation mode; however, currently a newly initialized percpu_ref is always in percpu mode making it impossible to avoid the latency overhead of switching to atomic mode. This patch adds @flags to percpu_ref_init() and implements the following flags. * PERCPU_REF_INIT_ATOMIC : start ref in atomic mode * PERCPU_REF_INIT_DEAD : start ref killed and drained These flags should be able to serve the above two use cases. v2: target_core_tpg.c conversion was missing. Fixed. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner --- block/blk-mq.c | 2 +- drivers/target/target_core_tpg.c | 2 +- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 18 +++++++++++++++++- kernel/cgroup.c | 7 ++++--- lib/percpu-refcount.c | 23 ++++++++++++++++++----- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/block/blk-mq.c b/block/blk-mq.c index 44a78ae3f899..d85fe01c44ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1796,7 +1796,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) goto err_hctxs; if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - GFP_KERNEL)) + 0, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 4ab6da338585..be783f717f19 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,7 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index 8d217ed04e6e..84a751005f5b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index cd7e20f0fe47..b0293f268cd2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -63,6 +63,21 @@ enum { __PERCPU_REF_FLAG_BITS = 2, }; +/* @flags for percpu_ref_init() */ +enum { + /* + * Start w/ ref == 1 in atomic mode. Can be switched to percpu + * operation using percpu_ref_switch_to_percpu(). + */ + PERCPU_REF_INIT_ATOMIC = 1 << 0, + + /* + * Start dead w/ ref == 0 in atomic mode. Must be revived with + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + */ + PERCPU_REF_INIT_DEAD = 1 << 1, +}; + struct percpu_ref { atomic_long_t count; /* @@ -76,7 +91,8 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release, gfp_t gfp); + percpu_ref_func_t *release, unsigned int flags, + gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a99d504294de..753df01a9831 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1634,7 +1634,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -4510,7 +4511,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4583,7 +4584,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5a6d43baccc5..ed280fb1e5b5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -45,27 +45,40 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * - * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_long_set(ref, 1). + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a + * refcount of 1; analagous to atomic_long_set(ref, 1). See the + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, - gfp_t gfp) + unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); - - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + unsigned long start_count = 0; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + else + start_count += PERCPU_COUNT_BIAS; + + if (flags & PERCPU_REF_INIT_DEAD) + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + else + start_count++; + + atomic_long_set(&ref->count, start_count); + ref->release = release; return 0; } -- cgit From 1cae13e75b7a7848c03138636d4eb8d8a5054dd5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: make INIT_ATOMIC and switch_to_atomic() sticky Currently, a percpu_ref which is initialized with PERPCU_REF_INIT_ATOMIC or switched to atomic mode via switch_to_atomic() automatically reverts to percpu mode on the first percpu_ref_reinit(). This makes the atomic mode difficult to use for cases where a percpu_ref is used as a persistent on/off switch which may be cycled multiple times. This patch makes such atomic state sticky so that it survives through kill/reinit cycles. After this patch, atomic state is cleared only by an explicit percpu_ref_switch_to_percpu() call. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner --- include/linux/percpu-refcount.h | 5 ++++- lib/percpu-refcount.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b0293f268cd2..00c01fc6d88c 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -67,7 +67,9 @@ enum { enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu - * operation using percpu_ref_switch_to_percpu(). + * operation using percpu_ref_switch_to_percpu(). If initialized + * with this flag, the ref will stay in atomic mode until + * percpu_ref_switch_to_percpu() is invoked on it. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, @@ -87,6 +89,7 @@ struct percpu_ref { unsigned long percpu_count_ptr; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; + bool force_atomic:1; struct rcu_head rcu; }; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index ed280fb1e5b5..6111bcb28376 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -67,6 +67,8 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, if (!ref->percpu_count_ptr) return -ENOMEM; + ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; else @@ -202,7 +204,8 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit - * operations. + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is already in @@ -216,6 +219,7 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { + ref->force_atomic = true; __percpu_ref_switch_to_atomic(ref, confirm_switch); } @@ -255,7 +259,10 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and - * reinit operations. + * reinit operations. This function reverses the sticky atomic state set + * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is + * dying or dead, the actual switching takes place on the following + * percpu_ref_reinit(). * * This function normally doesn't block and can be called from any context * but it may block if @ref is in the process of switching to atomic mode @@ -263,6 +270,8 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + ref->force_atomic = false; + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) __percpu_ref_switch_to_percpu(ref); @@ -304,8 +313,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully and - * reached 0 but not exited. + * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been + * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. @@ -316,6 +325,7 @@ void percpu_ref_reinit(struct percpu_ref *ref) ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - __percpu_ref_switch_to_percpu(ref); + if (!ref->force_atomic) + __percpu_ref_switch_to_percpu(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit From 6f3aabd183984120acb59f5d57a45cfa7455be9a Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 25 Sep 2014 16:05:25 -0700 Subject: genalloc: fix device node resource counter Decrement the np_pool device_node refcount, which was incremented on the preceding of_parse_phandle() call. Signed-off-by: Vladimir Zapolskiy Cc: Will Deacon Cc: Olof Johansson Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/genalloc.c b/lib/genalloc.c index bdb9a456bcbb..38d2db82228c 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -588,6 +588,7 @@ struct gen_pool *of_get_named_gen_pool(struct device_node *np, if (!np_pool) return NULL; pdev = of_find_device_by_node(np_pool); + of_node_put(np_pool); if (!pdev) return NULL; return dev_get_gen_pool(&pdev->dev); -- cgit From 3c731eba48e1b0650decfc91a839b80f0e05ce8f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:07 -0700 Subject: bpf: mini eBPF library, test stubs and verifier testsuite 1. the library includes a trivial set of BPF syscall wrappers: int bpf_create_map(int key_size, int value_size, int max_entries); int bpf_update_elem(int fd, void *key, void *value); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); int bpf_get_next_key(int fd, void *key, void *next_key); int bpf_prog_load(enum bpf_prog_type prog_type, const struct sock_filter_int *insns, int insn_len, const char *license); bpf_prog_load() stores verifier log into global bpf_log_buf[] array and BPF_*() macros to build instructions 2. test stubs configure eBPF infra with 'unspec' map and program types. These are fake types used by user space testsuite only. 3. verifier tests valid and invalid programs and expects predefined error log messages from kernel. 40 tests so far. $ sudo ./test_verifier #0 add+sub+mul OK #1 unreachable OK #2 unreachable2 OK #3 out of range jump OK #4 out of range jump2 OK #5 test1 ld_imm64 OK ... Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 4 + kernel/bpf/test_stub.c | 116 ++++++++++ lib/Kconfig.debug | 3 +- samples/bpf/Makefile | 12 + samples/bpf/libbpf.c | 94 ++++++++ samples/bpf/libbpf.h | 172 ++++++++++++++ samples/bpf/test_verifier.c | 548 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 948 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/test_stub.c create mode 100644 samples/bpf/Makefile create mode 100644 samples/bpf/libbpf.c create mode 100644 samples/bpf/libbpf.h create mode 100644 samples/bpf/test_verifier.c (limited to 'lib') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3c726b0995b7..45427239f375 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1,5 @@ obj-y := core.o syscall.o verifier.o + +ifdef CONFIG_TEST_BPF +obj-y += test_stub.o +endif diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c new file mode 100644 index 000000000000..fcaddff4003e --- /dev/null +++ b/kernel/bpf/test_stub.c @@ -0,0 +1,116 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC + * to be used by user space verifier testsuite + */ +struct bpf_context { + u64 arg1; + u64 arg2; +}; + +static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +static struct bpf_func_proto test_funcs[] = { + [BPF_FUNC_unspec] = { + .func = test_func, + .gpl_only = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + }, +}; + +static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) +{ + if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) + return NULL; + return &test_funcs[func_id]; +} + +static const struct bpf_context_access { + int size; + enum bpf_access_type type; +} test_ctx_access[] = { + [offsetof(struct bpf_context, arg1)] = { + FIELD_SIZEOF(struct bpf_context, arg1), + BPF_READ + }, + [offsetof(struct bpf_context, arg2)] = { + FIELD_SIZEOF(struct bpf_context, arg2), + BPF_READ + }, +}; + +static bool test_is_valid_access(int off, int size, enum bpf_access_type type) +{ + const struct bpf_context_access *access; + + if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) + return false; + + access = &test_ctx_access[off]; + if (access->size == size && (access->type & type)) + return true; + + return false; +} + +static struct bpf_verifier_ops test_ops = { + .get_func_proto = test_func_proto, + .is_valid_access = test_is_valid_access, +}; + +static struct bpf_prog_type_list tl_prog = { + .ops = &test_ops, + .type = BPF_PROG_TYPE_UNSPEC, +}; + +static struct bpf_map *test_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + map = kzalloc(sizeof(*map), GFP_USER); + if (!map) + return ERR_PTR(-ENOMEM); + + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + return map; +} + +static void test_map_free(struct bpf_map *map) +{ + kfree(map); +} + +static struct bpf_map_ops test_map_ops = { + .map_alloc = test_map_alloc, + .map_free = test_map_free, +}; + +static struct bpf_map_type_list tl_map = { + .ops = &test_map_ops, + .type = BPF_MAP_TYPE_UNSPEC, +}; + +static int __init register_test_ops(void) +{ + bpf_register_map_type(&tl_map); + bpf_register_prog_type(&tl_prog); + return 0; +} +late_initcall(register_test_ops); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a28590083622..3ac43f34437b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1672,7 +1672,8 @@ config TEST_BPF against the BPF interpreter or BPF JIT compiler depending on the current setting. This is in particular useful for BPF JIT compiler development, but also to run regression tests against changes in - the interpreter code. + the interpreter code. It also enables test stubs for eBPF maps and + verifier used by user space verifier testsuite. If unsure, say N. diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile new file mode 100644 index 000000000000..634391797856 --- /dev/null +++ b/samples/bpf/Makefile @@ -0,0 +1,12 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := test_verifier + +test_verifier-objs := test_verifier.o libbpf.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) + +HOSTCFLAGS += -I$(objtree)/usr/include diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c new file mode 100644 index 000000000000..ff6504420738 --- /dev/null +++ b/samples/bpf/libbpf.c @@ -0,0 +1,94 @@ +/* eBPF mini library */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries) +{ + union bpf_attr attr = { + .map_type = map_type, + .key_size = key_size, + .value_size = value_size, + .max_entries = max_entries + }; + + return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + +int bpf_update_elem(int fd, void *key, void *value) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + }; + + return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + +int bpf_lookup_elem(int fd, void *key, void *value) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + }; + + return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + +int bpf_delete_elem(int fd, void *key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + }; + + return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} + +int bpf_get_next_key(int fd, void *key, void *next_key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .next_key = ptr_to_u64(next_key), + }; + + return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + +#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) + +char bpf_log_buf[LOG_BUF_SIZE]; + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int prog_len, + const char *license) +{ + union bpf_attr attr = { + .prog_type = prog_type, + .insns = ptr_to_u64((void *) insns), + .insn_cnt = prog_len / sizeof(struct bpf_insn), + .license = ptr_to_u64((void *) license), + .log_buf = ptr_to_u64(bpf_log_buf), + .log_size = LOG_BUF_SIZE, + .log_level = 1, + }; + + bpf_log_buf[0] = 0; + + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); +} diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h new file mode 100644 index 000000000000..8a31babeca5d --- /dev/null +++ b/samples/bpf/libbpf.h @@ -0,0 +1,172 @@ +/* eBPF mini library */ +#ifndef __LIBBPF_H +#define __LIBBPF_H + +struct bpf_insn; + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries); +int bpf_update_elem(int fd, void *key, void *value); +int bpf_lookup_elem(int fd, void *key, void *value); +int bpf_delete_elem(int fd, void *key); +int bpf_get_next_key(int fd, void *key, void *next_key); + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int insn_len, + const char *license); + +#define LOG_BUF_SIZE 8192 +extern char bpf_log_buf[LOG_BUF_SIZE]; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c new file mode 100644 index 000000000000..d10992e2740e --- /dev/null +++ b/samples/bpf/test_verifier.c @@ -0,0 +1,548 @@ +/* + * Testsuite for eBPF verifier + * + * Copyright (c) 2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +#define MAX_INSNS 512 +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +struct bpf_test { + const char *descr; + struct bpf_insn insns[MAX_INSNS]; + int fixup[32]; + const char *errstr; + enum { + ACCEPT, + REJECT + } result; +}; + +static struct bpf_test tests[] = { + { + "add+sub+mul", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2), + BPF_MOV64_IMM(BPF_REG_2, 3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "unreachable", + .insns = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "unreachable2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "out of range jump", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "out of range jump2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -2), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "test1 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test2 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test3 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test4 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test5 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "no bpf_exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "loop (back-edge)", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "loop2 (back-edge)", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -4), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "conditional loop", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "read uninitialized register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "read invalid register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "program doesn't init R0 before exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + }, + { + "stack out of bounds", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack", + .result = REJECT, + }, + { + "invalid call insn1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid call insn2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid function call", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567), + BPF_EXIT_INSN(), + }, + .errstr = "invalid func 1234567", + .result = REJECT, + }, + { + "uninitialized stack1", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_EXIT_INSN(), + }, + .fixup = {2}, + .errstr = "invalid indirect read from stack", + .result = REJECT, + }, + { + "uninitialized stack2", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid read from stack", + .result = REJECT, + }, + { + "check valid spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* fill it back into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), + + /* should be able to access R0 = *(R2 + 8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check corrupted spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* mess up with R1 pointer on stack */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23), + + /* fill back into R0 should fail */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + + BPF_EXIT_INSN(), + }, + .errstr = "corrupted spill", + .result = REJECT, + }, + { + "invalid src register in STX", + .insns = { + BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "invalid dst register in STX", + .insns = { + BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid dst register in ST", + .insns = { + BPF_ST_MEM(BPF_B, 14, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid src register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R12 is invalid", + .result = REJECT, + }, + { + "invalid dst register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R11 is invalid", + .result = REJECT, + }, + { + "junk insn", + .insns = { + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM", + .result = REJECT, + }, + { + "junk insn2", + .insns = { + BPF_RAW_INSN(1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_LDX uses reserved fields", + .result = REJECT, + }, + { + "junk insn3", + .insns = { + BPF_RAW_INSN(-1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn4", + .insns = { + BPF_RAW_INSN(-1, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn5", + .insns = { + BPF_RAW_INSN(0x7f, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_ALU uses reserved fields", + .result = REJECT, + }, + { + "misaligned read from stack", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "invalid map_fd for function call", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_EXIT_INSN(), + }, + .errstr = "fd 0 is not pointing to valid bpf_map", + .result = REJECT, + }, + { + "don't check return value before access", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access 'map_value_or_null'", + .result = REJECT, + }, + { + "access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "sometimes access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access", + .result = REJECT, + }, +}; + +static int probe_filter_length(struct bpf_insn *fp) +{ + int len = 0; + + for (len = MAX_INSNS - 1; len > 0; --len) + if (fp[len].code != 0 || fp[len].imm != 0) + break; + + return len + 1; +} + +static int create_map(void) +{ + long long key, value = 0; + int map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + } + + return map_fd; +} + +static int test(void) +{ + int prog_fd, i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct bpf_insn *prog = tests[i].insns; + int prog_len = probe_filter_length(prog); + int *fixup = tests[i].fixup; + int map_fd = -1; + + if (*fixup) { + map_fd = create_map(); + + do { + prog[*fixup].imm = map_fd; + fixup++; + } while (*fixup); + } + printf("#%d %s ", i, tests[i].descr); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog, + prog_len * sizeof(struct bpf_insn), + "GPL"); + + if (tests[i].result == ACCEPT) { + if (prog_fd < 0) { + printf("FAIL\nfailed to load prog '%s'\n", + strerror(errno)); + printf("%s", bpf_log_buf); + goto fail; + } + } else { + if (prog_fd >= 0) { + printf("FAIL\nunexpected success to load\n"); + printf("%s", bpf_log_buf); + goto fail; + } + if (strstr(bpf_log_buf, tests[i].errstr) == 0) { + printf("FAIL\nunexpected error message: %s", + bpf_log_buf); + goto fail; + } + } + + printf("OK\n"); +fail: + if (map_fd >= 0) + close(map_fd); + close(prog_fd); + + } + + return 0; +} + +int main(void) +{ + return test(); +} -- cgit From af958a38a60c7ca3d8a39c918c1baa2ff7b6b233 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 27 Sep 2014 12:31:36 +0200 Subject: Revert "lzo: properly check for overruns" This reverts commit 206a81c ("lzo: properly check for overruns"). As analysed by Willem Pinckaers, this fix is still incomplete on certain rare corner cases, and it is easier to restart from the original code. Reported-by: Willem Pinckaers Cc: "Don A. Bailey" Cc: stable Signed-off-by: Willy Tarreau Signed-off-by: Greg Kroah-Hartman --- lib/lzo/lzo1x_decompress_safe.c | 62 ++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) (limited to 'lib') diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c index 8563081e8da3..569985d522d5 100644 --- a/lib/lzo/lzo1x_decompress_safe.c +++ b/lib/lzo/lzo1x_decompress_safe.c @@ -19,31 +19,11 @@ #include #include "lzodefs.h" -#define HAVE_IP(t, x) \ - (((size_t)(ip_end - ip) >= (size_t)(t + x)) && \ - (((t + x) >= t) && ((t + x) >= x))) - -#define HAVE_OP(t, x) \ - (((size_t)(op_end - op) >= (size_t)(t + x)) && \ - (((t + x) >= t) && ((t + x) >= x))) - -#define NEED_IP(t, x) \ - do { \ - if (!HAVE_IP(t, x)) \ - goto input_overrun; \ - } while (0) - -#define NEED_OP(t, x) \ - do { \ - if (!HAVE_OP(t, x)) \ - goto output_overrun; \ - } while (0) - -#define TEST_LB(m_pos) \ - do { \ - if ((m_pos) < out) \ - goto lookbehind_overrun; \ - } while (0) +#define HAVE_IP(x) ((size_t)(ip_end - ip) >= (size_t)(x)) +#define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x)) +#define NEED_IP(x) if (!HAVE_IP(x)) goto input_overrun +#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun +#define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len) @@ -78,14 +58,14 @@ int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1, 0); + NEED_IP(1); } t += 15 + *ip++; } t += 3; copy_literal_run: #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - if (likely(HAVE_IP(t, 15) && HAVE_OP(t, 15))) { + if (likely(HAVE_IP(t + 15) && HAVE_OP(t + 15))) { const unsigned char *ie = ip + t; unsigned char *oe = op + t; do { @@ -101,8 +81,8 @@ copy_literal_run: } else #endif { - NEED_OP(t, 0); - NEED_IP(t, 3); + NEED_OP(t); + NEED_IP(t + 3); do { *op++ = *ip++; } while (--t > 0); @@ -115,7 +95,7 @@ copy_literal_run: m_pos -= t >> 2; m_pos -= *ip++ << 2; TEST_LB(m_pos); - NEED_OP(2, 0); + NEED_OP(2); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; @@ -139,10 +119,10 @@ copy_literal_run: while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1, 0); + NEED_IP(1); } t += 31 + *ip++; - NEED_IP(2, 0); + NEED_IP(2); } m_pos = op - 1; next = get_unaligned_le16(ip); @@ -157,10 +137,10 @@ copy_literal_run: while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1, 0); + NEED_IP(1); } t += 7 + *ip++; - NEED_IP(2, 0); + NEED_IP(2); } next = get_unaligned_le16(ip); ip += 2; @@ -174,7 +154,7 @@ copy_literal_run: #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) if (op - m_pos >= 8) { unsigned char *oe = op + t; - if (likely(HAVE_OP(t, 15))) { + if (likely(HAVE_OP(t + 15))) { do { COPY8(op, m_pos); op += 8; @@ -184,7 +164,7 @@ copy_literal_run: m_pos += 8; } while (op < oe); op = oe; - if (HAVE_IP(6, 0)) { + if (HAVE_IP(6)) { state = next; COPY4(op, ip); op += next; @@ -192,7 +172,7 @@ copy_literal_run: continue; } } else { - NEED_OP(t, 0); + NEED_OP(t); do { *op++ = *m_pos++; } while (op < oe); @@ -201,7 +181,7 @@ copy_literal_run: #endif { unsigned char *oe = op + t; - NEED_OP(t, 0); + NEED_OP(t); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; @@ -214,15 +194,15 @@ match_next: state = next; t = next; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - if (likely(HAVE_IP(6, 0) && HAVE_OP(4, 0))) { + if (likely(HAVE_IP(6) && HAVE_OP(4))) { COPY4(op, ip); op += t; ip += t; } else #endif { - NEED_IP(t, 3); - NEED_OP(t, 0); + NEED_IP(t + 3); + NEED_OP(t); while (t > 0) { *op++ = *ip++; t--; -- cgit From 72cf90124e87d975d0b2114d930808c58b4c05e4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 27 Sep 2014 12:31:37 +0200 Subject: lzo: check for length overrun in variable length encoding. This fix ensures that we never meet an integer overflow while adding 255 while parsing a variable length encoding. It works differently from commit 206a81c ("lzo: properly check for overruns") because instead of ensuring that we don't overrun the input, which is tricky to guarantee due to many assumptions in the code, it simply checks that the cumulated number of 255 read cannot overflow by bounding this number. The MAX_255_COUNT is the maximum number of times we can add 255 to a base count without overflowing an integer. The multiply will overflow when multiplying 255 by more than MAXINT/255. The sum will overflow earlier depending on the base count. Since the base count is taken from a u8 and a few bits, it is safe to assume that it will always be lower than or equal to 2*255, thus we can always prevent any overflow by accepting two less 255 steps. This patch also reduces the CPU overhead and actually increases performance by 1.1% compared to the initial code, while the previous fix costs 3.1% (measured on x86_64). The fix needs to be backported to all currently supported stable kernels. Reported-by: Willem Pinckaers Cc: "Don A. Bailey" Cc: stable Signed-off-by: Willy Tarreau Signed-off-by: Greg Kroah-Hartman --- lib/lzo/lzo1x_decompress_safe.c | 43 +++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c index 569985d522d5..a1c387f6afba 100644 --- a/lib/lzo/lzo1x_decompress_safe.c +++ b/lib/lzo/lzo1x_decompress_safe.c @@ -25,6 +25,16 @@ #define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun #define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun +/* This MAX_255_COUNT is the maximum number of times we can add 255 to a base + * count without overflowing an integer. The multiply will overflow when + * multiplying 255 by more than MAXINT/255. The sum will overflow earlier + * depending on the base count. Since the base count is taken from a u8 + * and a few bits, it is safe to assume that it will always be lower than + * or equal to 2*255, thus we can always prevent any overflow by accepting + * two less 255 steps. See Documentation/lzo.txt for more information. + */ +#define MAX_255_COUNT ((((size_t)~0) / 255) - 2) + int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len) { @@ -55,12 +65,19 @@ int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, if (t < 16) { if (likely(state == 0)) { if (unlikely(t == 0)) { + size_t offset; + const unsigned char *ip_last = ip; + while (unlikely(*ip == 0)) { - t += 255; ip++; NEED_IP(1); } - t += 15 + *ip++; + offset = ip - ip_last; + if (unlikely(offset > MAX_255_COUNT)) + return LZO_E_ERROR; + + offset = (offset << 8) - offset; + t += offset + 15 + *ip++; } t += 3; copy_literal_run: @@ -116,12 +133,19 @@ copy_literal_run: } else if (t >= 32) { t = (t & 31) + (3 - 1); if (unlikely(t == 2)) { + size_t offset; + const unsigned char *ip_last = ip; + while (unlikely(*ip == 0)) { - t += 255; ip++; NEED_IP(1); } - t += 31 + *ip++; + offset = ip - ip_last; + if (unlikely(offset > MAX_255_COUNT)) + return LZO_E_ERROR; + + offset = (offset << 8) - offset; + t += offset + 31 + *ip++; NEED_IP(2); } m_pos = op - 1; @@ -134,12 +158,19 @@ copy_literal_run: m_pos -= (t & 8) << 11; t = (t & 7) + (3 - 1); if (unlikely(t == 2)) { + size_t offset; + const unsigned char *ip_last = ip; + while (unlikely(*ip == 0)) { - t += 255; ip++; NEED_IP(1); } - t += 7 + *ip++; + offset = ip - ip_last; + if (unlikely(offset > MAX_255_COUNT)) + return LZO_E_ERROR; + + offset = (offset << 8) - offset; + t += offset + 7 + *ip++; NEED_IP(2); } next = get_unaligned_le16(ip); -- cgit From 8acd91e8620836a56ff62028ed28ba629f2881a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Sep 2014 15:26:00 +0200 Subject: locking/lockdep: Revert qrwlock recusive stuff Commit f0bab73cb539 ("locking/lockdep: Restrict the use of recursive read_lock() with qrwlock") changed lockdep to try and conform to the qrwlock semantics which differ from the traditional rwlock semantics. In particular qrwlock is fair outside of interrupt context, but in interrupt context readers will ignore all fairness. The problem modeling this is that read and write side have different lock state (interrupts) semantics but we only have a single representation of these. Therefore lockdep will get confused, thinking the lock can cause interrupt lock inversions. So revert it for now; the old rwlock semantics were already imperfectly modeled and the qrwlock extra won't fit either. If we want to properly fix this, I think we need to resurrect the work by Gautham did a few years ago that split the read and write state of locks: http://lwn.net/Articles/332801/ FWIW the locking selftest that would've failed (and was reported by Borislav earlier) is something like: RL(X1); /* IRQ-ON */ LOCK(A); UNLOCK(A); RU(X1); IRQ_ENTER(); RL(X1); /* IN-IRQ */ RU(X1); IRQ_EXIT(); At which point it would report that because A is an IRQ-unsafe lock we can suffer the following inversion: CPU0 CPU1 lock(A) lock(X1) lock(A) lock(X1) And this is 'wrong' because X1 can recurse (assuming the above lock are in fact read-lock) but lockdep doesn't know about this. Signed-off-by: Peter Zijlstra (Intel) Cc: Waiman Long Cc: ego@linux.vnet.ibm.com Cc: bp@alien8.de Cc: Linus Torvalds Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20140930132600.GA7444@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 10 +-------- kernel/locking/lockdep.c | 6 ------ lib/locking-selftest.c | 56 ++++++------------------------------------------ 3 files changed, 8 insertions(+), 64 deletions(-) (limited to 'lib') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b5a84b62fb84..f388481201cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -478,24 +478,16 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -/* - * Read states in the 2-bit held_lock:read field: - * 0: Exclusive lock - * 1: Shareable lock, cannot be recursively called - * 2: Shareable lock, can be recursively called - * 3: Shareable lock, cannot be recursively called except in interrupt context - */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) -#define lock_acquire_shared_irecursive(l, s, t, n, i) lock_acquire(l, s, t, 3, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_irecursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define rwlock_release(l, n, i) lock_release(l, n, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 420ba685c4e5..88d0d4420ad2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3597,12 +3597,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - /* - * An interrupt recursive read in interrupt context can be considered - * to be the same as a recursive read from checking perspective. - */ - if ((read == 3) && in_interrupt()) - read = 2; current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 62af709b2083..872a15a2a637 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -267,46 +267,19 @@ GENERATE_TESTCASE(AA_rsem) #undef E /* - * Special-case for read-locking, they are not allowed to - * recurse on the same lock class except under interrupt context: + * Special-case for read-locking, they are + * allowed to recurse on the same lock class: */ static void rlock_AA1(void) { RL(X1); - RL(X1); // this one should fail + RL(X1); // this one should NOT fail } static void rlock_AA1B(void) { RL(X1); - RL(X2); // this one should fail -} - -static void rlock_AHA1(void) -{ - RL(X1); - HARDIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_EXIT(); -} - -static void rlock_AHA1B(void) -{ - RL(X1); - HARDIRQ_ENTER(); - RL(X2); // this one should NOT fail - HARDIRQ_EXIT(); -} - -static void rlock_ASAHA1(void) -{ - RL(X1); - SOFTIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_EXIT(); - SOFTIRQ_EXIT(); + RL(X2); // this one should NOT fail } static void rsem_AA1(void) @@ -1096,7 +1069,7 @@ static inline void print_testname(const char *testname) print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1857,14 +1830,14 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); printk(" |"); - dotest(rlock_AA1, FAILURE, LOCKTYPE_RWLOCK); + dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); printk("\n"); print_testname("recursive read-lock #2"); printk(" |"); - dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK); + dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); printk("\n"); @@ -1883,21 +1856,6 @@ void locking_selftest(void) dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); printk("\n"); - print_testname("recursive rlock with interrupt"); - printk(" |"); - dotest(rlock_AHA1, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - - print_testname("recursive rlock with interrupt #2"); - printk(" |"); - dotest(rlock_AHA1B, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - - print_testname("recursive rlock with interrupt #3"); - printk(" |"); - dotest(rlock_ASAHA1, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - printk(" --------------------------------------------------------------------------\n"); /* -- cgit From 6ccc72b87b83ece31c2a75bbe07f440b0378f7a9 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:27 +0800 Subject: lib: Add a generic cmdline parse function parse_option_str There should be a generic function to parse params like a=b,c Adding parse_option_str in lib/cmdline.c which will return true if there's specified option set in the params. Also updated efi=old_map parsing code to use the new function Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 22 ++-------------------- include/linux/kernel.h | 1 + lib/cmdline.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c90d3cd2728c..c73a7df5d37f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -932,26 +932,8 @@ u64 efi_mem_attributes(unsigned long phys_addr) static int __init parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - while (*str) { - if (!strncmp(str, "old_map", 7)) { - set_bit(EFI_OLD_MEMMAP, &efi.flags); - str += strlen("old_map"); - } - - /* - * Skip any options we don't understand. Presumably - * they apply to the EFI boot stub. - */ - while (*str && *str != ',') - str++; - - /* If we hit a delimiter, skip it */ - if (*str == ',') - str++; - } + if (parse_option_str(str, "old_map")) + set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 95624bed87ef..f66427ef0628 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -407,6 +407,7 @@ int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 76a712e6e20e..8f13cf73c2ec 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -160,3 +160,32 @@ unsigned long long memparse(const char *ptr, char **retptr) return ret; } EXPORT_SYMBOL(memparse); + +/** + * parse_option_str - Parse a string and check an option is set or not + * @str: String to be parsed + * @option: option name + * + * This function parses a string containing a comma-separated list of + * strings like a=b,c. + * + * Return true if there's such option in the string, or return false. + */ +bool parse_option_str(const char *str, const char *option) +{ + while (*str) { + if (!strncmp(str, option, strlen(option))) { + str += strlen(option); + if (!*str || *str == ',') + return true; + } + + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return false; +} -- cgit From 906d201530f2c52aeb4eee31895c71cdccf1e9a0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 24 Sep 2014 11:17:56 -0700 Subject: dynamic_debug: change __dynamic__dbg return types to void The return value is not used by callers of these functions so change the functions to return void. Signed-off-by: Joe Perches Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 12 +++++------ lib/dynamic_debug.c | 50 +++++++++++++++++-------------------------- 2 files changed, 26 insertions(+), 36 deletions(-) (limited to 'lib') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 2fe93b26b42f..4f1bbc68cd1b 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -42,7 +42,7 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, #if defined(CONFIG_DYNAMIC_DEBUG) extern int ddebug_remove_module(const char *mod_name); extern __printf(2, 3) -int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...); +void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...); extern int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *modname); @@ -50,15 +50,15 @@ extern int ddebug_dyndbg_module_param_cb(char *param, char *val, struct device; extern __printf(3, 4) -int __dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev, - const char *fmt, ...); +void __dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev, + const char *fmt, ...); struct net_device; extern __printf(3, 4) -int __dynamic_netdev_dbg(struct _ddebug *descriptor, - const struct net_device *dev, - const char *fmt, ...); +void __dynamic_netdev_dbg(struct _ddebug *descriptor, + const struct net_device *dev, + const char *fmt, ...); #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ static struct _ddebug __aligned(8) \ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index c9afbe2c445a..31fe79e31ab8 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -537,10 +537,9 @@ static char *dynamic_emit_prefix(const struct _ddebug *desc, char *buf) return buf; } -int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...) +void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...) { va_list args; - int res; struct va_format vaf; char buf[PREFIX_SIZE]; @@ -552,21 +551,17 @@ int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - res = printk(KERN_DEBUG "%s%pV", - dynamic_emit_prefix(descriptor, buf), &vaf); + printk(KERN_DEBUG "%s%pV", dynamic_emit_prefix(descriptor, buf), &vaf); va_end(args); - - return res; } EXPORT_SYMBOL(__dynamic_pr_debug); -int __dynamic_dev_dbg(struct _ddebug *descriptor, +void __dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev, const char *fmt, ...) { struct va_format vaf; va_list args; - int res; BUG_ON(!descriptor); BUG_ON(!fmt); @@ -577,30 +572,27 @@ int __dynamic_dev_dbg(struct _ddebug *descriptor, vaf.va = &args; if (!dev) { - res = printk(KERN_DEBUG "(NULL device *): %pV", &vaf); + printk(KERN_DEBUG "(NULL device *): %pV", &vaf); } else { char buf[PREFIX_SIZE]; - res = dev_printk_emit(7, dev, "%s%s %s: %pV", - dynamic_emit_prefix(descriptor, buf), - dev_driver_string(dev), dev_name(dev), - &vaf); + dev_printk_emit(7, dev, "%s%s %s: %pV", + dynamic_emit_prefix(descriptor, buf), + dev_driver_string(dev), dev_name(dev), + &vaf); } va_end(args); - - return res; } EXPORT_SYMBOL(__dynamic_dev_dbg); #ifdef CONFIG_NET -int __dynamic_netdev_dbg(struct _ddebug *descriptor, - const struct net_device *dev, const char *fmt, ...) +void __dynamic_netdev_dbg(struct _ddebug *descriptor, + const struct net_device *dev, const char *fmt, ...) { struct va_format vaf; va_list args; - int res; BUG_ON(!descriptor); BUG_ON(!fmt); @@ -613,23 +605,21 @@ int __dynamic_netdev_dbg(struct _ddebug *descriptor, if (dev && dev->dev.parent) { char buf[PREFIX_SIZE]; - res = dev_printk_emit(7, dev->dev.parent, - "%s%s %s %s%s: %pV", - dynamic_emit_prefix(descriptor, buf), - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), netdev_reg_state(dev), - &vaf); + dev_printk_emit(7, dev->dev.parent, + "%s%s %s %s%s: %pV", + dynamic_emit_prefix(descriptor, buf), + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + &vaf); } else if (dev) { - res = printk(KERN_DEBUG "%s%s: %pV", netdev_name(dev), - netdev_reg_state(dev), &vaf); + printk(KERN_DEBUG "%s%s: %pV", netdev_name(dev), + netdev_reg_state(dev), &vaf); } else { - res = printk(KERN_DEBUG "(NULL net_device): %pV", &vaf); + printk(KERN_DEBUG "(NULL net_device): %pV", &vaf); } va_end(args); - - return res; } EXPORT_SYMBOL(__dynamic_netdev_dbg); -- cgit From 33ac9dba859b07d40e9ec826057d20c857fdede5 Mon Sep 17 00:00:00 2001 From: Maarten ter Huurne Date: Tue, 9 Sep 2014 13:46:28 +0200 Subject: fonts: Add 6x10 font This font is suitable for framebuffer consoles on devices with a 320x240 screen, to get a reasonable number of characters (53x24) that are still at a readable size. The font is derived from the existing 6x11 font, but gets 3 extra lines without sacrificing readability. Also I redesigned a some glyhps so they are more distinct and better fill the available space. Signed-off-by: Maarten ter Huurne Signed-off-by: Tomi Valkeinen --- include/linux/font.h | 4 +- lib/fonts/Kconfig | 9 + lib/fonts/Makefile | 1 + lib/fonts/font_6x10.c | 3086 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/fonts/fonts.c | 4 + 5 files changed, 3103 insertions(+), 1 deletion(-) create mode 100644 lib/fonts/font_6x10.c (limited to 'lib') diff --git a/include/linux/font.h b/include/linux/font.h index 40a24ab41b36..d6821769dd1e 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -31,6 +31,7 @@ struct font_desc { #define SUN12x22_IDX 7 #define ACORN8x8_IDX 8 #define MINI4x6_IDX 9 +#define FONT6x10_IDX 10 extern const struct font_desc font_vga_8x8, font_vga_8x16, @@ -41,7 +42,8 @@ extern const struct font_desc font_vga_8x8, font_sun_8x16, font_sun_12x22, font_acorn_8x8, - font_mini_4x6; + font_mini_4x6, + font_6x10; /* Find a font with a specific name */ diff --git a/lib/fonts/Kconfig b/lib/fonts/Kconfig index 34fd931b54b5..e77dfe00de36 100644 --- a/lib/fonts/Kconfig +++ b/lib/fonts/Kconfig @@ -79,6 +79,14 @@ config FONT_MINI_4x6 bool "Mini 4x6 font" depends on !SPARC && FONTS +config FONT_6x10 + bool "Medium-size 6x10 font" + depends on !SPARC && FONTS + help + Medium-size console font. Suitable for framebuffer consoles on + embedded devices with a 320x240 screen, to get a reasonable number + of characters (53x24) that are still at a readable size. + config FONT_SUN8x16 bool "Sparc console 8x16 font" depends on FRAMEBUFFER_CONSOLE && (!SPARC && FONTS || SPARC) @@ -109,6 +117,7 @@ config FONT_AUTOSELECT depends on !FONT_PEARL_8x8 depends on !FONT_ACORN_8x8 depends on !FONT_MINI_4x6 + depends on !FONT_6x10 depends on !FONT_SUN8x16 depends on !FONT_SUN12x22 depends on !FONT_10x18 diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile index 2761560f3f15..e04d010cfbf5 100644 --- a/lib/fonts/Makefile +++ b/lib/fonts/Makefile @@ -12,6 +12,7 @@ font-objs-$(CONFIG_FONT_10x18) += font_10x18.o font-objs-$(CONFIG_FONT_PEARL_8x8) += font_pearl_8x8.o font-objs-$(CONFIG_FONT_ACORN_8x8) += font_acorn_8x8.o font-objs-$(CONFIG_FONT_MINI_4x6) += font_mini_4x6.o +font-objs-$(CONFIG_FONT_6x10) += font_6x10.o font-objs += $(font-objs-y) diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c new file mode 100644 index 000000000000..b20620904d31 --- /dev/null +++ b/lib/fonts/font_6x10.c @@ -0,0 +1,3086 @@ +#include + +static const unsigned char fontdata_6x10[] = { + + /* 0 0x00 '^@' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 1 0x01 '^A' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x84, /* 10000100 */ + 0xCC, /* 11001100 */ + 0x84, /* 10000100 */ + 0xCC, /* 11001100 */ + 0xB4, /* 10110100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 2 0x02 '^B' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0xFC, /* 11111100 */ + 0xB4, /* 10110100 */ + 0xFC, /* 11111100 */ + 0xB4, /* 10110100 */ + 0xCC, /* 11001100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 3 0x03 '^C' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x7C, /* 01111100 */ + 0x7C, /* 01111100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 4 0x04 '^D' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x7C, /* 01111100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 5 0x05 '^E' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x6C, /* 01101100 */ + 0x6C, /* 01101100 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 6 0x06 '^F' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x7C, /* 01111100 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 7 0x07 '^G' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x78, /* 01111000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 8 0x08 '^H' */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xCC, /* 11001100 */ + 0x84, /* 10000100 */ + 0xCC, /* 11001100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + + /* 9 0x09 '^I' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x48, /* 01001000 */ + 0x84, /* 10000100 */ + 0x48, /* 01001000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 10 0x0A '^J' */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xCC, /* 11001100 */ + 0xB4, /* 10110100 */ + 0x78, /* 01111000 */ + 0xB4, /* 10110100 */ + 0xCC, /* 11001100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + + /* 11 0x0B '^K' */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x14, /* 00010100 */ + 0x20, /* 00100000 */ + 0x78, /* 01111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 12 0x0C '^L' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 13 0x0D '^M' */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x14, /* 00010100 */ + 0x14, /* 00010100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x70, /* 01110000 */ + 0x60, /* 01100000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 14 0x0E '^N' */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x24, /* 00100100 */ + 0x3C, /* 00111100 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x6C, /* 01101100 */ + 0x6C, /* 01101100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 15 0x0F '^O' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x6C, /* 01101100 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 16 0x10 '^P' */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x60, /* 01100000 */ + 0x70, /* 01110000 */ + 0x78, /* 01111000 */ + 0x70, /* 01110000 */ + 0x60, /* 01100000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 17 0x11 '^Q' */ + 0x00, /* 00000000 */ + 0x04, /* 00000100 */ + 0x0C, /* 00001100 */ + 0x1C, /* 00011100 */ + 0x3C, /* 00111100 */ + 0x1C, /* 00011100 */ + 0x0C, /* 00001100 */ + 0x04, /* 00000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 18 0x12 '^R' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x10, /* 00010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 19 0x13 '^S' */ + 0x00, /* 00000000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x00, /* 00000000 */ + 0x48, /* 01001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 20 0x14 '^T' */ + 0x3C, /* 00111100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x3C, /* 00111100 */ + 0x14, /* 00010100 */ + 0x14, /* 00010100 */ + 0x14, /* 00010100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 21 0x15 '^U' */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x20, /* 00100000 */ + 0x50, /* 01010000 */ + 0x48, /* 01001000 */ + 0x24, /* 00100100 */ + 0x14, /* 00010100 */ + 0x08, /* 00001000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + + /* 22 0x16 '^V' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xF8, /* 11111000 */ + 0xF8, /* 11111000 */ + 0xF8, /* 11111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 23 0x17 '^W' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x10, /* 00010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + + /* 24 0x18 '^X' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 25 0x19 '^Y' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 26 0x1A '^Z' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x7C, /* 01111100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 27 0x1B '^[' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x7C, /* 01111100 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 28 0x1C '^\' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 29 0x1D '^]' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x48, /* 01001000 */ + 0x84, /* 10000100 */ + 0xFC, /* 11111100 */ + 0x84, /* 10000100 */ + 0x48, /* 01001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 30 0x1E '^^' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x7C, /* 01111100 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 31 0x1F '^_' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x7C, /* 01111100 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 32 0x20 ' ' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 33 0x21 '!' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 34 0x22 '"' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 35 0x23 '#' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x7C, /* 01111100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x7C, /* 01111100 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 36 0x24 '$' */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x50, /* 01010000 */ + 0x38, /* 00111000 */ + 0x14, /* 00010100 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + + /* 37 0x25 '%' */ + 0x00, /* 00000000 */ + 0x64, /* 01100100 */ + 0x64, /* 01100100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x4C, /* 01001100 */ + 0x4C, /* 01001100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 38 0x26 '&' */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x48, /* 01001000 */ + 0x50, /* 01010000 */ + 0x20, /* 00100000 */ + 0x54, /* 01010100 */ + 0x48, /* 01001000 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 39 0x27 ''' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 40 0x28 '(' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + + /* 41 0x29 ')' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x00, /* 00000000 */ + + /* 42 0x2A '*' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 43 0x2B '+' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 44 0x2C ',' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + + /* 45 0x2D '-' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 46 0x2E '.' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x18, /* 00011000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 47 0x2F '/' */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + + /* 48 0x30 '0' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x54, /* 01010100 */ + 0x64, /* 01100100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 49 0x31 '1' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x30, /* 00110000 */ + 0x50, /* 01010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 50 0x32 '2' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 51 0x33 '3' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x04, /* 00000100 */ + 0x18, /* 00011000 */ + 0x04, /* 00000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 52 0x34 '4' */ + 0x00, /* 00000000 */ + 0x08, /* 00001000 */ + 0x18, /* 00011000 */ + 0x28, /* 00101000 */ + 0x48, /* 01001000 */ + 0x7C, /* 01111100 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 53 0x35 '5' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 54 0x36 '6' */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x20, /* 00100000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 55 0x37 '7' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 56 0x38 '8' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 57 0x39 '9' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 58 0x3A ':' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x18, /* 00011000 */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x18, /* 00011000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 59 0x3B ';' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + + /* 60 0x3C '<' */ + 0x00, /* 00000000 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x04, /* 00000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 61 0x3D '=' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 62 0x3E '>' */ + 0x00, /* 00000000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 63 0x3F '?' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 64 0x40 '@' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x5C, /* 01011100 */ + 0x54, /* 01010100 */ + 0x5C, /* 01011100 */ + 0x40, /* 01000000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 65 0x41 'A' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 66 0x42 'B' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x38, /* 00111000 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 67 0x43 'C' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 68 0x44 'D' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 69 0x45 'E' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 70 0x46 'F' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 71 0x47 'G' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x5C, /* 01011100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 72 0x48 'H' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 73 0x49 'I' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 74 0x4A 'J' */ + 0x00, /* 00000000 */ + 0x1C, /* 00011100 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 75 0x4B 'K' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x48, /* 01001000 */ + 0x50, /* 01010000 */ + 0x60, /* 01100000 */ + 0x50, /* 01010000 */ + 0x48, /* 01001000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 76 0x4C 'L' */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 77 0x4D 'M' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x6C, /* 01101100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 78 0x4E 'N' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x64, /* 01100100 */ + 0x54, /* 01010100 */ + 0x4C, /* 01001100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 79 0x4F 'O' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 80 0x50 'P' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x78, /* 01111000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 81 0x51 'Q' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x54, /* 01010100 */ + 0x48, /* 01001000 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 82 0x52 'R' */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x78, /* 01111000 */ + 0x50, /* 01010000 */ + 0x48, /* 01001000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 83 0x53 'S' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 84 0x54 'T' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 85 0x55 'U' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 86 0x56 'V' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 87 0x57 'W' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x6C, /* 01101100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 88 0x58 'X' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 89 0x59 'Y' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 90 0x5A 'Z' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x40, /* 01000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 91 0x5B '[' */ + 0x18, /* 00011000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x18, /* 00011000 */ + 0x00, /* 00000000 */ + + /* 92 0x5C '\' */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + + /* 93 0x5D ']' */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + + /* 94 0x5E '^' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 95 0x5F '_' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + + /* 96 0x60 '`' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 97 0x61 'a' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 98 0x62 'b' */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x58, /* 01011000 */ + 0x64, /* 01100100 */ + 0x44, /* 01000100 */ + 0x64, /* 01100100 */ + 0x58, /* 01011000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 99 0x63 'c' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 100 0x64 'd' */ + 0x00, /* 00000000 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + 0x34, /* 00110100 */ + 0x4C, /* 01001100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 101 0x65 'e' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 102 0x66 'f' */ + 0x00, /* 00000000 */ + 0x0C, /* 00001100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 103 0x67 'g' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x34, /* 00110100 */ + 0x4C, /* 01001100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x04, /* 00000100 */ + 0x38, /* 00111000 */ + + /* 104 0x68 'h' */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 105 0x69 'i' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 106 0x6A 'j' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x60, /* 01100000 */ + + /* 107 0x6B 'k' */ + 0x00, /* 00000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x48, /* 01001000 */ + 0x50, /* 01010000 */ + 0x70, /* 01110000 */ + 0x48, /* 01001000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 108 0x6C 'l' */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 109 0x6D 'm' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x68, /* 01101000 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 110 0x6E 'n' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x58, /* 01011000 */ + 0x64, /* 01100100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 111 0x6F 'o' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 112 0x70 'p' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x58, /* 01011000 */ + 0x64, /* 01100100 */ + 0x44, /* 01000100 */ + 0x64, /* 01100100 */ + 0x58, /* 01011000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + + /* 113 0x71 'q' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x34, /* 00110100 */ + 0x4C, /* 01001100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + + /* 114 0x72 'r' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x58, /* 01011000 */ + 0x64, /* 01100100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 115 0x73 's' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x40, /* 01000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 116 0x74 't' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x0C, /* 00001100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 117 0x75 'u' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 118 0x76 'v' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 119 0x77 'w' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 120 0x78 'x' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 121 0x79 'y' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x04, /* 00000100 */ + 0x38, /* 00111000 */ + + /* 122 0x7A 'z' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 123 0x7B '{' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + + /* 124 0x7C '|' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + + /* 125 0x7D '}' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x00, /* 00000000 */ + + /* 126 0x7E '~' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x20, /* 00100000 */ + 0x54, /* 01010100 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 127 0x7F '' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 128 0x80 '\200' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + + /* 129 0x81 '\201' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 130 0x82 '\202' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 131 0x83 '\203' */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 132 0x84 '\204' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 133 0x85 '\205' */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 134 0x86 '\206' */ + 0x18, /* 00011000 */ + 0x24, /* 00100100 */ + 0x18, /* 00011000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 135 0x87 '\207' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + + /* 136 0x88 '\210' */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 137 0x89 '\211' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 138 0x8A '\212' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 139 0x8B '\213' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 140 0x8C '\214' */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 141 0x8D '\215' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 142 0x8E '\216' */ + 0x44, /* 01000100 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 143 0x8F '\217' */ + 0x30, /* 00110000 */ + 0x48, /* 01001000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 144 0x90 '\220' */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x78, /* 01111000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 145 0x91 '\221' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x78, /* 01111000 */ + 0x14, /* 00010100 */ + 0x7C, /* 01111100 */ + 0x50, /* 01010000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 146 0x92 '\222' */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x50, /* 01010000 */ + 0x50, /* 01010000 */ + 0x78, /* 01111000 */ + 0x50, /* 01010000 */ + 0x50, /* 01010000 */ + 0x5C, /* 01011100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 147 0x93 '\223' */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 148 0x94 '\224' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 149 0x95 '\225' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 150 0x96 '\226' */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 151 0x97 '\227' */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 152 0x98 '\230' */ + 0x00, /* 00000000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x04, /* 00000100 */ + 0x38, /* 00111000 */ + + /* 153 0x99 '\231' */ + 0x84, /* 10000100 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 154 0x9A '\232' */ + 0x88, /* 10001000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 155 0x9B '\233' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x50, /* 01010000 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + + /* 156 0x9C '\234' */ + 0x30, /* 00110000 */ + 0x48, /* 01001000 */ + 0x40, /* 01000000 */ + 0x70, /* 01110000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x78, /* 01111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 157 0x9D '\235' */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 158 0x9E '\236' */ + 0x00, /* 00000000 */ + 0x70, /* 01110000 */ + 0x48, /* 01001000 */ + 0x70, /* 01110000 */ + 0x48, /* 01001000 */ + 0x5C, /* 01011100 */ + 0x48, /* 01001000 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 159 0x9F '\237' */ + 0x00, /* 00000000 */ + 0x0C, /* 00001100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x60, /* 01100000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 160 0xA0 '\240' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 161 0xA1 '\241' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x30, /* 00110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 162 0xA2 '\242' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 163 0xA3 '\243' */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x4C, /* 01001100 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 164 0xA4 '\244' */ + 0x34, /* 00110100 */ + 0x58, /* 01011000 */ + 0x00, /* 00000000 */ + 0x58, /* 01011000 */ + 0x64, /* 01100100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 165 0xA5 '\245' */ + 0x58, /* 01011000 */ + 0x44, /* 01000100 */ + 0x64, /* 01100100 */ + 0x54, /* 01010100 */ + 0x4C, /* 01001100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 166 0xA6 '\246' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x04, /* 00000100 */ + 0x3C, /* 00111100 */ + 0x44, /* 01000100 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 167 0xA7 '\247' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 168 0xA8 '\250' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x40, /* 01000000 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 169 0xA9 '\251' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 170 0xAA '\252' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x04, /* 00000100 */ + 0x04, /* 00000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 171 0xAB '\253' */ + 0x20, /* 00100000 */ + 0x60, /* 01100000 */ + 0x24, /* 00100100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x44, /* 01000100 */ + 0x08, /* 00001000 */ + 0x1C, /* 00011100 */ + 0x00, /* 00000000 */ + + /* 172 0xAC '\254' */ + 0x20, /* 00100000 */ + 0x60, /* 01100000 */ + 0x24, /* 00100100 */ + 0x28, /* 00101000 */ + 0x10, /* 00010000 */ + 0x28, /* 00101000 */ + 0x58, /* 01011000 */ + 0x3C, /* 00111100 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + + /* 173 0xAD '\255' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 174 0xAE '\256' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x24, /* 00100100 */ + 0x48, /* 01001000 */ + 0x90, /* 10010000 */ + 0x48, /* 01001000 */ + 0x24, /* 00100100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 175 0xAF '\257' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x90, /* 10010000 */ + 0x48, /* 01001000 */ + 0x24, /* 00100100 */ + 0x48, /* 01001000 */ + 0x90, /* 10010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 176 0xB0 '\260' */ + 0x10, /* 00010000 */ + 0x44, /* 01000100 */ + 0x10, /* 00010000 */ + 0x44, /* 01000100 */ + 0x10, /* 00010000 */ + 0x44, /* 01000100 */ + 0x10, /* 00010000 */ + 0x44, /* 01000100 */ + 0x10, /* 00010000 */ + 0x44, /* 01000100 */ + + /* 177 0xB1 '\261' */ + 0xA8, /* 10101000 */ + 0x54, /* 01010100 */ + 0xA8, /* 10101000 */ + 0x54, /* 01010100 */ + 0xA8, /* 10101000 */ + 0x54, /* 01010100 */ + 0xA8, /* 10101000 */ + 0x54, /* 01010100 */ + 0xA8, /* 10101000 */ + 0x54, /* 01010100 */ + + /* 178 0xB2 '\262' */ + 0xDC, /* 11011100 */ + 0x74, /* 01110100 */ + 0xDC, /* 11011100 */ + 0x74, /* 01110100 */ + 0xDC, /* 11011100 */ + 0x74, /* 01110100 */ + 0xDC, /* 11011100 */ + 0x74, /* 01110100 */ + 0xDC, /* 11011100 */ + 0x74, /* 01110100 */ + + /* 179 0xB3 '\263' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 180 0xB4 '\264' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 181 0xB5 '\265' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 182 0xB6 '\266' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xE8, /* 11101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 183 0xB7 '\267' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xF8, /* 11111000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 184 0xB8 '\270' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 185 0xB9 '\271' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xE8, /* 11101000 */ + 0x08, /* 00001000 */ + 0xE8, /* 11101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 186 0xBA '\272' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 187 0xBB '\273' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xF8, /* 11111000 */ + 0x08, /* 00001000 */ + 0xE8, /* 11101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 188 0xBC '\274' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xE8, /* 11101000 */ + 0x08, /* 00001000 */ + 0xF8, /* 11111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 189 0xBD '\275' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xF8, /* 11111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 190 0xBE '\276' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 191 0xBF '\277' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xF0, /* 11110000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 192 0xC0 '\300' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 193 0xC1 '\301' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 194 0xC2 '\302' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 195 0xC3 '\303' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 196 0xC4 '\304' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 197 0xC5 '\305' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xFC, /* 11111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 198 0xC6 '\306' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 199 0xC7 '\307' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x2C, /* 00101100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 200 0xC8 '\310' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x2C, /* 00101100 */ + 0x20, /* 00100000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 201 0xC9 '\311' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x20, /* 00100000 */ + 0x2C, /* 00101100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 202 0xCA '\312' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xEC, /* 11101100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 203 0xCB '\313' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xEC, /* 11101100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 204 0xCC '\314' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x2C, /* 00101100 */ + 0x20, /* 00100000 */ + 0x2C, /* 00101100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 205 0xCD '\315' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 206 0xCE '\316' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xEC, /* 11101100 */ + 0x00, /* 00000000 */ + 0xEC, /* 11101100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 207 0xCF '\317' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 208 0xD0 '\320' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 209 0xD1 '\321' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 210 0xD2 '\322' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 211 0xD3 '\323' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 212 0xD4 '\324' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 213 0xD5 '\325' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 214 0xD6 '\326' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 215 0xD7 '\327' */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0xFC, /* 11111100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + + /* 216 0xD8 '\330' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xFC, /* 11111100 */ + 0x10, /* 00010000 */ + 0xFC, /* 11111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 217 0xD9 '\331' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0xF0, /* 11110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 218 0xDA '\332' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x1C, /* 00011100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 219 0xDB '\333' */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + + /* 220 0xDC '\334' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + + /* 221 0xDD '\335' */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + 0xE0, /* 11100000 */ + + /* 222 0xDE '\336' */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + 0x1C, /* 00011100 */ + + /* 223 0xDF '\337' */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 224 0xE0 '\340' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x34, /* 00110100 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x34, /* 00110100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 225 0xE1 '\341' */ + 0x18, /* 00011000 */ + 0x24, /* 00100100 */ + 0x44, /* 01000100 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x58, /* 01011000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + + /* 226 0xE2 '\342' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 227 0xE3 '\343' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x28, /* 00101000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 228 0xE4 '\344' */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x24, /* 00100100 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x24, /* 00100100 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 229 0xE5 '\345' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 230 0xE6 '\346' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x74, /* 01110100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + + /* 231 0xE7 '\347' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x0C, /* 00001100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 232 0xE8 '\350' */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 233 0xE9 '\351' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x7C, /* 01111100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 234 0xEA '\352' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x28, /* 00101000 */ + 0x6C, /* 01101100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 235 0xEB '\353' */ + 0x00, /* 00000000 */ + 0x18, /* 00011000 */ + 0x20, /* 00100000 */ + 0x18, /* 00011000 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x24, /* 00100100 */ + 0x18, /* 00011000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 236 0xEC '\354' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 237 0xED '\355' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x04, /* 00000100 */ + 0x38, /* 00111000 */ + 0x54, /* 01010100 */ + 0x54, /* 01010100 */ + 0x38, /* 00111000 */ + 0x40, /* 01000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 238 0xEE '\356' */ + 0x00, /* 00000000 */ + 0x3C, /* 00111100 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x38, /* 00111000 */ + 0x40, /* 01000000 */ + 0x40, /* 01000000 */ + 0x3C, /* 00111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 239 0xEF '\357' */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x44, /* 01000100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 240 0xF0 '\360' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0xFC, /* 11111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 241 0xF1 '\361' */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x7C, /* 01111100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 242 0xF2 '\362' */ + 0x00, /* 00000000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 243 0xF3 '\363' */ + 0x00, /* 00000000 */ + 0x08, /* 00001000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x10, /* 00010000 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 244 0xF4 '\364' */ + 0x00, /* 00000000 */ + 0x0C, /* 00001100 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + + /* 245 0xF5 '\365' */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x10, /* 00010000 */ + 0x60, /* 01100000 */ + 0x00, /* 00000000 */ + + /* 246 0xF6 '\366' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x7C, /* 01111100 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 247 0xF7 '\367' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x20, /* 00100000 */ + 0x54, /* 01010100 */ + 0x08, /* 00001000 */ + 0x20, /* 00100000 */ + 0x54, /* 01010100 */ + 0x08, /* 00001000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 248 0xF8 '\370' */ + 0x30, /* 00110000 */ + 0x48, /* 01001000 */ + 0x48, /* 01001000 */ + 0x30, /* 00110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 249 0xF9 '\371' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x38, /* 00111000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 250 0xFA '\372' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x10, /* 00010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 251 0xFB '\373' */ + 0x00, /* 00000000 */ + 0x04, /* 00000100 */ + 0x08, /* 00001000 */ + 0x08, /* 00001000 */ + 0x50, /* 01010000 */ + 0x50, /* 01010000 */ + 0x20, /* 00100000 */ + 0x20, /* 00100000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 252 0xFC '\374' */ + 0x60, /* 01100000 */ + 0x50, /* 01010000 */ + 0x50, /* 01010000 */ + 0x50, /* 01010000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 253 0xFD '\375' */ + 0x60, /* 01100000 */ + 0x10, /* 00010000 */ + 0x20, /* 00100000 */ + 0x70, /* 01110000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 254 0xFE '\376' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x38, /* 00111000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + + /* 255 0xFF '\377' */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + 0x00, /* 00000000 */ + +}; + +const struct font_desc font_6x10 = { + .idx = FONT6x10_IDX, + .name = "6x10", + .width = 6, + .height = 10, + .data = fontdata_6x10, + .pref = 0, +}; diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index f947189efe6d..823376ca0a8b 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -63,6 +63,10 @@ static const struct font_desc *fonts[] = { #undef NO_FONTS &font_mini_4x6, #endif +#ifdef CONFIG_FONT_6x10 +#undef NO_FONTS + &font_6x10, +#endif }; #define num_fonts ARRAY_SIZE(fonts) -- cgit From 505e3be6c082489a32a88e042f930d047b6415bc Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 9 Oct 2014 15:26:35 -0700 Subject: lib/genalloc.c: add power aligned algorithm One of the more common algorithms used for allocation is to align the start address of the allocation to the order of size requested. Add this as an algorithm option for genalloc. Signed-off-by: Laura Abbott Acked-by: Will Deacon Acked-by: Olof Johansson Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: David Riley Cc: Ritesh Harjain Cc: Russell King Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 4 ++++ lib/genalloc.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 1c2fdaa2ffc3..3cd0934d62ba 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -110,6 +110,10 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); +extern unsigned long gen_pool_first_fit_order_align(unsigned long *map, + unsigned long size, unsigned long start, unsigned int nr, + void *data); + extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); diff --git a/lib/genalloc.c b/lib/genalloc.c index 38d2db82228c..166f17b9f169 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -480,6 +480,26 @@ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, } EXPORT_SYMBOL(gen_pool_first_fit); +/** + * gen_pool_first_fit_order_align - find the first available region + * of memory matching the size requirement. The region will be aligned + * to the order of the size specified. + * @map: The address to base the search on + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @data: additional data - unused + */ +unsigned long gen_pool_first_fit_order_align(unsigned long *map, + unsigned long size, unsigned long start, + unsigned int nr, void *data) +{ + unsigned long align_mask = roundup_pow_of_two(nr) - 1; + + return bitmap_find_next_zero_area(map, size, start, nr, align_mask); +} +EXPORT_SYMBOL(gen_pool_first_fit_order_align); + /** * gen_pool_best_fit - find the best fitting region of memory * macthing the size requirement (no alignment constraint) -- cgit From 9efb3a421d55d30b65fb0dbee05108d15c6c55f7 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 9 Oct 2014 15:26:38 -0700 Subject: lib/genalloc.c: add genpool range check function After allocating an address from a particular genpool, there is no good way to verify if that address actually belongs to a genpool. Introduce addr_in_gen_pool which will return if an address plus size falls completely within the genpool range. Signed-off-by: Laura Abbott Acked-by: Will Deacon Reviewed-by: Olof Johansson Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: David Riley Cc: Ritesh Harjain Cc: Russell King Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 3 +++ lib/genalloc.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 3cd0934d62ba..1ccaab44abcc 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -121,6 +121,9 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid); extern struct gen_pool *dev_get_gen_pool(struct device *dev); +bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, + size_t size); + #ifdef CONFIG_OF extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, const char *propname, int index); diff --git a/lib/genalloc.c b/lib/genalloc.c index 166f17b9f169..cce4dd68c40d 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -402,6 +402,35 @@ void gen_pool_for_each_chunk(struct gen_pool *pool, } EXPORT_SYMBOL(gen_pool_for_each_chunk); +/** + * addr_in_gen_pool - checks if an address falls within the range of a pool + * @pool: the generic memory pool + * @start: start address + * @size: size of the region + * + * Check if the range of addresses falls within the specified pool. Returns + * true if the entire range is contained in the pool and false otherwise. + */ +bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, + size_t size) +{ + bool found = false; + unsigned long end = start + size; + struct gen_pool_chunk *chunk; + + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) { + if (start >= chunk->start_addr && start <= chunk->end_addr) { + if (end <= chunk->end_addr) { + found = true; + break; + } + } + } + rcu_read_unlock(); + return found; +} + /** * gen_pool_avail - get available free space of the pool * @pool: pool to get available free space -- cgit From 4bad78c550025346de3fc5820e366c7b525e1889 Mon Sep 17 00:00:00 2001 From: Rob Jones Date: Mon, 13 Oct 2014 15:51:32 -0700 Subject: lib/dynamic_debug.c: use seq_open_private() instead of seq_open() Using seq_open_private() removes boilerplate code from ddebug_proc_open(). The resultant code is shorter and easier to follow. This patch does not change any functionality. Signed-off-by: Rob Jones Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 31fe79e31ab8..dfba05521748 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -819,22 +819,9 @@ static const struct seq_operations ddebug_proc_seqops = { */ static int ddebug_proc_open(struct inode *inode, struct file *file) { - struct ddebug_iter *iter; - int err; - vpr_info("called\n"); - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (iter == NULL) - return -ENOMEM; - - err = seq_open(file, &ddebug_proc_seqops); - if (err) { - kfree(iter); - return err; - } - ((struct seq_file *)file->private_data)->private = iter; - return 0; + return seq_open_private(file, &ddebug_proc_seqops, + sizeof(struct ddebug_iter)); } static const struct file_operations ddebug_proc_fops = { -- cgit From fec22908323dc56ce38b835f5a67cce30fc7b6fc Mon Sep 17 00:00:00 2001 From: Raphael Silva Date: Mon, 13 Oct 2014 15:51:34 -0700 Subject: lib/textsearch.c: remove textsearch_put reference from comments There is no textsearch_put(). Remove it from the comments to avoid misunderstanding. Textsearch prepare no longer needs textsearch_put(). Signed-off-by: Raphael Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/textsearch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/textsearch.c b/lib/textsearch.c index 0c7e9ab2d88f..0b79908dfe89 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c @@ -249,9 +249,7 @@ EXPORT_SYMBOL(textsearch_find_continuous); * @flags: search flags * * Looks up the search algorithm module and creates a new textsearch - * configuration for the specified pattern. Upon completion all - * necessary refcnts are held and the configuration must be put back - * using textsearch_put() after usage. + * configuration for the specified pattern. * * Note: The format of the pattern may not be compatible between * the various search algorithms. -- cgit From 6de8ab68bc30da75116209d818c75497bdaed09d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 13 Oct 2014 15:51:36 -0700 Subject: lib: remove prio_heap The prio_heap code is unused since commit 889ed9ceaa97 ("cgroup: remove css_scan_tasks()"). It should be compiled out to shrink the binary kernel size which can be done via introducing CONFIG_PRIO_HEAD or by removing the code. We can simply recover the code from git when needed, so it would be better to remove it IMO. Signed-off-by: Lai Jiangshan Acked-by: Tejun Heo Cc: "David S. Miller" Cc: Ingo Molnar Acked-by: Peter Zijlstra Cc: Kees Cook Cc: Francesco Fusco Cc: Greg Thelen Cc: Chris Wilson Cc: Randy Dunlap Cc: George Spelvin Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prio_heap.h | 58 --------------------------------------- lib/Makefile | 2 +- lib/prio_heap.c | 70 ----------------------------------------------- 3 files changed, 1 insertion(+), 129 deletions(-) delete mode 100644 include/linux/prio_heap.h delete mode 100644 lib/prio_heap.c (limited to 'lib') diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h deleted file mode 100644 index 08094350f26a..000000000000 --- a/include/linux/prio_heap.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef _LINUX_PRIO_HEAP_H -#define _LINUX_PRIO_HEAP_H - -/* - * Simple insertion-only static-sized priority heap containing - * pointers, based on CLR, chapter 7 - */ - -#include - -/** - * struct ptr_heap - simple static-sized priority heap - * @ptrs - pointer to data area - * @max - max number of elements that can be stored in @ptrs - * @size - current number of valid elements in @ptrs (in the range 0..@size-1 - * @gt: comparison operator, which should implement "greater than" - */ -struct ptr_heap { - void **ptrs; - int max; - int size; - int (*gt)(void *, void *); -}; - -/** - * heap_init - initialize an empty heap with a given memory size - * @heap: the heap structure to be initialized - * @size: amount of memory to use in bytes - * @gfp_mask: mask to pass to kmalloc() - * @gt: comparison operator, which should implement "greater than" - */ -extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, - int (*gt)(void *, void *)); - -/** - * heap_free - release a heap's storage - * @heap: the heap structure whose data should be released - */ -void heap_free(struct ptr_heap *heap); - -/** - * heap_insert - insert a value into the heap and return any overflowed value - * @heap: the heap to be operated on - * @p: the pointer to be inserted - * - * Attempts to insert the given value into the priority heap. If the - * heap is full prior to the insertion, then the resulting heap will - * consist of the smallest @max elements of the original heap and the - * new element; the greatest element will be removed from the heap and - * returned. Note that the returned element will be the new element - * (i.e. no change to the heap) if the new element is greater than all - * elements currently in the heap. - */ -extern void *heap_insert(struct ptr_heap *heap, void *p); - - - -#endif /* _LINUX_PRIO_HEAP_H */ diff --git a/lib/Makefile b/lib/Makefile index d6b4bc496408..eb95e0fdcca5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ + proportions.o flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o diff --git a/lib/prio_heap.c b/lib/prio_heap.c deleted file mode 100644 index a7af6f85eca8..000000000000 --- a/lib/prio_heap.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Simple insertion-only static-sized priority heap containing - * pointers, based on CLR, chapter 7 - */ - -#include -#include - -int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, - int (*gt)(void *, void *)) -{ - heap->ptrs = kmalloc(size, gfp_mask); - if (!heap->ptrs) - return -ENOMEM; - heap->size = 0; - heap->max = size / sizeof(void *); - heap->gt = gt; - return 0; -} - -void heap_free(struct ptr_heap *heap) -{ - kfree(heap->ptrs); -} - -void *heap_insert(struct ptr_heap *heap, void *p) -{ - void *res; - void **ptrs = heap->ptrs; - int pos; - - if (heap->size < heap->max) { - /* Heap insertion */ - pos = heap->size++; - while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { - ptrs[pos] = ptrs[(pos-1)/2]; - pos = (pos-1)/2; - } - ptrs[pos] = p; - return NULL; - } - - /* The heap is full, so something will have to be dropped */ - - /* If the new pointer is greater than the current max, drop it */ - if (heap->gt(p, ptrs[0])) - return p; - - /* Replace the current max and heapify */ - res = ptrs[0]; - ptrs[0] = p; - pos = 0; - - while (1) { - int left = 2 * pos + 1; - int right = 2 * pos + 2; - int largest = pos; - if (left < heap->size && heap->gt(ptrs[left], p)) - largest = left; - if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) - largest = right; - if (largest == pos) - break; - /* Push p down the heap one level and bump one up */ - ptrs[pos] = ptrs[largest]; - ptrs[largest] = p; - pos = largest; - } - return res; -} -- cgit From 8a6f0b47dad5f8653f2f6ca6360f9f97b8113571 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Mon, 13 Oct 2014 15:51:38 -0700 Subject: lib: rename TEST_MODULE to TEST_LKM The "_MODULE" suffix is reserved for tristates compiled as loadable kernel modules (LKM). The "TEST_MODULE" feature thereby violates this convention. The feature is used to compile the lib/test_module.c kernel module. Sadly this convention is not made explicit, but the Kconfig code documents it. The following code (./scripts/kconfig/confdata.c) is used to generate the autoconf.h header file during the build process. When a feature is selected as a kernel module ('m'), it is suffixed with "_MODULE" to indicate it. switch (*value) { case 'n': break; case 'm': suffix = "_MODULE"; /* fall through */ This causes problems for static code analysis, which assumes a consistent use of the "_MODULE" suffix. This patch renames the feature and its reference in a Makefile to "TEST_LKM", which still expresses the test of a LKM. Signed-off-by: Valentin Rothberg Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- lib/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e7ad58c5fbeb..4e35a5d767ed 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1648,7 +1648,7 @@ config DMA_API_DEBUG If unsure, say N. -config TEST_MODULE +config TEST_LKM tristate "Test module loading with 'hello world' module" default n depends on m diff --git a/lib/Makefile b/lib/Makefile index eb95e0fdcca5..7512dc978f18 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -31,7 +31,7 @@ obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o -obj-$(CONFIG_TEST_MODULE) += test_module.o +obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o -- cgit From cd514e727b18ff4d189b8e268db13729a4175091 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Oct 2014 15:54:25 -0700 Subject: lib/string.c: remove duplicated function lib/string.c contains two functions, strnicmp and strncasecmp, which do roughly the same thing, namely compare two strings case-insensitively up to a given bound. They have slightly different implementations, but the only important difference is that strncasecmp doesn't handle len==0 appropriately; it effectively becomes strcasecmp in that case. strnicmp correctly says that two strings are always equal in their first 0 characters. strncasecmp is the POSIX name for this functionality. So rename the non-broken function to the standard name. To minimize the impact on the rest of the kernel (and since both are exported to modules), make strnicmp a wrapper for strncasecmp. Signed-off-by: Rasmus Villemoes Cc: Grant Likely Cc: Andi Kleen Cc: Dan Carpenter Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/string.c b/lib/string.c index f3c6ff596414..3181e267a033 100644 --- a/lib/string.c +++ b/lib/string.c @@ -27,14 +27,14 @@ #include #include -#ifndef __HAVE_ARCH_STRNICMP +#ifndef __HAVE_ARCH_STRNCASECMP /** - * strnicmp - Case insensitive, length-limited string comparison + * strncasecmp - Case insensitive, length-limited string comparison * @s1: One string * @s2: The other string * @len: the maximum number of characters to compare */ -int strnicmp(const char *s1, const char *s2, size_t len) +int strncasecmp(const char *s1, const char *s2, size_t len) { /* Yes, Virginia, it had better be unsigned */ unsigned char c1, c2; @@ -56,6 +56,13 @@ int strnicmp(const char *s1, const char *s2, size_t len) } while (--len); return (int)c1 - (int)c2; } +EXPORT_SYMBOL(strncasecmp); +#endif +#ifndef __HAVE_ARCH_STRNICMP +int strnicmp(const char *s1, const char *s2, size_t len) +{ + return strncasecmp(s1, s2, len); +} EXPORT_SYMBOL(strnicmp); #endif @@ -73,20 +80,6 @@ int strcasecmp(const char *s1, const char *s2) EXPORT_SYMBOL(strcasecmp); #endif -#ifndef __HAVE_ARCH_STRNCASECMP -int strncasecmp(const char *s1, const char *s2, size_t n) -{ - int c1, c2; - - do { - c1 = tolower(*s1++); - c2 = tolower(*s2++); - } while ((--n > 0) && c1 == c2 && c1 != 0); - return c1 - c2; -} -EXPORT_SYMBOL(strncasecmp); -#endif - #ifndef __HAVE_ARCH_STRCPY /** * strcpy - Copy a %NUL terminated string -- cgit From b0bfb63118612e3614cf77b115c00f895a42c96a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Oct 2014 15:54:27 -0700 Subject: lib: string: Make all calls to strnicmp into calls to strncasecmp The previous patch made strnicmp into a wrapper for strncasecmp. This patch makes all in-tree users of strnicmp call strncasecmp directly, while still making sure that the strnicmp symbol can be used by out-of-tree modules. It should be considered a temporary hack until all in-tree callers have been converted. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 2 +- lib/string.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..e6edfe51575a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -41,7 +41,7 @@ extern int strcmp(const char *,const char *); extern int strncmp(const char *,const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRNICMP -extern int strnicmp(const char *, const char *, __kernel_size_t); +#define strnicmp strncasecmp #endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); diff --git a/lib/string.c b/lib/string.c index 3181e267a033..2fc20aa06f84 100644 --- a/lib/string.c +++ b/lib/string.c @@ -59,6 +59,7 @@ int strncasecmp(const char *s1, const char *s2, size_t len) EXPORT_SYMBOL(strncasecmp); #endif #ifndef __HAVE_ARCH_STRNICMP +#undef strnicmp int strnicmp(const char *s1, const char *s2, size_t len) { return strncasecmp(s1, s2, len); -- cgit From d295634e965ecacdb44c6760b3ca4eae08812715 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Oct 2014 15:55:11 -0700 Subject: lib / string_helpers: move documentation to c-file The introduced function string_escape_mem() is a kind of opposite to string_unescape. We have several users of such functionality each of them created custom implementation. The series contains clean up of test suite, adding new call, and switching few users to use it via %*pE specifier. Test suite covers all of existing and most of potential use cases. This patch (of 11): The documentation of API belongs to c-file. This patch moves it accordingly. There is no functional change. Signed-off-by: Andy Shevchenko Cc: "John W . Linville" Cc: Johannes Berg Cc: Greg Kroah-Hartman Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string_helpers.h | 34 ---------------------------------- lib/string_helpers.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 34 deletions(-) (limited to 'lib') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 3eeee9672a4a..5a30f2a86239 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -20,40 +20,6 @@ int string_get_size(u64 size, enum string_size_units units, #define UNESCAPE_ANY \ (UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL) -/** - * string_unescape - unquote characters in the given string - * @src: source buffer (escaped) - * @dst: destination buffer (unescaped) - * @size: size of the destination buffer (0 to unlimit) - * @flags: combination of the flags (bitwise OR): - * %UNESCAPE_SPACE: - * '\f' - form feed - * '\n' - new line - * '\r' - carriage return - * '\t' - horizontal tab - * '\v' - vertical tab - * %UNESCAPE_OCTAL: - * '\NNN' - byte with octal value NNN (1 to 3 digits) - * %UNESCAPE_HEX: - * '\xHH' - byte with hexadecimal value HH (1 to 2 digits) - * %UNESCAPE_SPECIAL: - * '\"' - double quote - * '\\' - backslash - * '\a' - alert (BEL) - * '\e' - escape - * %UNESCAPE_ANY: - * all previous together - * - * Returns amount of characters processed to the destination buffer excluding - * trailing '\0'. - * - * Because the size of the output will be the same as or less than the size of - * the input, the transformation may be performed in place. - * - * Caller must provide valid source and destination pointers. Be aware that - * destination buffer will always be NULL-terminated. Source string must be - * NULL-terminated as well. - */ int string_unescape(char *src, char *dst, size_t size, unsigned int flags); static inline int string_unescape_inplace(char *buf, unsigned int flags) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 29033f319aea..74ec60469640 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -168,6 +168,44 @@ static bool unescape_special(char **src, char **dst) return true; } +/** + * string_unescape - unquote characters in the given string + * @src: source buffer (escaped) + * @dst: destination buffer (unescaped) + * @size: size of the destination buffer (0 to unlimit) + * @flags: combination of the flags (bitwise OR): + * %UNESCAPE_SPACE: + * '\f' - form feed + * '\n' - new line + * '\r' - carriage return + * '\t' - horizontal tab + * '\v' - vertical tab + * %UNESCAPE_OCTAL: + * '\NNN' - byte with octal value NNN (1 to 3 digits) + * %UNESCAPE_HEX: + * '\xHH' - byte with hexadecimal value HH (1 to 2 digits) + * %UNESCAPE_SPECIAL: + * '\"' - double quote + * '\\' - backslash + * '\a' - alert (BEL) + * '\e' - escape + * %UNESCAPE_ANY: + * all previous together + * + * Description: + * The function unquotes characters in the given string. + * + * Because the size of the output will be the same as or less than the size of + * the input, the transformation may be performed in place. + * + * Caller must provide valid source and destination pointers. Be aware that + * destination buffer will always be NULL-terminated. Source string must be + * NULL-terminated as well. + * + * Return: + * The amount of the characters processed to the destination buffer excluding + * trailing '\0' is returned. + */ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) { char *out = dst; -- cgit From 45ff337a54c154680edf0c538e5c9eb4a2f862cc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Oct 2014 15:55:14 -0700 Subject: lib / string_helpers: refactoring the test suite This patch prepares test suite for a following update. It introduces test_string_check_buf() helper which checks the result and dumps an error. Signed-off-by: Andy Shevchenko Cc: "John W . Linville" Cc: Johannes Berg Cc: Greg Kroah-Hartman Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test-string_helpers.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 6ac48de04c0e..ac44c9245dcf 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -10,6 +10,26 @@ #include #include +static __init bool test_string_check_buf(const char *name, unsigned int flags, + char *in, size_t p, + char *out_real, size_t q_real, + char *out_test, size_t q_test) +{ + if (q_real == q_test && !memcmp(out_test, out_real, q_test)) + return true; + + pr_warn("Test '%s' failed: flags = %u\n", name, flags); + + print_hex_dump(KERN_WARNING, "Input: ", DUMP_PREFIX_NONE, 16, 1, + in, p, true); + print_hex_dump(KERN_WARNING, "Expected: ", DUMP_PREFIX_NONE, 16, 1, + out_test, q_test, true); + print_hex_dump(KERN_WARNING, "Got: ", DUMP_PREFIX_NONE, 16, 1, + out_real, q_real, true); + + return false; +} + struct test_string { const char *in; const char *out; @@ -39,7 +59,8 @@ static const struct test_string strings[] __initconst = { }, }; -static void __init test_string_unescape(unsigned int flags, bool inplace) +static void __init test_string_unescape(const char *name, unsigned int flags, + bool inplace) { char in[256]; char out_test[256]; @@ -77,15 +98,8 @@ static void __init test_string_unescape(unsigned int flags, bool inplace) q_real = string_unescape(in, out_real, q_real, flags); } - if (q_real != q_test || memcmp(out_test, out_real, q_test)) { - pr_warn("Test failed: flags = %u\n", flags); - print_hex_dump(KERN_WARNING, "Input: ", - DUMP_PREFIX_NONE, 16, 1, in, p - 1, true); - print_hex_dump(KERN_WARNING, "Expected: ", - DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true); - print_hex_dump(KERN_WARNING, "Got: ", - DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true); - } + test_string_check_buf(name, flags, in, p - 1, out_real, q_real, + out_test, q_test); } static int __init test_string_helpers_init(void) @@ -94,8 +108,9 @@ static int __init test_string_helpers_init(void) pr_info("Running tests...\n"); for (i = 0; i < UNESCAPE_ANY + 1; i++) - test_string_unescape(i, false); - test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true); + test_string_unescape("unescape", i, false); + test_string_unescape("unescape inplace", + get_random_int() % (UNESCAPE_ANY + 1), true); return -EINVAL; } -- cgit From c8250381c8272a9828fdd353171727b154fbd296 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Oct 2014 15:55:16 -0700 Subject: lib / string_helpers: introduce string_escape_mem() This is almost the opposite function to string_unescape(). Nevertheless it handles \0 and could be used for any byte buffer. The documentation is supplied together with the function prototype. The test cases covers most of the scenarios and would be expanded later on. [akpm@linux-foundation.org: avoid 1k stack consumption] Signed-off-by: Andy Shevchenko Cc: "John W . Linville" Cc: Johannes Berg Cc: Greg Kroah-Hartman Cc: Joe Perches Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string_helpers.h | 31 +++++ lib/string_helpers.c | 274 +++++++++++++++++++++++++++++++++++++++++ lib/test-string_helpers.c | 240 +++++++++++++++++++++++++++++++++++- 3 files changed, 541 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 5a30f2a86239..6eb567ac56bc 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -37,4 +37,35 @@ static inline int string_unescape_any_inplace(char *buf) return string_unescape_any(buf, buf, 0); } +#define ESCAPE_SPACE 0x01 +#define ESCAPE_SPECIAL 0x02 +#define ESCAPE_NULL 0x04 +#define ESCAPE_OCTAL 0x08 +#define ESCAPE_ANY \ + (ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_SPECIAL | ESCAPE_NULL) +#define ESCAPE_NP 0x10 +#define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP) +#define ESCAPE_HEX 0x20 + +int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, + unsigned int flags, const char *esc); + +static inline int string_escape_mem_any_np(const char *src, size_t isz, + char **dst, size_t osz, const char *esc) +{ + return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc); +} + +static inline int string_escape_str(const char *src, char **dst, size_t sz, + unsigned int flags, const char *esc) +{ + return string_escape_mem(src, strlen(src), dst, sz, flags, esc); +} + +static inline int string_escape_str_any_np(const char *src, char **dst, + size_t sz, const char *esc) +{ + return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc); +} + #endif diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 74ec60469640..58b78ba57439 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include /** @@ -240,3 +242,275 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) return out - dst; } EXPORT_SYMBOL(string_unescape); + +static int escape_passthrough(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + + if (*osz < 1) + return -ENOMEM; + + *out++ = c; + + *dst = out; + *osz -= 1; + + return 1; +} + +static int escape_space(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + unsigned char to; + + if (*osz < 2) + return -ENOMEM; + + switch (c) { + case '\n': + to = 'n'; + break; + case '\r': + to = 'r'; + break; + case '\t': + to = 't'; + break; + case '\v': + to = 'v'; + break; + case '\f': + to = 'f'; + break; + default: + return 0; + } + + *out++ = '\\'; + *out++ = to; + + *dst = out; + *osz -= 2; + + return 1; +} + +static int escape_special(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + unsigned char to; + + if (*osz < 2) + return -ENOMEM; + + switch (c) { + case '\\': + to = '\\'; + break; + case '\a': + to = 'a'; + break; + case '\e': + to = 'e'; + break; + default: + return 0; + } + + *out++ = '\\'; + *out++ = to; + + *dst = out; + *osz -= 2; + + return 1; +} + +static int escape_null(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + + if (*osz < 2) + return -ENOMEM; + + if (c) + return 0; + + *out++ = '\\'; + *out++ = '0'; + + *dst = out; + *osz -= 2; + + return 1; +} + +static int escape_octal(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + + if (*osz < 4) + return -ENOMEM; + + *out++ = '\\'; + *out++ = ((c >> 6) & 0x07) + '0'; + *out++ = ((c >> 3) & 0x07) + '0'; + *out++ = ((c >> 0) & 0x07) + '0'; + + *dst = out; + *osz -= 4; + + return 1; +} + +static int escape_hex(unsigned char c, char **dst, size_t *osz) +{ + char *out = *dst; + + if (*osz < 4) + return -ENOMEM; + + *out++ = '\\'; + *out++ = 'x'; + *out++ = hex_asc_hi(c); + *out++ = hex_asc_lo(c); + + *dst = out; + *osz -= 4; + + return 1; +} + +/** + * string_escape_mem - quote characters in the given memory buffer + * @src: source buffer (unescaped) + * @isz: source buffer size + * @dst: destination buffer (escaped) + * @osz: destination buffer size + * @flags: combination of the flags (bitwise OR): + * %ESCAPE_SPACE: + * '\f' - form feed + * '\n' - new line + * '\r' - carriage return + * '\t' - horizontal tab + * '\v' - vertical tab + * %ESCAPE_SPECIAL: + * '\\' - backslash + * '\a' - alert (BEL) + * '\e' - escape + * %ESCAPE_NULL: + * '\0' - null + * %ESCAPE_OCTAL: + * '\NNN' - byte with octal value NNN (3 digits) + * %ESCAPE_ANY: + * all previous together + * %ESCAPE_NP: + * escape only non-printable characters (checked by isprint) + * %ESCAPE_ANY_NP: + * all previous together + * %ESCAPE_HEX: + * '\xHH' - byte with hexadecimal value HH (2 digits) + * @esc: NULL-terminated string of characters any of which, if found in + * the source, has to be escaped + * + * Description: + * The process of escaping byte buffer includes several parts. They are applied + * in the following sequence. + * 1. The character is matched to the printable class, if asked, and in + * case of match it passes through to the output. + * 2. The character is not matched to the one from @esc string and thus + * must go as is to the output. + * 3. The character is checked if it falls into the class given by @flags. + * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any + * character. Note that they actually can't go together, otherwise + * %ESCAPE_HEX will be ignored. + * + * Caller must provide valid source and destination pointers. Be aware that + * destination buffer will not be NULL-terminated, thus caller have to append + * it if needs. + * + * Return: + * The amount of the characters processed to the destination buffer, or + * %-ENOMEM if the size of buffer is not enough to put an escaped character is + * returned. + * + * Even in the case of error @dst pointer will be updated to point to the byte + * after the last processed character. + */ +int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, + unsigned int flags, const char *esc) +{ + char *out = *dst, *p = out; + bool is_dict = esc && *esc; + int ret = 0; + + while (isz--) { + unsigned char c = *src++; + + /* + * Apply rules in the following sequence: + * - the character is printable, when @flags has + * %ESCAPE_NP bit set + * - the @esc string is supplied and does not contain a + * character under question + * - the character doesn't fall into a class of symbols + * defined by given @flags + * In these cases we just pass through a character to the + * output buffer. + */ + if ((flags & ESCAPE_NP && isprint(c)) || + (is_dict && !strchr(esc, c))) { + /* do nothing */ + } else { + if (flags & ESCAPE_SPACE) { + ret = escape_space(c, &p, &osz); + if (ret < 0) + break; + if (ret > 0) + continue; + } + + if (flags & ESCAPE_SPECIAL) { + ret = escape_special(c, &p, &osz); + if (ret < 0) + break; + if (ret > 0) + continue; + } + + if (flags & ESCAPE_NULL) { + ret = escape_null(c, &p, &osz); + if (ret < 0) + break; + if (ret > 0) + continue; + } + + /* ESCAPE_OCTAL and ESCAPE_HEX always go last */ + if (flags & ESCAPE_OCTAL) { + ret = escape_octal(c, &p, &osz); + if (ret < 0) + break; + continue; + } + if (flags & ESCAPE_HEX) { + ret = escape_hex(c, &p, &osz); + if (ret < 0) + break; + continue; + } + } + + ret = escape_passthrough(c, &p, &osz); + if (ret < 0) + break; + } + + *dst = p; + + if (ret < 0) + return ret; + + return p - out; +} +EXPORT_SYMBOL(string_escape_mem); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index ac44c9245dcf..ab0d30e1e18f 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -62,10 +63,14 @@ static const struct test_string strings[] __initconst = { static void __init test_string_unescape(const char *name, unsigned int flags, bool inplace) { - char in[256]; - char out_test[256]; - char out_real[256]; - int i, p = 0, q_test = 0, q_real = sizeof(out_real); + int q_real = 256; + char *in = kmalloc(q_real, GFP_KERNEL); + char *out_test = kmalloc(q_real, GFP_KERNEL); + char *out_real = kmalloc(q_real, GFP_KERNEL); + int i, p = 0, q_test = 0; + + if (!in || !out_test || !out_real) + goto out; for (i = 0; i < ARRAY_SIZE(strings); i++) { const char *s = strings[i].in; @@ -100,6 +105,223 @@ static void __init test_string_unescape(const char *name, unsigned int flags, test_string_check_buf(name, flags, in, p - 1, out_real, q_real, out_test, q_test); +out: + kfree(out_real); + kfree(out_test); + kfree(in); +} + +struct test_string_1 { + const char *out; + unsigned int flags; +}; + +#define TEST_STRING_2_MAX_S1 32 +struct test_string_2 { + const char *in; + struct test_string_1 s1[TEST_STRING_2_MAX_S1]; +}; + +#define TEST_STRING_2_DICT_0 NULL +static const struct test_string_2 escape0[] __initconst = {{ + .in = "\f\\ \n\r\t\v", + .s1 = {{ + .out = "\\f\\ \\n\\r\\t\\v", + .flags = ESCAPE_SPACE, + },{ + .out = "\\f\\134\\040\\n\\r\\t\\v", + .flags = ESCAPE_SPACE | ESCAPE_OCTAL, + },{ + .out = "\\f\\x5c\\x20\\n\\r\\t\\v", + .flags = ESCAPE_SPACE | ESCAPE_HEX, + },{ + /* terminator */ + }}, +},{ + .in = "\\h\\\"\a\e\\", + .s1 = {{ + .out = "\\\\h\\\\\"\\a\\e\\\\", + .flags = ESCAPE_SPECIAL, + },{ + .out = "\\\\\\150\\\\\\042\\a\\e\\\\", + .flags = ESCAPE_SPECIAL | ESCAPE_OCTAL, + },{ + .out = "\\\\\\x68\\\\\\x22\\a\\e\\\\", + .flags = ESCAPE_SPECIAL | ESCAPE_HEX, + },{ + /* terminator */ + }}, +},{ + .in = "\eb \\C\007\"\x90\r]", + .s1 = {{ + .out = "\eb \\C\007\"\x90\\r]", + .flags = ESCAPE_SPACE, + },{ + .out = "\\eb \\\\C\\a\"\x90\r]", + .flags = ESCAPE_SPECIAL, + },{ + .out = "\\eb \\\\C\\a\"\x90\\r]", + .flags = ESCAPE_SPACE | ESCAPE_SPECIAL, + },{ + .out = "\\033\\142\\040\\134\\103\\007\\042\\220\\015\\135", + .flags = ESCAPE_OCTAL, + },{ + .out = "\\033\\142\\040\\134\\103\\007\\042\\220\\r\\135", + .flags = ESCAPE_SPACE | ESCAPE_OCTAL, + },{ + .out = "\\e\\142\\040\\\\\\103\\a\\042\\220\\015\\135", + .flags = ESCAPE_SPECIAL | ESCAPE_OCTAL, + },{ + .out = "\\e\\142\\040\\\\\\103\\a\\042\\220\\r\\135", + .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_OCTAL, + },{ + .out = "\eb \\C\007\"\x90\r]", + .flags = ESCAPE_NP, + },{ + .out = "\eb \\C\007\"\x90\\r]", + .flags = ESCAPE_SPACE | ESCAPE_NP, + },{ + .out = "\\eb \\C\\a\"\x90\r]", + .flags = ESCAPE_SPECIAL | ESCAPE_NP, + },{ + .out = "\\eb \\C\\a\"\x90\\r]", + .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_NP, + },{ + .out = "\\033b \\C\\007\"\\220\\015]", + .flags = ESCAPE_OCTAL | ESCAPE_NP, + },{ + .out = "\\033b \\C\\007\"\\220\\r]", + .flags = ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_NP, + },{ + .out = "\\eb \\C\\a\"\\220\\r]", + .flags = ESCAPE_SPECIAL | ESCAPE_SPACE | ESCAPE_OCTAL | + ESCAPE_NP, + },{ + .out = "\\x1bb \\C\\x07\"\\x90\\x0d]", + .flags = ESCAPE_NP | ESCAPE_HEX, + },{ + /* terminator */ + }}, +},{ + /* terminator */ +}}; + +#define TEST_STRING_2_DICT_1 "b\\ \t\r" +static const struct test_string_2 escape1[] __initconst = {{ + .in = "\f\\ \n\r\t\v", + .s1 = {{ + .out = "\f\\134\\040\n\\015\\011\v", + .flags = ESCAPE_OCTAL, + },{ + .out = "\f\\x5c\\x20\n\\x0d\\x09\v", + .flags = ESCAPE_HEX, + },{ + /* terminator */ + }}, +},{ + .in = "\\h\\\"\a\e\\", + .s1 = {{ + .out = "\\134h\\134\"\a\e\\134", + .flags = ESCAPE_OCTAL, + },{ + /* terminator */ + }}, +},{ + .in = "\eb \\C\007\"\x90\r]", + .s1 = {{ + .out = "\e\\142\\040\\134C\007\"\x90\\015]", + .flags = ESCAPE_OCTAL, + },{ + /* terminator */ + }}, +},{ + /* terminator */ +}}; + +static __init const char *test_string_find_match(const struct test_string_2 *s2, + unsigned int flags) +{ + const struct test_string_1 *s1 = s2->s1; + unsigned int i; + + if (!flags) + return s2->in; + + /* Test cases are NULL-aware */ + flags &= ~ESCAPE_NULL; + + /* ESCAPE_OCTAL has a higher priority */ + if (flags & ESCAPE_OCTAL) + flags &= ~ESCAPE_HEX; + + for (i = 0; i < TEST_STRING_2_MAX_S1 && s1->out; i++, s1++) + if (s1->flags == flags) + return s1->out; + return NULL; +} + +static __init void test_string_escape(const char *name, + const struct test_string_2 *s2, + unsigned int flags, const char *esc) +{ + int q_real = 512; + char *out_test = kmalloc(q_real, GFP_KERNEL); + char *out_real = kmalloc(q_real, GFP_KERNEL); + char *in = kmalloc(256, GFP_KERNEL); + char *buf = out_real; + int p = 0, q_test = 0; + + if (!out_test || !out_real || !in) + goto out; + + for (; s2->in; s2++) { + const char *out; + int len; + + /* NULL injection */ + if (flags & ESCAPE_NULL) { + in[p++] = '\0'; + out_test[q_test++] = '\\'; + out_test[q_test++] = '0'; + } + + /* Don't try strings that have no output */ + out = test_string_find_match(s2, flags); + if (!out) + continue; + + /* Copy string to in buffer */ + len = strlen(s2->in); + memcpy(&in[p], s2->in, len); + p += len; + + /* Copy expected result for given flags */ + len = strlen(out); + memcpy(&out_test[q_test], out, len); + q_test += len; + } + + q_real = string_escape_mem(in, p, &buf, q_real, flags, esc); + + test_string_check_buf(name, flags, in, p, out_real, q_real, out_test, + q_test); +out: + kfree(in); + kfree(out_real); + kfree(out_test); +} + +static __init void test_string_escape_nomem(void) +{ + char *in = "\eb \\C\007\"\x90\r]"; + char out[64], *buf = out; + int rc = -ENOMEM, ret; + + ret = string_escape_str_any_np(in, &buf, strlen(in), NULL); + if (ret == rc) + return; + + pr_err("Test 'escape nomem' failed: got %d instead of %d\n", ret, rc); } static int __init test_string_helpers_init(void) @@ -112,6 +334,16 @@ static int __init test_string_helpers_init(void) test_string_unescape("unescape inplace", get_random_int() % (UNESCAPE_ANY + 1), true); + /* Without dictionary */ + for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) + test_string_escape("escape 0", escape0, i, TEST_STRING_2_DICT_0); + + /* With dictionary */ + for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) + test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1); + + test_string_escape_nomem(); + return -EINVAL; } module_init(test_string_helpers_init); -- cgit From 71dca95d5cf5ece6c1bee8e625e23c16025952c7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Oct 2014 15:55:18 -0700 Subject: lib/vsprintf: add %*pE[achnops] format specifier This allows user to print a given buffer as an escaped string. The rules are applied according to an optional mix of flags provided by additional format letters. For example, if the given buffer is: 1b 62 20 5c 43 07 22 90 0d 5d The result strings would be: %*pE "\eb \C\a"\220\r]" %*pEhp "\x1bb \C\x07"\x90\x0d]" %*pEa "\e\142\040\\\103\a\042\220\r\135" Please, read Documentation/printk-formats.txt and lib/string_helpers.c kernel documentation to get further information. [akpm@linux-foundation.org: tidy up comment layout, per Joe] Signed-off-by: Andy Shevchenko Suggested-by: Joe Perches Cc: "John W . Linville" Cc: Johannes Berg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/printk-formats.txt | 32 ++++++++++++++++++ lib/vsprintf.c | 71 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) (limited to 'lib') diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index b4498218c474..5a615c14f75d 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -70,6 +70,38 @@ DMA addresses types dma_addr_t: For printing a dma_addr_t type which can vary based on build options, regardless of the width of the CPU data path. Passed by reference. +Raw buffer as an escaped string: + + %*pE[achnops] + + For printing raw buffer as an escaped string. For the following buffer + + 1b 62 20 5c 43 07 22 90 0d 5d + + few examples show how the conversion would be done (the result string + without surrounding quotes): + + %*pE "\eb \C\a"\220\r]" + %*pEhp "\x1bb \C\x07"\x90\x0d]" + %*pEa "\e\142\040\\\103\a\042\220\r\135" + + The conversion rules are applied according to an optional combination + of flags (see string_escape_mem() kernel documentation for the + details): + a - ESCAPE_ANY + c - ESCAPE_SPECIAL + h - ESCAPE_HEX + n - ESCAPE_NULL + o - ESCAPE_OCTAL + p - ESCAPE_NP + s - ESCAPE_SPACE + By default ESCAPE_ANY_NP is used. + + ESCAPE_ANY_NP is the sane choice for many cases, in particularly for + printing SSIDs. + + If field width is omitted the 1 byte only will be escaped. + Raw buffer as a hex string: %*ph 00 01 02 ... 3f %*phC 00:01:02: ... :3f diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ba3cd0a35640..ec337f64f52d 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -33,6 +33,7 @@ #include /* for PAGE_SIZE */ #include /* for dereference_function_descriptor() */ +#include #include "kstrtox.h" /** @@ -1100,6 +1101,62 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, return string(buf, end, ip4_addr, spec); } +static noinline_for_stack +char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, + const char *fmt) +{ + bool found = true; + int count = 1; + unsigned int flags = 0; + int len; + + if (spec.field_width == 0) + return buf; /* nothing to print */ + + if (ZERO_OR_NULL_PTR(addr)) + return string(buf, end, NULL, spec); /* NULL pointer */ + + + do { + switch (fmt[count++]) { + case 'a': + flags |= ESCAPE_ANY; + break; + case 'c': + flags |= ESCAPE_SPECIAL; + break; + case 'h': + flags |= ESCAPE_HEX; + break; + case 'n': + flags |= ESCAPE_NULL; + break; + case 'o': + flags |= ESCAPE_OCTAL; + break; + case 'p': + flags |= ESCAPE_NP; + break; + case 's': + flags |= ESCAPE_SPACE; + break; + default: + found = false; + break; + } + } while (found); + + if (!flags) + flags = ESCAPE_ANY_NP; + + len = spec.field_width < 0 ? 1 : spec.field_width; + + /* Ignore the error. We print as many characters as we can */ + string_escape_mem(addr, len, &buf, end - buf, flags, NULL); + + return buf; +} + static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) @@ -1221,6 +1278,17 @@ int kptr_restrict __read_mostly; * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I[6S]c' for IPv6 addresses printed as specified by * http://tools.ietf.org/html/rfc5952 + * - 'E[achnops]' For an escaped buffer, where rules are defined by combination + * of the following flags (see string_escape_mem() for the + * details): + * a - ESCAPE_ANY + * c - ESCAPE_SPECIAL + * h - ESCAPE_HEX + * n - ESCAPE_NULL + * o - ESCAPE_OCTAL + * p - ESCAPE_NP + * s - ESCAPE_SPACE + * By default ESCAPE_ANY_NP is used. * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form * "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" * Options for %pU are: @@ -1321,6 +1389,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, }} } break; + case 'E': + return escaped_string(buf, end, ptr, spec, fmt); case 'U': return uuid_string(buf, end, ptr, spec, fmt); case 'V': @@ -1633,6 +1703,7 @@ qualifier: * %piS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address * %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper * case. + * %*pE[achnops] print an escaped buffer * %*ph[CDN] a variable-length hex string with a separator (supports up to 64 * bytes of the input) * %n is ignored -- cgit From b395f75eabb3844c99244928293796ff42feaa3d Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 23:03:16 +1100 Subject: lib/raid6: Add log level to printks Signed-off-by: Anton Blanchard Signed-off-by: NeilBrown --- lib/raid6/algos.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index f0b1aa3586d1..7d0e5cd7b570 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -121,9 +121,9 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) raid6_2data_recov = best->data2; raid6_datap_recov = best->datap; - printk("raid6: using %s recovery algorithm\n", best->name); + pr_info("raid6: using %s recovery algorithm\n", best->name); } else - printk("raid6: Yikes! No recovery algorithm found!\n"); + pr_err("raid6: Yikes! No recovery algorithm found!\n"); return best; } @@ -157,18 +157,18 @@ static inline const struct raid6_calls *raid6_choose_gen( bestperf = perf; best = *algo; } - printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, + pr_info("raid6: %-8s %5ld MB/s\n", (*algo)->name, (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); } } if (best) { - printk("raid6: using algorithm %s (%ld MB/s)\n", + pr_info("raid6: using algorithm %s (%ld MB/s)\n", best->name, (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); raid6_call = *best; } else - printk("raid6: Yikes! No algorithm found!\n"); + pr_err("raid6: Yikes! No algorithm found!\n"); return best; } @@ -194,7 +194,7 @@ int __init raid6_select_algo(void) syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); if (!syndromes) { - printk("raid6: Yikes! No memory available.\n"); + pr_err("raid6: Yikes! No memory available.\n"); return -ENOMEM; } -- cgit From ea0e0de69fc413aa80dbf1ec1fb9702ea1b6faca Mon Sep 17 00:00:00 2001 From: Jan-Simon Möller Date: Mon, 2 Jul 2012 12:54:28 +0200 Subject: crypto: LLVMLinux: Remove VLAIS usage from libcrc32c.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaced the use of a Variable Length Array In Struct (VLAIS) with a C99 compliant equivalent. This patch allocates the appropriate amount of memory using a char array using the SHASH_DESC_ON_STACK macro. The new code can be compiled with both gcc and clang. Signed-off-by: Jan-Simon Möller Signed-off-by: Behan Webster Reviewed-by: Mark Charlebois Acked-by: Herbert Xu Cc: pageexec@freemail.hu Cc: "David S. Miller" --- lib/libcrc32c.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index b3131f5cf8a2..6a08ce7d6adc 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -41,20 +41,18 @@ static struct crypto_shash *tfm; u32 crc32c(u32 crc, const void *address, unsigned int length) { - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; - } desc; + SHASH_DESC_ON_STACK(shash, tfm); + u32 *ctx = (u32 *)shash_desc_ctx(shash); int err; - desc.shash.tfm = tfm; - desc.shash.flags = 0; - *(u32 *)desc.ctx = crc; + shash->tfm = tfm; + shash->flags = 0; + *ctx = crc; - err = crypto_shash_update(&desc.shash, address, length); + err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *(u32 *)desc.ctx; + return *ctx; } EXPORT_SYMBOL(crc32c); -- cgit From d4c5efdb97773f59a2b711754ca0953f24516739 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 26 Aug 2014 23:16:35 -0400 Subject: random: add and use memzero_explicit() for clearing data zatimend has reported that in his environment (3.16/gcc4.8.3/corei7) memset() calls which clear out sensitive data in extract_{buf,entropy, entropy_user}() in random driver are being optimized away by gcc. Add a helper memzero_explicit() (similarly as explicit_bzero() variants) that can be used in such cases where a variable with sensitive data is being cleared out in the end. Other use cases might also be in crypto code. [ I have put this into lib/string.c though, as it's always built-in and doesn't need any dependencies then. ] Fixes kernel bugzilla: 82041 Reported-by: zatimend@hotmail.co.uk Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Cc: Alexey Dobriyan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- drivers/char/random.c | 8 ++++---- include/linux/string.h | 5 +++-- lib/string.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index c18d41db83d8..8c86a95203a0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1106,7 +1106,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __mix_pool_bytes(r, hash.w, sizeof(hash.w)); spin_unlock_irqrestore(&r->lock, flags); - memset(workspace, 0, sizeof(workspace)); + memzero_explicit(workspace, sizeof(workspace)); /* * In case the hash function has some recognizable output @@ -1118,7 +1118,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) hash.w[2] ^= rol32(hash.w[2], 16); memcpy(out, &hash, EXTRACT_SIZE); - memset(&hash, 0, sizeof(hash)); + memzero_explicit(&hash, sizeof(hash)); } /* @@ -1175,7 +1175,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } @@ -1218,7 +1218,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..3b42b3732da6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -132,7 +132,7 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); + const void *from, size_t available); /** * strstarts - does @str start with @prefix? @@ -144,7 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } -extern size_t memweight(const void *ptr, size_t bytes); +size_t memweight(const void *ptr, size_t bytes); +void memzero_explicit(void *s, size_t count); /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index 992bf30af759..3a3120452a1d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -604,6 +604,22 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + OPTIMIZER_HIDE_VAR(s); +} +EXPORT_SYMBOL(memzero_explicit); + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- cgit From c21e59d8dc04b2107bdb4ff0f412a9b7ae3349f3 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 23 Oct 2014 15:10:21 -0400 Subject: lib/scatterlist: fix memory leak with scsi-mq Fix a memory leak with scsi-mq triggered by commands with large data transfer length. Fixes: c53c6d6a68b1 ("scatterlist: allow chaining to preallocated chunks") Cc: # 3.17.x Signed-off-by: Tony Battersby Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- lib/scatterlist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 9cdf62f8accd..c9f2e8c6ccc9 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -203,10 +203,10 @@ void __sg_free_table(struct sg_table *table, unsigned int max_ents, } table->orig_nents -= sg_size; - if (!skip_first_chunk) { - free_fn(sgl, alloc_size); + if (skip_first_chunk) skip_first_chunk = false; - } + else + free_fn(sgl, alloc_size); sgl = next; } -- cgit From ea5d05b34aca25c066e0699512d0ffbd8ee6ac3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Oct 2014 14:50:44 -0700 Subject: lib/bitmap.c: fix undefined shift in __bitmap_shift_{left|right}() If __bitmap_shift_left() or __bitmap_shift_right() are asked to shift by a multiple of BITS_PER_LONG, they will try to shift a long value by BITS_PER_LONG bits which is undefined. Change the functions to avoid the undefined shift. Coverity id: 1192175 Coverity id: 1192174 Signed-off-by: Jan Kara Cc: Rasmus Villemoes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index cd250a2e14cb..b499ab6ada29 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -131,7 +131,9 @@ void __bitmap_shift_right(unsigned long *dst, lower = src[off + k]; if (left && off + k == lim - 1) lower &= mask; - dst[k] = upper << (BITS_PER_LONG - rem) | lower >> rem; + dst[k] = lower >> rem; + if (rem) + dst[k] |= upper << (BITS_PER_LONG - rem); if (left && k == lim - 1) dst[k] &= mask; } @@ -172,7 +174,9 @@ void __bitmap_shift_left(unsigned long *dst, upper = src[k]; if (left && k == lim - 1) upper &= (1UL << left) - 1; - dst[k + off] = lower >> (BITS_PER_LONG - rem) | upper << rem; + dst[k + off] = upper << rem; + if (rem) + dst[k + off] |= lower >> (BITS_PER_LONG - rem); if (left && k + off == lim - 1) dst[k + off] &= (1UL << left) - 1; } -- cgit From 0c828f2f8395fb5e7faf0a116e476a3ce992a199 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 13 Nov 2014 13:10:48 +0800 Subject: lib: rhashtable - Remove weird non-ASCII characters from comments My editor spewed garbage that looked like memory corruption on my screen. It turns out that a number of occurences of "fi" got turned into a ligature. This patch replaces these ligatures with the ASCII letters "fi". Signed-off-by: Herbert Xu Cheers, Acked-by: Thomas Graf Signed-off-by: David S. Miller --- lib/rhashtable.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 081be3ba9ea8..624a0b7c05ef 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -230,7 +230,7 @@ int rhashtable_expand(struct rhashtable *ht, gfp_t flags) ht->shift++; /* For each new bucket, search the corresponding old bucket - * for the first entry that hashes to the new bucket, and + * for the first entry that hashes to the new bucket, and * link the new bucket to that entry. Since all the entries * which will end up in the new bucket appear in the same * old bucket, this constructs an entirely valid new hash @@ -248,8 +248,8 @@ int rhashtable_expand(struct rhashtable *ht, gfp_t flags) } /* Publish the new table pointer. Lookups may now traverse - * the new table, but they will not benefit from any - * additional efficiency until later steps unzip the buckets. + * the new table, but they will not benefit from any + * additional efficiency until later steps unzip the buckets. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -306,14 +306,14 @@ int rhashtable_shrink(struct rhashtable *ht, gfp_t flags) ht->shift--; - /* Link each bucket in the new table to the first bucket + /* Link each bucket in the new table to the first bucket * in the old table that contains entries which will hash * to the new bucket. */ for (i = 0; i < ntbl->size; i++) { ntbl->buckets[i] = tbl->buckets[i]; - /* Link each bucket in the new table to the first bucket + /* Link each bucket in the new table to the first bucket * in the old table that contains entries which will hash * to the new bucket. */ -- cgit