From 090f32ee4ef0a59c738963c6b0a6948cc5dee84c Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 20 Apr 2014 23:44:47 -0400 Subject: ext4: get rid of EXT4_MAP_UNINIT flag Currently EXT4_MAP_UNINIT is used in dioread_nolock case to mark the cases where we're using dioread_nolock and we're writing into either unallocated, or unwritten extent, because we need to make sure that any DIO write into that inode will wait for the extent conversion. However EXT4_MAP_UNINIT is not only entirely misleading name but also unnecessary because we can check for EXT4_MAP_UNWRITTEN in the dioread_nolock case instead. This commit removes EXT4_MAP_UNINIT flag. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6a1a0245474f..9995d3b588a8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -51,7 +51,6 @@ struct extent_status; { EXT4_MAP_MAPPED, "M" }, \ { EXT4_MAP_UNWRITTEN, "U" }, \ { EXT4_MAP_BOUNDARY, "B" }, \ - { EXT4_MAP_UNINIT, "u" }, \ { EXT4_MAP_FROM_CLUSTER, "C" }) #define show_free_flags(flags) __print_flags(flags, "|", \ -- cgit From 556615dcbf38b0a92a9e659f52c06686270dfc16 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 20 Apr 2014 23:45:47 -0400 Subject: ext4: rename uninitialized extents to unwritten Currently in ext4 there is quite a mess when it comes to naming unwritten extents. Sometimes we call it uninitialized and sometimes we refer to it as unwritten. The right name for the extent which has been allocated but does not contain any written data is _unwritten_. Other file systems are using this name consistently, even the buffer head state refers to it as unwritten. We need to fix this confusion in ext4. This commit changes every reference to an uninitialized extent (meaning allocated but unwritten) to unwritten extent. This includes comments, function names and variable names. It even covers abbreviation of the word uninitialized (such as uninit) and some misspellings. This commit does not change any of the code paths at all. This has been confirmed by comparing md5sums of the assembly code of each object file after all the function names were stripped from it. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 16 ++-- fs/ext4/ext4_extents.h | 22 ++--- fs/ext4/extents.c | 218 ++++++++++++++++++++++---------------------- fs/ext4/extents_status.c | 2 +- fs/ext4/file.c | 2 +- fs/ext4/inode.c | 18 ++-- fs/ext4/move_extent.c | 38 ++++---- fs/ext4/super.c | 2 +- include/trace/events/ext4.h | 8 +- 9 files changed, 163 insertions(+), 163 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b681d90b1e87..86c2cda208ea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -183,7 +183,7 @@ struct ext4_map_blocks { #define EXT4_IO_END_UNWRITTEN 0x0001 /* - * For converting uninitialized extents on a work queue. 'handle' is used for + * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { @@ -536,26 +536,26 @@ enum { /* * Flags used by ext4_map_blocks() */ - /* Allocate any needed blocks and/or convert an unitialized + /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 - /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an - unitialized extents if not allocated, split the uninitialized + unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 5074fe23f19e..a867f5ca9991 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -137,21 +137,21 @@ struct ext4_ext_path { * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. + * particular extent is an initialized extent or an unwritten (i.e. * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) -#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ @@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { - /* We can not have an uninitialized extent of zero length! */ + /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 83ed52538ae4..e305a31641f2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,8 +50,8 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ @@ -524,7 +524,7 @@ __read_extent_tree_block(const char *function, unsigned int line, lblk - prev, ~0, EXTENT_STATUS_HOLE); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); @@ -620,7 +620,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -646,7 +646,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -677,7 +677,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -802,7 +802,7 @@ ext4_ext_binsearch(struct inode *inode, ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -1686,11 +1686,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, /* * Make sure that both extents are initialized. We don't merge - * uninitialized extents so that we can be sure that end_io code has + * unwritten extents so that we can be sure that end_io code has * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1707,10 +1707,10 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - if (ext4_ext_is_uninitialized(ex1) && + if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1735,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0, uninit; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1745,11 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1903,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0, uninit; + int mb_flags = 0, unwritten; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1943,21 +1943,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1969,10 +1969,10 @@ prepend: ext_debug("prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, @@ -1980,13 +1980,13 @@ prepend: if (err) return err; - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2046,7 +2046,7 @@ has_space: ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -2057,7 +2057,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -2068,7 +2068,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -2078,7 +2078,7 @@ has_space: "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -2200,7 +2200,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } @@ -2576,7 +2576,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; @@ -2623,13 +2623,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2701,11 +2701,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2854,9 +2854,9 @@ again: end < ee_block + ext4_ext_get_actual_len(ex) - 1) { int split_flag = 0; - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (ext4_ext_is_unwritten(ex)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; /* * Split the extent in two so that 'end' is the last @@ -3113,7 +3113,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -3155,10 +3155,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_uninitialized(ex) && + BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2)); + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3170,8 +3170,8 @@ static int ext4_split_extent_at(handle_t *handle, * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); @@ -3185,8 +3185,8 @@ static int ext4_split_extent_at(handle_t *handle, /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -3200,8 +3200,8 @@ static int ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { @@ -3278,7 +3278,7 @@ static int ext4_split_extent(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; int allocated = map->m_len; @@ -3286,14 +3286,14 @@ static int ext4_split_extent(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, path, @@ -3318,15 +3318,15 @@ static int ext4_split_extent(handle_t *handle, (unsigned long) map->m_lblk); return -EIO; } - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (uninitialized) { - split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT2); + EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); @@ -3341,16 +3341,16 @@ out: /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -3396,12 +3396,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3437,7 +3437,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3452,7 +3452,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3483,7 +3483,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3498,7 +3498,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3603,26 +3603,26 @@ out: /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, @@ -3660,7 +3660,7 @@ static int ext4_split_convert_extents(handle_t *handle, } else if (flags & EXT4_GET_BLOCKS_CONVERT) { split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2); + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); @@ -3710,8 +3710,8 @@ static int ext4_convert_initialized_extents(handle_t *handle, err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - /* first mark the extent as uninitialized */ - ext4_ext_mark_uninitialized(ex); + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed @@ -3971,10 +3971,10 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, /* * Make sure that the extent is no bigger than we support with - * uninitialized extent + * unwritten extent */ - if (map->m_len > EXT_UNINIT_MAX_LEN) - map->m_len = EXT_UNINIT_MAX_LEN / 2; + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; ret = ext4_convert_initialized_extents(handle, inode, map, path); @@ -3993,7 +3993,7 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int allocated, ext4_fsblk_t newblock) @@ -4002,19 +4002,19 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* - * When writing into uninitialized space, we should not fail to + * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submit the IO, split the extent */ @@ -4057,7 +4057,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } @@ -4308,7 +4308,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -4327,16 +4327,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ - if ((!ext4_ext_is_uninitialized(ex)) && + if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = ext4_ext_convert_initialized_extent( handle, inode, map, path, flags, allocated, newblock); goto out2; - } else if (!ext4_ext_is_uninitialized(ex)) + } else if (!ext4_ext_is_unwritten(ex)) goto out; - ret = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_unwritten_extents( handle, inode, map, path, flags, allocated, newblock); if (ret < 0) @@ -4408,15 +4408,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -4464,13 +4464,13 @@ got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. @@ -4603,9 +4603,9 @@ got_allocated_blocks: /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4679,7 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * that it doesn't get unnecessarily split into multiple * extents. */ - if (len <= EXT_UNINIT_MAX_LEN) + if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* @@ -4771,7 +4771,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -4914,7 +4914,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0ebc21204b51..98c90f5834ee 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -433,7 +433,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, ee_start = ext4_ext_pblock(ex); ee_len = ext4_ext_get_actual_len(ex); - ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; es_status = ext4_es_is_unwritten(es) ? 1 : 0; /* diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 063fc1538355..bf0e772b6a03 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -136,7 +136,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, /* * 'err==len' means that all of blocks has been preallocated no * matter they are initialized or not. For excluding - * uninitialized extents, we need to check m_flags. There are + * unwritten extents, we need to check m_flags. There are * two conditions that indicate for initialized extents. * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; * 2) If we do a real lookup, non-flags are returned. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 297465e07d44..7dfbcbba67d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -489,8 +489,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -622,7 +622,7 @@ found: map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. @@ -2032,7 +2032,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. @@ -2131,7 +2131,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or - * to convert an uninitialized extent to be initialized (in the case + * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in @@ -3071,9 +3071,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten. * * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -3125,12 +3125,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * We could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose + * unwritten to prevent parallel buffered read to expose * the stale data before DIO complete the data IO. * * As to previously fallocated extents, ext4 get_block will * just simply mark the buffer mapped but still keep the - * extents uninitialized. + * extents unwritten. * * For non AIO case, we will convert those unwritten extents * to written after return back from blockdev_direct_IO. diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 58ee7dc87669..1b809fe51da1 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, static void copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) { - if (ext4_ext_is_uninitialized(src)) - ext4_ext_mark_uninitialized(dest); + if (ext4_ext_is_unwritten(src)) + ext4_ext_mark_unwritten(dest); else dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); } @@ -593,14 +593,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @inode: inode in question * @from: block offset of inode * @count: block count to be checked - * @uninit: extents expected to be uninitialized + * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, - int uninit, int *err) + int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; @@ -611,7 +611,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (*err) goto out; ext = path[ext_depth(inode)].p_ext; - if (uninit != ext4_ext_is_uninitialized(ext)) + if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); @@ -894,7 +894,7 @@ out: * @orig_page_offset: page index on original file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped - * @uninit: orig extent is uninitialized or not + * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents @@ -905,7 +905,7 @@ out: static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit, int *err) + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; @@ -962,27 +962,27 @@ again: if (unlikely(*err < 0)) goto stop_journal; /* - * If orig extent was uninitialized it can become initialized + * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - if (uninit) { + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ - uninit = mext_check_coverage(orig_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - uninit &= mext_check_coverage(donor_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - if (!uninit) { + if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } @@ -1259,7 +1259,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; - int uninit; + int unwritten; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1391,8 +1391,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, !last_extent) continue; - /* Is original extent is uninitialized */ - uninit = ext4_ext_is_uninitialized(ext_prev); + /* Is original extent is unwritten */ + unwritten = ext4_ext_is_unwritten(ext_prev); data_offset_in_page = seq_start % blocks_per_page; @@ -1432,8 +1432,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit, - &ret); + block_len_in_page, + unwritten, &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6f9e6fadac04..c4895c195e00 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3337,7 +3337,7 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting - * uninitialized extents in delalloc path. In most cases such + * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 9995d3b588a8..d4f70a7fe876 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -36,7 +36,7 @@ struct extent_status; #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ - { EXT4_GET_BLOCKS_UNINIT_EXT, "UNINIT" }, \ + { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ @@ -1496,7 +1496,7 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_ARGS(inode) ); -/* 'ux' is the uninitialized extent. */ +/* 'ux' is the unwritten extent. */ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux), @@ -1532,7 +1532,7 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, ); /* - * 'ux' is the uninitialized extent. + * 'ux' is the unwritten extent. * 'ix' is the initialized extent to which blocks are transferred. */ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, @@ -1810,7 +1810,7 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_ARGS(sb, group, start, len) ); -TRACE_EVENT(ext4_ext_handle_uninitialized_extents, +TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), -- cgit From 111c0cf566777ebbe026228b72df95788e771831 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 22 Apr 2014 13:23:17 +0200 Subject: ASoC: Remove ASoC level IO tracing The ASoC framework is in the process of migrating all IO operations to regmap. regmap has its own more sophisticated tracing infrastructure for IO operations, which means that the ASoC level IO tracing becomes redundant, hence this patch removes them. There are still a handful of ASoC drivers left that do not use regmap yet, but hopefully the removal of the ASoC IO tracing will be an additional incentive to switch to regmap. Signed-off-by: Lars-Peter Clausen Signed-off-by: Mark Brown --- include/trace/events/asoc.h | 92 --------------------------------------------- sound/soc/soc-io.c | 11 ------ 2 files changed, 103 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h index 03996b2bb04f..c75c795a377b 100644 --- a/include/trace/events/asoc.h +++ b/include/trace/events/asoc.h @@ -11,102 +11,10 @@ struct snd_soc_jack; struct snd_soc_codec; -struct snd_soc_platform; struct snd_soc_card; struct snd_soc_dapm_widget; struct snd_soc_dapm_path; -/* - * Log register events - */ -DECLARE_EVENT_CLASS(snd_soc_reg, - - TP_PROTO(struct snd_soc_codec *codec, unsigned int reg, - unsigned int val), - - TP_ARGS(codec, reg, val), - - TP_STRUCT__entry( - __string( name, codec->name ) - __field( int, id ) - __field( unsigned int, reg ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __assign_str(name, codec->name); - __entry->id = codec->id; - __entry->reg = reg; - __entry->val = val; - ), - - TP_printk("codec=%s.%d reg=%x val=%x", __get_str(name), - (int)__entry->id, (unsigned int)__entry->reg, - (unsigned int)__entry->val) -); - -DEFINE_EVENT(snd_soc_reg, snd_soc_reg_write, - - TP_PROTO(struct snd_soc_codec *codec, unsigned int reg, - unsigned int val), - - TP_ARGS(codec, reg, val) - -); - -DEFINE_EVENT(snd_soc_reg, snd_soc_reg_read, - - TP_PROTO(struct snd_soc_codec *codec, unsigned int reg, - unsigned int val), - - TP_ARGS(codec, reg, val) - -); - -DECLARE_EVENT_CLASS(snd_soc_preg, - - TP_PROTO(struct snd_soc_platform *platform, unsigned int reg, - unsigned int val), - - TP_ARGS(platform, reg, val), - - TP_STRUCT__entry( - __string( name, platform->name ) - __field( int, id ) - __field( unsigned int, reg ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __assign_str(name, platform->name); - __entry->id = platform->id; - __entry->reg = reg; - __entry->val = val; - ), - - TP_printk("platform=%s.%d reg=%x val=%x", __get_str(name), - (int)__entry->id, (unsigned int)__entry->reg, - (unsigned int)__entry->val) -); - -DEFINE_EVENT(snd_soc_preg, snd_soc_preg_write, - - TP_PROTO(struct snd_soc_platform *platform, unsigned int reg, - unsigned int val), - - TP_ARGS(platform, reg, val) - -); - -DEFINE_EVENT(snd_soc_preg, snd_soc_preg_read, - - TP_PROTO(struct snd_soc_platform *platform, unsigned int reg, - unsigned int val), - - TP_ARGS(platform, reg, val) - -); - DECLARE_EVENT_CLASS(snd_soc_card, TP_PROTO(struct snd_soc_card *card, int val), diff --git a/sound/soc/soc-io.c b/sound/soc/soc-io.c index 2ead9edd0d63..7767fbd73eb7 100644 --- a/sound/soc/soc-io.c +++ b/sound/soc/soc-io.c @@ -17,8 +17,6 @@ #include #include -#include - /** * snd_soc_component_read() - Read register value * @component: Component to read from @@ -39,8 +37,6 @@ int snd_soc_component_read(struct snd_soc_component *component, else ret = -EIO; - dev_dbg(component->dev, "read %x => %x\n", reg, *val); - return ret; } EXPORT_SYMBOL_GPL(snd_soc_component_read); @@ -56,8 +52,6 @@ EXPORT_SYMBOL_GPL(snd_soc_component_read); int snd_soc_component_write(struct snd_soc_component *component, unsigned int reg, unsigned int val) { - dev_dbg(component->dev, "write %x = %x\n", reg, val); - if (component->regmap) return regmap_write(component->regmap, reg, val); else if (component->write) @@ -207,7 +201,6 @@ unsigned int snd_soc_read(struct snd_soc_codec *codec, unsigned int reg) ret = snd_soc_component_read(&codec->component, reg, &val); if (ret < 0) return -1; - trace_snd_soc_reg_read(codec, reg, val); return val; } @@ -216,7 +209,6 @@ EXPORT_SYMBOL_GPL(snd_soc_read); int snd_soc_write(struct snd_soc_codec *codec, unsigned int reg, unsigned int val) { - trace_snd_soc_reg_write(codec, reg, val); return snd_soc_component_write(&codec->component, reg, val); } EXPORT_SYMBOL_GPL(snd_soc_write); @@ -269,8 +261,6 @@ int snd_soc_platform_read(struct snd_soc_platform *platform, if (ret < 0) return -1; - trace_snd_soc_preg_read(platform, reg, val); - return val; } EXPORT_SYMBOL_GPL(snd_soc_platform_read); @@ -278,7 +268,6 @@ EXPORT_SYMBOL_GPL(snd_soc_platform_read); int snd_soc_platform_write(struct snd_soc_platform *platform, unsigned int reg, unsigned int val) { - trace_snd_soc_preg_write(platform, reg, val); return snd_soc_component_write(&platform->component, reg, val); } EXPORT_SYMBOL_GPL(snd_soc_platform_write); -- cgit From 62aed044ea4bef36ac3f6885611b013b11c9be2d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 May 2014 16:46:04 +0800 Subject: f2fs: add a tracepoint for f2fs_write_begin This patch adds a tracepoint for f2fs_write_begin to trace write op of user. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ include/trace/events/f2fs.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/trace') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a7bb98f5831a..cde0d69969b9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -900,6 +900,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; + trace_f2fs_write_begin(inode, pos, len, flags); + f2fs_balance_fs(sbi); repeat: err = f2fs_convert_inline_data(inode, pos + len); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 67f38faac589..1e9375aa9340 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -659,6 +659,36 @@ DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, TP_CONDITION(bio) ); +TRACE_EVENT(f2fs_write_begin, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int flags), + + TP_ARGS(inode, pos, len, flags), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, pos) + __field(unsigned int, len) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, flags = %u", + show_dev_ino(__entry), + (unsigned long long)__entry->pos, + __entry->len, + __entry->flags) +); + DECLARE_EVENT_CLASS(f2fs__page, TP_PROTO(struct page *page, int type), -- cgit From dfb2bf38bf89e15a65f9996566b2945921388a2f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 May 2014 16:47:23 +0800 Subject: f2fs: add a tracepoint for f2fs_write_end This patch adds a tracepoint for f2fs_write_end to trace write op of user. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ include/trace/events/f2fs.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/trace') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cde0d69969b9..ea58fb698ac6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -989,6 +989,8 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; + trace_f2fs_write_end(inode, pos, len, copied); + SetPageUptodate(page); set_page_dirty(page); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1e9375aa9340..483804b55742 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -689,6 +689,36 @@ TRACE_EVENT(f2fs_write_begin, __entry->flags) ); +TRACE_EVENT(f2fs_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, pos) + __field(unsigned int, len) + __field(unsigned int, copied) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, copied = %u", + show_dev_ino(__entry), + (unsigned long long)__entry->pos, + __entry->len, + __entry->copied) +); + DECLARE_EVENT_CLASS(f2fs__page, TP_PROTO(struct page *page, int type), -- cgit From ecda0de3430455378f1c02523bf3ad71d91d613a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 May 2014 16:48:26 +0800 Subject: f2fs: add a tracepoint for f2fs_write_{meta,node,data}_page This patch adds a tracepoint for f2fs_write_{meta,node,data}_page to trace when page is writting out. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 2 ++ fs/f2fs/node.c | 2 ++ include/trace/events/f2fs.h | 7 +++++++ 4 files changed, 13 insertions(+) (limited to 'include/trace') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4206f1664be5..c95d62281d7e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -157,6 +157,8 @@ static int f2fs_write_meta_page(struct page *page, struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + trace_f2fs_writepage(page, META); + if (unlikely(sbi->por_doing)) goto redirty_out; if (wbc->for_reclaim) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ea58fb698ac6..b997d552880e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -788,6 +788,8 @@ static int f2fs_write_data_page(struct page *page, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, }; + trace_f2fs_writepage(page, DATA); + if (page->index < end_index) goto write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 059aaf5dda2b..49bdddbcadea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1199,6 +1199,8 @@ static int f2fs_write_node_page(struct page *page, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, }; + trace_f2fs_writepage(page, NODE); + if (unlikely(sbi->por_doing)) goto redirty_out; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 483804b55742..d70991e69e58 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -751,6 +751,13 @@ DECLARE_EVENT_CLASS(f2fs__page, __entry->dirty) ); +DEFINE_EVENT(f2fs__page, f2fs_writepage, + + TP_PROTO(struct page *page, int type), + + TP_ARGS(page, type) +); + DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty, TP_PROTO(struct page *page, int type), -- cgit From e57484343898094bb8f72a2aa1a50929d27aa027 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 May 2014 16:51:24 +0800 Subject: f2fs: add a tracepoint for f2fs_write_{meta,node,data}_pages This patch adds a tracepoint for f2fs_write_{meta,node,data}_pages to trace when pages are fsyncing/flushing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 2 ++ fs/f2fs/node.c | 2 ++ include/trace/events/f2fs.h | 64 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) (limited to 'include/trace') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c95d62281d7e..fe968c7bfc90 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -186,6 +186,8 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); long diff, written; + trace_f2fs_writepages(mapping->host, wbc, META); + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b997d552880e..21bfafaafe83 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -860,6 +860,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, int ret; long diff; + trace_f2fs_writepages(mapping->host, wbc, DATA); + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 49bdddbcadea..3d60d3d34ed2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1242,6 +1242,8 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); long diff; + trace_f2fs_writepages(mapping->host, wbc, NODE); + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d70991e69e58..91b1fcc5ec93 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -772,6 +772,70 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); +TRACE_EVENT(f2fs_writepages, + + TP_PROTO(struct inode *inode, struct writeback_control *wbc, int type), + + TP_ARGS(inode, wbc, type), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(int, type) + __field(int, dir) + __field(long, nr_to_write) + __field(long, pages_skipped) + __field(loff_t, range_start) + __field(loff_t, range_end) + __field(pgoff_t, writeback_index) + __field(int, sync_mode) + __field(char, for_kupdate) + __field(char, for_background) + __field(char, tagged_writepages) + __field(char, for_reclaim) + __field(char, range_cyclic) + __field(char, for_sync) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->type = type; + __entry->dir = S_ISDIR(inode->i_mode); + __entry->nr_to_write = wbc->nr_to_write; + __entry->pages_skipped = wbc->pages_skipped; + __entry->range_start = wbc->range_start; + __entry->range_end = wbc->range_end; + __entry->writeback_index = inode->i_mapping->writeback_index; + __entry->sync_mode = wbc->sync_mode; + __entry->for_kupdate = wbc->for_kupdate; + __entry->for_background = wbc->for_background; + __entry->tagged_writepages = wbc->tagged_writepages; + __entry->for_reclaim = wbc->for_reclaim; + __entry->range_cyclic = wbc->range_cyclic; + __entry->for_sync = wbc->for_sync; + ), + + TP_printk("dev = (%d,%d), ino = %lu, %s, %s, nr_to_write %ld, " + "skipped %ld, start %lld, end %lld, wb_idx %lu, sync_mode %d, " + "kupdate %u background %u tagged %u reclaim %u cyclic %u sync %u", + show_dev_ino(__entry), + show_block_type(__entry->type), + show_file_type(__entry->dir), + __entry->nr_to_write, + __entry->pages_skipped, + __entry->range_start, + __entry->range_end, + (unsigned long)__entry->writeback_index, + __entry->sync_mode, + __entry->for_kupdate, + __entry->for_background, + __entry->tagged_writepages, + __entry->for_reclaim, + __entry->range_cyclic, + __entry->for_sync) +); + TRACE_EVENT(f2fs_submit_page_mbio, TP_PROTO(struct page *page, int rw, int type, block_t blk_addr), -- cgit From c20e89cde669799eff62bf8c00ca9a4819c4e11f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 May 2014 16:53:08 +0800 Subject: f2fs: add a tracepoint for f2fs_read_data_page This patch adds a tracepoint for f2fs_read_data_page to trace when page is readed by user. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ include/trace/events/f2fs.h | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 21bfafaafe83..8c250a5d6f26 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -713,6 +713,8 @@ static int f2fs_read_data_page(struct file *file, struct page *page) struct inode *inode = page->mapping->host; int ret; + trace_f2fs_readpage(page, DATA); + /* If the file has inline data, try to read it directlly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 91b1fcc5ec93..b983990b4a9f 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -732,6 +732,7 @@ DECLARE_EVENT_CLASS(f2fs__page, __field(int, dir) __field(pgoff_t, index) __field(int, dirty) + __field(int, uptodate) ), TP_fast_assign( @@ -741,14 +742,17 @@ DECLARE_EVENT_CLASS(f2fs__page, __entry->dir = S_ISDIR(page->mapping->host->i_mode); __entry->index = page->index; __entry->dirty = PageDirty(page); + __entry->uptodate = PageUptodate(page); ), - TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, dirty = %d", + TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, " + "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), (unsigned long)__entry->index, - __entry->dirty) + __entry->dirty, + __entry->uptodate) ); DEFINE_EVENT(f2fs__page, f2fs_writepage, @@ -758,6 +762,13 @@ DEFINE_EVENT(f2fs__page, f2fs_writepage, TP_ARGS(page, type) ); +DEFINE_EVENT(f2fs__page, f2fs_readpage, + + TP_PROTO(struct page *page, int type), + + TP_ARGS(page, type) +); + DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty, TP_PROTO(struct page *page, int type), -- cgit From 4449bf927b61bdb4389393c6fea6837214d1ace7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 13:10:24 -0400 Subject: tracing: Add __bitmask() macro to trace events to cpumasks and other bitmasks Being able to show a cpumask of events can be useful as some events may affect only some CPUs. There is no standard way to record the cpumask and converting it to a string is rather expensive during the trace as traces happen in hotpaths. It would be better to record the raw event mask and be able to parse it at print time. The following macros were added for use with the TRACE_EVENT() macro: __bitmask() __assign_bitmask() __get_bitmask() To test this, I added this to the sched_migrate_task event, which looked like this: TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu, const struct cpumask *cpus), TP_ARGS(p, dest_cpu, cpus), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) __bitmask( cpumask, num_possible_cpus() ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; __assign_bitmask(cpumask, cpumask_bits(cpus), num_possible_cpus()); ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d cpumask=%s", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu, __get_bitmask(cpumask)) ); With the output of: ksmtuned-3613 [003] d..2 485.220508: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=3 dest_cpu=2 cpumask=00000000,0000000f migration/1-13 [001] d..5 485.221202: sched_migrate_task: comm=ksmtuned pid=3614 prio=120 orig_cpu=1 dest_cpu=0 cpumask=00000000,0000000f awk-3615 [002] d.H5 485.221747: sched_migrate_task: comm=rcu_preempt pid=7 prio=120 orig_cpu=0 dest_cpu=1 cpumask=00000000,000000ff migration/2-18 [002] d..5 485.222062: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=2 dest_cpu=3 cpumask=00000000,0000000f Link: http://lkml.kernel.org/r/1399377998-14870-6-git-send-email-javi.merino@arm.com Link: http://lkml.kernel.org/r/20140506132238.22e136d1@gandalf.local.home Suggested-by: Javi Merino Tested-by: Javi Merino Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 +++ include/linux/trace_seq.h | 10 ++++++++ include/trace/ftrace.h | 57 +++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_output.c | 41 +++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d16da3e53bc7..cff3106ffe2c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -38,6 +38,9 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, *symbol_array); #endif +const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); + const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index a32d86ec8bf2..136116924d8d 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -46,6 +46,9 @@ extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, extern void *trace_seq_reserve(struct trace_seq *s, size_t len); extern int trace_seq_path(struct trace_seq *s, const struct path *path); +extern int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits); + #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { @@ -57,6 +60,13 @@ trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) return 0; } +static inline int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + return 0; +} + static inline int trace_print_seq(struct seq_file *m, struct trace_seq *s) { return 0; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0a1a4f7caf09..9b7a989dcbcc 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -53,6 +53,9 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -128,6 +131,9 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ struct ftrace_data_offsets_##call { \ @@ -200,6 +206,15 @@ #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) +#undef __get_bitmask +#define __get_bitmask(field) \ + ({ \ + void *__bitmask = __get_dynamic_array(field); \ + unsigned int __bitmask_size; \ + __bitmask_size = (__entry->__data_loc_##field >> 16) & 0xffff; \ + ftrace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ + }) + #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ @@ -322,6 +337,9 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ static int notrace __init \ @@ -372,6 +390,29 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #define __string(item, src) __dynamic_array(char, item, \ strlen((src) ? (const char *)(src) : "(null)") + 1) +/* + * __bitmask_size_in_bytes_raw is the number of bytes needed to hold + * num_possible_cpus(). + */ +#define __bitmask_size_in_bytes_raw(nr_bits) \ + (((nr_bits) + 7) / 8) + +#define __bitmask_size_in_longs(nr_bits) \ + ((__bitmask_size_in_bytes_raw(nr_bits) + \ + ((BITS_PER_LONG / 8) - 1)) / (BITS_PER_LONG / 8)) + +/* + * __bitmask_size_in_bytes is the number of bytes needed to hold + * num_possible_cpus() padded out to the nearest long. This is what + * is saved in the buffer, just to be consistent. + */ +#define __bitmask_size_in_bytes(nr_bits) \ + (__bitmask_size_in_longs(nr_bits) * (BITS_PER_LONG / 8)) + +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, \ + __bitmask_size_in_longs(nr_bits)) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static inline notrace int ftrace_get_offsets_##call( \ @@ -513,12 +554,22 @@ static inline notrace int ftrace_get_offsets_##call( \ __entry->__data_loc_##item = __data_offsets.item; #undef __string -#define __string(item, src) __dynamic_array(char, item, -1) \ +#define __string(item, src) __dynamic_array(char, item, -1) #undef __assign_str #define __assign_str(dst, src) \ strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + +#undef __assign_bitmask +#define __assign_bitmask(dst, src, nr_bits) \ + memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) + #undef TP_fast_assign #define TP_fast_assign(args...) args @@ -586,6 +637,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_hex #undef __get_dynamic_array #undef __get_str +#undef __get_bitmask #undef TP_printk #define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) @@ -651,6 +703,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + #undef __perf_addr #define __perf_addr(a) (__addr = (a)) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -125,6 +125,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + /** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor @@ -398,6 +426,19 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif +const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { -- cgit From 62af4f1f7df44ea0bb1a11c94ac9fb384bf1c564 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 May 2014 14:13:05 -0400 Subject: locks: add some tracepoints in the lease handling code v2: add a __break_lease tracepoint for non-blocking case Recently, I needed these to help track down a softlockup when recalling a delegation, but they might be helpful in other situations as well. Cc: "J. Bruce Fields" Signed-off-by: Jeff Layton --- fs/locks.c | 11 +++++ include/trace/events/filelock.h | 96 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 include/trace/events/filelock.h (limited to 'include/trace') diff --git a/fs/locks.c b/fs/locks.c index facf76d9fcb4..da57c9b7e844 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -130,6 +130,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) @@ -1287,6 +1290,7 @@ static void time_out_leases(struct inode *inode) before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(before, F_RDLCK); if (past_time(fl->fl_break_time)) @@ -1374,6 +1378,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } if (i_have_this_lease || (mode & O_NONBLOCK)) { + trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } @@ -1385,10 +1390,12 @@ restart: if (break_time == 0) break_time++; locks_insert_block(flock, new_fl); + trace_break_lease_block(inode, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); spin_lock(&inode->i_lock); + trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { if (error == 0) @@ -1510,6 +1517,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp int error; lease = *flp; + trace_generic_add_lease(inode, lease); + /* * In the delegation case we need mutual exclusion with * a number of operations that take the i_mutex. We trylock @@ -1599,6 +1608,8 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + trace_generic_delete_lease(inode, *flp); + for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h new file mode 100644 index 000000000000..59d11c22f076 --- /dev/null +++ b/include/trace/events/filelock.h @@ -0,0 +1,96 @@ +/* + * Events for filesystem locks + * + * Copyright 2013 Jeff Layton + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM filelock + +#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FILELOCK_H + +#include +#include +#include +#include + +#define show_fl_flags(val) \ + __print_flags(val, "|", \ + { FL_POSIX, "FL_POSIX" }, \ + { FL_FLOCK, "FL_FLOCK" }, \ + { FL_DELEG, "FL_DELEG" }, \ + { FL_ACCESS, "FL_ACCESS" }, \ + { FL_EXISTS, "FL_EXISTS" }, \ + { FL_LEASE, "FL_LEASE" }, \ + { FL_CLOSE, "FL_CLOSE" }, \ + { FL_SLEEP, "FL_SLEEP" }, \ + { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ + { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ + { FL_OFDLCK, "FL_OFDLCK" }) + +#define show_fl_type(val) \ + __print_symbolic(val, \ + { F_RDLCK, "F_RDLCK" }, \ + { F_WRLCK, "F_WRLCK" }, \ + { F_UNLCK, "F_UNLCK" }) + +DECLARE_EVENT_CLASS(filelock_lease, + + TP_PROTO(struct inode *inode, struct file_lock *fl), + + TP_ARGS(inode, fl), + + TP_STRUCT__entry( + __field(struct file_lock *, fl) + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(struct file_lock *, fl_next) + __field(fl_owner_t, fl_owner) + __field(unsigned int, fl_flags) + __field(unsigned char, fl_type) + __field(unsigned long, fl_break_time) + __field(unsigned long, fl_downgrade_time) + ), + + TP_fast_assign( + __entry->fl = fl; + __entry->s_dev = inode->i_sb->s_dev; + __entry->i_ino = inode->i_ino; + __entry->fl_next = fl->fl_next; + __entry->fl_owner = fl->fl_owner; + __entry->fl_flags = fl->fl_flags; + __entry->fl_type = fl->fl_type; + __entry->fl_break_time = fl->fl_break_time; + __entry->fl_downgrade_time = fl->fl_downgrade_time; + ), + + TP_printk("fl=0x%p dev=0x%x:0x%x ino=0x%lx fl_next=0x%p fl_owner=0x%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", + __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, __entry->fl_next, __entry->fl_owner, + show_fl_flags(__entry->fl_flags), + show_fl_type(__entry->fl_type), + __entry->fl_break_time, __entry->fl_downgrade_time) +); + +DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +DEFINE_EVENT(filelock_lease, generic_add_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl), + TP_ARGS(inode, fl)); + +#endif /* _TRACE_FILELOCK_H */ + +/* This part must be outside protection */ +#include -- cgit From beba4bb096201ceec0e8cfb7ce3172a53015bdaf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 4 Jun 2014 14:29:33 -0400 Subject: tracing: Add __get_dynamic_array_len() macro for trace events If a trace event uses a dynamic array for something other than a string then there's currently no way the TP_printk() can figure out what size it is. A __get_dynamic_array_len() is required to know the length. This also simplifies the __get_bitmask() macro which required it as well, but instead just hardcoded it. Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 9b7a989dcbcc..0fd06fef9fac 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -203,6 +203,10 @@ #define __get_dynamic_array(field) \ ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) +#undef __get_dynamic_array_len +#define __get_dynamic_array_len(field) \ + ((__entry->__data_loc_##field >> 16) & 0xffff) + #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) @@ -211,7 +215,7 @@ ({ \ void *__bitmask = __get_dynamic_array(field); \ unsigned int __bitmask_size; \ - __bitmask_size = (__entry->__data_loc_##field >> 16) & 0xffff; \ + __bitmask_size = __get_dynamic_array_len(field); \ ftrace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ }) @@ -636,6 +640,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_symbolic #undef __print_hex #undef __get_dynamic_array +#undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask @@ -700,6 +705,10 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #define __get_dynamic_array(field) \ ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) +#undef __get_dynamic_array_len +#define __get_dynamic_array_len(field) \ + ((__entry->__data_loc_##field >> 16) & 0xffff) + #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) -- cgit From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++--- include/linux/memcontrol.h | 2 +- include/linux/slab.h | 11 +------- include/linux/thread_info.h | 2 -- include/trace/events/gfpflags.h | 1 - kernel/fork.c | 6 ++--- mm/memcontrol.c | 11 ++++---- mm/page_alloc.c | 56 +++++++++++++++++++++++++---------------- mm/slab_common.c | 13 ++++++++++ mm/slub.c | 6 ++--- 10 files changed, 68 insertions(+), 50 deletions(-) (limited to 'include/trace') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc7d01a..d382db71e300 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,7 +31,6 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -91,7 +90,6 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node) +extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); +extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, + unsigned int order); + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); -extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); -extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +extern void __free_kmem_pages(struct page *page, unsigned int order); +extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96e5d2573eb0..5155d09e749d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * res_counter_charge_nofail, but we hope those allocations are rare, * and won't be worth the trouble. */ - if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + if (gfp & __GFP_NOFAIL) return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe62387..a6aab2c0dfc5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, #include #endif -static __always_inline void * -kmalloc_order(size_t size, gfp_t flags, unsigned int order) -{ - void *ret; - - flags |= (__GFP_COMP | __GFP_KMEMCG); - ret = (void *) __get_free_pages(flags, order); - kmemleak_alloc(ret, size, 1, flags); - return ret; -} +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec94fda3..ff307b548ed3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif -#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) - /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1557f2..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,7 +34,6 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..59e3dcc5b8f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b3d5a8..7bab1de50f48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c6125632..1950c8f4d1a6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc9831851be6..ddb60795f373 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit From 7fe7047597cf5ebb300802494db4f407327ec94f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:06 -0700 Subject: mm: shrinker trace points: fix negatives I was looking at a trace of the slab shrinkers (attachment in this comment): https://bugs.freedesktop.org/show_bug.cgi?id=72742#c67 and noticed that "total_scan" can go negative in some cases. We used to dump out the "total_scan" variable directly, but some of the shrinker modifications along the way changed that. This patch just dumps it out directly, again. It doesn't make any sense to derive it from new_nr and nr any more since there are now other shrinkers that can be running in parallel and mucking with those values. Here's an example of the negative numbers in the output: > kswapd0-840 [000] 160.869398: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 10 new scan count 39 total_scan 29 last shrinker return val 256 > kswapd0-840 [000] 160.869618: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 39 new scan count 102 total_scan 63 last shrinker return val 256 > kswapd0-840 [000] 160.870031: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 102 new scan count 47 total_scan -55 last shrinker return val 768 > kswapd0-840 [000] 160.870464: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 47 new scan count 45 total_scan -2 last shrinker return val 768 > kswapd0-840 [000] 163.384144: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 45 new scan count 56 total_scan 11 last shrinker return val 0 > kswapd0-840 [000] 163.384297: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 56 new scan count 15 total_scan -41 last shrinker return val 256 > kswapd0-840 [000] 163.384414: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 15 new scan count 117 total_scan 102 last shrinker return val 0 > kswapd0-840 [000] 163.384657: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 117 new scan count 36 total_scan -81 last shrinker return val 512 > kswapd0-840 [000] 163.384880: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 36 new scan count 111 total_scan 75 last shrinker return val 256 > kswapd0-840 [000] 163.385256: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 111 new scan count 34 total_scan -77 last shrinker return val 768 > kswapd0-840 [000] 163.385598: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 34 new scan count 122 total_scan 88 last shrinker return val 512 Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- mm/vmscan.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 132a985aba8b..1dd5e773114c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -226,9 +226,9 @@ TRACE_EVENT(mm_shrink_slab_start, TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int shrinker_retval, - long unused_scan_cnt, long new_scan_cnt), + long unused_scan_cnt, long new_scan_cnt, long total_scan), - TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -245,7 +245,7 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; - __entry->total_scan = new_scan_cnt - unused_scan_cnt; + __entry->total_scan = total_scan; ), TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a8776eb0f43..15e93158bd0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); return freed; } -- cgit From df9024a8c5a3e031c5df26386f74ffed1b8fc095 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:07 -0700 Subject: mm: shrinker: add nid to tracepoint output Now that we are doing NUMA-aware shrinking, and can have shrinkers running in parallel, or working on individual nodes, it seems like we should also be sticking the node in the output. Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 15 +++++++++++---- mm/vmscan.c | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 1dd5e773114c..69590b6ffc09 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -191,6 +191,7 @@ TRACE_EVENT(mm_shrink_slab_start, TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) + __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) __field(unsigned long, pgs_scanned) @@ -203,6 +204,7 @@ TRACE_EVENT(mm_shrink_slab_start, TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; + __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; __entry->pgs_scanned = pgs_scanned; @@ -212,9 +214,10 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->total_scan = total_scan; ), - TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", __entry->shrink, __entry->shr, + __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->pgs_scanned, @@ -225,13 +228,15 @@ TRACE_EVENT(mm_shrink_slab_start, ); TRACE_EVENT(mm_shrink_slab_end, - TP_PROTO(struct shrinker *shr, int shrinker_retval, + TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan), - TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), + TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, + total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) + __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) @@ -241,6 +246,7 @@ TRACE_EVENT(mm_shrink_slab_end, TP_fast_assign( __entry->shr = shr; + __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; @@ -248,9 +254,10 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->total_scan = total_scan; ), - TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, + __entry->nid, __entry->unused_scan, __entry->new_scan, __entry->total_scan, diff --git a/mm/vmscan.c b/mm/vmscan.c index 15e93158bd0b..9253e188000f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } -- cgit From f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:08:32 -0700 Subject: mm/compaction: do not count migratepages when unnecessary During compaction, update_nr_listpages() has been used to count remaining non-migrated and free pages after a call to migrage_pages(). The freepages counting has become unneccessary, and it turns out that migratepages counting is also unnecessary in most cases. The only situation when it's needed to count cc->migratepages is when migrate_pages() returns with a negative error code. Otherwise, the non-negative return value is the number of pages that were not migrated, which is exactly the count of remaining pages in the cc->migratepages list. Furthermore, any non-zero count is only interesting for the tracepoint of mm_compaction_migratepages events, because after that all remaining unmigrated pages are put back and their count is set to 0. This patch therefore removes update_nr_listpages() completely, and changes the tracepoint definition so that the manual counting is done only when the tracepoint is enabled, and only when migrate_pages() returns a negative error code. Furthermore, migrate_pages() and the tracepoints won't be called when there's nothing to migrate. This potentially avoids some wasted cycles and reduces the volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0 nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events. The mm_compaction_isolate_migratepages event is better for determining that nothing was isolated for migration, and this one was just duplicating the info. Signed-off-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Acked-by: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 25 +++++++++++++++++++++---- mm/compaction.c | 31 +++++++------------------------ 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 06f544ef2f6f..c6814b917bdf 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -5,6 +5,7 @@ #define _TRACE_COMPACTION_H #include +#include #include #include @@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TRACE_EVENT(mm_compaction_migratepages, - TP_PROTO(unsigned long nr_migrated, - unsigned long nr_failed), + TP_PROTO(unsigned long nr_all, + int migrate_rc, + struct list_head *migratepages), - TP_ARGS(nr_migrated, nr_failed), + TP_ARGS(nr_all, migrate_rc, migratepages), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages, ), TP_fast_assign( - __entry->nr_migrated = nr_migrated; + unsigned long nr_failed = 0; + struct list_head *page_lru; + + /* + * migrate_pages() returns either a non-negative number + * with the number of pages that failed migration, or an + * error code, in which case we need to count the remaining + * pages manually + */ + if (migrate_rc >= 0) + nr_failed = migrate_rc; + else + list_for_each(page_lru, migratepages) + nr_failed++; + + __entry->nr_migrated = nr_all - nr_failed; __entry->nr_failed = nr_failed; ), diff --git a/mm/compaction.c b/mm/compaction.c index 56331f5124ba..3c60e3d5237e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -822,22 +822,6 @@ static void compaction_free(struct page *page, unsigned long data) cc->nr_freepages++; } -/* - * We cannot control nr_migratepages fully when migration is running as - * migrate_pages() has no knowledge of of compact_control. When migration is - * complete, we count the number of pages on the list by hand. - */ -static void update_nr_listpages(struct compact_control *cc) -{ - int nr_migratepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - - cc->nr_migratepages = nr_migratepages; -} - /* possible outcome of isolate_migratepages */ typedef enum { ISOLATE_ABORT, /* Abort compaction now */ @@ -1032,7 +1016,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1047,20 +1030,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it -- cgit From dfc68f29ae67f2a6e799b44e6a4eb3417dffbfcd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:15 -0700 Subject: sched, trace: Add a tracepoint for IPI-less remote wakeups Remote wakeups of polling CPUs are a valuable performance improvement; add a tracepoint to make it much easier to verify that they're working. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: David Ahern Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16205aee116772aa686814f9b13bccb562108047.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 67e1bbf83695..0a68d5ae584e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); + +/* + * Tracepoint for waking a polling cpu without an IPI. + */ +TRACE_EVENT(sched_wake_idle_without_ipi, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field( int, cpu ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5976ca579d3e..e4c0ddd3db8e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -564,6 +564,8 @@ void resched_task(struct task_struct *p) if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -647,6 +649,8 @@ static void wake_up_idle_cpu(int cpu) smp_mb(); if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) -- cgit From bb3632c6101b2fad07e6246721466b984b1e0e9d Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 6 Jun 2014 05:40:17 -0700 Subject: PM / sleep: trace events for suspend/resume Adds trace events that give finer resolution into suspend/resume. These events are graphed in the timelines generated by the analyze_suspend.py script. They represent large areas of time consumed that are typical to suspend and resume. The event is triggered by calling the function "trace_suspend_resume" with three arguments: a string (the name of the event to be displayed in the timeline), an integer (case specific number, such as the power state or cpu number), and a boolean (where true is used to denote the start of the timeline event, and false to denote the end). The suspend_resume trace event reproduces the data that the machine_suspend trace event did, so the latter has been removed. Signed-off-by: Todd Brandt Acked-by: Steven Rostedt Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 3 +++ drivers/base/power/main.c | 16 ++++++++++++++++ drivers/base/syscore.c | 5 +++++ include/trace/events/power.h | 42 +++++++++++++++++++++++++----------------- kernel/cpu.c | 5 +++++ kernel/power/hibernate.c | 3 +++ kernel/power/process.c | 3 +++ kernel/power/suspend.c | 14 ++++++++++++-- 8 files changed, 72 insertions(+), 19 deletions(-) (limited to 'include/trace') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c11e3795431b..b3e3cc73ba79 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "sleep.h" @@ -501,6 +502,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) ACPI_FLUSH_CPU_CACHE(); + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, true); switch (acpi_state) { case ACPI_STATE_S1: barrier(); @@ -516,6 +518,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) pr_info(PREFIX "Low-level resume complete\n"); break; } + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, false); /* This violates the spec but is required for bug compatibility. */ acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 343ffad59377..de3fe4fe350a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -545,6 +545,7 @@ static void dpm_resume_noirq(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -587,6 +588,7 @@ static void dpm_resume_noirq(pm_message_t state) dpm_show_time(starttime, state, "noirq"); resume_device_irqs(); cpuidle_resume(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** @@ -664,6 +666,7 @@ static void dpm_resume_early(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -703,6 +706,7 @@ static void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, "early"); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** @@ -834,6 +838,7 @@ void dpm_resume(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -875,6 +880,7 @@ void dpm_resume(pm_message_t state) dpm_show_time(starttime, state, NULL); cpufreq_resume(); + trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** @@ -932,6 +938,7 @@ void dpm_complete(pm_message_t state) { struct list_head list; + trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); @@ -951,6 +958,7 @@ void dpm_complete(pm_message_t state) } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** @@ -1086,6 +1094,7 @@ static int dpm_suspend_noirq(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); cpuidle_pause(); suspend_device_irqs(); mutex_lock(&dpm_list_mtx); @@ -1126,6 +1135,7 @@ static int dpm_suspend_noirq(pm_message_t state) } else { dpm_show_time(starttime, state, "noirq"); } + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } @@ -1222,6 +1232,7 @@ static int dpm_suspend_late(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; @@ -1257,6 +1268,7 @@ static int dpm_suspend_late(pm_message_t state) } else { dpm_show_time(starttime, state, "late"); } + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } @@ -1461,6 +1473,7 @@ int dpm_suspend(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); cpufreq_suspend(); @@ -1498,6 +1511,7 @@ int dpm_suspend(pm_message_t state) dpm_save_failed_step(SUSPEND_SUSPEND); } else dpm_show_time(starttime, state, NULL); + trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } @@ -1582,6 +1596,7 @@ int dpm_prepare(pm_message_t state) { int error = 0; + trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -1612,6 +1627,7 @@ int dpm_prepare(pm_message_t state) put_device(dev); } mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index e8d11b6630ee..dbb8350ea8dc 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -10,6 +10,7 @@ #include #include #include +#include static LIST_HEAD(syscore_ops_list); static DEFINE_MUTEX(syscore_ops_lock); @@ -49,6 +50,7 @@ int syscore_suspend(void) struct syscore_ops *ops; int ret = 0; + trace_suspend_resume(TPS("syscore_suspend"), 0, true); pr_debug("Checking wakeup interrupts\n"); /* Return error code if there are any wakeup interrupts pending. */ @@ -70,6 +72,7 @@ int syscore_suspend(void) "Interrupts enabled after %pF\n", ops->suspend); } + trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: @@ -92,6 +95,7 @@ void syscore_resume(void) { struct syscore_ops *ops; + trace_suspend_resume(TPS("syscore_resume"), 0, true); WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core resume.\n"); @@ -103,6 +107,7 @@ void syscore_resume(void) WARN_ONCE(!irqs_disabled(), "Interrupts enabled after %pF\n", ops->resume); } + trace_suspend_resume(TPS("syscore_resume"), 0, false); } EXPORT_SYMBOL_GPL(syscore_resume); #endif /* CONFIG_PM_SLEEP */ diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9a7e08d61258..f88c8573e66c 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -7,6 +7,9 @@ #include #include #include +#include + +#define TPS(x) tracepoint_string(x) DECLARE_EVENT_CLASS(cpu, @@ -97,23 +100,6 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); -TRACE_EVENT(machine_suspend, - - TP_PROTO(unsigned int state), - - TP_ARGS(state), - - TP_STRUCT__entry( - __field( u32, state ) - ), - - TP_fast_assign( - __entry->state = state; - ), - - TP_printk("state=%lu", (unsigned long)__entry->state) -); - TRACE_EVENT(device_pm_report_time, TP_PROTO(struct device *dev, const char *pm_ops, s64 ops_time, @@ -151,6 +137,28 @@ TRACE_EVENT(device_pm_report_time, __entry->ops_time, __entry->error) ); +TRACE_EVENT(suspend_resume, + + TP_PROTO(const char *action, int val, bool start), + + TP_ARGS(action, val, start), + + TP_STRUCT__entry( + __field(const char *, action) + __field(int, val) + __field(bool, start) + ), + + TP_fast_assign( + __entry->action = action; + __entry->val = val; + __entry->start = start; + ), + + TP_printk("%s[%u] %s", __entry->action, (unsigned int)__entry->val, + (__entry->start)?"begin":"end") +); + DECLARE_EVENT_CLASS(wakeup_source, TP_PROTO(const char *name, unsigned int state), diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..76deba01322a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "smpboot.h" @@ -522,7 +523,9 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1); + trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { @@ -566,7 +569,9 @@ void __ref enable_nonboot_cpus(void) arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { + trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1); + trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { printk(KERN_INFO "CPU%d is up\n", cpu); continue; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index df88d55dc436..49e0a20fd010 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "power.h" @@ -292,7 +293,9 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf1..0ca8d83e2369 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void) struct task_struct *g, *p; struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; @@ -201,6 +203,7 @@ void thaw_processes(void) schedule(); printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 963e6d0f050b..4dd8822f732a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Finish; + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; @@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); goto Platform_wake; } @@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); events_check_enabled = false; } syscore_resume(); @@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state) if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; - trace_machine_suspend(state); if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state) else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) freeze_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: @@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state) { int error; + trace_suspend_resume(TPS("suspend_enter"), state, true); if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { @@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); + trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); @@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; + trace_suspend_resume(TPS("suspend_enter"), state, false); pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); -- cgit From e8bca479c3f269ebb3a3acea5ef63314bb677060 Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Tue, 10 Jun 2014 07:31:22 -0700 Subject: PM / sleep: trace events for device PM callbacks Adds two trace events which supply the same info that initcall_debug provides, but via ftrace instead of dmesg. The existing initcall_debug calls require the pm_print_times_enabled var to be set (either via sysfs or via the kernel cmd line). The new trace events provide all the same info as the initcall_debug prints but with less overhead, and also with coverage of device prepare and complete device callbacks. These events replace the device_pm_report_time event (which has been removed). device_pm_callback_start is called first and provides the device and callback info. device_pm_callback_end is called after with the device name and error info. The time and pid are gathered from the trace data headers. Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 14 ++++++++--- include/trace/events/power.h | 60 +++++++++++++++++++++++++++++++------------- 2 files changed, 52 insertions(+), 22 deletions(-) (limited to 'include/trace') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index de3fe4fe350a..bf412961a934 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -214,9 +214,6 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime, pr_info("call %s+ returned %d after %Ld usecs\n", dev_name(dev), error, (unsigned long long)nsecs >> 10); } - - trace_device_pm_report_time(dev, info, nsecs, pm_verb(state.event), - error); } /** @@ -387,7 +384,9 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, calltime = initcall_debug_start(dev); pm_dev_dbg(dev, state, info); + trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); + trace_device_pm_callback_end(dev, error); suspend_report_result(cb, error); initcall_debug_report(dev, calltime, error, state, info); @@ -919,7 +918,9 @@ static void device_complete(struct device *dev, pm_message_t state) if (callback) { pm_dev_dbg(dev, state, info); + trace_device_pm_callback_start(dev, info, state.event); callback(dev); + trace_device_pm_callback_end(dev, 0); } device_unlock(dev); @@ -1307,7 +1308,9 @@ static int legacy_suspend(struct device *dev, pm_message_t state, calltime = initcall_debug_start(dev); + trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); + trace_device_pm_callback_end(dev, error); suspend_report_result(cb, error); initcall_debug_report(dev, calltime, error, state, info); @@ -1563,8 +1566,11 @@ static int device_prepare(struct device *dev, pm_message_t state) callback = dev->driver->pm->prepare; } - if (callback) + if (callback) { + trace_device_pm_callback_start(dev, info, state.event); ret = callback(dev); + trace_device_pm_callback_end(dev, ret); + } device_unlock(dev); diff --git a/include/trace/events/power.h b/include/trace/events/power.h index f88c8573e66c..d19840b0cac8 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -93,6 +93,17 @@ TRACE_EVENT(pstate_sample, #define PWR_EVENT_EXIT -1 #endif +#define pm_verb_symbolic(event) \ + __print_symbolic(event, \ + { PM_EVENT_SUSPEND, "suspend" }, \ + { PM_EVENT_RESUME, "resume" }, \ + { PM_EVENT_FREEZE, "freeze" }, \ + { PM_EVENT_QUIESCE, "quiesce" }, \ + { PM_EVENT_HIBERNATE, "hibernate" }, \ + { PM_EVENT_THAW, "thaw" }, \ + { PM_EVENT_RESTORE, "restore" }, \ + { PM_EVENT_RECOVER, "recover" }) + DEFINE_EVENT(cpu, cpu_frequency, TP_PROTO(unsigned int frequency, unsigned int cpu_id), @@ -100,41 +111,54 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); -TRACE_EVENT(device_pm_report_time, +TRACE_EVENT(device_pm_callback_start, - TP_PROTO(struct device *dev, const char *pm_ops, s64 ops_time, - char *pm_event_str, int error), + TP_PROTO(struct device *dev, const char *pm_ops, int event), - TP_ARGS(dev, pm_ops, ops_time, pm_event_str, error), + TP_ARGS(dev, pm_ops, event), TP_STRUCT__entry( __string(device, dev_name(dev)) __string(driver, dev_driver_string(dev)) __string(parent, dev->parent ? dev_name(dev->parent) : "none") __string(pm_ops, pm_ops ? pm_ops : "none ") - __string(pm_event_str, pm_event_str) - __field(s64, ops_time) - __field(int, error) + __field(int, event) ), TP_fast_assign( - const char *tmp = dev->parent ? dev_name(dev->parent) : "none"; - const char *tmp_i = pm_ops ? pm_ops : "none "; + __assign_str(device, dev_name(dev)); + __assign_str(driver, dev_driver_string(dev)); + __assign_str(parent, + dev->parent ? dev_name(dev->parent) : "none"); + __assign_str(pm_ops, pm_ops ? pm_ops : "none "); + __entry->event = event; + ), + + TP_printk("%s %s, parent: %s, %s[%s]", __get_str(driver), + __get_str(device), __get_str(parent), __get_str(pm_ops), + pm_verb_symbolic(__entry->event)) +); + +TRACE_EVENT(device_pm_callback_end, + + TP_PROTO(struct device *dev, int error), + TP_ARGS(dev, error), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(driver, dev_driver_string(dev)) + __field(int, error) + ), + + TP_fast_assign( __assign_str(device, dev_name(dev)); __assign_str(driver, dev_driver_string(dev)); - __assign_str(parent, tmp); - __assign_str(pm_ops, tmp_i); - __assign_str(pm_event_str, pm_event_str); - __entry->ops_time = ops_time; __entry->error = error; ), - /* ops_str has an extra space at the end */ - TP_printk("%s %s parent=%s state=%s ops=%snsecs=%lld err=%d", - __get_str(driver), __get_str(device), __get_str(parent), - __get_str(pm_event_str), __get_str(pm_ops), - __entry->ops_time, __entry->error) + TP_printk("%s %s, err=%d", + __get_str(driver), __get_str(device), __entry->error) ); TRACE_EVENT(suspend_resume, -- cgit From 4af4206be2bd1933cae20c2b6fb2058dbc887f7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:58:54 +0200 Subject: tracing: Fix syscall_*regfunc() vs copy_process() race syscall_regfunc() and syscall_unregfunc() should set/clear TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race with copy_process() and miss the new child which was not added to the process/thread lists yet. Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT under tasklist. Link: http://lkml.kernel.org/p/20140413185854.GB20668@redhat.com Cc: stable@vger.kernel.org # 2.6.33 Fixes: a871bd33a6c0 "tracing: Add syscall tracepoints" Acked-by: Frederic Weisbecker Acked-by: Paul E. McKenney Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/syscall.h | 15 +++++++++++++++ kernel/fork.c | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include/trace') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index fed853f3d7aa..9674145e2f6a 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -32,4 +33,18 @@ struct syscall_metadata { struct ftrace_event_call *exit_event; }; +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) +static inline void syscall_tracepoint_update(struct task_struct *p) +{ + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + else + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); +} +#else +static inline void syscall_tracepoint_update(struct task_struct *p) +{ +} +#endif + #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) -- cgit From 4d4c9cc839a308be3289a361ccba4447ee140552 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Jun 2014 08:59:16 -0400 Subject: tracing: Add __field_struct macro for TRACE_EVENT() Currently the __field() macro in TRACE_EVENT is only good for primitive values, such as integers and pointers, but it fails on complex data types such as structures or unions. This is because the __field() macro determines if the variable is signed or not with the test of: (((type)(-1)) < (type)1) Unfortunately, that fails when type is a structure. Since trace events should support structures as fields a new macro is created for such a case called __field_struct() which acts exactly the same as __field() does but it does not do the signed type check and just uses a constant false for that answer. Cc: Tony Luck Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 33 ++++++++++++++++++++++++++++++ samples/trace_events/trace-events-sample.h | 3 ++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0fd06fef9fac..26b4f2e13275 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -44,6 +44,12 @@ #undef __field_ext #define __field_ext(type, item, filter_type) type item; +#undef __field_struct +#define __field_struct(type, item) type item; + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) type item; + #undef __array #define __array(type, item, len) type item[len]; @@ -122,6 +128,12 @@ #undef __field_ext #define __field_ext(type, item, filter_type) +#undef __field_struct +#define __field_struct(type, item) + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) @@ -315,9 +327,21 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ if (ret) \ return ret; +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + 0, filter_type); \ + if (ret) \ + return ret; + #undef __field #define __field(type, item) __field_ext(type, item, FILTER_OTHER) +#undef __field_struct +#define __field_struct(type, item) __field_struct_ext(type, item, FILTER_OTHER) + #undef __array #define __array(type, item, len) \ do { \ @@ -379,6 +403,12 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #undef __field_ext #define __field_ext(type, item, filter_type) +#undef __field_struct +#define __field_struct(type, item) + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) @@ -550,6 +580,9 @@ static inline notrace int ftrace_get_offsets_##call( \ #undef __field #define __field(type, item) +#undef __field_struct +#define __field_struct(type, item) + #undef __array #define __array(type, item, len) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 6af373236d73..4b0113f73ee9 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -56,7 +56,8 @@ * struct: This defines the way the data will be stored in the ring buffer. * There are currently two types of elements. __field and __array. * a __field is broken up into (type, name). Where type can be any - * type but an array. + * primitive type (integer, long or pointer). __field_struct() can + * be any static complex data value (struct, union, but not an array). * For an array. there are three fields. (type, name, size). The * type of elements in the array, the name of the field and the size * of the array. -- cgit