#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0-only # Copyright (C) 2024 ARM Ltd. # # Utility providing smaps-like output detailing transparent hugepage usage. # For more info, run: # ./thpmaps --help # # Requires numpy: # pip3 install numpy import argparse import collections import math import os import re import resource import shutil import sys import textwrap import time import numpy as np with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f: PAGE_SIZE = resource.getpagesize() PAGE_SHIFT = int(math.log2(PAGE_SIZE)) PMD_SIZE = int(f.read()) PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE)) def align_forward(v, a): return (v + (a - 1)) & ~(a - 1) def align_offset(v, a): return v & (a - 1) def kbnr(kb): # Convert KB to number of pages. return (kb << 10) >> PAGE_SHIFT def nrkb(nr): # Convert number of pages to KB. return (nr << PAGE_SHIFT) >> 10 def odkb(order): # Convert page order to KB. return (PAGE_SIZE << order) >> 10 def cont_ranges_all(search, index): # Given a list of arrays, find the ranges for which values are monotonically # incrementing in all arrays. all arrays in search and index must be the # same size. sz = len(search[0]) r = np.full(sz, 2) d = np.diff(search[0]) == 1 for dd in [np.diff(arr) == 1 for arr in search[1:]]: d &= dd r[1:] -= d r[:-1] -= d return [np.repeat(arr, r).reshape(-1, 2) for arr in index] class ArgException(Exception): pass class FileIOException(Exception): pass class BinArrayFile: # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a # numpy array. Use inherrited class in a with clause to ensure file is # closed when it goes out of scope. def __init__(self, filename, element_size): self.element_size = element_size self.filename = filename self.fd = os.open(self.filename, os.O_RDONLY) def cleanup(self): os.close(self.fd) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.cleanup() def _readin(self, offset, buffer): length = os.preadv(self.fd, (buffer,), offset) if len(buffer) != length: raise FileIOException('error: {} failed to read {} bytes at {:x}' .format(self.filename, len(buffer), offset)) def _toarray(self, buf): assert(self.element_size == 8) return np.frombuffer(buf, dtype=np.uint64) def getv(self, vec): vec *= self.element_size offsets = vec[:, 0] lengths = (np.diff(vec) + self.element_size).reshape(len(vec)) buf = bytearray(int(np.sum(lengths))) view = memoryview(buf) pos = 0 for offset, length in zip(offsets, lengths): offset = int(offset) length = int(length) self._readin(offset, view[pos:pos+length]) pos += length return self._toarray(buf) def get(self, index, nr=1): offset = index * self.element_size length = nr * self.element_size buf = bytearray(length) self._readin(offset, buf) return self._toarray(buf) PM_PAGE_PRESENT = 1 << 63 PM_PFN_MASK = (1 << 55) - 1 class PageMap(BinArrayFile): # Read ranges of a given pid's pagemap into a numpy array. def __init__(self, pid='self'): super().__init__(f'/proc/{pid}/pagemap', 8) KPF_ANON = 1 << 12 KPF_COMPOUND_HEAD = 1 << 15 KPF_COMPOUND_TAIL = 1 << 16 KPF_THP = 1 << 22 class KPageFlags(BinArrayFile): # Read ranges of /proc/kpageflags into a numpy array. def __init__(self): super().__init__(f'/proc/kpageflags', 8) vma_all_stats = set([ "Size", "Rss", "Pss", "Pss_Dirty", "Shared_Clean", "Shared_Dirty", "Private_Clean", "Private_Dirty", "Referenced", "Anonymous", "KSM", "LazyFree", "AnonHugePages", "ShmemPmdMapped", "FilePmdMapped", "Shared_Hugetlb", "Private_Hugetlb", "Swap", "SwapPss", "Locked", ]) vma_min_stats = set([ "Rss", "Anonymous", "AnonHugePages", "ShmemPmdMapped", "FilePmdMapped", ]) VMA = collections.namedtuple('VMA', [ 'name', 'start', 'end', 'read', 'write', 'execute', 'private', 'pgoff', 'major', 'minor', 'inode', 'stats', ]) class VMAList: # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the # instance to receive VMAs. def __init__(self, pid='self', stats=[]): self.vmas = [] with open(f'/proc/{pid}/smaps', 'r') as file: for line in file: elements = line.split() if '-' in elements[0]: start, end = map(lambda x: int(x, 16), elements[0].split('-')) major, minor = map(lambda x: int(x, 16), elements[3].split(':')) self.vmas.append(VMA( name=elements[5] if len(elements) == 6 else '', start=start, end=end, read=elements[1][0] == 'r', write=elements[1][1] == 'w', execute=elements[1][2] == 'x', private=elements[1][3] == 'p', pgoff=int(elements[2], 16), major=major, minor=minor, inode=int(elements[4], 16), stats={}, )) else: param = elements[0][:-1] if param in stats: value = int(elements[1]) self.vmas[-1].stats[param] = {'type': None, 'value': value} def __iter__(self): yield from self.vmas def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads): # Given 4 same-sized arrays representing a range within a page table backed # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: # True if page is anonymous, heads: True if page is head of a THP), return a # dictionary of statistics describing the mapped THPs. stats = { 'file': { 'partial': 0, 'aligned': [0] * (PMD_ORDER + 1), 'unaligned': [0] * (PMD_ORDER + 1), }, 'anon': { 'partial': 0, 'aligned': [0] * (PMD_ORDER + 1), 'unaligned': [0] * (PMD_ORDER + 1), }, } for rindex, rpfn in zip(ranges[0], ranges[2]): index_next = int(rindex[0]) index_end = int(rindex[1]) + 1 pfn_end = int(rpfn[1]) + 1 folios = indexes[index_next:index_end][heads[index_next:index_end]] # Account pages for any partially mapped THP at the front. In that case, # the first page of the range is a tail. nr = (int(folios[0]) if len(folios) else index_end) - index_next stats['anon' if anons[index_next] else 'file']['partial'] += nr # Account pages for any partially mapped THP at the back. In that case, # the next page after the range is a tail. if len(folios): flags = int(kpageflags.get(pfn_end)[0]) if flags & KPF_COMPOUND_TAIL: nr = index_end - int(folios[-1]) folios = folios[:-1] index_end -= nr stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr # Account fully mapped THPs in the middle of the range. if len(folios): folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1])) folio_orders = np.log2(folio_nrs).astype(np.uint64) for index, order in zip(folios, folio_orders): index = int(index) order = int(order) nr = 1 << order vfn = int(vfns[index]) align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned' anon = 'anon' if anons[index] else 'file' stats[anon][align][order] += nr # Account PMD-mapped THPs spearately, so filter out of the stats. There is a # race between acquiring the smaps stats and reading pagemap, where memory # could be deallocated. So clamp to zero incase it would have gone negative. anon_pmd_mapped = vma.stats['AnonHugePages']['value'] file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ vma.stats['FilePmdMapped']['value'] stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped)) stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped)) rstats = { f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped}, } def flatten_sub(type, subtype, stats): param = f"{type}-thp-pte-{subtype}-{{}}kB" for od, nr in enumerate(stats[2:], 2): rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)} def flatten_type(type, stats): flatten_sub(type, 'aligned', stats['aligned']) flatten_sub(type, 'unaligned', stats['unaligned']) rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])} flatten_type('anon', stats['anon']) flatten_type('file', stats['file']) return rstats def cont_parse(vma, order, ranges, anons, heads): # Given 4 same-sized arrays representing a range within a page table backed # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: # True if page is anonymous, heads: True if page is head of a THP), return a # dictionary of statistics describing the contiguous blocks. nr_cont = 1 << order nr_anon = 0 nr_file = 0 for rindex, rvfn, rpfn in zip(*ranges): index_next = int(rindex[0]) index_end = int(rindex[1]) + 1 vfn_start = int(rvfn[0]) pfn_start = int(rpfn[0]) if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): continue off = align_forward(vfn_start, nr_cont) - vfn_start index_next += off while index_next + nr_cont <= index_end: folio_boundary = heads[index_next+1:index_next+nr_cont].any() if not folio_boundary: if anons[index_next]: nr_anon += nr_cont else: nr_file += nr_cont index_next += nr_cont # Account blocks that are PMD-mapped spearately, so filter out of the stats. # There is a race between acquiring the smaps stats and reading pagemap, # where memory could be deallocated. So clamp to zero incase it would have # gone negative. anon_pmd_mapped = vma.stats['AnonHugePages']['value'] file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ vma.stats['FilePmdMapped']['value'] nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped)) nr_file = max(0, nr_file - kbnr(file_pmd_mapped)) rstats = { f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped}, } rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)} rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)} return rstats def vma_print(vma, pid): # Prints a VMA instance in a format similar to smaps. The main difference is # that the pid is included as the first value. print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}" .format( pid, vma.start, vma.end, 'r' if vma.read else '-', 'w' if vma.write else '-', 'x' if vma.execute else '-', 'p' if vma.private else 's', vma.pgoff, vma.major, vma.minor, vma.inode, vma.name )) def stats_print(stats, tot_anon, tot_file, inc_empty): # Print a statistics dictionary. label_field = 32 for label, stat in stats.items(): type = stat['type'] value = stat['value'] if value or inc_empty: pad = max(0, label_field - len(label) - 1) if type == 'anon' and tot_anon > 0: percent = f' ({value / tot_anon:3.0%})' elif type == 'file' and tot_file > 0: percent = f' ({value / tot_file:3.0%})' else: percent = '' print(f"{label}:{' ' * pad}{value:8} kB{percent}") def vma_parse(vma, pagemap, kpageflags, contorders): # Generate thp and cont statistics for a single VMA. start = vma.start >> PAGE_SHIFT end = vma.end >> PAGE_SHIFT pmes = pagemap.get(start, end - start) present = pmes & PM_PAGE_PRESENT != 0 pfns = pmes & PM_PFN_MASK pfns = pfns[present] vfns = np.arange(start, end, dtype=np.uint64) vfns = vfns[present] pfn_vec = cont_ranges_all([pfns], [pfns])[0] flags = kpageflags.getv(pfn_vec) anons = flags & KPF_ANON != 0 heads = flags & KPF_COMPOUND_HEAD != 0 thps = flags & KPF_THP != 0 vfns = vfns[thps] pfns = pfns[thps] anons = anons[thps] heads = heads[thps] indexes = np.arange(len(vfns), dtype=np.uint64) ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns]) thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads) contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders] tot_anon = vma.stats['Anonymous']['value'] tot_file = vma.stats['Rss']['value'] - tot_anon return { **thpstats, **{k: v for s in contstats for k, v in s.items()} }, tot_anon, tot_file def do_main(args): pids = set() rollup = {} rollup_anon = 0 rollup_file = 0 if args.cgroup: strict = False for walk_info in os.walk(args.cgroup): cgroup = walk_info[0] with open(f'{cgroup}/cgroup.procs') as pidfile: for line in pidfile.readlines(): pids.add(int(line.strip())) elif args.pid: strict = True pids = pids.union(args.pid) else: strict = False for pid in os.listdir('/proc'): if pid.isdigit(): pids.add(int(pid)) if not args.rollup: print(" PID START END PROT OFFSET DEV INODE OBJECT") for pid in pids: try: with PageMap(pid) as pagemap: with KPageFlags() as kpageflags: for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0: stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) else: stats = {} vma_anon = 0 vma_file = 0 if args.inc_smaps: stats = {**vma.stats, **stats} if args.rollup: for k, v in stats.items(): if k in rollup: assert(rollup[k]['type'] == v['type']) rollup[k]['value'] += v['value'] else: rollup[k] = v rollup_anon += vma_anon rollup_file += vma_file else: vma_print(vma, pid) stats_print(stats, vma_anon, vma_file, args.inc_empty) except (FileNotFoundError, ProcessLookupError, FileIOException): if strict: raise if args.rollup: stats_print(rollup, rollup_anon, rollup_file, args.inc_empty) def main(): docs_width = shutil.get_terminal_size().columns docs_width -= 2 docs_width = min(80, docs_width) def format(string): text = re.sub(r'\s+', ' ', string) text = re.sub(r'\s*\\n\s*', '\n', text) paras = text.split('\n') paras = [textwrap.fill(p, width=docs_width) for p in paras] return '\n'.join(paras) def formatter(prog): return argparse.RawDescriptionHelpFormatter(prog, width=docs_width) def size2order(human): units = { "K": 2**10, "M": 2**20, "G": 2**30, "k": 2**10, "m": 2**20, "g": 2**30, } unit = 1 if human[-1] in units: unit = units[human[-1]] human = human[:-1] try: size = int(human) except ValueError: raise ArgException('error: --cont value must be integer size with optional KMG unit') size *= unit order = int(math.log2(size / PAGE_SIZE)) if order < 1: raise ArgException('error: --cont value must be size of at least 2 pages') if (1 << order) * PAGE_SIZE != size: raise ArgException('error: --cont value must be size of power-of-2 pages') if order > PMD_ORDER: raise ArgException('error: --cont value must be less than or equal to PMD order') return order parser = argparse.ArgumentParser(formatter_class=formatter, description=format("""Prints information about how transparent huge pages are mapped, either system-wide, or for a specified process or cgroup.\\n \\n When run with --pid, the user explicitly specifies the set of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run with --cgroup, the user passes either a v1 or v2 cgroup and all pids that belong to the cgroup subtree are scanned. When run with neither --pid nor --cgroup, the full set of pids on the system is gathered from /proc and scanned as if the user had provided "--pid 1 --pid 2 ...".\\n \\n A default set of statistics is always generated for THP mappings. However, it is also possible to generate additional statistics for "contiguous block mappings" where the block size is user-defined.\\n \\n Statistics are maintained independently for anonymous and file-backed (pagecache) memory and are shown both in kB and as a percentage of either total anonymous or total file-backed memory as appropriate.\\n \\n THP Statistics\\n --------------\\n \\n Statistics are always generated for fully- and contiguously-mapped THPs whose mapping address is aligned to their size, for each <size> supported by the system. Separate counters describe THPs mapped by PTE vs those mapped by PMD. (Although note a THP can only be mapped by PMD if it is PMD-sized):\\n \\n - anon-thp-pte-aligned-<size>kB\\n - file-thp-pte-aligned-<size>kB\\n - anon-thp-pmd-aligned-<size>kB\\n - file-thp-pmd-aligned-<size>kB\\n \\n Similarly, statistics are always generated for fully- and contiguously-mapped THPs whose mapping address is *not* aligned to their size, for each <size> supported by the system. Due to the unaligned mapping, it is impossible to map by PMD, so there are only PTE counters for this case:\\n \\n - anon-thp-pte-unaligned-<size>kB\\n - file-thp-pte-unaligned-<size>kB\\n \\n Statistics are also always generated for mapped pages that belong to a THP but where the is THP is *not* fully- and contiguously- mapped. These "partial" mappings are all counted in the same counter regardless of the size of the THP that is partially mapped:\\n \\n - anon-thp-pte-partial\\n - file-thp-pte-partial\\n \\n Contiguous Block Statistics\\n ---------------------------\\n \\n An optional, additional set of statistics is generated for every contiguous block size specified with `--cont <size>`. These statistics show how much memory is mapped in contiguous blocks of <size> and also aligned to <size>. A given contiguous block must all belong to the same THP, but there is no requirement for it to be the *whole* THP. Separate counters describe contiguous blocks mapped by PTE vs those mapped by PMD:\\n \\n - anon-cont-pte-aligned-<size>kB\\n - file-cont-pte-aligned-<size>kB\\n - anon-cont-pmd-aligned-<size>kB\\n - file-cont-pmd-aligned-<size>kB\\n \\n As an example, if monitoring 64K contiguous blocks (--cont 64K), there are a number of sources that could provide such blocks: a fully- and contiguously-mapped 64K THP that is aligned to a 64K boundary would provide 1 block. A fully- and contiguously-mapped 128K THP that is aligned to at least a 64K boundary would provide 2 blocks. Or a 128K THP that maps its first 100K, but contiguously and starting at a 64K boundary would provide 1 block. A fully- and contiguously-mapped 2M THP would provide 32 blocks. There are many other possible permutations.\\n"""), epilog=format("""Requires root privilege to access pagemap and kpageflags.""")) group = parser.add_mutually_exclusive_group(required=False) group.add_argument('--pid', metavar='pid', required=False, type=int, default=[], action='append', help="""Process id of the target process. Maybe issued multiple times to scan multiple processes. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information.""") group.add_argument('--cgroup', metavar='path', required=False, help="""Path to the target cgroup in sysfs. Iterates over every pid in the cgroup and its children. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information.""") parser.add_argument('--rollup', required=False, default=False, action='store_true', help="""Sum the per-vma statistics to provide a summary over the whole system, process or cgroup.""") parser.add_argument('--cont', metavar='size[KMG]', required=False, default=[], action='append', help="""Adds stats for memory that is mapped in contiguous blocks of <size> and also aligned to <size>. May be issued multiple times to track multiple sized blocks. Useful to infer e.g. arm64 contpte and hpa mappings. Size must be a power-of-2 number of pages.""") parser.add_argument('--inc-smaps', required=False, default=False, action='store_true', help="""Include all numerical, additive /proc/<pid>/smaps stats in the output.""") parser.add_argument('--inc-empty', required=False, default=False, action='store_true', help="""Show all statistics including those whose value is 0.""") parser.add_argument('--periodic', metavar='sleep_ms', required=False, type=int, help="""Run in a loop, polling every sleep_ms milliseconds.""") args = parser.parse_args() try: args.cont = [size2order(cont) for cont in args.cont] except ArgException as e: parser.print_usage() raise if args.periodic: while True: do_main(args) print() time.sleep(args.periodic / 1000) else: do_main(args) if __name__ == "__main__": try: main() except Exception as e: prog = os.path.basename(sys.argv[0]) print(f'{prog}: {e}') exit(1)