aboutsummaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/alloc_tag.h26
-rw-r--r--include/linux/bcma/bcma_driver_pci.h2
-rw-r--r--include/linux/bpf.h28
-rw-r--r--include/linux/bpf_lsm.h8
-rw-r--r--include/linux/bpf_verifier.h27
-rw-r--r--include/linux/btf.h5
-rw-r--r--include/linux/buildid.h4
-rw-r--r--include/linux/cgroup-defs.h11
-rw-r--r--include/linux/cgroup.h7
-rw-r--r--include/linux/cleanup.h2
-rw-r--r--include/linux/clk-provider.h14
-rw-r--r--include/linux/clk.h33
-rw-r--r--include/linux/cma.h16
-rw-r--r--include/linux/damon.h3
-rw-r--r--include/linux/decompress/unxz.h5
-rw-r--r--include/linux/dma/ipu-dma.h174
-rw-r--r--include/linux/dma/k3-udma-glue.h2
-rw-r--r--include/linux/err.h9
-rw-r--r--include/linux/fault-inject.h36
-rw-r--r--include/linux/file.h53
-rw-r--r--include/linux/filter.h10
-rw-r--r--include/linux/firewire.h22
-rw-r--r--include/linux/fs.h10
-rw-r--r--include/linux/generic-radix-tree.h105
-rw-r--r--include/linux/gfp.h25
-rw-r--r--include/linux/gfp_types.h8
-rw-r--r--include/linux/huge_mm.h158
-rw-r--r--include/linux/hugetlb.h17
-rw-r--r--include/linux/i2c.h3
-rw-r--r--include/linux/iomap.h13
-rw-r--r--include/linux/jbd2.h4
-rw-r--r--include/linux/kernel-page-flags.h3
-rw-r--r--include/linux/kfence.h2
-rw-r--r--include/linux/khugepaged.h1
-rw-r--r--include/linux/leds.h2
-rw-r--r--include/linux/lockd/lockd.h2
-rw-r--r--include/linux/lru_cache.h4
-rw-r--r--include/linux/maple_tree.h20
-rw-r--r--include/linux/memcontrol.h67
-rw-r--r--include/linux/memory_hotplug.h48
-rw-r--r--include/linux/mfd/88pm80x.h2
-rw-r--r--include/linux/mfd/ds1wm.h29
-rw-r--r--include/linux/migrate.h3
-rw-r--r--include/linux/mm.h304
-rw-r--r--include/linux/mm_types.h22
-rw-r--r--include/linux/mm_types_task.h3
-rw-r--r--include/linux/mmc/host.h1
-rw-r--r--include/linux/mmzone.h35
-rw-r--r--include/linux/msi.h2
-rw-r--r--include/linux/nfs4.h17
-rw-r--r--include/linux/numa.h8
-rw-r--r--include/linux/numa_memblks.h58
-rw-r--r--include/linux/page-flags.h202
-rw-r--r--include/linux/page_counter.h27
-rw-r--r--include/linux/pagemap.h124
-rw-r--r--include/linux/pagewalk.h58
-rw-r--r--include/linux/pci-epc.h3
-rw-r--r--include/linux/pci.h11
-rw-r--r--include/linux/pci_ids.h5
-rw-r--r--include/linux/percpu.h1
-rw-r--r--include/linux/pgalloc_tag.h31
-rw-r--r--include/linux/pgtable.h18
-rw-r--r--include/linux/pinctrl/pinconf-generic.h3
-rw-r--r--include/linux/platform_data/amd_qdma.h36
-rw-r--r--include/linux/quota.h2
-rw-r--r--include/linux/ratelimit_types.h2
-rw-r--r--include/linux/ring_buffer.h20
-rw-r--r--include/linux/rmap.h11
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/sched/ext.h215
-rw-r--r--include/linux/sched/mm.h27
-rw-r--r--include/linux/sched/task.h8
-rw-r--r--include/linux/sched/task_stack.h18
-rw-r--r--include/linux/seqlock.h25
-rw-r--r--include/linux/set_memory.h8
-rw-r--r--include/linux/shmem_fs.h15
-rw-r--r--include/linux/slab.h14
-rw-r--r--include/linux/soundwire/sdw.h2
-rw-r--r--include/linux/sunrpc/svc.h44
-rw-r--r--include/linux/sunrpc/svc_rdma.h2
-rw-r--r--include/linux/sunrpc/svcauth.h1
-rw-r--r--include/linux/sunrpc/svcsock.h2
-rw-r--r--include/linux/sunrpc/xdrgen/_builtins.h243
-rw-r--r--include/linux/sunrpc/xdrgen/_defs.h26
-rw-r--r--include/linux/swap.h44
-rw-r--r--include/linux/uaccess.h7
-rw-r--r--include/linux/userfaultfd_k.h19
-rw-r--r--include/linux/vm_event_item.h26
-rw-r--r--include/linux/vmalloc.h4
-rw-r--r--include/linux/vmstat.h1
-rw-r--r--include/linux/writeback.h3
-rw-r--r--include/linux/xz.h81
-rw-r--r--include/linux/zstd.h167
-rw-r--r--include/linux/zswap.h16
94 files changed, 2132 insertions, 918 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8c61ccd161ba..1f0a9ff23a2c 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -70,7 +70,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
/*
* When percpu variables are required to be defined as weak, static percpu
* variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
- * Instead we will accound all module allocations to a single counter.
+ * Instead we will account all module allocations to a single counter.
*/
DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
@@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
/* Caller should verify both ref and tag to be valid */
static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
{
+ alloc_tag_add_check(ref, tag);
+ if (!ref || !tag)
+ return;
+
ref->ct = &tag->ct;
+}
+
+static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+ __alloc_tag_ref_set(ref, tag);
/*
* We need in increment the call counter every time we have a new
* allocation or when we split a large allocation into smaller ones.
@@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
this_cpu_inc(tag->counters->calls);
}
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
-{
- alloc_tag_add_check(ref, tag);
- if (!ref || !tag)
- return;
-
- __alloc_tag_ref_set(ref, tag);
-}
-
static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
{
- alloc_tag_add_check(ref, tag);
- if (!ref || !tag)
- return;
-
- __alloc_tag_ref_set(ref, tag);
+ alloc_tag_ref_set(ref, tag);
this_cpu_add(tag->counters->bytes, bytes);
}
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 68da8dba5162..dba41b65ae0d 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -203,7 +203,7 @@ struct pci_dev;
#define BCMA_CORE_PCI_MDIO_RXCTRL0 0x840
/* PCIE Root Capability Register bits (Host mode only) */
-#define BCMA_CORE_PCI_RC_CRS_VISIBILITY 0x0001
+#define BCMA_CORE_PCI_RC_RRS_VISIBILITY 0x0001
struct bcma_drv_pci;
struct bcma_bus;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3b94ec161e8c..0c3893c47171 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -294,6 +294,7 @@ struct bpf_map {
* same prog type, JITed flag and xdp_has_frags flag.
*/
struct {
+ const struct btf_type *attach_func_proto;
spinlock_t lock;
enum bpf_prog_type type;
bool jited;
@@ -694,6 +695,11 @@ enum bpf_type_flag {
/* DYNPTR points to xdp_buff */
DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+ /* Memory must be aligned on some architectures, used in combination with
+ * MEM_FIXED_SIZE.
+ */
+ MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
@@ -731,8 +737,6 @@ enum bpf_arg_type {
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
- ARG_PTR_TO_INT, /* pointer to int */
- ARG_PTR_TO_LONG, /* pointer to long */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */
@@ -743,7 +747,7 @@ enum bpf_arg_type {
ARG_PTR_TO_STACK, /* pointer to stack */
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
- ARG_PTR_TO_KPTR, /* pointer to referenced kptr */
+ ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */
ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
__BPF_ARG_TYPE_MAX,
@@ -807,6 +811,12 @@ struct bpf_func_proto {
bool gpl_only;
bool pkt_access;
bool might_sleep;
+ /* set to true if helper follows contract for llvm
+ * attribute bpf_fastcall:
+ * - void functions do not scratch r0
+ * - functions taking N arguments scratch only registers r1-rN
+ */
+ bool allow_fastcall;
enum bpf_return_type ret_type;
union {
struct {
@@ -919,6 +929,7 @@ static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
*/
struct bpf_insn_access_aux {
enum bpf_reg_type reg_type;
+ bool is_ldsx;
union {
int ctx_field_size;
struct {
@@ -927,6 +938,7 @@ struct bpf_insn_access_aux {
};
};
struct bpf_verifier_log *log; /* for verbose logs */
+ bool is_retval; /* is accessing function return value ? */
};
static inline void
@@ -965,6 +977,8 @@ struct bpf_verifier_ops {
struct bpf_insn_access_aux *info);
int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
const struct bpf_prog *prog);
+ int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
+ s16 ctx_stack_off);
int (*gen_ld_abs)(const struct bpf_insn *orig,
struct bpf_insn *insn_buf);
u32 (*convert_ctx_access)(enum bpf_access_type type,
@@ -1795,6 +1809,7 @@ struct bpf_struct_ops_common_value {
#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
bool bpf_struct_ops_get(const void *kdata);
void bpf_struct_ops_put(const void *kdata);
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff);
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
void *value);
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
@@ -1851,6 +1866,10 @@ static inline void bpf_module_put(const void *data, struct module *owner)
{
module_put(owner);
}
+static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ return -ENOTSUPP;
+}
static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
void *key,
void *value)
@@ -3184,7 +3203,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
@@ -3192,6 +3213,7 @@ extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
+extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 1de7ece5d36d..aefcd6564251 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/lsm_hooks.h>
#ifdef CONFIG_BPF_LSM
@@ -45,6 +46,8 @@ void bpf_inode_storage_free(struct inode *inode);
void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *range);
#else /* !CONFIG_BPF_LSM */
static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
@@ -78,6 +81,11 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
{
}
+static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_BPF_LSM */
#endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7b776dae36e5..4513372c5bc8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,6 +23,8 @@
* (in the "-8,-16,...,-512" form)
*/
#define TMP_STR_BUF_LEN 320
+/* Patch buffer size */
+#define INSN_BUF_SIZE 32
/* Liveness marks, used for registers and spilled-regs (in stack slots).
* Read marks propagate upwards until they find a write mark; they record that
@@ -371,6 +373,10 @@ struct bpf_jmp_history_entry {
u32 prev_idx : 22;
/* special flags, e.g., whether insn is doing register stack spill/load */
u32 flags : 10;
+ /* additional registers that need precision tracking when this
+ * jump is backtracked, vector of six 10-bit records
+ */
+ u64 linked_regs;
};
/* Maximum number of register states that can exist at once */
@@ -572,6 +578,14 @@ struct bpf_insn_aux_data {
bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
u8 alu_state; /* used in combination with alu_limit */
+ /* true if STX or LDX instruction is a part of a spill/fill
+ * pattern for a bpf_fastcall call.
+ */
+ u8 fastcall_pattern:1;
+ /* for CALL instructions, a number of spill/fill pairs in the
+ * bpf_fastcall pattern.
+ */
+ u8 fastcall_spills_num:3;
/* below fields are initialized once */
unsigned int orig_idx; /* original instruction index */
@@ -641,6 +655,10 @@ struct bpf_subprog_info {
u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
u16 stack_depth; /* max. stack depth used by this function */
u16 stack_extra;
+ /* offsets in range [stack_depth .. fastcall_stack_off)
+ * are used for bpf_fastcall spills and fills.
+ */
+ s16 fastcall_stack_off;
bool has_tail_call: 1;
bool tail_call_reachable: 1;
bool has_ld_abs: 1;
@@ -648,6 +666,8 @@ struct bpf_subprog_info {
bool is_async_cb: 1;
bool is_exception_cb: 1;
bool args_cached: 1;
+ /* true if bpf_fastcall stack region is used by functions that can't be inlined */
+ bool keep_fastcall_stack: 1;
u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
@@ -762,6 +782,8 @@ struct bpf_verifier_env {
* e.g., in reg_type_str() to generate reg_type string
*/
char tmp_str_buf[TMP_STR_BUF_LEN];
+ struct bpf_insn insn_buf[INSN_BUF_SIZE];
+ struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
};
static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
@@ -905,6 +927,11 @@ static inline bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static inline bool type_may_be_null(u32 type)
+{
+ return type & PTR_MAYBE_NULL;
+}
+
static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
env->scratched_regs |= 1U << regno;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index cffb43133c68..b8a583194c4a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -580,6 +580,7 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
bool btf_types_are_same(const struct btf *btf1, u32 id1,
const struct btf *btf2, u32 id2);
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx);
#else
static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
u32 type_id)
@@ -654,6 +655,10 @@ static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
{
return false;
}
+static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 20aa3c2d89f7..014a88c41073 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -7,8 +7,8 @@
#define BUILD_ID_SIZE_MAX 20
struct vm_area_struct;
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
- __u32 *size);
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
+int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index eb0f6f349496..47ae4c4d924c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -172,7 +172,11 @@ struct cgroup_subsys_state {
/* reference count - access via css_[try]get() and css_put() */
struct percpu_ref refcnt;
- /* siblings list anchored at the parent's ->children */
+ /*
+ * siblings list anchored at the parent's ->children
+ *
+ * linkage is protected by cgroup_mutex or RCU
+ */
struct list_head sibling;
struct list_head children;
@@ -789,6 +793,11 @@ struct cgroup_subsys {
extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+struct cgroup_of_peak {
+ unsigned long value;
+ struct list_head list;
+};
+
/**
* cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
* @tsk: target task
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c60ba0ab1462..f8ef47f8a634 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/nodemask.h>
+#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
@@ -28,8 +29,6 @@
struct kernel_clone_args;
-#ifdef CONFIG_CGROUPS
-
/*
* All weight knobs on the default hierarchy should use the following min,
* default and max values. The default value is the logarithmic center of
@@ -39,6 +38,8 @@ struct kernel_clone_args;
#define CGROUP_WEIGHT_DFL 100
#define CGROUP_WEIGHT_MAX 10000
+#ifdef CONFIG_CGROUPS
+
enum {
CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */
CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */
@@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);
+
#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index d9e613803df1..a3d3e888cf1f 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -98,7 +98,7 @@ const volatile void * __must_check_fn(const volatile void *val)
* DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
*
* CLASS(fdget, f)(fd);
- * if (!f.file)
+ * if (!fd_file(f))
* return -EBADF;
*
* // use 'f' without concern
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 4a537260f655..7e43caabb54b 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -394,6 +394,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
__clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \
NULL, (flags), (fixed_rate), 0, 0, true)
/**
+ * devm_clk_hw_register_fixed_rate_parent_data - register fixed-rate clock with
+ * the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_data: parent clk data
+ * @flags: framework-specific flags
+ * @fixed_rate: non-adjustable clock rate
+ */
+#define devm_clk_hw_register_fixed_rate_parent_data(dev, name, parent_data, flags, \
+ fixed_rate) \
+ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \
+ (parent_data), (flags), (fixed_rate), 0, \
+ 0, true)
+/**
* clk_hw_register_fixed_rate_parent_hw - register fixed-rate clock with
* the clock framework
* @dev: device that is registering this clock
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 0fa56d672532..851a0f2cf42c 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -641,6 +641,32 @@ struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id);
struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id);
/**
+ * devm_clk_get_optional_enabled_with_rate - devm_clk_get_optional() +
+ * clk_set_rate() +
+ * clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ * @rate: new clock rate
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno. The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer. If no such clk is found, it returns NULL
+ * which serves as a dummy clk. That's the only difference compared
+ * to devm_clk_get_enabled().
+ *
+ * The returned clk (if valid) is prepared and enabled and rate was set.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_enabled_with_rate(struct device *dev,
+ const char *id,
+ unsigned long rate);
+
+/**
* devm_get_clk_from_child - lookup and obtain a managed reference to a
* clock producer from child node.
* @dev: device for clock "consumer"
@@ -982,6 +1008,13 @@ static inline struct clk *devm_clk_get_optional_enabled(struct device *dev,
return NULL;
}
+static inline struct clk *
+devm_clk_get_optional_enabled_with_rate(struct device *dev, const char *id,
+ unsigned long rate)
+{
+ return NULL;
+}
+
static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
struct clk_bulk_data *clks)
{
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9db877506ea8..d15b64f51336 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -52,4 +52,20 @@ extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
extern void cma_reserve_pages_on_error(struct cma *cma);
+
+#ifdef CONFIG_CMA
+struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp);
+bool cma_free_folio(struct cma *cma, const struct folio *folio);
+#else
+static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
+{
+ return NULL;
+}
+
+static inline bool cma_free_folio(struct cma *cma, const struct folio *folio)
+{
+ return false;
+}
+#endif
+
#endif
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 27c546bfc6d4..a67f2c4940e9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -233,7 +233,6 @@ struct damos_quota {
unsigned long charge_addr_from;
/* For prioritization */
- unsigned long histogram[DAMOS_MAX_SCORE + 1];
unsigned int min_score;
/* For feedback loop */
@@ -630,6 +629,8 @@ struct damon_ctx {
unsigned long next_ops_update_sis;
/* for waiting until the execution of the kdamond_fn is started */
struct completion kdamond_started;
+ /* for scheme quotas prioritization */
+ unsigned long *regions_score_histogram;
/* public: */
struct task_struct *kdamond;
diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h
index f764e2a7201e..3dd2658a9dab 100644
--- a/include/linux/decompress/unxz.h
+++ b/include/linux/decompress/unxz.h
@@ -1,10 +1,9 @@
+/* SPDX-License-Identifier: 0BSD */
+
/*
* Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
- *
- * This file has been put into the public domain.
- * You can do whatever you want with this file.
*/
#ifndef DECOMPRESS_UNXZ_H
diff --git a/include/linux/dma/ipu-dma.h b/include/linux/dma/ipu-dma.h
deleted file mode 100644
index 6969391580d2..000000000000
--- a/include/linux/dma/ipu-dma.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2008
- * Guennadi Liakhovetski, DENX Software Engineering, <lg@denx.de>
- *
- * Copyright (C) 2005-2007 Freescale Semiconductor, Inc.
- */
-
-#ifndef __LINUX_DMA_IPU_DMA_H
-#define __LINUX_DMA_IPU_DMA_H
-
-#include <linux/types.h>
-#include <linux/dmaengine.h>
-
-/* IPU DMA Controller channel definitions. */
-enum ipu_channel {
- IDMAC_IC_0 = 0, /* IC (encoding task) to memory */
- IDMAC_IC_1 = 1, /* IC (viewfinder task) to memory */
- IDMAC_ADC_0 = 1,
- IDMAC_IC_2 = 2,
- IDMAC_ADC_1 = 2,
- IDMAC_IC_3 = 3,
- IDMAC_IC_4 = 4,
- IDMAC_IC_5 = 5,
- IDMAC_IC_6 = 6,
- IDMAC_IC_7 = 7, /* IC (sensor data) to memory */
- IDMAC_IC_8 = 8,
- IDMAC_IC_9 = 9,
- IDMAC_IC_10 = 10,
- IDMAC_IC_11 = 11,
- IDMAC_IC_12 = 12,
- IDMAC_IC_13 = 13,
- IDMAC_SDC_0 = 14, /* Background synchronous display data */
- IDMAC_SDC_1 = 15, /* Foreground data (overlay) */
- IDMAC_SDC_2 = 16,
- IDMAC_SDC_3 = 17,
- IDMAC_ADC_2 = 18,
- IDMAC_ADC_3 = 19,
- IDMAC_ADC_4 = 20,
- IDMAC_ADC_5 = 21,
- IDMAC_ADC_6 = 22,
- IDMAC_ADC_7 = 23,
- IDMAC_PF_0 = 24,
- IDMAC_PF_1 = 25,
- IDMAC_PF_2 = 26,
- IDMAC_PF_3 = 27,
- IDMAC_PF_4 = 28,
- IDMAC_PF_5 = 29,
- IDMAC_PF_6 = 30,
- IDMAC_PF_7 = 31,
-};
-
-/* Order significant! */
-enum ipu_channel_status {
- IPU_CHANNEL_FREE,
- IPU_CHANNEL_INITIALIZED,
- IPU_CHANNEL_READY,
- IPU_CHANNEL_ENABLED,
-};
-
-#define IPU_CHANNELS_NUM 32
-
-enum pixel_fmt {
- /* 1 byte */
- IPU_PIX_FMT_GENERIC,
- IPU_PIX_FMT_RGB332,
- IPU_PIX_FMT_YUV420P,
- IPU_PIX_FMT_YUV422P,
- IPU_PIX_FMT_YUV420P2,
- IPU_PIX_FMT_YVU422P,
- /* 2 bytes */
- IPU_PIX_FMT_RGB565,
- IPU_PIX_FMT_RGB666,
- IPU_PIX_FMT_BGR666,
- IPU_PIX_FMT_YUYV,
- IPU_PIX_FMT_UYVY,
- /* 3 bytes */
- IPU_PIX_FMT_RGB24,
- IPU_PIX_FMT_BGR24,
- /* 4 bytes */
- IPU_PIX_FMT_GENERIC_32,
- IPU_PIX_FMT_RGB32,
- IPU_PIX_FMT_BGR32,
- IPU_PIX_FMT_ABGR32,
- IPU_PIX_FMT_BGRA32,
- IPU_PIX_FMT_RGBA32,
-};
-
-enum ipu_color_space {
- IPU_COLORSPACE_RGB,
- IPU_COLORSPACE_YCBCR,
- IPU_COLORSPACE_YUV
-};
-
-/*
- * Enumeration of IPU rotation modes
- */
-enum ipu_rotate_mode {
- /* Note the enum values correspond to BAM value */
- IPU_ROTATE_NONE = 0,
- IPU_ROTATE_VERT_FLIP = 1,
- IPU_ROTATE_HORIZ_FLIP = 2,
- IPU_ROTATE_180 = 3,
- IPU_ROTATE_90_RIGHT = 4,
- IPU_ROTATE_90_RIGHT_VFLIP = 5,
- IPU_ROTATE_90_RIGHT_HFLIP = 6,
- IPU_ROTATE_90_LEFT = 7,
-};
-
-/*
- * Enumeration of DI ports for ADC.
- */
-enum display_port {
- DISP0,
- DISP1,
- DISP2,
- DISP3
-};
-
-struct idmac_video_param {
- unsigned short in_width;
- unsigned short in_height;
- uint32_t in_pixel_fmt;
- unsigned short out_width;
- unsigned short out_height;
- uint32_t out_pixel_fmt;
- unsigned short out_stride;
- bool graphics_combine_en;
- bool global_alpha_en;
- bool key_color_en;
- enum display_port disp;
- unsigned short out_left;
- unsigned short out_top;
-};
-
-/*
- * Union of initialization parameters for a logical channel. So far only video
- * parameters are used.
- */
-union ipu_channel_param {
- struct idmac_video_param video;
-};
-
-struct idmac_tx_desc {
- struct dma_async_tx_descriptor txd;
- struct scatterlist *sg; /* scatterlist for this */
- unsigned int sg_len; /* tx-descriptor. */
- struct list_head list;
-};
-
-struct idmac_channel {
- struct dma_chan dma_chan;
- dma_cookie_t completed; /* last completed cookie */
- union ipu_channel_param params;
- enum ipu_channel link; /* input channel, linked to the output */
- enum ipu_channel_status status;
- void *client; /* Only one client per channel */
- unsigned int n_tx_desc;
- struct idmac_tx_desc *desc; /* allocated tx-descriptors */
- struct scatterlist *sg[2]; /* scatterlist elements in buffer-0 and -1 */
- struct list_head free_list; /* free tx-descriptors */
- struct list_head queue; /* queued tx-descriptors */
- spinlock_t lock; /* protects sg[0,1], queue */
- struct mutex chan_mutex; /* protects status, cookie, free_list */
- bool sec_chan_en;
- int active_buffer;
- unsigned int eof_irq;
- char eof_name[16]; /* EOF IRQ name for request_irq() */
-};
-
-#define to_tx_desc(tx) container_of(tx, struct idmac_tx_desc, txd)
-#define to_idmac_chan(c) container_of(c, struct idmac_channel, dma_chan)
-
-#endif /* __LINUX_DMA_IPU_DMA_H */
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 1e491c5dcac2..2dea217629d0 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -136,8 +136,6 @@ u32 k3_udma_glue_rx_flow_get_fdq_id(struct k3_udma_glue_rx_channel *rx_chn,
u32 k3_udma_glue_rx_get_flow_id_base(struct k3_udma_glue_rx_channel *rx_chn);
int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
u32 flow_num);
-void k3_udma_glue_rx_put_irq(struct k3_udma_glue_rx_channel *rx_chn,
- u32 flow_num);
void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
u32 flow_num, void *data,
void (*cleanup)(void *data, dma_addr_t desc_dma),
diff --git a/include/linux/err.h b/include/linux/err.h
index b5d9bb2a2349..a4dacd745fcf 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -41,6 +41,9 @@ static inline void * __must_check ERR_PTR(long error)
return (void *) error;
}
+/* Return the pointer in the percpu address space. */
+#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))
+
/**
* PTR_ERR - Extract the error code from an error pointer.
* @ptr: An error pointer.
@@ -51,6 +54,9 @@ static inline long __must_check PTR_ERR(__force const void *ptr)
return (long) ptr;
}
+/* Read an error pointer from the percpu address space. */
+#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr)))
+
/**
* IS_ERR - Detect an error pointer.
* @ptr: The pointer to check.
@@ -61,6 +67,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
return IS_ERR_VALUE((unsigned long)ptr);
}
+/* Read an error pointer from the percpu address space. */
+#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr)))
+
/**
* IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
* @ptr: The pointer to check.
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 354413950d34..8c829d28dcf3 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -2,13 +2,17 @@
#ifndef _LINUX_FAULT_INJECT_H
#define _LINUX_FAULT_INJECT_H
+#include <linux/err.h>
+#include <linux/types.h>
+
+struct dentry;
+struct kmem_cache;
+
#ifdef CONFIG_FAULT_INJECTION
-#include <linux/types.h>
-#include <linux/debugfs.h>
+#include <linux/atomic.h>
#include <linux/configfs.h>
#include <linux/ratelimit.h>
-#include <linux/atomic.h>
/*
* For explanation of the elements of this struct, see
@@ -51,6 +55,28 @@ int setup_fault_attr(struct fault_attr *attr, char *str);
bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags);
bool should_fail(struct fault_attr *attr, ssize_t size);
+#else /* CONFIG_FAULT_INJECTION */
+
+struct fault_attr {
+};
+
+#define DECLARE_FAULT_ATTR(name) struct fault_attr name = {}
+
+static inline int setup_fault_attr(struct fault_attr *attr, char *str)
+{
+ return 0; /* Note: 0 means error for __setup() handlers! */
+}
+static inline bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags)
+{
+ return false;
+}
+static inline bool should_fail(struct fault_attr *attr, ssize_t size)
+{
+ return false;
+}
+
+#endif /* CONFIG_FAULT_INJECTION */
+
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct dentry *fault_create_debugfs_attr(const char *name,
@@ -87,10 +113,6 @@ static inline void fault_config_init(struct fault_config *config,
#endif /* CONFIG_FAULT_INJECTION_CONFIGFS */
-#endif /* CONFIG_FAULT_INJECTION */
-
-struct kmem_cache;
-
#ifdef CONFIG_FAIL_PAGE_ALLOC
bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order);
#else
diff --git a/include/linux/file.h b/include/linux/file.h
index 6bd9cd9c87e5..f98de143245a 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,51 +36,52 @@ static inline void fput_light(struct file *file, int fput_needed)
fput(file);
}
+/* either a reference to struct file + flags
+ * (cloned vs. borrowed, pos locked), with
+ * flags stored in lower bits of value,
+ * or empty (represented by 0).
+ */
struct fd {
- struct file *file;
- unsigned int flags;
+ unsigned long word;
};
#define FDPUT_FPUT 1
#define FDPUT_POS_UNLOCK 2
-static inline void fdput(struct fd fd)
+#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
+static inline bool fd_empty(struct fd f)
{
- if (fd.flags & FDPUT_FPUT)
- fput(fd.file);
+ return unlikely(!f.word);
}
-extern struct file *fget(unsigned int fd);
-extern struct file *fget_raw(unsigned int fd);
-extern struct file *fget_task(struct task_struct *task, unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
-extern void __f_unlock_pos(struct file *);
-
-static inline struct fd __to_fd(unsigned long v)
+#define EMPTY_FD (struct fd){0}
+static inline struct fd BORROWED_FD(struct file *f)
{
- return (struct fd){(struct file *)(v & ~3),v & 3};
+ return (struct fd){(unsigned long)f};
}
-
-static inline struct fd fdget(unsigned int fd)
+static inline struct fd CLONED_FD(struct file *f)
{
- return __to_fd(__fdget(fd));
+ return (struct fd){(unsigned long)f | FDPUT_FPUT};
}
-static inline struct fd fdget_raw(unsigned int fd)
+static inline void fdput(struct fd fd)
{
- return __to_fd(__fdget_raw(fd));
+ if (fd.word & FDPUT_FPUT)
+ fput(fd_file(fd));
}
-static inline struct fd fdget_pos(int fd)
-{
- return __to_fd(__fdget_pos(fd));
-}
+extern struct file *fget(unsigned int fd);
+extern struct file *fget_raw(unsigned int fd);
+extern struct file *fget_task(struct task_struct *task, unsigned int fd);
+extern void __f_unlock_pos(struct file *);
+
+struct fd fdget(unsigned int fd);
+struct fd fdget_raw(unsigned int fd);
+struct fd fdget_pos(unsigned int fd);
static inline void fdput_pos(struct fd f)
{
- if (f.flags & FDPUT_POS_UNLOCK)
- __f_unlock_pos(f.file);
+ if (f.word & FDPUT_POS_UNLOCK)
+ __f_unlock_pos(fd_file(f));
fdput(f);
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 64e1506fefb8..7d7578a8eac1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -437,6 +437,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn)
.off = OFF, \
.imm = 0 })
+/* Unconditional jumps, gotol pc + imm32 */
+
+#define BPF_JMP32_A(IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP32 | BPF_JA, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
/* Relative call */
#define BPF_CALL_REL(TGT) \
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 1cca14cf5652..b632eec3ab52 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -134,6 +134,8 @@ struct fw_card {
__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
__be32 maint_utility_register;
+
+ struct workqueue_struct *isoc_wq;
};
static inline struct fw_card *fw_card_get(struct fw_card *card)
@@ -509,6 +511,7 @@ union fw_iso_callback {
struct fw_iso_context {
struct fw_card *card;
+ struct work_struct work;
int type;
int channel;
int speed;
@@ -528,6 +531,25 @@ int fw_iso_context_queue(struct fw_iso_context *ctx,
unsigned long payload);
void fw_iso_context_queue_flush(struct fw_iso_context *ctx);
int fw_iso_context_flush_completions(struct fw_iso_context *ctx);
+
+/**
+ * fw_iso_context_schedule_flush_completions() - schedule work item to process isochronous context.
+ * @ctx: the isochronous context
+ *
+ * Schedule a work item on workqueue to process the isochronous context. The registered callback
+ * function is called by the worker when a queued packet buffer with the interrupt flag is
+ * completed, either after transmission in the IT context or after being filled in the IR context.
+ * The callback function is also called when the header buffer in the context becomes full, If it
+ * is required to process the context in the current context, fw_iso_context_flush_completions() is
+ * available instead.
+ *
+ * Context: Any context.
+ */
+static inline void fw_iso_context_schedule_flush_completions(struct fw_iso_context *ctx)
+{
+ queue_work(ctx->card->isoc_wq, &ctx->work);
+}
+
int fw_iso_context_start(struct fw_iso_context *ctx,
int cycle, int sync, int tags);
int fw_iso_context_stop(struct fw_iso_context *ctx);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0df3e5f0dd2b..776298fbfcb4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,7 +3148,14 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
-extern void __iget(struct inode * inode);
+/*
+ * inode->i_lock must be held
+ */
+static inline void __iget(struct inode *inode)
+{
+ atomic_inc(&inode->i_count);
+}
+
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
@@ -3513,7 +3520,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
if (flags & RWF_NOWAIT) {
if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
return -EOPNOTSUPP;
- kiocb_flags |= IOCB_NOIO;
}
if (flags & RWF_ATOMIC) {
if (rw_type != WRITE)
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f3512fddf3d7..5b51c3d582d6 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -41,6 +41,7 @@
#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math.h>
+#include <linux/slab.h>
#include <linux/types.h>
struct genradix_root;
@@ -48,10 +49,63 @@ struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
+#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH \
+ DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK \
+ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+ return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+ return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+ return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+ return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
struct __genradix {
struct genradix_root *root;
};
+struct genradix_node {
+ union {
+ /* Interior node: */
+ struct genradix_node *children[GENRADIX_ARY];
+
+ /* Leaf: */
+ u8 data[GENRADIX_NODE_SIZE];
+ };
+};
+
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+ return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+ kfree(node);
+}
+
/*
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
@@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+ struct genradix_root *r = READ_ONCE(radix->root);
+ struct genradix_node *n = genradix_root_to_node(r);
+ unsigned level = genradix_root_to_depth(r);
+ unsigned shift = genradix_depth_shift(level);
+
+ if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+ return NULL;
+
+ while (n && shift > GENRADIX_NODE_SHIFT) {
+ shift -= GENRADIX_ARY_SHIFT;
+ n = n->children[offset >> shift];
+ offset &= (1UL << shift) - 1;
+ }
+
+ return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx) \
+ (__genradix_cast(_radix) \
+ __genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)))
+
void *__genradix_ptr(struct __genradix *, size_t);
/**
@@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
__genradix_ptr(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+ struct genradix_node **, gfp_t);
+
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ NULL, _gfp)))
+
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp)))
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
- _gfp))
+ NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp))
struct genradix_iter {
size_t offset;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53f76e0b17e..a951de920e20 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -319,7 +319,7 @@ static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
- return __folio_alloc_node(gfp, order, numa_node_id());
+ return __folio_alloc_node_noprof(gfp, order, numa_node_id());
}
static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid)
@@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
+#ifdef CONFIG_CONTIG_ALLOC
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+ int nid, nodemask_t *node)
+{
+ struct page *page;
+
+ if (WARN_ON(!order || !(gfp & __GFP_COMP)))
+ return NULL;
+
+ page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
+
+ return page ? page_folio(page) : NULL;
+}
+#else
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+ int nid, nodemask_t *node)
+{
+ return NULL;
+}
+#endif
+/* This should be paired with folio_put() rather than free_contig_range(). */
+#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
+
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 313be4ad79fd..65db9349f905 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -215,7 +215,8 @@ enum {
* the caller still has to check for failures) while costly requests try to be
* not disruptive and back off even without invoking the OOM killer.
* The following three modifiers might be used to override some of these
- * implicit rules.
+ * implicit rules. Please note that all of them must be used along with
+ * %__GFP_DIRECT_RECLAIM flag.
*
* %__GFP_NORETRY: The VM implementation will try only very lightweight
* memory direct reclaim to get some memory under memory pressure (thus
@@ -246,11 +247,14 @@ enum {
* cannot handle allocation failures. The allocation could block
* indefinitely but will never return with failure. Testing for
* failure is pointless.
+ * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM.
+ * It should _never_ be used in non-sleepable contexts.
* New users should be evaluated carefully (and the flag should be
* used only when there is no reasonable failure policy) but it is
* definitely preferable to use the flag rather than opencode endless
* loop around allocator.
- * Using this flag for costly allocations is _highly_ discouraged.
+ * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is
+ * not supported. Please consider using kvmalloc() instead.
*/
#define __GFP_IO ((__force gfp_t)___GFP_IO)
#define __GFP_FS ((__force gfp_t)___GFP_FS)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..67d0ab3c3bba 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -76,9 +76,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
/*
* Mask of all large folio orders supported for file THP. Folios in a DAX
* file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
- * it.
+ * it. Same to PFNMAPs where there's neither page* nor pagecache.
*/
-#define THP_ORDERS_ALL_FILE_DAX \
+#define THP_ORDERS_ALL_SPECIAL \
(BIT(PMD_ORDER) | BIT(PUD_ORDER))
#define THP_ORDERS_ALL_FILE_DEFAULT \
((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
@@ -87,7 +87,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
* Mask of all large folio orders supported for THP.
*/
#define THP_ORDERS_ALL \
- (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
+ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
#define TVA_IN_PF (1 << 1) /* Page fault handler */
@@ -96,6 +96,8 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
+#define split_folio(f) split_folio_to_list(f, NULL)
+
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
@@ -114,6 +116,53 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
+enum mthp_stat_item {
+ MTHP_STAT_ANON_FAULT_ALLOC,
+ MTHP_STAT_ANON_FAULT_FALLBACK,
+ MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_SWPOUT,
+ MTHP_STAT_SWPOUT_FALLBACK,
+ MTHP_STAT_SHMEM_ALLOC,
+ MTHP_STAT_SHMEM_FALLBACK,
+ MTHP_STAT_SHMEM_FALLBACK_CHARGE,
+ MTHP_STAT_SPLIT,
+ MTHP_STAT_SPLIT_FAILED,
+ MTHP_STAT_SPLIT_DEFERRED,
+ MTHP_STAT_NR_ANON,
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+ __MTHP_STAT_COUNT
+};
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
+struct mthp_stat {
+ unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
+
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
+{
+ if (order <= 0 || order > PMD_ORDER)
+ return;
+
+ this_cpu_add(mthp_stats.stats[order][item], delta);
+}
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+ mod_mthp_stat(order, item, 1);
+}
+
+#else
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
+{
+}
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+}
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long transparent_hugepage_flags;
@@ -269,41 +318,6 @@ struct thpsize {
#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
-enum mthp_stat_item {
- MTHP_STAT_ANON_FAULT_ALLOC,
- MTHP_STAT_ANON_FAULT_FALLBACK,
- MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
- MTHP_STAT_SWPOUT,
- MTHP_STAT_SWPOUT_FALLBACK,
- MTHP_STAT_SHMEM_ALLOC,
- MTHP_STAT_SHMEM_FALLBACK,
- MTHP_STAT_SHMEM_FALLBACK_CHARGE,
- MTHP_STAT_SPLIT,
- MTHP_STAT_SPLIT_FAILED,
- MTHP_STAT_SPLIT_DEFERRED,
- __MTHP_STAT_COUNT
-};
-
-struct mthp_stat {
- unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
-};
-
-#ifdef CONFIG_SYSFS
-DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
-
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
- if (order <= 0 || order > PMD_ORDER)
- return;
-
- this_cpu_inc(mthp_stats.stats[order][item]);
-}
-#else
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
-}
-#endif
-
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
@@ -314,14 +328,29 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
unsigned long len, unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags);
-bool can_split_folio(struct folio *folio, int *pextra_pins);
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
+int min_order_for_split(struct folio *folio);
+int split_folio_to_list(struct folio *folio, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
- return split_huge_page_to_list_to_order(page, NULL, 0);
+ struct folio *folio = page_folio(page);
+ int ret = min_order_for_split(folio);
+
+ if (ret < 0)
+ return ret;
+
+ /*
+ * split_huge_page() locks the page before splitting and
+ * expects the same page that has been split to be locked when
+ * returned. split_folio(page_folio(page)) cannot be used here
+ * because it converts the page to folio and passes the head
+ * page to be split.
+ */
+ return split_huge_page_to_list_to_order(page, NULL, ret);
}
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio);
@@ -342,6 +371,17 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
unsigned long address);
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pud_t *pudp, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags);
+#else
+static inline int
+change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pud_t *pudp, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags) { return 0; }
+#endif
+
#define split_huge_pud(__vma, __pud, __address) \
do { \
pud_t *____pud = (__pud); \
@@ -410,11 +450,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}
-static inline bool is_huge_zero_pud(pud_t pud)
-{
- return false;
-}
-
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
@@ -470,7 +505,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
}
static inline bool
-can_split_folio(struct folio *folio, int *pextra_pins)
+can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
return false;
}
@@ -484,7 +519,13 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-static inline void deferred_split_folio(struct folio *folio) {}
+
+static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
+{
+ return 0;
+}
+
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
@@ -555,11 +596,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return false;
}
-static inline bool is_huge_zero_pud(pud_t pud)
-{
- return false;
-}
-
static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
return;
@@ -585,6 +621,19 @@ static inline int next_order(unsigned long *orders, int prev)
{
return 0;
}
+
+static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long address)
+{
+}
+
+static inline int change_huge_pud(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags)
+{
+ return 0;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
@@ -598,7 +647,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
-#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
-#define split_folio(f) split_folio_to_order(f, 0)
-
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 45bf05ad5c53..98c47c394b89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -127,9 +127,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -899,10 +896,11 @@ static inline bool hugepage_movable_supported(struct hstate *h)
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
+ gfp_t gfp = __GFP_COMP | __GFP_NOWARN;
+
+ gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER;
+
+ return gfp;
}
static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
@@ -1251,7 +1249,7 @@ static inline __init void hugetlb_cma_reserve(int order)
}
#endif
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return page_count(virt_to_page(pte)) > 1;
@@ -1287,8 +1285,7 @@ bool __vma_private_lock(struct vm_area_struct *vma);
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
-#if defined(CONFIG_HUGETLB_PAGE) && \
- defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
/*
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 377def497298..388ce71a29a9 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -761,6 +761,9 @@ struct i2c_adapter {
struct regulator *bus_regulator;
struct dentry *debugfs;
+
+ /* 7bit address space */
+ DECLARE_BITMAP(addrs_in_instantiation, 1 << 7);
};
#define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..4ad12a3c8bae 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -257,11 +257,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
}
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
- const struct iomap_ops *ops);
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-
+ const struct iomap_ops *ops, void *private);
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
@@ -277,6 +273,13 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
const struct iomap_ops *ops);
+
+typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
+ struct iomap *iomap);
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
+ loff_t length, ssize_t written, unsigned flag,
+ struct iomap *iomap, iomap_punch_t punch);
+
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5157d92b6f23..8aef9bb6ad57 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1086,7 +1086,7 @@ struct journal_s
int j_revoke_records_per_block;
/**
- * @j_transaction_overhead:
+ * @j_transaction_overhead_buffers:
*
* Number of blocks each transaction needs for its own bookkeeping
*/
@@ -1675,7 +1675,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
-int jbd2_fc_release_bufs(journal_t *journal);
+void jbd2_fc_release_bufs(journal_t *journal);
/*
* is_journal_abort
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 859f4b0c1b2b..196778a087c4 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -10,12 +10,11 @@
*/
#define KPF_RESERVED 32
#define KPF_MLOCKED 33
-#define KPF_MAPPEDTODISK 34
+#define KPF_OWNER_2 34
#define KPF_PRIVATE 35
#define KPF_PRIVATE_2 36
#define KPF_OWNER_PRIVATE 37
#define KPF_ARCH 38
-#define KPF_UNCACHED 39
#define KPF_SOFTDIRTY 40
#define KPF_ARCH_2 41
#define KPF_ARCH_3 42
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 88100cc9caba..0ad1ddbb8b99 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -124,7 +124,7 @@ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp
if (!static_branch_likely(&kfence_allocation_key))
return NULL;
#endif
- if (likely(atomic_read(&kfence_allocation_gate)))
+ if (likely(atomic_read(&kfence_allocation_gate) > 0))
return NULL;
return __kfence_alloc(s, size, flags);
}
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index f68865e19b0b..30baae91b225 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,7 @@
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6885603f211b..e5968c3ed4ae 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -611,6 +611,8 @@ enum led_trigger_netdev_modes {
TRIGGER_NETDEV_FULL_DUPLEX,
TRIGGER_NETDEV_TX,
TRIGGER_NETDEV_RX,
+ TRIGGER_NETDEV_TX_ERR,
+ TRIGGER_NETDEV_RX_ERR,
/* Keep last */
__TRIGGER_NETDEV_MAX,
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 1b95fe31051f..61c4b9c41904 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -200,7 +200,7 @@ extern const struct svc_procedure nlmsvc_procedures[24];
extern const struct svc_procedure nlmsvc_procedures4[24];
#endif
extern int nlmsvc_grace_period;
-extern unsigned long nlmsvc_timeout;
+extern unsigned long nlm_timeout;
extern bool nsm_use_hostnames;
extern u32 nsm_local_state;
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index c9afcdd9324c..ff82ef85a084 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -119,7 +119,7 @@ write intent log information, three of which are mentioned here.
*/
/* this defines an element in a tracked set
- * .colision is for hash table lookup.
+ * .collision is for hash table lookup.
* When we process a new IO request, we know its sector, thus can deduce the
* region number (label) easily. To do the label -> object lookup without a
* full list walk, we use a simple hash table.
@@ -145,7 +145,7 @@ write intent log information, three of which are mentioned here.
* But it avoids high order page allocations in kmalloc.
*/
struct lc_element {
- struct hlist_node colision;
+ struct hlist_node collision;
struct list_head list; /* LRU list or free list */
unsigned refcnt;
/* back "pointer" into lc_cache->element[index],
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index a53ad4dabd7e..c2c11004085e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -52,9 +52,9 @@
* bit in the node type. This is possible by using bit 1 to indicate if bit 2
* is part of the type or the slot.
*
- * Once the type is decided, the decision of an allocation range type or a range
- * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
- * flag.
+ * Once the type is decided, the decision of an allocation range type or a
+ * range type is done by examining the immutable tree flag for the
+ * MT_FLAGS_ALLOC_RANGE flag.
*
* Node types:
* 0x??1 = Root
@@ -148,6 +148,18 @@ enum maple_type {
maple_arange_64,
};
+enum store_type {
+ wr_invalid,
+ wr_new_root,
+ wr_store_root,
+ wr_exact_fit,
+ wr_spanning_store,
+ wr_split_store,
+ wr_rebalance,
+ wr_append,
+ wr_node_store,
+ wr_slot_store,
+};
/**
* DOC: Maple tree flags
@@ -436,6 +448,7 @@ struct ma_state {
unsigned char offset;
unsigned char mas_flags;
unsigned char end; /* The end of the node */
+ enum store_type store_type; /* The type of store needed for this operation */
};
struct ma_wr_state {
@@ -477,6 +490,7 @@ struct ma_wr_state {
.max = ULONG_MAX, \
.alloc = NULL, \
.mas_flags = 0, \
+ .store_type = wr_invalid, \
}
#define MA_WR_STATE(name, ma_state, wr_entry) \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0e5bf25d324f..34d2da05f2f1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -57,7 +57,7 @@ enum memcg_memory_event {
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
- unsigned int generation;
+ int generation;
};
#ifdef CONFIG_MEMCG
@@ -70,6 +70,7 @@ struct mem_cgroup_id {
};
struct memcg_vmstats_percpu;
+struct memcg1_events_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;
@@ -77,7 +78,7 @@ struct lruvec_stats;
struct mem_cgroup_reclaim_iter {
struct mem_cgroup *position;
/* scan generation, increased every round-trip */
- unsigned int generation;
+ atomic_t generation;
};
/*
@@ -193,6 +194,11 @@ struct mem_cgroup {
struct page_counter memsw; /* v1 only */
};
+ /* registered local peak watchers */
+ struct list_head memory_peaks;
+ struct list_head swap_peaks;
+ spinlock_t peaks_lock;
+
/* Range enforcement for interrupt charges */
struct work_struct high_work;
@@ -270,6 +276,8 @@ struct mem_cgroup {
struct page_counter kmem; /* v1 only */
struct page_counter tcpmem; /* v1 only */
+ struct memcg1_events_percpu __percpu *events_percpu;
+
unsigned long soft_limit;
/* protected by memcg_oom_lock */
@@ -361,11 +369,11 @@ static inline bool folio_memcg_kmem(struct folio *folio);
* After the initialization objcg->memcg is always pointing at
* a valid memcg, but can be atomically swapped to the parent memcg.
*
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
+ * The caller must ensure that the returned memcg won't be released.
*/
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
+ lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
return READ_ONCE(objcg->memcg);
}
@@ -439,6 +447,19 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
return __folio_memcg(folio);
}
+/*
+ * folio_memcg_charged - If a folio is charged to a memory cgroup.
+ * @folio: Pointer to the folio.
+ *
+ * Returns true if folio is charged to a memory cgroup, otherwise returns false.
+ */
+static inline bool folio_memcg_charged(struct folio *folio)
+{
+ if (folio_memcg_kmem(folio))
+ return __folio_objcg(folio) != NULL;
+ return __folio_memcg(folio) != NULL;
+}
+
/**
* folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
* @folio: Pointer to the folio.
@@ -455,7 +476,6 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- WARN_ON_ONCE(!rcu_read_lock_held());
if (memcg_data & MEMCG_DATA_KMEM) {
struct obj_cgroup *objcg;
@@ -464,6 +484,8 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
return obj_cgroup_memcg(objcg);
}
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
@@ -677,7 +699,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
+
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
void __mem_cgroup_uncharge(struct folio *folio);
@@ -762,6 +785,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct mem_cgroup *get_mem_cgroup_from_current(void);
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);
+
struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1006,8 +1031,8 @@ static inline void count_memcg_folio_events(struct folio *folio,
count_memcg_events(memcg, idx, nr);
}
-static inline void count_memcg_event_mm(struct mm_struct *mm,
- enum vm_event_item idx)
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+ enum vm_event_item idx, unsigned long count)
{
struct mem_cgroup *memcg;
@@ -1017,10 +1042,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (likely(memcg))
- count_memcg_events(memcg, idx, 1);
+ count_memcg_events(memcg, idx, count);
rcu_read_unlock();
}
+static inline void count_memcg_event_mm(struct mm_struct *mm,
+ enum vm_event_item idx)
+{
+ count_memcg_events_mm(mm, idx, 1);
+}
+
static inline void memcg_memory_event(struct mem_cgroup *memcg,
enum memcg_memory_event event)
{
@@ -1176,7 +1207,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
return 0;
}
-static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr)
{
}
@@ -1240,6 +1271,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
return NULL;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+ return NULL;
+}
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
@@ -1462,6 +1498,11 @@ static inline void count_memcg_folio_events(struct folio *folio,
{
}
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+ enum vm_event_item idx, unsigned long count)
+{
+}
+
static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
@@ -1717,7 +1758,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-struct mem_cgroup *mem_cgroup_from_obj(void *p);
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
static inline void count_objcg_event(struct obj_cgroup *objcg,
@@ -1780,11 +1820,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
return -1;
}
-static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
-{
- return NULL;
-}
-
static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
return NULL;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ebe876930e78..b27ddce5d324 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,54 +16,6 @@ struct resource;
struct vmem_altmap;
struct dev_pagemap;
-#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
-/*
- * For supporting node-hotadd, we have to allocate a new pgdat.
- *
- * If an arch has generic style NODE_DATA(),
- * node_data[nid] = kzalloc() works well. But it depends on the architecture.
- *
- * In general, generic_alloc_nodedata() is used.
- *
- */
-extern pg_data_t *arch_alloc_nodedata(int nid);
-extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
-
-#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
-#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
-
-#ifdef CONFIG_NUMA
-/*
- * XXX: node aware allocation can't work well to get new node's memory at this time.
- * Because, pgdat for the new node is not allocated/initialized yet itself.
- * To use new node's memory, more consideration will be necessary.
- */
-#define generic_alloc_nodedata(nid) \
-({ \
- memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
-})
-
-extern pg_data_t *node_data[];
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
- node_data[nid] = pgdat;
-}
-
-#else /* !CONFIG_NUMA */
-
-/* never called */
-static inline pg_data_t *generic_alloc_nodedata(int nid)
-{
- BUG();
- return NULL;
-}
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
#ifdef CONFIG_MEMORY_HOTPLUG
struct page *pfn_to_online_page(unsigned long pfn);
diff --git a/include/linux/mfd/88pm80x.h b/include/linux/mfd/88pm80x.h
index def5df6e74bf..551ef1c367d6 100644
--- a/include/linux/mfd/88pm80x.h
+++ b/include/linux/mfd/88pm80x.h
@@ -294,7 +294,7 @@ struct pm80x_chip {
struct i2c_client *client;
struct i2c_client *companion;
struct regmap *regmap;
- struct regmap_irq_chip *regmap_irq_chip;
+ const struct regmap_irq_chip *regmap_irq_chip;
struct regmap_irq_chip_data *irq_data;
int type;
int irq;
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
deleted file mode 100644
index 43dfca1c9702..000000000000
--- a/include/linux/mfd/ds1wm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* MFD cell driver data for the DS1WM driver
- *
- * to be defined in the MFD device that is
- * using this driver for one of his sub devices
- */
-
-struct ds1wm_driver_data {
- int active_high;
- int clock_rate;
- /* in milliseconds, the amount of time to
- * sleep following a reset pulse. Zero
- * should work if your bus devices recover
- * time respects the 1-wire spec since the
- * ds1wm implements the precise timings of
- * a reset pulse/presence detect sequence.
- */
- unsigned int reset_recover_delay;
-
- /* Say 1 here for big endian Hardware
- * (only relevant with bus-shift > 0
- */
- bool is_hw_big_endian;
-
- /* left shift of register number to get register address offsett.
- * Only 0,1,2 allowed for 8,16 or 32 bit bus width respectively
- */
- unsigned int bus_shift;
-};
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 644be30b69c8..002e49b2ebd9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -70,6 +70,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free,
unsigned int *ret_succeeded);
struct folio *alloc_migration_target(struct folio *src, unsigned long private);
bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src);
@@ -91,6 +92,8 @@ static inline struct folio *alloc_migration_target(struct folio *src,
{ return NULL; }
static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{ return false; }
+static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
+ { return false; }
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13bff7cf03b7..ecf63d2b0582 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,7 +98,11 @@ extern int mmap_rnd_compat_bits __read_mostly;
#endif
#ifndef PHYSMEM_END
+# ifdef MAX_PHYSMEM_BITS
# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+# else
+# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63))
+# endif
#endif
#include <asm/page.h>
@@ -1015,27 +1019,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
return mas_prev(&vmi->mas, 0);
}
-static inline
-struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
-{
- return mas_prev_range(&vmi->mas, 0);
-}
-
-static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
-{
- return vmi->mas.index;
-}
-
-static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
-{
- return vmi->mas.last + 1;
-}
-static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
- unsigned long count)
-{
- return mas_expected_entries(&vmi->mas, count);
-}
-
static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
unsigned long start, unsigned long end, gfp_t gfp)
{
@@ -1259,8 +1242,7 @@ static inline int folio_mapcount(const struct folio *folio)
if (likely(!folio_test_large(folio))) {
mapcount = atomic_read(&folio->_mapcount) + 1;
- /* Handle page_has_type() pages */
- if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+ if (page_mapcount_is_type(mapcount))
mapcount = 0;
return mapcount;
}
@@ -1756,6 +1738,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
}
}
+
+bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
@@ -1809,6 +1793,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
+static inline bool folio_use_access_time(struct folio *folio)
+{
+ return false;
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2158,14 +2146,19 @@ static inline size_t folio_size(const struct folio *folio)
* MM ("mapped shared"), or if the folio is only mapped into a single MM
* ("mapped exclusively").
*
+ * For KSM folios, this function also returns "mapped shared" when a folio is
+ * mapped multiple times into the same MM, because the individual page mappings
+ * are independent.
+ *
* As precise information is not easily available for all folios, this function
* estimates the number of MMs ("sharers") that are currently mapping a folio
* using the number of times the first page of the folio is currently mapped
* into page tables.
*
- * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
- * the return value will be exactly correct, because they can only be mapped
- * at most once into an MM, and they cannot be partially mapped.
+ * For small anonymous folios and anonymous hugetlb folios, the return
+ * value will be exactly correct: non-KSM folios can only be mapped at most once
+ * into an MM, and they cannot be partially mapped. KSM folios are
+ * considered shared even if mapped multiple times into the same MM.
*
* For other folios, the result can be fuzzy:
* #. For partially-mappable large folios (THP), the return value can wrongly
@@ -2174,9 +2167,6 @@ static inline size_t folio_size(const struct folio *folio)
* #. For pagecache folios (including hugetlb), the return value can wrongly
* indicate "mapped shared" (false positive) when two VMAs in the same MM
* cover the same file range.
- * #. For (small) KSM folios, the return value can wrongly indicate "mapped
- * shared" (false positive), when the folio is mapped multiple times into
- * the same MM.
*
* Further, this function only considers current page table mappings that
* are tracked using the folio mapcount(s).
@@ -2210,26 +2200,10 @@ static inline bool folio_likely_mapped_shared(struct folio *folio)
return atomic_read(&folio->_mapcount) > 0;
}
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
- return 0;
-}
-#endif
-
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
- int ret;
- long i, nr = folio_nr_pages(folio);
-
- for (i = 0; i < nr; i++) {
- ret = arch_make_page_accessible(folio_page(folio, i));
- if (ret)
- break;
- }
-
- return ret;
+ return 0;
}
#endif
@@ -2409,11 +2383,40 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct vm_area_struct *vma, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
+struct follow_pfnmap_args {
+ /**
+ * Inputs:
+ * @vma: Pointer to @vm_area_struct struct
+ * @address: the virtual address to walk
+ */
+ struct vm_area_struct *vma;
+ unsigned long address;
+ /**
+ * Internals:
+ *
+ * The caller shouldn't touch any of these.
+ */
+ spinlock_t *lock;
+ pte_t *ptep;
+ /**
+ * Outputs:
+ *
+ * @pfn: the PFN of the address
+ * @pgprot: the pgprot_t of the mapping
+ * @writable: whether the mapping is writable
+ * @special: whether the mapping is a special mapping (real PFN maps)
+ */
+ unsigned long pfn;
+ pgprot_t pgprot;
+ bool writable;
+ bool special;
+};
+int follow_pfnmap_start(struct follow_pfnmap_args *args);
+void follow_pfnmap_end(struct follow_pfnmap_args *args);
+
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
@@ -2541,11 +2544,6 @@ int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
-extern unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
-
/*
* Flags used by change_protection(). For now we make it a bitmap so
* that we can pass in multiple flags just like parameters. However
@@ -2566,21 +2564,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
-bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
-{
- /*
- * We want to check manually if we can change individual PTEs writable
- * if we can't do that automatically for all PTEs in a mapping. For
- * private mappings, that's always the case when we have write
- * permissions as we properly have to handle COW.
- */
- if (vma->vm_flags & VM_SHARED)
- return vma_wants_writenotify(vma, vma->vm_page_prot);
- return !!(vma->vm_flags & VM_WRITE);
-
-}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
@@ -2704,6 +2687,30 @@ static inline pte_t pte_mkspecial(pte_t pte)
}
#endif
+#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
+{
+ return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+ return false;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+ return pud;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
@@ -2896,7 +2903,7 @@ static inline void pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
@@ -2954,7 +2961,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
return true;
}
-#else /* !USE_SPLIT_PTE_PTLOCKS */
+#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
@@ -2969,7 +2976,7 @@ static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
@@ -3029,7 +3036,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
-#if USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_SPLIT_PMD_PTLOCKS)
static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
@@ -3288,78 +3295,9 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next);
-extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void unlink_file_vma(struct vm_area_struct *);
-extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
- unsigned long addr, unsigned long len, pgoff_t pgoff,
- bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long vm_flags,
- struct mempolicy *policy,
- struct vm_userfaultfd_ctx uffd_ctx,
- struct anon_vma_name *anon_name);
-
-/* We are about to modify the VMA's flags. */
-static inline struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long new_flags)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or anon_name. */
-static inline struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long new_flags,
- struct anon_vma_name *new_name)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
-}
-
-/* We are about to modify the VMA's memory policy. */
-static inline struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct mempolicy *new_pol)
-{
- return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
- new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or uffd context. */
-static inline struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), new_ctx, anon_vma_name(vma));
-}
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3392,10 +3330,6 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags,
const struct vm_special_mapping *spec);
-/* This is an obsolete alternative to _install_special_mapping. */
-extern int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long flags, struct page **pages);
unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);
@@ -3421,14 +3355,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
+int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
-extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
@@ -3656,9 +3590,6 @@ static inline vm_fault_t vmf_fs_error(int err)
return VM_FAULT_SIGBUS;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags);
-
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
@@ -4194,18 +4125,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
#ifdef CONFIG_UNACCEPTED_MEMORY
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
+void accept_memory(phys_addr_t start, unsigned long size);
#else
static inline bool range_contains_unaccepted_memory(phys_addr_t start,
- phys_addr_t end)
+ unsigned long size)
{
return false;
}
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}
@@ -4213,9 +4144,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
- phys_addr_t paddr = pfn << PAGE_SHIFT;
-
- return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+ return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
}
void vma_pgtable_walk_begin(struct vm_area_struct *vma);
@@ -4233,4 +4162,61 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla
}
#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+ int i;
+ struct alloc_tag *tag;
+ unsigned int nr_pages = 1 << new_order;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = pgalloc_tag_get(&folio->page);
+ if (!tag)
+ return;
+
+ for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+ union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
+
+ if (ref) {
+ /* Set new reference to point to the original tag */
+ alloc_tag_ref_set(ref, tag);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+ struct alloc_tag *tag;
+ union codetag_ref *ref;
+
+ tag = pgalloc_tag_get(&old->page);
+ if (!tag)
+ return;
+
+ ref = get_page_tag_ref(&new->page);
+ if (!ref)
+ return;
+
+ /* Clear the old ref to the original allocation tag. */
+ clear_page_tag_ref(&old->page);
+ /* Decrement the counters of the tag on get_new_folio. */
+ alloc_tag_sub(ref, folio_nr_pages(new));
+
+ __alloc_tag_ref_set(ref, tag);
+
+ put_page_tag_ref(ref);
+}
+#else /* !CONFIG_MEM_ALLOC_PROFILING */
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+}
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 485424979254..6e3bdf8e38bc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -109,7 +109,7 @@ struct page {
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
- * Used for swp_entry_t if PageSwapCache.
+ * Used for swp_entry_t if swapcache flag set.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
@@ -660,6 +660,9 @@ struct vma_numab_state {
* per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
+ *
+ * Only explicitly marked struct members may be accessed by RCU readers before
+ * getting a stable reference.
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
@@ -675,7 +678,11 @@ struct vm_area_struct {
#endif
};
- struct mm_struct *vm_mm; /* The address space we belong to. */
+ /*
+ * The address space we belong to.
+ * Unstable RCU readers are allowed to read this.
+ */
+ struct mm_struct *vm_mm;
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
/*
@@ -688,7 +695,10 @@ struct vm_area_struct {
};
#ifdef CONFIG_PER_VMA_LOCK
- /* Flag to indicate areas detached from the mm->mm_mt tree */
+ /*
+ * Flag to indicate areas detached from the mm->mm_mt tree.
+ * Unstable RCU readers are allowed to read this.
+ */
bool detached;
/*
@@ -706,6 +716,7 @@ struct vm_area_struct {
* slowpath.
*/
int vm_lock_seq;
+ /* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock;
#endif
@@ -947,7 +958,7 @@ struct mm_struct {
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1313,6 +1324,9 @@ struct vm_special_mapping {
int (*mremap)(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma);
+
+ void (*close)(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma);
};
enum tlb_flush_reason {
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a2f6179b672b..bff5706b76e1 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -16,9 +16,6 @@
#include <asm/tlbbatch.h>
#endif
-#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
-#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
- IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
/*
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 6a31ed02d3ff..8fc2b328ec4d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/fault-inject.h>
+#include <linux/debugfs.h>
#include <linux/mmc/core.h>
#include <linux/mmc/card.h>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1dc6248feb83..17506e4a2835 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -666,11 +666,6 @@ enum zone_watermarks {
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
-#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
-#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
-#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
-#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
-
/*
* Flags used in pcp->flags field.
*
@@ -1016,6 +1011,32 @@ enum zone_flags {
ZONE_BELOW_HIGH, /* zone is below high watermark. */
};
+static inline unsigned long wmark_pages(const struct zone *z,
+ enum zone_watermarks w)
+{
+ return z->_watermark[w] + z->watermark_boost;
+}
+
+static inline unsigned long min_wmark_pages(const struct zone *z)
+{
+ return wmark_pages(z, WMARK_MIN);
+}
+
+static inline unsigned long low_wmark_pages(const struct zone *z)
+{
+ return wmark_pages(z, WMARK_LOW);
+}
+
+static inline unsigned long high_wmark_pages(const struct zone *z)
+{
+ return wmark_pages(z, WMARK_HIGH);
+}
+
+static inline unsigned long promo_wmark_pages(const struct zone *z)
+{
+ return wmark_pages(z, WMARK_PROMO);
+}
+
static inline unsigned long zone_managed_pages(struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
@@ -1688,7 +1709,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
zone = zonelist_zone(z))
#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
- for (zone = z->zone; \
+ for (zone = zonelist_zone(z); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
@@ -1724,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
nid = first_node(*nodes);
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes);
- return (!z->zone) ? true : false;
+ return (!zonelist_zone(z)) ? true : false;
}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 944979763825..b10093c4d00e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -554,6 +554,8 @@ enum {
MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19),
/* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */
MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20),
+ /* PCI MSIs cannot be steered separately to CPU cores */
+ MSI_FLAG_NO_AFFINITY = (1 << 21),
};
/**
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index f9df88091c6d..8d7430d9f218 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -281,15 +281,18 @@ enum nfsstat4 {
/* nfs42 */
NFS4ERR_PARTNER_NOTSUPP = 10088,
NFS4ERR_PARTNER_NO_AUTH = 10089,
- NFS4ERR_UNION_NOTSUPP = 10090,
- NFS4ERR_OFFLOAD_DENIED = 10091,
- NFS4ERR_WRONG_LFS = 10092,
- NFS4ERR_BADLABEL = 10093,
- NFS4ERR_OFFLOAD_NO_REQS = 10094,
+ NFS4ERR_UNION_NOTSUPP = 10090,
+ NFS4ERR_OFFLOAD_DENIED = 10091,
+ NFS4ERR_WRONG_LFS = 10092,
+ NFS4ERR_BADLABEL = 10093,
+ NFS4ERR_OFFLOAD_NO_REQS = 10094,
/* xattr (RFC8276) */
- NFS4ERR_NOXATTR = 10095,
- NFS4ERR_XATTR2BIG = 10096,
+ NFS4ERR_NOXATTR = 10095,
+ NFS4ERR_XATTR2BIG = 10096,
+
+ /* can be used for internal errors */
+ NFS4ERR_FIRST_FREE
};
/* error codes for internal client use */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index eb19503604fe..3567e40329eb 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -30,6 +30,12 @@ static inline bool numa_valid_node(int nid)
#ifdef CONFIG_NUMA
#include <asm/sparsemem.h>
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid) (node_data[nid])
+
+void __init alloc_node_data(int nid);
+void __init alloc_offline_node_data(int nid);
+
/* Generic implementation available */
int numa_nearest_node(int node, unsigned int state);
@@ -57,6 +63,8 @@ static inline int phys_to_target_node(u64 start)
{
return 0;
}
+
+static inline void alloc_offline_node_data(int nid) {}
#endif
#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
new file mode 100644
index 000000000000..cfad6ce7e1bd
--- /dev/null
+++ b/include/linux/numa_memblks.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NUMA_MEMBLKS_H
+#define __NUMA_MEMBLKS_H
+
+#ifdef CONFIG_NUMA_MEMBLKS
+#include <linux/types.h>
+
+#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2)
+
+void __init numa_set_distance(int from, int to, int distance);
+void __init numa_reset_distance(void);
+
+struct numa_memblk {
+ u64 start;
+ u64 end;
+ int nid;
+};
+
+struct numa_meminfo {
+ int nr_blks;
+ struct numa_memblk blk[NR_NODE_MEMBLKS];
+};
+
+int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+
+int __init numa_memblks_init(int (*init_func)(void),
+ bool memblock_force_top_down);
+
+#ifdef CONFIG_NUMA_EMU
+int numa_emu_cmdline(char *str);
+void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
+ unsigned int nr_emu_nids);
+u64 __init numa_emu_dma_end(void);
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt)
+{ }
+static inline int numa_emu_cmdline(char *str)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_NUMA_EMU */
+
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+extern int phys_to_target_node(u64 start);
+#define phys_to_target_node phys_to_target_node
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif /* CONFIG_NUMA_KEEP_MEMINFO */
+
+#endif /* CONFIG_NUMA_MEMBLKS */
+
+#endif /* __NUMA_MEMBLKS_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5769fe6e4950..1b3a76710487 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -66,8 +66,6 @@
* PG_referenced, PG_reclaim are used for page reclaim for anonymous and
* file-backed pagecache (see mm/vmscan.c).
*
- * PG_error is set to indicate that an I/O error occurred on this page.
- *
* PG_arch_1 is an architecture specific page state bit. The generic code
* guarantees that this bit is cleared for a page when it first is entered into
* the page cache.
@@ -103,22 +101,18 @@ enum pageflags {
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_active,
PG_workingset,
- PG_error,
- PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
+ PG_owner_priv_1, /* Owner use. If pagecache, fs may use */
+ PG_owner_2, /* Owner use. If pagecache, fs may use */
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
- PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- PG_uncached, /* Page has been mapped as uncached */
-#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
@@ -126,14 +120,21 @@ enum pageflags {
PG_young,
PG_idle,
#endif
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
PG_arch_2,
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
PG_arch_3,
#endif
__NR_PAGEFLAGS,
PG_readahead = PG_reclaim,
+ /* Anonymous memory (and shmem) */
+ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
+ /* Some filesystems */
+ PG_checked = PG_owner_priv_1,
+
/*
* Depending on the way an anonymous folio can be mapped into a page
* table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
@@ -141,13 +142,13 @@ enum pageflags {
* tail pages of an anonymous folio. For now, we only expect it to be
* set on tail pages for PTE-mapped THP.
*/
- PG_anon_exclusive = PG_mappedtodisk,
-
- /* Filesystems */
- PG_checked = PG_owner_priv_1,
+ PG_anon_exclusive = PG_owner_2,
- /* SwapBacked */
- PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
+ /*
+ * Set if all buffer heads in the folio are mapped.
+ * Filesystems which do not use BHs can use it for their own purpose.
+ */
+ PG_mappedtodisk = PG_owner_2,
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
@@ -183,8 +184,9 @@ enum pageflags {
*/
/* At least one page in this folio has the hwpoison flag set */
- PG_has_hwpoisoned = PG_error,
+ PG_has_hwpoisoned = PG_active,
PG_large_rmappable = PG_workingset, /* anon or file-backed */
+ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -235,7 +237,7 @@ static __always_inline int page_is_fake_head(const struct page *page)
return page_fixed_fake_head(page) != page;
}
-static inline unsigned long _compound_head(const struct page *page)
+static __always_inline unsigned long _compound_head(const struct page *page)
{
unsigned long head = READ_ONCE(page->compound_head);
@@ -506,7 +508,6 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
-PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
__FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
@@ -514,8 +515,9 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
TESTCLEARFLAG(LRU, lru, PF_HEAD)
-PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
- TESTCLEARFLAG(Active, active, PF_HEAD)
+FOLIO_FLAG(active, FOLIO_HEAD_PAGE)
+ __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
PAGEFLAG(Workingset, workingset, PF_HEAD)
TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
@@ -531,9 +533,9 @@ PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
-PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
- __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
- __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
+ __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE)
+ __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE)
/*
* Private page markings that may be used by the filesystem that owns the page
@@ -542,8 +544,9 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
*/
PAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
-PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
- TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+
+/* owner_2 can be set on tail pages for anon memory */
+FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
@@ -556,8 +559,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
- TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)
+FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)
#ifdef CONFIG_HIGHMEM
/*
@@ -577,34 +580,26 @@ static __always_inline bool folio_test_swapcache(const struct folio *folio)
test_bit(PG_swapcache, const_folio_flags(folio, 0));
}
-static __always_inline bool PageSwapCache(const struct page *page)
-{
- return folio_test_swapcache(page_folio(page));
-}
-
-SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
-CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
+FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE)
+FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
#else
-PAGEFLAG_FALSE(SwapCache, swapcache)
+FOLIO_FLAG_FALSE(swapcache)
#endif
-PAGEFLAG(Unevictable, unevictable, PF_HEAD)
- __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
- TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
+FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
+ __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
- __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
- TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
-#else
-PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
- TESTSCFLAG_FALSE(Mlocked, mlocked)
-#endif
-
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
+FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
+ __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE)
#else
-PAGEFLAG_FALSE(Uncached, uncached)
+FOLIO_FLAG_FALSE(mlocked)
+ __FOLIO_CLEAR_FLAG_NOOP(mlocked)
+ FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked)
+ FOLIO_TEST_SET_FLAG_FALSE(mlocked)
#endif
#ifdef CONFIG_MEMORY_FAILURE
@@ -865,8 +860,18 @@ static inline void ClearPageCompound(struct page *page)
ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+/*
+ * PG_partially_mapped is protected by deferred_split split_queue_lock,
+ * so its safe to use non-atomic set/clear.
+ */
+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
+FOLIO_TEST_FLAG_FALSE(partially_mapped)
+__FOLIO_SET_FLAG_NOOP(partially_mapped)
+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
#endif
#define PG_head_mask ((1UL << PG_head))
@@ -927,79 +932,74 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#endif
/*
- * For pages that are never mapped to userspace,
- * page_type may be used. Because it is initialised to -1, we invert the
- * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
- * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
- * low bits so that an underflow or overflow of _mapcount won't be
- * mistaken for a page type value.
+ * For pages that do not use mapcount, page_type may be used.
+ * The low 24 bits of pagetype may be used for your own purposes, as long
+ * as you are careful to not affect the top 8 bits. The low bits of
+ * pagetype will be overwritten when you clear the page_type from the page.
*/
-
enum pagetype {
- PG_buddy = 0x40000000,
- PG_offline = 0x20000000,
- PG_table = 0x10000000,
- PG_guard = 0x08000000,
- PG_hugetlb = 0x04000000,
- PG_slab = 0x02000000,
- PG_zsmalloc = 0x01000000,
-
- PAGE_TYPE_BASE = 0x80000000,
-
- /*
- * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and
- * allow owners that set a type to reuse the lower 16 bit for their own
- * purposes.
- */
- PAGE_MAPCOUNT_RESERVE = ~0x0000ffff,
+ /* 0x00-0x7f are positive numbers, ie mapcount */
+ /* Reserve 0x80-0xef for mapcount overflow. */
+ PGTY_buddy = 0xf0,
+ PGTY_offline = 0xf1,
+ PGTY_table = 0xf2,
+ PGTY_guard = 0xf3,
+ PGTY_hugetlb = 0xf4,
+ PGTY_slab = 0xf5,
+ PGTY_zsmalloc = 0xf6,
+ PGTY_unaccepted = 0xf7,
+
+ PGTY_mapcount_underflow = 0xff
};
-#define PageType(page, flag) \
- ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
-#define folio_test_type(folio, flag) \
- ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+static inline bool page_type_has_type(int page_type)
+{
+ return page_type < (PGTY_mapcount_underflow << 24);
+}
-static inline int page_type_has_type(unsigned int page_type)
+/* This takes a mapcount which is one more than page->_mapcount */
+static inline bool page_mapcount_is_type(unsigned int mapcount)
{
- return (int)page_type < PAGE_MAPCOUNT_RESERVE;
+ return page_type_has_type(mapcount - 1);
}
-static inline int page_has_type(const struct page *page)
+static inline bool page_has_type(const struct page *page)
{
- return page_type_has_type(READ_ONCE(page->page_type));
+ return page_mapcount_is_type(data_race(page->page_type));
}
#define FOLIO_TYPE_OPS(lname, fname) \
-static __always_inline bool folio_test_##fname(const struct folio *folio)\
+static __always_inline bool folio_test_##fname(const struct folio *folio) \
{ \
- return folio_test_type(folio, PG_##lname); \
+ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
- VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
- folio->page.page_type &= ~PG_##lname; \
+ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
+ folio); \
+ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
- folio->page.page_type |= PG_##lname; \
+ folio->page.page_type = UINT_MAX; \
}
#define PAGE_TYPE_OPS(uname, lname, fname) \
FOLIO_TYPE_OPS(lname, fname) \
static __always_inline int Page##uname(const struct page *page) \
{ \
- return PageType(page, PG_##lname); \
+ return data_race(page->page_type >> 24) == PGTY_##lname; \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
- VM_BUG_ON_PAGE(!PageType(page, 0), page); \
- page->page_type &= ~PG_##lname; \
+ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
+ page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
- page->page_type |= PG_##lname; \
+ page->page_type = UINT_MAX; \
}
/*
@@ -1076,6 +1076,13 @@ FOLIO_TEST_FLAG_FALSE(hugetlb)
PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
+/*
+ * Mark pages that has to be accepted before touched for the first time.
+ *
+ * Serialized with zone lock.
+ */
+PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
+
/**
* PageHuge - Determine if the page belongs to hugetlbfs
* @page: The page to test.
@@ -1175,25 +1182,20 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
#define PAGE_FLAGS_SECOND \
(0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
- 1UL << PG_large_rmappable)
+ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
/**
- * page_has_private - Determine if page has private stuff
- * @page: The page to be checked
+ * folio_has_private - Determine if folio has private stuff
+ * @folio: The folio to be checked
*
- * Determine if a page has private stuff, indicating that release routines
+ * Determine if a folio has private stuff, indicating that release routines
* should be invoked upon it.
*/
-static inline int page_has_private(const struct page *page)
-{
- return !!(page->flags & PAGE_FLAGS_PRIVATE);
-}
-
-static inline bool folio_has_private(const struct folio *folio)
+static inline int folio_has_private(const struct folio *folio)
{
- return page_has_private(&folio->page);
+ return !!(folio->flags & PAGE_FLAGS_PRIVATE);
}
#undef PF_ANY
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 904c52f97284..79dbd8bc35a7 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -26,11 +26,14 @@ struct page_counter {
atomic_long_t children_low_usage;
unsigned long watermark;
+ /* Latest cg2 reset watermark */
+ unsigned long local_watermark;
unsigned long failcnt;
/* Keep all the read most fields in a separete cacheline. */
CACHELINE_PADDING(_pad2_);
+ bool protection_support;
unsigned long min;
unsigned long low;
unsigned long high;
@@ -44,12 +47,17 @@ struct page_counter {
#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE)
#endif
+/*
+ * Protection is supported only for the first counter (with id 0).
+ */
static inline void page_counter_init(struct page_counter *counter,
- struct page_counter *parent)
+ struct page_counter *parent,
+ bool protection_support)
{
- atomic_long_set(&counter->usage, 0);
+ counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0);
counter->max = PAGE_COUNTER_MAX;
counter->parent = parent;
+ counter->protection_support = protection_support;
}
static inline unsigned long page_counter_read(struct page_counter *counter)
@@ -78,11 +86,24 @@ int page_counter_memparse(const char *buf, const char *max,
static inline void page_counter_reset_watermark(struct page_counter *counter)
{
- counter->watermark = page_counter_read(counter);
+ unsigned long usage = page_counter_read(counter);
+
+ /*
+ * Update local_watermark first, so it's always <= watermark
+ * (modulo CPU/compiler re-ordering)
+ */
+ counter->local_watermark = usage;
+ counter->watermark = usage;
}
+#ifdef CONFIG_MEMCG
void page_counter_calculate_protection(struct page_counter *root,
struct page_counter *counter,
bool recursive_protection);
+#else
+static inline void page_counter_calculate_protection(struct page_counter *root,
+ struct page_counter *counter,
+ bool recursive_protection) {}
+#endif
#endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e39c3a7ce33c..68a5f1ff3301 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -206,14 +206,21 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
- AS_LARGE_FOLIO_SUPPORT = 6,
- AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
- AS_STABLE_WRITES, /* must wait for writeback before modifying
+ AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */
+ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
folio contents */
- AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping,
- including to move the mapping */
+ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
+ /* Bits 16-25 are used for FOLIO_ORDER */
+ AS_FOLIO_ORDER_BITS = 5,
+ AS_FOLIO_ORDER_MIN = 16,
+ AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
};
+#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
+#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
+#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
+#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
+
/**
* mapping_set_error - record a writeback error in the address_space
* @mapping: the mapping in which an error should be set
@@ -369,9 +376,64 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1)
#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+/*
+ * mapping_max_folio_size_supported() - Check the max folio size supported
+ *
+ * The filesystem should call this function at mount time if there is a
+ * requirement on the folio mapping size in the page cache.
+ */
+static inline size_t mapping_max_folio_size_supported(void)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
+ return PAGE_SIZE;
+}
+
+/*
+ * mapping_set_folio_order_range() - Set the orders supported by a file.
+ * @mapping: The address space of the file.
+ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
+ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
+ *
+ * The filesystem should call this function in its inode constructor to
+ * indicate which base size (min) and maximum size (max) of folio the VFS
+ * can use to cache the contents of the file. This should only be used
+ * if the filesystem needs special handling of folio sizes (ie there is
+ * something the core cannot know).
+ * Do not tune it based on, eg, i_size.
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_set_folio_order_range(struct address_space *mapping,
+ unsigned int min,
+ unsigned int max)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ if (min > MAX_PAGECACHE_ORDER)
+ min = MAX_PAGECACHE_ORDER;
+
+ if (max > MAX_PAGECACHE_ORDER)
+ max = MAX_PAGECACHE_ORDER;
+
+ if (max < min)
+ max = min;
+
+ mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
+ (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
+}
+
+static inline void mapping_set_folio_min_order(struct address_space *mapping,
+ unsigned int min)
+{
+ mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
+}
+
/**
* mapping_set_large_folios() - Indicate the file supports large folios.
- * @mapping: The file.
+ * @mapping: The address space of the file.
*
* The filesystem should call this function in its inode constructor to
* indicate that the VFS can use large folios to cache the contents of
@@ -382,7 +444,44 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
*/
static inline void mapping_set_large_folios(struct address_space *mapping)
{
- __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+ mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
+}
+
+static inline unsigned int
+mapping_max_folio_order(const struct address_space *mapping)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 0;
+ return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
+}
+
+static inline unsigned int
+mapping_min_folio_order(const struct address_space *mapping)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 0;
+ return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
+}
+
+static inline unsigned long
+mapping_min_folio_nrpages(struct address_space *mapping)
+{
+ return 1UL << mapping_min_folio_order(mapping);
+}
+
+/**
+ * mapping_align_index() - Align index for this mapping.
+ * @mapping: The address_space.
+ * @index: The page index.
+ *
+ * The index of a folio must be naturally aligned. If you are adding a
+ * new folio to the page cache and need to know what index to give it,
+ * call this function.
+ */
+static inline pgoff_t mapping_align_index(struct address_space *mapping,
+ pgoff_t index)
+{
+ return round_down(index, mapping_min_folio_nrpages(mapping));
}
/*
@@ -391,20 +490,17 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
*/
static inline bool mapping_large_folio_support(struct address_space *mapping)
{
- /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
+ /* AS_FOLIO_ORDER is only reasonable for pagecache folios */
VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
"Anonymous mapping always supports large folio");
- return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+ return mapping_max_folio_order(mapping) > 0;
}
/* Return the maximum folio size for this pagecache mapping, in bytes. */
-static inline size_t mapping_max_folio_size(struct address_space *mapping)
+static inline size_t mapping_max_folio_size(const struct address_space *mapping)
{
- if (mapping_large_folio_support(mapping))
- return PAGE_SIZE << MAX_PAGECACHE_ORDER;
- return PAGE_SIZE;
+ return PAGE_SIZE << mapping_max_folio_order(mapping);
}
static inline int filemap_nr_thps(struct address_space *mapping)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 27cd1e59ccf7..f5eb5a32aeed 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
pgoff_t nr, const struct mm_walk_ops *ops,
void *private);
+typedef int __bitwise folio_walk_flags_t;
+
+/*
+ * Walk migration entries as well. Careful: a large folio might get split
+ * concurrently.
+ */
+#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0))
+
+/* Walk shared zeropages (small + huge) as well. */
+#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1))
+
+enum folio_walk_level {
+ FW_LEVEL_PTE,
+ FW_LEVEL_PMD,
+ FW_LEVEL_PUD,
+};
+
+/**
+ * struct folio_walk - folio_walk_start() / folio_walk_end() data
+ * @page: exact folio page referenced (if applicable)
+ * @level: page table level identifying the entry type
+ * @pte: pointer to the page table entry (FW_LEVEL_PTE).
+ * @pmd: pointer to the page table entry (FW_LEVEL_PMD).
+ * @pud: pointer to the page table entry (FW_LEVEL_PUD).
+ * @ptl: pointer to the page table lock.
+ *
+ * (see folio_walk_start() documentation for more details)
+ */
+struct folio_walk {
+ /* public */
+ struct page *page;
+ enum folio_walk_level level;
+ union {
+ pte_t *ptep;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ };
+ union {
+ pte_t pte;
+ pud_t pud;
+ pmd_t pmd;
+ };
+ /* private */
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+};
+
+struct folio *folio_walk_start(struct folio_walk *fw,
+ struct vm_area_struct *vma, unsigned long addr,
+ folio_walk_flags_t flags);
+
+#define folio_walk_end(__fw, __vma) do { \
+ spin_unlock((__fw)->ptl); \
+ if (likely((__fw)->level == FW_LEVEL_PTE)) \
+ pte_unmap((__fw)->ptep); \
+ vma_pgtable_walk_end(__vma); \
+} while (0)
+
#endif /* _LINUX_PAGEWALK_H */
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 85bdf2adb760..42ef06136bd1 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -128,6 +128,7 @@ struct pci_epc_mem {
* @group: configfs group representing the PCI EPC device
* @lock: mutex to protect pci_epc ops
* @function_num_map: bitmap to manage physical function number
+ * @domain_nr: PCI domain number of the endpoint controller
* @init_complete: flag to indicate whether the EPC initialization is complete
* or not
*/
@@ -145,10 +146,12 @@ struct pci_epc {
/* mutex to protect against concurrent access of EP controller */
struct mutex lock;
unsigned long function_num_map;
+ int domain_nr;
bool init_complete;
};
/**
+ * enum pci_epc_bar_type - configurability of endpoint BAR
* @BAR_PROGRAMMABLE: The BAR mask can be configured by the EPC.
* @BAR_FIXED: The BAR mask is fixed by the hardware.
* @BAR_RESERVED: The BAR should not be touched by an EPF driver.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..573b4c4c2be6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -371,6 +371,7 @@ struct pci_dev {
can be generated */
unsigned int pme_poll:1; /* Poll device's PME status bit */
unsigned int pinned:1; /* Whether this dev is pinned */
+ unsigned int config_rrs_sv:1; /* Config RRS software visibility */
unsigned int imm_ready:1; /* Supports Immediate Readiness */
unsigned int d1_support:1; /* Low power state D1 is supported */
unsigned int d2_support:1; /* Low power state D2 is supported */
@@ -517,6 +518,9 @@ struct pci_dev {
#ifdef CONFIG_PCI_DOE
struct xarray doe_mbs; /* Data Object Exchange mailboxes */
#endif
+#ifdef CONFIG_PCI_NPEM
+ struct npem *npem; /* Native PCIe Enclosure Management */
+#endif
u16 acs_cap; /* ACS Capability offset */
phys_addr_t rom; /* Physical address if not from BAR */
size_t romlen; /* Length if not from BAR */
@@ -1098,7 +1102,7 @@ enum pcie_bus_config_types {
extern enum pcie_bus_config_types pcie_bus_config;
-extern struct bus_type pci_bus_type;
+extern const struct bus_type pci_bus_type;
/* Do NOT directly access these two variables, unless you are arch-specific PCI
* code, or PCI core code. */
@@ -1884,7 +1888,7 @@ static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
{ return 0; }
#endif
int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent);
-void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent);
+void pci_bus_release_domain_nr(struct device *parent, int domain_nr);
#endif
/* Some architectures require additional setup to direct VGA traffic */
@@ -2290,8 +2294,11 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass,
#endif
void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
+void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar,
+ const char *name);
void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
+int pcim_request_region(struct pci_dev *pdev, int bar, const char *name);
int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name);
int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask,
const char *name);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 91182aa1d2ec..4cf6aaed5f35 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2662,6 +2662,8 @@
#define PCI_DEVICE_ID_DCI_PCCOM8 0x0002
#define PCI_DEVICE_ID_DCI_PCCOM2 0x0004
+#define PCI_VENDOR_ID_GLENFLY 0x6766
+
#define PCI_VENDOR_ID_INTEL 0x8086
#define PCI_DEVICE_ID_INTEL_EESSC 0x0008
#define PCI_DEVICE_ID_INTEL_HDA_CML_LP 0x02c8
@@ -2707,6 +2709,9 @@
#define PCI_DEVICE_ID_INTEL_82815_MC 0x1130
#define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132
#define PCI_DEVICE_ID_INTEL_SST_TNG 0x119a
+#define PCI_DEVICE_ID_INTEL_DSA_GNRD 0x11fb
+#define PCI_DEVICE_ID_INTEL_DSA_DMR 0x1212
+#define PCI_DEVICE_ID_INTEL_IAA_DMR 0x1216
#define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221
#define PCI_DEVICE_ID_INTEL_82437 0x122d
#define PCI_DEVICE_ID_INTEL_82371FB_0 0x122e
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4b2047b78b67..b6321fc49159 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,6 @@ extern void __init setup_per_cpu_areas(void);
extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp_t gfp) __alloc_size(1);
-extern size_t pcpu_alloc_size(void __percpu *__pdata);
#define __alloc_percpu_gfp(_size, _align, _gfp) \
alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp))
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 207f0c83c8e9..59a3deb792a8 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
}
}
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
-{
- int i;
- struct page_ext *first_page_ext;
- struct page_ext *page_ext;
- union codetag_ref *ref;
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- first_page_ext = page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
- return;
-
- ref = codetag_ref_from_page_ext(page_ext);
- if (!ref->ct)
- goto out;
-
- tag = ct_to_alloc_tag(ref->ct);
- page_ext = page_ext_next(page_ext);
- for (i = 1; i < nr; i++) {
- /* Set new reference to point to the original tag */
- alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag);
- page_ext = page_ext_next(page_ext);
- }
-out:
- page_ext_put(first_page_ext);
-}
-
static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
{
struct alloc_tag *tag = NULL;
@@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {}
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a6a3cccfc36..e8b2ac6bd2ae 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -447,6 +447,12 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
}
#endif
+#ifndef arch_check_zapped_pud
+static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
+{
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
@@ -1950,6 +1956,18 @@ typedef unsigned int pgtbl_mod_mask;
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif
+#ifndef pte_pgprot
+#define pte_pgprot(x) ((pgprot_t) {0})
+#endif
+
+#ifndef pmd_pgprot
+#define pmd_pgprot(x) ((pgprot_t) {0})
+#endif
+
+#ifndef pud_pgprot
+#define pud_pgprot(x) ((pgprot_t) {0})
+#endif
+
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index a65d3d078e58..53cfde98433d 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -81,6 +81,8 @@ struct pinctrl_map;
* @PIN_CONFIG_INPUT_SCHMITT_ENABLE: control schmitt-trigger mode on the pin.
* If the argument != 0, schmitt-trigger mode is enabled. If it's 0,
* schmitt-trigger mode is disabled.
+ * @PIN_CONFIG_INPUT_SCHMITT_UV: this will configure an input pin to run in
+ * schmitt-trigger mode. The argument is in uV.
* @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power
* operation, if several modes of operation are supported these can be
* passed in the argument on a custom form, else just use argument 1
@@ -132,6 +134,7 @@ enum pin_config_param {
PIN_CONFIG_INPUT_ENABLE,
PIN_CONFIG_INPUT_SCHMITT,
PIN_CONFIG_INPUT_SCHMITT_ENABLE,
+ PIN_CONFIG_INPUT_SCHMITT_UV,
PIN_CONFIG_MODE_LOW_POWER,
PIN_CONFIG_MODE_PWM,
PIN_CONFIG_OUTPUT,
diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h
new file mode 100644
index 000000000000..576d952f97ed
--- /dev/null
+++ b/include/linux/platform_data/amd_qdma.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _PLATDATA_AMD_QDMA_H
+#define _PLATDATA_AMD_QDMA_H
+
+#include <linux/dmaengine.h>
+
+/**
+ * struct qdma_queue_info - DMA queue information. This information is used to
+ * match queue when DMA channel is requested
+ * @dir: Channel transfer direction
+ */
+struct qdma_queue_info {
+ enum dma_transfer_direction dir;
+};
+
+#define QDMA_FILTER_PARAM(qinfo) ((void *)(qinfo))
+
+struct dma_slave_map;
+
+/**
+ * struct qdma_platdata - Platform specific data for QDMA engine
+ * @max_mm_channels: Maximum number of MM DMA channels in each direction
+ * @device_map: DMA slave map
+ * @irq_index: The index of first IRQ
+ */
+struct qdma_platdata {
+ u32 max_mm_channels;
+ u32 irq_index;
+ struct dma_slave_map *device_map;
+};
+
+#endif /* _PLATDATA_AMD_QDMA_H */
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 07071e64abf3..89a0d83ddad0 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -526,7 +526,7 @@ struct quota_info {
const struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */
};
-int register_quota_format(struct quota_format_type *fmt);
+void register_quota_format(struct quota_format_type *fmt);
void unregister_quota_format(struct quota_format_type *fmt);
struct quota_module_name {
diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h
index 002266693e50..765232ce0b5e 100644
--- a/include/linux/ratelimit_types.h
+++ b/include/linux/ratelimit_types.h
@@ -19,8 +19,8 @@ struct ratelimit_state {
int burst;
int printed;
int missed;
+ unsigned int flags;
unsigned long begin;
- unsigned long flags;
};
#define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fd35d4ec12e1..17fbb7855295 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -89,6 +89,14 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
struct trace_buffer *
__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ struct lock_class_key *key);
+
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+ long *data);
+
/*
* Because the ring buffer is generic, if other users of the ring buffer get
* traced by ftrace, it can produce lockdep warnings. We need to keep each
@@ -100,6 +108,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
__ring_buffer_alloc((size), (flags), &__key); \
})
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc_range(size, flags, order, start, range_size) \
+({ \
+ static struct lock_class_key __key; \
+ __ring_buffer_alloc_range((size), (flags), (order), (start), \
+ (range_size), &__key); \
+})
+
typedef bool (*ring_buffer_cond_fn)(void *data);
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
ring_buffer_cond_fn cond, void *data);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0978c64f49d8..d5e93e44322e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -331,7 +331,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
switch (level) {
case RMAP_LEVEL_PTE:
if (!folio_test_large(folio)) {
- atomic_inc(&page->_mapcount);
+ atomic_inc(&folio->_mapcount);
break;
}
@@ -425,7 +425,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
if (!folio_test_large(folio)) {
if (PageAnonExclusive(page))
ClearPageAnonExclusive(page);
- atomic_inc(&page->_mapcount);
+ atomic_inc(&folio->_mapcount);
break;
}
@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+ RMP_LOCKED = 1 << 0,
+ RMP_USE_SHARED_ZEROPAGE = 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
/*
* rmap_walk_control: To control rmap traversing for specific needs
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a1d0c7cab25c..e6ee4258169a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -82,6 +82,8 @@ struct task_group;
struct task_struct;
struct user_event_mm;
+#include <linux/sched/ext.h>
+
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
@@ -830,6 +832,9 @@ struct task_struct {
struct sched_rt_entity rt;
struct sched_dl_entity dl;
struct sched_dl_entity *dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct sched_ext_entity scx;
+#endif
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
index 000000000000..1ddbde64a31b
--- /dev/null
+++ b/include/linux/sched/ext.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef _LINUX_SCHED_EXT_H
+#define _LINUX_SCHED_EXT_H
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+
+#include <linux/llist.h>
+#include <linux/rhashtable-types.h>
+
+enum scx_public_consts {
+ SCX_OPS_NAME_LEN = 128,
+
+ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
+};
+
+/*
+ * DSQ (dispatch queue) IDs are 64bit of the format:
+ *
+ * Bits: [63] [62 .. 0]
+ * [ B] [ ID ]
+ *
+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
+ * ID: 63 bit ID
+ *
+ * Built-in IDs:
+ *
+ * Bits: [63] [62] [61..32] [31 .. 0]
+ * [ 1] [ L] [ R ] [ V ]
+ *
+ * 1: 1 for built-in DSQs.
+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others
+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
+ */
+enum scx_dsq_id_flags {
+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
+
+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
+};
+
+/*
+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
+ * buffer between the scheduler core and the BPF scheduler. See the
+ * documentation for more details.
+ */
+struct scx_dispatch_q {
+ raw_spinlock_t lock;
+ struct list_head list; /* tasks in dispatch order */
+ struct rb_root priq; /* used to order by p->scx.dsq_vtime */
+ u32 nr;
+ u32 seq; /* used by BPF iter */
+ u64 id;
+ struct rhash_head hash_node;
+ struct llist_node free_node;
+ struct rcu_head rcu;
+};
+
+/* scx_entity.flags */
+enum scx_ent_flags {
+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
+ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
+
+ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
+ SCX_TASK_STATE_BITS = 2,
+ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
+
+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
+};
+
+/* scx_entity.flags & SCX_TASK_STATE_MASK */
+enum scx_task_state {
+ SCX_TASK_NONE, /* ops.init_task() not called yet */
+ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
+ SCX_TASK_READY, /* fully initialized, but not in sched_ext */
+ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
+
+ SCX_TASK_NR_STATES,
+};
+
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
+/*
+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
+ * everywhere and the following bits track which kfunc sets are currently
+ * allowed for %current. This simple per-task tracking works because SCX ops
+ * nest in a limited way. BPF will likely implement a way to allow and disallow
+ * kfuncs depending on the calling context which will replace this manual
+ * mechanism. See scx_kf_allow().
+ */
+enum scx_kf_mask {
+ SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
+ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
+ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
+ /* ops.dequeue (in REST) may be nested inside DISPATCH */
+ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
+ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
+ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
+ SCX_KF_REST = 1 << 4, /* other rq-locked operations */
+
+ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+};
+
+enum scx_dsq_lnode_flags {
+ SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
+
+ /* high 16 bits can be for iter cursor flags */
+ __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
+};
+
+struct scx_dsq_list_node {
+ struct list_head node;
+ u32 flags;
+ u32 priv; /* can be used by iter cursor */
+};
+
+/*
+ * The following is embedded in task_struct and contains all fields necessary
+ * for a task to be scheduled by SCX.
+ */
+struct sched_ext_entity {
+ struct scx_dispatch_q *dsq;
+ struct scx_dsq_list_node dsq_list; /* dispatch order */
+ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
+ u32 dsq_seq;
+ u32 dsq_flags; /* protected by DSQ lock */
+ u32 flags; /* protected by rq lock */
+ u32 weight;
+ s32 sticky_cpu;
+ s32 holding_cpu;
+ u32 kf_mask; /* see scx_kf_mask above */
+ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
+ atomic_long_t ops_state;
+
+ struct list_head runnable_node; /* rq->scx.runnable_list */
+ unsigned long runnable_at;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 core_sched_at; /* see scx_prio_less() */
+#endif
+ u64 ddsp_dsq_id;
+ u64 ddsp_enq_flags;
+
+ /* BPF scheduler modifiable fields */
+
+ /*
+ * Runtime budget in nsecs. This is usually set through
+ * scx_bpf_dispatch() but can also be modified directly by the BPF
+ * scheduler. Automatically decreased by SCX as the task executes. On
+ * depletion, a scheduling event is triggered.
+ *
+ * This value is cleared to zero if the task is preempted by
+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+ * task ran. Use p->se.sum_exec_runtime instead.
+ */
+ u64 slice;
+
+ /*
+ * Used to order tasks when dispatching to the vtime-ordered priority
+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+ * but can also be modified directly by the BPF scheduler. Modifying it
+ * while a task is queued on a dsq may mangle the ordering and is not
+ * recommended.
+ */
+ u64 dsq_vtime;
+
+ /*
+ * If set, reject future sched_setscheduler(2) calls updating the policy
+ * to %SCHED_EXT with -%EACCES.
+ *
+ * Can be set from ops.init_task() while the BPF scheduler is being
+ * loaded (!scx_init_task_args->fork). If set and the task's policy is
+ * already %SCHED_EXT, the task's policy is rejected and forcefully
+ * reverted to %SCHED_NORMAL. The number of such events are reported
+ * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
+ * during fork is not allowed.
+ */
+ bool disallow; /* reject switching into SCX */
+
+ /* cold fields */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ struct cgroup *cgrp_moving_from;
+#endif
+ /* must be the last field, see init_scx_entity() */
+ struct list_head tasks_node;
+};
+
+void sched_ext_free(struct task_struct *p);
+void print_scx_info(const char *log_lvl, struct task_struct *p);
+
+#else /* !CONFIG_SCHED_CLASS_EXT */
+
+static inline void sched_ext_free(struct task_struct *p) {}
+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* _LINUX_SCHED_EXT_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 91546493c43d..07bb8d4181d7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -179,27 +179,20 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
extern void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack);
-extern unsigned long
-arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
- unsigned long, unsigned long);
-extern unsigned long
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags);
+unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags);
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t);
unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
-unsigned long
-arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags, vm_flags_t vm_flags);
-unsigned long
-arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags, vm_flags_t);
-
unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
struct file *filp,
unsigned long addr,
@@ -211,11 +204,11 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags);
+ unsigned long flags, vm_flags_t vm_flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags);
+ unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack) {}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index d362aacf9f89..0f2aeb37bbb0 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
return t;
}
+static inline struct task_struct *tryget_task_struct(struct task_struct *t)
+{
+ return refcount_inc_not_zero(&t->usage) ? t : NULL;
+}
+
extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd72b978e1f..bf10bdb487dd 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -95,23 +95,11 @@ static inline int object_is_on_stack(const void *obj)
extern void thread_stack_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
+unsigned long stack_not_used(struct task_struct *p);
+#else
static inline unsigned long stack_not_used(struct task_struct *p)
{
- unsigned long *n = end_of_stack(p);
-
- do { /* Skip over canary */
-# ifdef CONFIG_STACK_GROWSUP
- n--;
-# else
- n++;
-# endif
- } while (!*n);
-
-# ifdef CONFIG_STACK_GROWSUP
- return (unsigned long)end_of_stack(p) - (unsigned long)n;
-# else
- return (unsigned long)n - (unsigned long)end_of_stack(p);
-# endif
+ return 0;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d90d8ee29d81..fffeb754880f 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -157,7 +157,7 @@ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \
static __always_inline unsigned \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
{ \
- unsigned seq = READ_ONCE(s->seqcount.sequence); \
+ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \
\
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
return seq; \
@@ -170,7 +170,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
* Re-read the sequence counter since the (possibly \
* preempted) writer made progress. \
*/ \
- seq = READ_ONCE(s->seqcount.sequence); \
+ seq = smp_load_acquire(&s->seqcount.sequence); \
} \
\
return seq; \
@@ -208,7 +208,7 @@ static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
- return READ_ONCE(s->sequence);
+ return smp_load_acquire(&s->sequence);
}
static inline bool __seqprop_preemptible(const seqcount_t *s)
@@ -263,17 +263,9 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
#define seqprop_assert(s) __seqprop(s, assert)(s)
/**
- * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
+ * __read_seqcount_begin() - begin a seqcount_t read section
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
- * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
- *
* Return: count to be passed to read_seqcount_retry()
*/
#define __read_seqcount_begin(s) \
@@ -293,13 +285,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
*
* Return: count to be passed to read_seqcount_retry()
*/
-#define raw_read_seqcount_begin(s) \
-({ \
- unsigned _seq = __read_seqcount_begin(s); \
- \
- smp_rmb(); \
- _seq; \
-})
+#define raw_read_seqcount_begin(s) __read_seqcount_begin(s)
/**
* read_seqcount_begin() - begin a seqcount_t read critical section
@@ -328,7 +314,6 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
({ \
unsigned __seq = seqprop_sequence(s); \
\
- smp_rmb(); \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
__seq; \
})
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 95ac8398ee72..e7aec20fb44f 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -8,10 +8,10 @@
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
#include <asm/set_memory.h>
#else
-static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
-static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
-static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
-static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_x(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
#ifndef set_memory_rox
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1d06b1e5408a..515a9a6a3c6f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,20 +111,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
- struct mm_struct *mm, unsigned long vm_flags);
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
- bool global_huge);
+ loff_t write_end, bool shmem_huge_force);
#else
-static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
- struct mm_struct *mm, unsigned long vm_flags)
-{
- return false;
-}
static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
- bool global_huge)
+ loff_t write_end, bool shmem_huge_force)
{
return 0;
}
@@ -150,8 +143,8 @@ enum sgp_type {
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
- enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
+ struct folio **foliop, enum sgp_type sgp);
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index da3a546571e7..b35e2db7eb0e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -930,6 +930,16 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz
* @new_n: new number of elements to alloc
* @new_size: new size of a single member of the array
* @flags: the type of memory to allocate (see kmalloc)
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * See krealloc_noprof() for further details.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
*/
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
size_t new_n,
@@ -1038,8 +1048,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
#define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__))
-extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
- __realloc_size(3);
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+ __realloc_size(2);
#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
extern void kvfree(const void *addr);
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 94fc1b57c57b..5e0dd47a0412 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -704,8 +704,6 @@ struct sdw_master_device {
container_of(d, struct sdw_master_device, dev)
struct sdw_driver {
- const char *name;
-
int (*probe)(struct sdw_slave *sdw,
const struct sdw_device_id *id);
int (*remove)(struct sdw_slave *sdw);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a7d0406b9ef5..c419a61f60e5 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -21,6 +21,7 @@
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/pagevec.h>
+#include <linux/kthread.h>
/*
*
@@ -33,9 +34,9 @@
* node traffic on multi-node NUMA NFS servers.
*/
struct svc_pool {
- unsigned int sp_id; /* pool id; also node id on NUMA */
+ unsigned int sp_id; /* pool id; also node id on NUMA */
struct lwq sp_xprts; /* pending transports */
- atomic_t sp_nrthreads; /* # of threads in pool */
+ unsigned int sp_nrthreads; /* # of threads in pool */
struct list_head sp_all_threads; /* all server threads */
struct llist_head sp_idle_threads; /* idle server threads */
@@ -232,6 +233,11 @@ struct svc_rqst {
struct net *rq_bc_net; /* pointer to backchannel's
* net namespace
*/
+
+ int rq_err; /* Thread sets this to inidicate
+ * initialisation success.
+ */
+
unsigned long bc_to_initval;
unsigned int bc_to_retries;
void ** rq_lease_breaker; /* The v4 client breaking a lease */
@@ -305,6 +311,31 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
return test_bit(RQ_VICTIM, &rqstp->rq_flags);
}
+/**
+ * svc_thread_init_status - report whether thread has initialised successfully
+ * @rqstp: the thread in question
+ * @err: errno code
+ *
+ * After performing any initialisation that could fail, and before starting
+ * normal work, each sunrpc svc_thread must call svc_thread_init_status()
+ * with an appropriate error, or zero.
+ *
+ * If zero is passed, the thread is ready and must continue until
+ * svc_thread_should_stop() returns true. If a non-zero error is passed
+ * the call will not return - the thread will exit.
+ */
+static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err)
+{
+ rqstp->rq_err = err;
+ /* memory barrier ensures assignment to error above is visible before
+ * waitqueue_active() test below completes.
+ */
+ smp_mb();
+ wake_up_var(&rqstp->rq_err);
+ if (err)
+ kthread_exit(1);
+}
+
struct svc_deferred_req {
u32 prot; /* protocol (UDP or TCP) */
struct svc_xprt *xprt;
@@ -401,17 +432,13 @@ struct svc_procedure {
*/
int sunrpc_set_pool_mode(const char *val);
int sunrpc_get_pool_mode(char *val, size_t size);
-int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
int svc_bind(struct svc_serv *serv, struct net *net);
struct svc_serv *svc_create(struct svc_program *, unsigned int,
int (*threadfn)(void *data));
-struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
- struct svc_pool *pool, int node);
bool svc_rqst_replace_page(struct svc_rqst *rqstp,
struct page *page);
void svc_rqst_release_pages(struct svc_rqst *rqstp);
-void svc_rqst_free(struct svc_rqst *);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *prog,
struct svc_stat *stats,
@@ -446,11 +473,6 @@ int svc_generic_rpcbind_set(struct net *net,
u32 version, int family,
unsigned short proto,
unsigned short port);
-int svc_rpcbind_set_version(struct net *net,
- const struct svc_program *progp,
- u32 version, int family,
- unsigned short proto,
- unsigned short port);
#define RPC_MAX_ADDRBUFLEN (63U)
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d33bab33099a..619fc0bd837a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -48,6 +48,7 @@
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/rpc_rdma_cid.h>
#include <linux/sunrpc/svc_rdma_pcl.h>
+#include <linux/sunrpc/rdma_rn.h>
#include <linux/percpu_counter.h>
#include <rdma/ib_verbs.h>
@@ -76,6 +77,7 @@ struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
struct list_head sc_accept_q; /* Conn. waiting accept */
+ struct rpcrdma_notification sc_rn; /* removal notification */
int sc_ord; /* RDMA read limit */
int sc_max_send_sges;
bool sc_snd_w_inv; /* OK to use Send With Invalidate */
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 61c455f1e1f5..63cf6fb26dcc 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -151,7 +151,6 @@ struct auth_ops {
struct svc_xprt;
-extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp);
extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp);
extern int svc_authorise(struct svc_rqst *rqstp);
extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp);
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7c78ec6356b9..bf45d9e8492a 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -58,8 +58,6 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
*/
void svc_recv(struct svc_rqst *rqstp);
void svc_send(struct svc_rqst *rqstp);
-void svc_drop(struct svc_rqst *);
-void svc_sock_update_bufs(struct svc_serv *serv);
int svc_addsock(struct svc_serv *serv, struct net *net,
const int fd, char *name_return, const size_t len,
const struct cred *cred);
diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
new file mode 100644
index 000000000000..66ca3ece951a
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__BUILTINS_H_
+#define _SUNRPC_XDRGEN__BUILTINS_H_
+
+#include <linux/sunrpc/xdr.h>
+
+static inline bool
+xdrgen_decode_void(struct xdr_stream *xdr)
+{
+ return true;
+}
+
+static inline bool
+xdrgen_encode_void(struct xdr_stream *xdr)
+{
+ return true;
+}
+
+static inline bool
+xdrgen_decode_bool(struct xdr_stream *xdr, bool *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = (*p != xdr_zero);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_bool(struct xdr_stream *xdr, bool val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = val ? xdr_one : xdr_zero;
+ return true;
+}
+
+static inline bool
+xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = be32_to_cpup(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_int(struct xdr_stream *xdr, s32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_int(struct xdr_stream *xdr, u32 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = be32_to_cpup(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_int(struct xdr_stream *xdr, u32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_long(struct xdr_stream *xdr, s32 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = be32_to_cpup(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_long(struct xdr_stream *xdr, s32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_long(struct xdr_stream *xdr, u32 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = be32_to_cpup(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_long(struct xdr_stream *xdr, u32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(!p))
+ return false;
+ *p = cpu_to_be32(val);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_hyper(struct xdr_stream *xdr, s64 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = get_unaligned_be64(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_hyper(struct xdr_stream *xdr, s64 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+ if (unlikely(!p))
+ return false;
+ put_unaligned_be64(val, p);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_hyper(struct xdr_stream *xdr, u64 *ptr)
+{
+ __be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+ if (unlikely(!p))
+ return false;
+ *ptr = get_unaligned_be64(p);
+ return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_hyper(struct xdr_stream *xdr, u64 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+ if (unlikely(!p))
+ return false;
+ put_unaligned_be64(val, p);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
+{
+ __be32 *p;
+ u32 len;
+
+ if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
+ return false;
+ if (unlikely(maxlen && len > maxlen))
+ return false;
+ if (len != 0) {
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return false;
+ ptr->data = (unsigned char *)p;
+ }
+ ptr->len = len;
+ return true;
+}
+
+static inline bool
+xdrgen_encode_string(struct xdr_stream *xdr, string val, u32 maxlen)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+ if (unlikely(!p))
+ return false;
+ xdr_encode_opaque(p, val.data, val.len);
+ return true;
+}
+
+static inline bool
+xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
+{
+ __be32 *p;
+ u32 len;
+
+ if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
+ return false;
+ if (unlikely(maxlen && len > maxlen))
+ return false;
+ if (len != 0) {
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return false;
+ ptr->data = (u8 *)p;
+ }
+ ptr->len = len;
+ return true;
+}
+
+static inline bool
+xdrgen_encode_opaque(struct xdr_stream *xdr, opaque val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+ if (unlikely(!p))
+ return false;
+ xdr_encode_opaque(p, val.data, val.len);
+ return true;
+}
+
+#endif /* _SUNRPC_XDRGEN__BUILTINS_H_ */
diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h
new file mode 100644
index 000000000000..be9e62371758
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_defs.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__DEFS_H_
+#define _SUNRPC_XDRGEN__DEFS_H_
+
+#define TRUE (true)
+#define FALSE (false)
+
+typedef struct {
+ u32 len;
+ unsigned char *data;
+} string;
+
+typedef struct {
+ u32 len;
+ u8 *data;
+} opaque;
+
+#endif /* _SUNRPC_XDRGEN__DEFS_H_ */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba7ea95d1c57..ca533b478c21 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -243,22 +243,24 @@ enum {
* free clusters are organized into a list. We fetch an entry from the list to
* get a free cluster.
*
- * The data field stores next cluster if the cluster is free or cluster usage
- * counter otherwise. The flags field determines if a cluster is free. This is
- * protected by swap_info_struct.lock.
+ * The flags field determines if a cluster is free. This is
+ * protected by cluster lock.
*/
struct swap_cluster_info {
spinlock_t lock; /*
* Protect swap_cluster_info fields
- * and swap_info_struct->swap_map
- * elements correspond to the swap
- * cluster
+ * other than list, and swap_info_struct->swap_map
+ * elements corresponding to the swap cluster.
*/
- unsigned int data:24;
- unsigned int flags:8;
+ u16 count;
+ u8 flags;
+ u8 order;
+ struct list_head list;
};
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
+#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
/*
* The first page in the swap file is the swap header, which is always marked
@@ -283,11 +285,6 @@ struct percpu_cluster {
unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};
-struct swap_cluster_list {
- struct swap_cluster_info head;
- struct swap_cluster_info tail;
-};
-
/*
* The in-memory structure used to track swap areas.
*/
@@ -299,8 +296,15 @@ struct swap_info_struct {
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
- struct swap_cluster_list free_clusters; /* free clusters list */
+ struct list_head free_clusters; /* free clusters list */
+ struct list_head full_clusters; /* full clusters list */
+ struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+ /* list of cluster that contains at least one free slot */
+ struct list_head frag_clusters[SWAP_NR_ORDERS];
+ /* list of cluster that are fragmented or contented */
+ unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int lowest_bit; /* index of first free in swap_map */
unsigned int highest_bit; /* index of last free in swap_map */
unsigned int pages; /* total of usable pages of swap */
@@ -331,7 +335,7 @@ struct swap_info_struct {
* list.
*/
struct work_struct discard_work; /* discard worker */
- struct swap_cluster_list discard_clusters; /* discard clusters list */
+ struct list_head discard_clusters; /* discard clusters list */
struct plist_node avail_lists[]; /*
* entries in swap_avail_heads, one
* entry per node.
@@ -478,9 +482,9 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
-extern void swap_shmem_alloc(swp_entry_t);
+extern void swap_shmem_alloc(swp_entry_t, int);
extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
@@ -545,7 +549,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
return 0;
}
-static inline void swap_shmem_alloc(swp_entry_t swp)
+static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
{
}
@@ -554,7 +558,7 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
-static inline int swapcache_prepare(swp_entry_t swp)
+static inline int swapcache_prepare(swp_entry_t swp, int nr)
{
return 0;
}
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index d8e4105a2f21..39c7cf82b0c2 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -33,6 +33,13 @@
})
#endif
+#ifdef masked_user_access_begin
+ #define can_do_masked_user_access() 1
+#else
+ #define can_do_masked_user_access() 0
+ #define masked_user_access_begin(src) NULL
+#endif
+
/*
* Architectures should provide two primitives (raw_copy_{to,from}_user())
* and get rid of their private instances of copy_{to,from}_user() and
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a12bcf042551..9fc6ce15c499 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
+void userfaultfd_reset_ctx(struct vm_area_struct *vma);
+
+struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end);
+
+int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ unsigned long start, unsigned long end,
+ bool wp_async);
+
+void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
+
+void userfaultfd_release_all(struct mm_struct *mm,
+ struct userfaultfd_ctx *ctx);
+
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..aed952d04132 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -50,6 +50,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSTEAL_ANON,
PGSTEAL_FILE,
#ifdef CONFIG_NUMA
+ PGSCAN_ZONE_RECLAIM_SUCCESS,
PGSCAN_ZONE_RECLAIM_FAILED,
#endif
PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
@@ -104,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
THP_DEFERRED_SPLIT_PAGE,
+ THP_UNDERUSED_SPLIT_PAGE,
THP_SPLIT_PMD,
THP_SCAN_EXCEED_NONE_PTE,
THP_SCAN_EXCEED_SWAP_PTE,
@@ -154,6 +156,30 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
VMA_LOCK_RETRY,
VMA_LOCK_MISS,
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ KSTACK_1K,
+#if THREAD_SIZE > 1024
+ KSTACK_2K,
+#endif
+#if THREAD_SIZE > 2048
+ KSTACK_4K,
+#endif
+#if THREAD_SIZE > 4096
+ KSTACK_8K,
+#endif
+#if THREAD_SIZE > 8192
+ KSTACK_16K,
+#endif
+#if THREAD_SIZE > 16384
+ KSTACK_32K,
+#endif
+#if THREAD_SIZE > 32768
+ KSTACK_64K,
+#endif
+#if THREAD_SIZE > 65536
+ KSTACK_REST,
+#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index e4a631ec430b..ad2ce7a6ab7a 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -189,6 +189,10 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1
extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__))
+void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+ __realloc_size(2);
+#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
+
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9eb77c9007e6..d2761bf8ff32 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -32,6 +32,7 @@ struct reclaim_stat {
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
unsigned nr_lazyfree_fail;
+ unsigned nr_demoted;
};
/* Stat data for system wide items */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8f651bb0a1a5..d6db822e4bb3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,6 +79,9 @@ struct writeback_control {
*/
struct swap_iocb **swap_plug;
+ /* Target list for splitting a large folio */
+ struct list_head *list;
+
/* internal fields used by the ->writepages implementation: */
struct folio_batch fbatch;
pgoff_t index;
diff --git a/include/linux/xz.h b/include/linux/xz.h
index 7285ca5d56e9..58ae1d746c6f 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -1,11 +1,10 @@
+/* SPDX-License-Identifier: 0BSD */
+
/*
* XZ decompressor
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <https://7-zip.org/>
- *
- * This file has been put into the public domain.
- * You can do whatever you want with this file.
*/
#ifndef XZ_H
@@ -19,11 +18,6 @@
# include <stdint.h>
#endif
-/* In Linux, this is used to make extern functions static when needed. */
-#ifndef XZ_EXTERN
-# define XZ_EXTERN extern
-#endif
-
/**
* enum xz_mode - Operation mode
*
@@ -143,7 +137,7 @@ struct xz_buf {
size_t out_size;
};
-/**
+/*
* struct xz_dec - Opaque type to hold the XZ decoder state
*/
struct xz_dec;
@@ -191,7 +185,7 @@ struct xz_dec;
* ready to be used with xz_dec_run(). If memory allocation fails,
* xz_dec_init() returns NULL.
*/
-XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
+struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
/**
* xz_dec_run() - Run the XZ decoder
@@ -211,7 +205,7 @@ XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
* get that amount valid data from the beginning of the stream. You must use
* the multi-call decoder if you don't want to uncompress the whole stream.
*/
-XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
+enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
/**
* xz_dec_reset() - Reset an already allocated decoder state
@@ -224,32 +218,38 @@ XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
* xz_dec_run(). Thus, explicit call to xz_dec_reset() is useful only in
* multi-call mode.
*/
-XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
+void xz_dec_reset(struct xz_dec *s);
/**
* xz_dec_end() - Free the memory allocated for the decoder state
* @s: Decoder state allocated using xz_dec_init(). If s is NULL,
* this function does nothing.
*/
-XZ_EXTERN void xz_dec_end(struct xz_dec *s);
+void xz_dec_end(struct xz_dec *s);
-/*
- * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
- * See xz_dec_microlzma_alloc() below for details.
+/**
+ * DOC: MicroLZMA decompressor
+ *
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too. **In most cases one needs the XZ APIs above instead.**
*
- * These functions aren't used or available in preboot code and thus aren't
- * marked with XZ_EXTERN. This avoids warnings about static functions that
- * are never defined.
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
*/
-/**
+
+/*
* struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
*/
struct xz_dec_microlzma;
/**
* xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
- * @mode XZ_SINGLE or XZ_PREALLOC
- * @dict_size LZMA dictionary size. This must be at least 4 KiB and
+ * @mode: XZ_SINGLE or XZ_PREALLOC
+ * @dict_size: LZMA dictionary size. This must be at least 4 KiB and
* at most 3 GiB.
*
* In contrast to xz_dec_init(), this function only allocates the memory
@@ -262,40 +262,30 @@ struct xz_dec_microlzma;
* On success, xz_dec_microlzma_alloc() returns a pointer to
* struct xz_dec_microlzma. If memory allocation fails or
* dict_size is invalid, NULL is returned.
- *
- * The compressed format supported by this decoder is a raw LZMA stream
- * whose first byte (always 0x00) has been replaced with bitwise-negation
- * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
- * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
- * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
- * marker must not be used. The unused values are reserved for future use.
- * This MicroLZMA header format was created for use in EROFS but may be used
- * by others too.
*/
-extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
- uint32_t dict_size);
+struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+ uint32_t dict_size);
/**
* xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
- * @s Decoder state allocated using xz_dec_microlzma_alloc()
- * @comp_size Compressed size of the input stream
- * @uncomp_size Uncompressed size of the input stream. A value smaller
+ * @s: Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size: Compressed size of the input stream
+ * @uncomp_size: Uncompressed size of the input stream. A value smaller
* than the real uncompressed size of the input stream can
* be specified if uncomp_size_is_exact is set to false.
* uncomp_size can never be set to a value larger than the
* expected real uncompressed size because it would eventually
* result in XZ_DATA_ERROR.
- * @uncomp_size_is_exact This is an int instead of bool to avoid
+ * @uncomp_size_is_exact: This is an int instead of bool to avoid
* requiring stdbool.h. This should normally be set to true.
* When this is set to false, error detection is weaker.
*/
-extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
- uint32_t comp_size, uint32_t uncomp_size,
- int uncomp_size_is_exact);
+void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
+ uint32_t uncomp_size, int uncomp_size_is_exact);
/**
* xz_dec_microlzma_run() - Run the MicroLZMA decoder
- * @s Decoder state initialized using xz_dec_microlzma_reset()
+ * @s: Decoder state initialized using xz_dec_microlzma_reset()
* @b: Input and output buffers
*
* This works similarly to xz_dec_run() with a few important differences.
@@ -329,15 +319,14 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
* may be changed normally like with XZ_PREALLOC. This way input data can be
* provided from non-contiguous memory.
*/
-extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
- struct xz_buf *b);
+enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, struct xz_buf *b);
/**
* xz_dec_microlzma_end() - Free the memory allocated for the decoder state
* @s: Decoder state allocated using xz_dec_microlzma_alloc().
* If s is NULL, this function does nothing.
*/
-extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
/*
* Standalone build (userspace build or in-kernel build for boot time use)
@@ -358,13 +347,13 @@ extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
* This must be called before any other xz_* function to initialize
* the CRC32 lookup table.
*/
-XZ_EXTERN void xz_crc32_init(void);
+void xz_crc32_init(void);
/*
* Update CRC32 value using the polynomial from IEEE-802.3. To start a new
* calculation, the third argument must be zero. To continue the calculation,
* the previously returned value is passed as the third argument.
*/
-XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc);
+uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc);
#endif
#endif
diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ec..b2c7cf310c8f 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -77,6 +77,30 @@ int zstd_min_clevel(void);
*/
int zstd_max_clevel(void);
+/**
+ * zstd_default_clevel() - default compression level
+ *
+ * Return: Default compression level.
+ */
+int zstd_default_clevel(void);
+
+/**
+ * struct zstd_custom_mem - custom memory allocation
+ */
+typedef ZSTD_customMem zstd_custom_mem;
+
+/**
+ * struct zstd_dict_load_method - Dictionary load method.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_dictLoadMethod_e zstd_dict_load_method;
+
+/**
+ * struct zstd_dict_content_type - Dictionary context type.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_dictContentType_e zstd_dict_content_type;
+
/* ====== Parameter Selection ====== */
/**
@@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters;
zstd_parameters zstd_get_params(int level,
unsigned long long estimated_src_size);
+
+/**
+ * zstd_get_cparams() - returns zstd_compression_parameters for selected level
+ * @level: The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ * if unknown.
+ * @dict_size: Dictionary size.
+ *
+ * Return: The selected zstd_compression_parameters.
+ */
+zstd_compression_parameters zstd_get_cparams(int level,
+ unsigned long long estimated_src_size, size_t dict_size);
+
/* ====== Single-pass Compression ====== */
typedef ZSTD_CCtx zstd_cctx;
@@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
const void *src, size_t src_size, const zstd_parameters *parameters);
+/**
+ * zstd_create_cctx_advanced() - Create compression context
+ * @custom_mem: Custom allocator.
+ *
+ * Return: NULL on error, pointer to compression context otherwise.
+ */
+zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_cctx() - Free compression context
+ * @cdict: Pointer to compression context.
+ *
+ * Return: Always 0.
+ */
+size_t zstd_free_cctx(zstd_cctx* cctx);
+
+/**
+ * struct zstd_cdict - Compression dictionary.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_CDict zstd_cdict;
+
+/**
+ * zstd_create_cdict_byreference() - Create compression dictionary
+ * @dict: Pointer to dictionary buffer.
+ * @dict_size: Size of the dictionary buffer.
+ * @dict_load_method: Dictionary load method.
+ * @dict_content_type: Dictionary content type.
+ * @custom_mem: Memory allocator.
+ *
+ * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be
+ * free before zstd_cdict is destroyed.
+ *
+ * Return: NULL on error, pointer to compression dictionary
+ * otherwise.
+ */
+zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size,
+ zstd_compression_parameters cparams,
+ zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_cdict() - Free compression dictionary
+ * @cdict: Pointer to compression dictionary.
+ *
+ * Return: Always 0.
+ */
+size_t zstd_free_cdict(zstd_cdict* cdict);
+
+/**
+ * zstd_compress_using_cdict() - compress src into dst using a dictionary
+ * @cctx: The context. Must have been initialized with zstd_init_cctx().
+ * @dst: The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ * ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src: The data to compress.
+ * @src_size: The size of the data to compress.
+ * @cdict: The dictionary to be used.
+ *
+ * Return: The compressed size or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst,
+ size_t dst_capacity, const void *src, size_t src_size,
+ const zstd_cdict *cdict);
+
/* ====== Single-pass Decompression ====== */
typedef ZSTD_DCtx zstd_dctx;
@@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
const void *src, size_t src_size);
+/**
+ * struct zstd_ddict - Decompression dictionary.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_DDict zstd_ddict;
+
+/**
+ * zstd_create_ddict_byreference() - Create decompression dictionary
+ * @dict: Pointer to dictionary buffer.
+ * @dict_size: Size of the dictionary buffer.
+ * @dict_load_method: Dictionary load method.
+ * @dict_content_type: Dictionary content type.
+ * @custom_mem: Memory allocator.
+ *
+ * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be
+ * free before zstd_ddict is destroyed.
+ *
+ * Return: NULL on error, pointer to decompression dictionary
+ * otherwise.
+ */
+zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size,
+ zstd_custom_mem custom_mem);
+/**
+ * zstd_free_ddict() - Free decompression dictionary
+ * @dict: Pointer to the dictionary.
+ *
+ * Return: Always 0.
+ */
+size_t zstd_free_ddict(zstd_ddict *ddict);
+
+/**
+ * zstd_create_dctx_advanced() - Create decompression context
+ * @custom_mem: Custom allocator.
+ *
+ * Return: NULL on error, pointer to decompression context otherwise.
+ */
+zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_dctx() -- Free decompression context
+ * @dctx: Pointer to decompression context.
+ * Return: Always 0.
+ */
+size_t zstd_free_dctx(zstd_dctx *dctx);
+
+/**
+ * zstd_decompress_using_ddict() - decompress src into dst using a dictionary
+ * @dctx: The decompression context.
+ * @dst: The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ * as the decompressed size. If the caller cannot upper bound the
+ * decompressed size, then it's better to use the streaming API.
+ * @src: The zstd compressed data to decompress. Multiple concatenated
+ * frames and skippable frames are allowed.
+ * @src_size: The exact size of the data to decompress.
+ * @ddict: The dictionary to be used.
+ *
+ * Return: The decompressed size or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_decompress_using_ddict(zstd_dctx *dctx,
+ void *dst, size_t dst_capacity, const void *src, size_t src_size,
+ const zstd_ddict *ddict);
+
+
/* ====== Streaming Buffers ====== */
/**
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 6cecb4a4f68b..9cd1beef0654 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages;
struct zswap_lruvec_state {
/*
- * Number of pages in zswap that should be protected from the shrinker.
- * This number is an estimate of the following counts:
+ * Number of swapped in pages from disk, i.e not found in the zswap pool.
*
- * a) Recent page faults.
- * b) Recent insertion to the zswap LRU. This includes new zswap stores,
- * as well as recent zswap LRU rotations.
- *
- * These pages are likely to be warm, and might incur IO if the are written
- * to swap.
+ * This is consumed and subtracted from the lru size in
+ * zswap_shrinker_count() to penalize past overshrinking that led to disk
+ * swapins. The idea is that had we considered this many more pages in the
+ * LRU active/protected and not written them back, we would not have had to
+ * swapped them in.
*/
- atomic_long_t nr_zswap_protected;
+ atomic_long_t nr_disk_swapins;
};
unsigned long zswap_total_pages(void);