diff options
Diffstat (limited to 'include/linux')
232 files changed, 5129 insertions, 2797 deletions
diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4db54e928b36..118a18b7ff84 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -424,6 +424,13 @@ extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); +#ifdef CONFIG_ACPI_THERMAL_LIB +int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); +int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); +#endif + #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); @@ -756,6 +763,10 @@ const char *acpi_get_subsystem_id(acpi_handle handle); #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) +/* Get rid of the -Wunused-variable for adev */ +#define acpi_dev_uid_match(adev, uid2) (adev && false) +#define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) + #include <acpi/acpi_numa.h> struct fwnode_handle; @@ -772,17 +783,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) struct acpi_device; -static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2) -{ - return false; -} - -static inline bool -acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) -{ - return false; -} - static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; diff --git a/include/linux/acpi_amd_wbrf.h b/include/linux/acpi_amd_wbrf.h new file mode 100644 index 000000000000..898f31d536d4 --- /dev/null +++ b/include/linux/acpi_amd_wbrf.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Wifi Band Exclusion Interface (AMD ACPI Implementation) + * Copyright (C) 2023 Advanced Micro Devices + */ + +#ifndef _ACPI_AMD_WBRF_H +#define _ACPI_AMD_WBRF_H + +#include <linux/device.h> +#include <linux/notifier.h> + +/* The maximum number of frequency band ranges */ +#define MAX_NUM_OF_WBRF_RANGES 11 + +/* Record actions */ +#define WBRF_RECORD_ADD 0x0 +#define WBRF_RECORD_REMOVE 0x1 + +/** + * struct freq_band_range - Wifi frequency band range definition + * @start: start frequency point (in Hz) + * @end: end frequency point (in Hz) + */ +struct freq_band_range { + u64 start; + u64 end; +}; + +/** + * struct wbrf_ranges_in_out - wbrf ranges info + * @num_of_ranges: total number of band ranges in this struct + * @band_list: array of Wifi band ranges + */ +struct wbrf_ranges_in_out { + u64 num_of_ranges; + struct freq_band_range band_list[MAX_NUM_OF_WBRF_RANGES]; +}; + +/** + * enum wbrf_notifier_actions - wbrf notifier actions index + * @WBRF_CHANGED: there was some frequency band updates. The consumers + * should retrieve the latest active frequency bands. + */ +enum wbrf_notifier_actions { + WBRF_CHANGED, +}; + +#if IS_ENABLED(CONFIG_AMD_WBRF) +bool acpi_amd_wbrf_supported_producer(struct device *dev); +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in); +bool acpi_amd_wbrf_supported_consumer(struct device *dev); +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out); +int amd_wbrf_register_notifier(struct notifier_block *nb); +int amd_wbrf_unregister_notifier(struct notifier_block *nb); +#else +static inline +bool acpi_amd_wbrf_supported_consumer(struct device *dev) +{ + return false; +} + +static inline +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in) +{ + return -ENODEV; +} + +static inline +bool acpi_amd_wbrf_supported_producer(struct device *dev) +{ + return false; +} +static inline +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out) +{ + return -ENODEV; +} +static inline +int amd_wbrf_register_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +static inline +int amd_wbrf_unregister_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_WBRF */ + +#endif /* _ACPI_AMD_WBRF_H */ diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h deleted file mode 100644 index 720fbb70294a..000000000000 --- a/include/linux/apple-mailbox.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ -/* - * Apple mailbox message format - * - * Copyright (C) 2021 The Asahi Linux Contributors - */ - -#ifndef _LINUX_APPLE_MAILBOX_H_ -#define _LINUX_APPLE_MAILBOX_H_ - -#include <linux/types.h> - -/* encodes a single 96bit message sent over the single channel */ -struct apple_mbox_msg { - u64 msg0; - u32 msg1; -}; - -#endif diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc5..a63d61ca55af 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) @@ -92,6 +99,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/async.h b/include/linux/async.h index cce4ad31e8fc..33c9ff4afb49 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev) return async_schedule_node(func, dev, dev_to_node(dev)); } +bool async_schedule_dev_nocall(async_func_t func, struct device *dev); + /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously diff --git a/include/linux/audit.h b/include/linux/audit.h index 51b1b7054a23..0050ef288ab3 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -36,6 +36,7 @@ struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; +struct kern_ipc_perm; struct audit_krule { u32 pflags; diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 6b3acf15be5c..8e177b67e82f 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -5,6 +5,7 @@ #define _VIRTCHNL_H_ #include <linux/bitops.h> +#include <linux/bits.h> #include <linux/overflow.h> #include <uapi/linux/if_ether.h> @@ -118,6 +119,7 @@ enum virtchnl_ops { VIRTCHNL_OP_GET_STATS = 15, VIRTCHNL_OP_RSVD = 16, VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */ + VIRTCHNL_OP_CONFIG_RSS_HFUNC = 18, /* opcode 19 is reserved */ VIRTCHNL_OP_IWARP = 20, /* advanced opcode */ VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP, @@ -911,6 +913,29 @@ struct virtchnl_rss_hena { VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena); +/* Type of RSS algorithm */ +enum virtchnl_rss_algorithm { + VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, + VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, + VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, + VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, +}; + +/* VIRTCHNL_OP_CONFIG_RSS_HFUNC + * VF sends this message to configure the RSS hash function. Only supported + * if both PF and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during + * configuration negotiation. + * The hash function is initialized to VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC + * by the PF. + */ +struct virtchnl_rss_hfunc { + u16 vsi_id; + u16 rss_algorithm; /* enum virtchnl_rss_algorithm */ + u32 reserved; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hfunc); + /* VIRTCHNL_OP_ENABLE_CHANNELS * VIRTCHNL_OP_DISABLE_CHANNELS * VF sends these messages to enable or disable channels based on @@ -1095,14 +1120,6 @@ enum virtchnl_vfr_states { VIRTCHNL_VFR_VFACTIVE, }; -/* Type of RSS algorithm */ -enum virtchnl_rss_algorithm { - VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, - VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, - VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, - VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, -}; - #define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 #define PROTO_HDR_SHIFT 5 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) @@ -1542,6 +1559,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, vrl->lut_entries); } break; + case VIRTCHNL_OP_CONFIG_RSS_HFUNC: + valid_len = sizeof(struct virtchnl_rss_hfunc); + break; case VIRTCHNL_OP_GET_RSS_HENA_CAPS: break; case VIRTCHNL_OP_SET_RSS_HENA: diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h new file mode 100644 index 000000000000..3f1fe1774f1b --- /dev/null +++ b/include/linux/backing-file.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#ifndef _LINUX_BACKING_FILE_H +#define _LINUX_BACKING_FILE_H + +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/fs.h> + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *); +}; + +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); + +#endif /* _LINUX_BACKING_FILE_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 41d417ee1349..ec4db73e5f4e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ enum bip_flags { BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ + BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ }; /* @@ -718,6 +720,7 @@ static inline bool bioset_initialized(struct bio_set *bs) for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + ssize_t len, u32 seed) +{ + return -EINVAL; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1ab3081c82ed..a676e116085f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -830,6 +830,12 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib); */ static inline bool blk_mq_need_time_stamp(struct request *rq) { + /* + * passthrough io doesn't use iostat accounting, cgroup stats + * and io scheduler functionalities. + */ + if (blk_rq_is_passthrough(rq)) + return false; return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b29ebd53417d..f288c94374b3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -57,20 +57,18 @@ struct block_device { void * bd_holder; const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; - /* The counter of freeze processes */ - int bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir; - /* Mutex for freeze */ - struct mutex bd_fsfreeze_mutex; - struct super_block *bd_fsfreeze_sb; + atomic_t bd_fsfreeze_count; /* number of freeze requests */ + struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; #ifdef CONFIG_FAIL_MAKE_REQUEST bool bd_make_it_fail; #endif bool bd_ro_warned; + int bd_writers; /* * keep this out-of-line as it's both big and not needed in the fast * path @@ -380,6 +378,8 @@ enum req_op { REQ_OP_DISCARD = (__force blk_opf_t)3, /* securely erase sectors */ REQ_OP_SECURE_ERASE = (__force blk_opf_t)5, + /* write data at the current zone write pointer */ + REQ_OP_ZONE_APPEND = (__force blk_opf_t)7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, /* Open a zone */ @@ -388,12 +388,10 @@ enum req_op { REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = (__force blk_opf_t)12, - /* write data at the current zone write pointer */ - REQ_OP_ZONE_APPEND = (__force blk_opf_t)13, /* reset a zone write pointer */ - REQ_OP_ZONE_RESET = (__force blk_opf_t)15, + REQ_OP_ZONE_RESET = (__force blk_opf_t)13, /* reset all the zone present on the device */ - REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17, + REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88e9dd4b71fb..99e4f5e72213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) /* open for "writes" only for ioctls (specialy hack for floppy.c) */ #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ +#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) struct gendisk { /* @@ -264,18 +266,6 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) } /* - * Zoned block device models (zoned limit). - * - * Note: This needs to be ordered from the least to the most severe - * restrictions for the inheritance in blk_stack_limits() to work. - */ -enum blk_zoned_model { - BLK_ZONED_NONE = 0, /* Regular block device */ - BLK_ZONED_HA, /* Host-aware zoned block device */ - BLK_ZONED_HM, /* Host-managed zoned block device */ -}; - -/* * BLK_BOUNCE_NONE: never bounce (default) * BLK_BOUNCE_HIGH: bounce all highmem pages */ @@ -316,7 +306,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; - enum blk_zoned_model zoned; + bool zoned; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -329,24 +319,15 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model); +void disk_set_zoned(struct gendisk *disk); -#ifdef CONFIG_BLK_DEV_ZONED #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -unsigned int bdev_nr_zones(struct block_device *bdev); -extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, - gfp_t gfp_mask); + unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, + sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} -#endif /* CONFIG_BLK_DEV_ZONED */ + void (*update_driver_data)(struct gendisk *disk)); /* * Independent access ranges: struct blk_independent_access_range describes @@ -376,59 +357,51 @@ struct blk_independent_access_ranges { }; struct request_queue { - struct request *last_merge; - struct elevator_queue *elevator; - - struct percpu_ref q_usage_counter; + /* + * The queue owner gets to use this for whatever they like. + * ll_rw_blk doesn't touch it. + */ + void *queuedata; - struct blk_queue_stats *stats; - struct rq_qos *rq_qos; - struct mutex rq_qos_mutex; + struct elevator_queue *elevator; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; + /* + * various queue flags, see QUEUE_* below + */ + unsigned long queue_flags; + + unsigned int rq_timeout; + unsigned int queue_depth; + refcount_t refs; + /* hw dispatch queues */ - struct xarray hctx_table; unsigned int nr_hw_queues; + struct xarray hctx_table; - /* - * The queue owner gets to use this for whatever they like. - * ll_rw_blk doesn't touch it. - */ - void *queuedata; - - /* - * various queue flags, see QUEUE_* below - */ - unsigned long queue_flags; - /* - * Number of contexts that have called blk_set_pm_only(). If this - * counter is above zero then only RQF_PM requests are processed. - */ - atomic_t pm_only; + struct percpu_ref q_usage_counter; - /* - * ida allocated id for this queue. Used to index queues from - * ioctx. - */ - int id; + struct request *last_merge; spinlock_t queue_lock; - struct gendisk *disk; + int quiesce_depth; - refcount_t refs; + struct gendisk *disk; /* * mq queue kobject */ struct kobject *mq_kobj; + struct queue_limits limits; + #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity integrity; #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -439,24 +412,40 @@ struct request_queue { #endif /* - * queue settings + * Number of contexts that have called blk_set_pm_only(). If this + * counter is above zero then only RQF_PM requests are processed. */ - unsigned long nr_requests; /* Max # of requests */ + atomic_t pm_only; + + struct blk_queue_stats *stats; + struct rq_qos *rq_qos; + struct mutex rq_qos_mutex; + + /* + * ida allocated id for this queue. Used to index queues from + * ioctx. + */ + int id; unsigned int dma_pad_mask; + /* + * queue settings + */ + unsigned long nr_requests; /* Max # of requests */ + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; struct kobject *crypto_kobject; #endif - unsigned int rq_timeout; - struct timer_list timeout; struct work_struct timeout_work; atomic_t nr_active_requests_shared_tags; + unsigned int required_elevator_features; + struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@ -467,11 +456,12 @@ struct request_queue { struct mutex blkcg_mutex; #endif - struct queue_limits limits; + int node; - unsigned int required_elevator_features; + spinlock_t requeue_lock; + struct list_head requeue_list; + struct delayed_work requeue_work; - int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -481,10 +471,6 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; - struct list_head requeue_list; - spinlock_t requeue_lock; - struct delayed_work requeue_work; - struct mutex sysfs_lock; struct mutex sysfs_dir_lock; @@ -509,8 +495,6 @@ struct request_queue { */ struct mutex mq_freeze_lock; - int quiesce_depth; - struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; @@ -623,26 +607,14 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) } #endif -static inline enum blk_zoned_model -blk_queue_zoned_model(struct request_queue *q) -{ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) - return q->limits.zoned; - return BLK_ZONED_NONE; -} - static inline bool blk_queue_is_zoned(struct request_queue *q) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - case BLK_ZONED_HM: - return true; - default: - return false; - } + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned; } #ifdef CONFIG_BLK_DEV_ZONED +unsigned int bdev_nr_zones(struct block_device *bdev); + static inline unsigned int disk_nr_zones(struct gendisk *disk) { return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; @@ -687,6 +659,11 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) } #else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return 0; +} + static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; @@ -1080,7 +1057,14 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -#define BLK_DEF_MAX_SECTORS 2560u +/* + * Default upper limit for the software max_sectors limit used for + * regular file system I/O. This can be increased through sysfs. + * + * Not to be confused with the max_hw_sector limit that is entirely + * controlled by the driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP 2560u static inline unsigned long queue_segment_boundary(const struct request_queue *q) { @@ -1259,11 +1243,6 @@ static inline bool bdev_nowait(struct block_device *bdev) return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); } -static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) -{ - return blk_queue_zoned_model(bdev_get_queue(bdev)); -} - static inline bool bdev_is_zoned(struct block_device *bdev) { return blk_queue_is_zoned(bdev_get_queue(bdev)); @@ -1468,8 +1447,23 @@ struct blk_holder_ops { * Sync the file system mounted on the block device. */ void (*sync)(struct block_device *bdev); + + /* + * Freeze the file system mounted on the block device. + */ + int (*freeze)(struct block_device *bdev); + + /* + * Thaw the file system mounted on the block device. + */ + int (*thaw)(struct block_device *bdev); }; +/* + * For filesystems using @fs_holder_ops, the @holder argument passed to + * helpers used to open and claim block devices via + * bd_prepare_to_claim() must point to a superblock. + */ extern const struct blk_holder_ops fs_holder_ops; /* @@ -1477,7 +1471,8 @@ extern const struct blk_holder_ops fs_holder_ops; * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ - (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) + (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ + (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) struct bdev_handle { struct block_device *bdev; @@ -1485,10 +1480,6 @@ struct bdev_handle { blk_mode_t mode; }; -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, @@ -1496,7 +1487,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void blkdev_put(struct block_device *bdev, void *holder); void bdev_release(struct bdev_handle *handle); /* just for blk-cgroup, don't use elsewhere */ @@ -1541,8 +1531,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev); +int bdev_freeze(struct block_device *bdev); +int bdev_thaw(struct block_device *bdev); struct io_comp_batch { struct request *req_list; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cff5bb08820e..e30100597d0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> +#include <linux/cfi.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -106,7 +107,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, @@ -186,8 +191,8 @@ enum btf_field_type { BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | - BPF_RB_NODE | BPF_RB_ROOT, + BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, + BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), }; @@ -272,7 +277,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -288,6 +297,9 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; @@ -1044,6 +1056,17 @@ struct btf_func_model { */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) +/* + * Indicate the trampoline should be suitable to receive indirect calls; + * without this indirectly calling the generated code can result in #UD/#CP, + * depending on the CFI options. + * + * Used by bpf_struct_ops. + * + * Incompatible with FENTRY usage, overloads @func_addr argument. + */ +#define BPF_TRAMP_F_INDIRECT BIT(8) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -1083,10 +1106,17 @@ struct bpf_tramp_run_ctx; * fexit = a set of program to run after original function */ struct bpf_tramp_image; -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call); + void *func_addr); +void *arch_alloc_bpf_trampoline(unsigned int size); +void arch_free_bpf_trampoline(void *image, unsigned int size); +void arch_protect_bpf_trampoline(void *image, unsigned int size); +void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr); + u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, @@ -1119,6 +1149,7 @@ enum bpf_tramp_prog_type { struct bpf_tramp_image { void *image; + int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; @@ -1188,7 +1219,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1226,6 +1261,8 @@ enum bpf_dynptr_type { int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); @@ -1278,7 +1315,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ @@ -1301,7 +1338,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); @@ -1345,6 +1382,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { @@ -1410,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; @@ -1426,6 +1466,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; @@ -1438,7 +1481,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; - unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); + u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif @@ -1636,6 +1679,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -1649,6 +1693,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, + void *stub_func, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { @@ -2422,12 +2467,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, bool is_ex_cb); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b21308..aaf004d94322 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; @@ -21,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aa4d19d0bc94..d07d857ca67f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -275,6 +275,11 @@ struct bpf_reference_state { int callback_ref; }; +struct bpf_retval_range { + s32 minval; + s32 maxval; +}; + /* state of the program: * type of all registers and stack info */ @@ -297,8 +302,8 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; + struct bpf_retval_range callback_ret_range; bool in_callback_fn; - struct tnum callback_ret_range; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible @@ -316,16 +321,48 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; - int allocated_stack; + /* The state of the stack. Each element of the array describes BPF_REG_SIZE + * (i.e. 8) bytes worth of stack memory. + * stack[0] represents bytes [*(r10-8)..*(r10-1)] + * stack[1] represents bytes [*(r10-16)..*(r10-9)] + * ... + * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] + */ struct bpf_stack_state *stack; + /* Size of the current stack, in bytes. The stack state is tracked below, in + * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. + */ + int allocated_stack; +}; + +#define MAX_CALL_FRAMES 8 + +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +enum { + /* instruction references stack slot through PTR_TO_STACK register; + * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) + * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, + * 8 bytes per slot, so slot index (spi) is [0, 63]) + */ + INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ + + INSN_F_SPI_MASK = 0x3f, /* 6 bits */ + INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + + INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ }; -struct bpf_idx_pair { - u32 prev_idx; +static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); +static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); + +struct bpf_jmp_history_entry { u32 idx; + /* insn idx can't be bigger than 1 million */ + u32 prev_idx : 22; + /* special flags, e.g., whether insn is doing register stack spill/load */ + u32 flags : 10; }; -#define MAX_CALL_FRAMES 8 /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { @@ -408,7 +445,7 @@ struct bpf_verifier_state { * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - struct bpf_idx_pair *jmp_history; + struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; @@ -569,17 +606,28 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_arg_info { + enum bpf_arg_type arg_type; + union { + u32 mem_size; + }; +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ - bool has_tail_call; - bool tail_call_reachable; - bool has_ld_abs; - bool is_cb; - bool is_async_cb; - bool is_exception_cb; + bool has_tail_call: 1; + bool tail_call_reachable: 1; + bool has_ld_abs: 1; + bool is_cb: 1; + bool is_async_cb: 1; + bool is_exception_cb: 1; + bool args_cached: 1; + + u8 arg_cnt; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; @@ -618,6 +666,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ + bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; @@ -630,6 +679,10 @@ struct bpf_verifier_env { int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; + /* Allow access to uninitialized stack memory. Writes with fixed offset are + * always allowed, so this refers to reads (with fixed or variable offset), + * to writes with variable offset and to indirect (helper) accesses. + */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; @@ -650,6 +703,7 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ @@ -684,6 +738,16 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + +static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, @@ -695,6 +759,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...); + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; @@ -717,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size); - /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) @@ -794,4 +854,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type) return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } +static inline bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + +static inline bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + +static inline bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + type = base_type(type); + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + +static inline bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; +} + +static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); +const char *dynptr_type_str(enum bpf_dynptr_type type); +const char *iter_type_str(const struct btf *btf, u32 btf_id); +const char *iter_state_str(enum bpf_iter_state state); + +void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h deleted file mode 100644 index 736ded4905e0..000000000000 --- a/include/linux/bpfilter.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BPFILTER_H -#define _LINUX_BPFILTER_H - -#include <uapi/linux/bpfilter.h> -#include <linux/usermode_driver.h> -#include <linux/sockptr.h> - -struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen); - -struct bpfilter_umh_ops { - struct umd_info info; - /* since ip_getsockopt() can run in parallel, serialize access to umh */ - struct mutex lock; - int (*sockopt)(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set); - int (*start)(void); -}; -extern struct bpfilter_umh_ops bpfilter_ops; -#endif diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5f23ee599889..d78454a4dd1f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); @@ -252,11 +251,10 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[], * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler); + get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, @@ -270,7 +268,6 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); -void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); diff --git a/include/linux/cache.h b/include/linux/cache.h index 9900d20b76c2..0ecb17bb6883 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,6 +85,31 @@ #define cache_line_size() L1_CACHE_BYTES #endif +#ifndef __cacheline_group_begin +#define __cacheline_group_begin(GROUP) \ + __u8 __cacheline_group_begin__##GROUP[0] +#endif + +#ifndef __cacheline_group_end +#define __cacheline_group_end(GROUP) \ + __u8 __cacheline_group_end__##GROUP[0] +#endif + +#ifndef CACHELINE_ASSERT_GROUP_MEMBER +#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \ + BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \ + offsetofend(TYPE, MEMBER) <= \ + offsetof(TYPE, __cacheline_group_end__##GROUP))) +#endif + +#ifndef CACHELINE_ASSERT_GROUP_SIZE +#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \ + BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \ + SIZE) +#endif + /* * Helper to add padding within a struct to ensure data fall into separate * cachelines. diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 3552ec82b725..f0df518e11dd 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -9,6 +9,14 @@ #include <linux/bug.h> #include <linux/module.h> +#include <asm/cfi.h> + +#ifndef cfi_get_offset +static inline int cfi_get_offset(void) +{ + return 0; +} +#endif #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -38,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr, #endif /* CONFIG_ARCH_USES_CFI_TRAPS */ #endif /* CONFIG_MODULES */ +#ifndef CFI_NOSEAL +#define CFI_NOSEAL(x) +#endif + #endif /* _LINUX_CFI_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a6b6b77ccb6..ea48c861cd36 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,20 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * Add padding to separate the read mostly rstat_cpu and + * rstat_css_list into a different cacheline from the following + * rstat_flush_next and *bstat fields which can have frequent updates. + */ + CACHELINE_PADDING(_pad_); + + /* + * A singly-linked list of cgroup structures to be rstat flushed. + * This is a scratch field to be used exclusively by + * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + */ + struct cgroup *rstat_flush_next; + /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; @@ -548,6 +562,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; + /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -561,9 +579,6 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; - /* A list running through the active hierarchies */ - struct list_head root_list; - /* Hierarchy-specific flags */ unsigned int flags; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0ef0af66080e..34aaf0e87def 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -69,6 +69,7 @@ struct css_task_iter { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> @@ -386,7 +387,6 @@ static inline void cgroup_unlock(void) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ @@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 9f1a9c455b68..c2d09bc4f976 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -125,25 +125,55 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * trivial wrapper around DEFINE_CLASS() above specifically * for locks. * + * DEFINE_GUARD_COND(name, ext, condlock) + * wrapper around EXTEND_CLASS above to add conditional lock + * variants to a base class, eg. mutex_trylock() or + * mutex_lock_interruptible(). + * * guard(name): - * an anonymous instance of the (guard) class + * an anonymous instance of the (guard) class, not recommended for + * conditional locks. * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is * bound to the next (compound) statement. * + * for conditional locks the loop body is skipped when the lock is not + * acquired. + * + * scoped_cond_guard (name, fail, args...) { }: + * similar to scoped_guard(), except it does fail when the lock + * acquire fails. + * */ #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T) + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return *_T; } + +#define DEFINE_GUARD_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \ + class_##_name##_t _T) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } #define guard(_name) \ CLASS(_name, __UNIQUE_ID(guard)) +#define __guard_ptr(_name) class_##_name##_lock_ptr + #define scoped_guard(_name, args...) \ for (CLASS(_name, scope)(args), \ - *done = NULL; !done; done = (void *)1) + *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1) + +#define scoped_cond_guard(_name, _fail, args...) \ + for (CLASS(_name, scope)(args), \ + *done = NULL; !done; done = (void *)1) \ + if (!__guard_ptr(_name)(&scope)) _fail; \ + else /* * Additional helper macros for generating lock guards with types, either for @@ -152,6 +182,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * * DEFINE_LOCK_GUARD_0(name, lock, unlock, ...) * DEFINE_LOCK_GUARD_1(name, type, lock, unlock, ...) + * DEFINE_LOCK_GUARD_1_COND(name, ext, condlock) * * will result in the following type: * @@ -173,6 +204,11 @@ typedef struct { \ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (_T->lock) { _unlock; } \ +} \ + \ +static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ +{ \ + return _T->lock; \ } @@ -201,4 +237,14 @@ __DEFINE_LOCK_GUARD_1(_name, _type, _lock) __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__) \ __DEFINE_LOCK_GUARD_0(_name, _lock) +#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ + if (_T->lock && !(_condlock)) _T->lock = NULL; \ + _t; }), \ + typeof_member(class_##_name##_t, lock) l) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } + + #endif /* __LINUX_GUARDS_H */ diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index ace3a4ce2fc9..1293c38ddb7f 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -448,8 +448,8 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, */ #define clk_hw_register_fixed_rate_with_accuracy_parent_hw(dev, name, \ parent_hw, flags, fixed_rate, fixed_accuracy) \ - __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw) \ - NULL, NULL, (flags), (fixed_rate), \ + __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw), \ + NULL, (flags), (fixed_rate), \ (fixed_accuracy), 0, false) /** * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec1..aebb65bf95a7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -136,7 +136,7 @@ #endif #define __diag_ignore_all(option, comment) \ - __diag_GCC(8, ignore, option) + __diag(__diag_GCC_ignore option) /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" diff --git a/include/linux/connector.h b/include/linux/connector.h index cec2d99ae902..70bc1160f3d8 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -100,8 +100,7 @@ void cn_del_callback(const struct cb_id *id); */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask, - int (*filter)(struct sock *dsk, struct sk_buff *skb, - void *data), + netlink_filter_fn filter, void *filter_data); /** diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1c5ca92a0555..afda5f24d3dd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index efc0c0b07efb..172d0a743e5d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -66,15 +66,12 @@ enum cpuhp_state { CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, - CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, - /* Must be after CPUHP_MM_VMSTAT_DEAD */ - CPUHP_MM_DEMOTION_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, @@ -96,7 +93,6 @@ enum cpuhp_state { CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_IOVA_DEAD, - CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_AP_DTPM_CPU_DEAD, @@ -108,7 +104,6 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, - CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, @@ -118,13 +113,11 @@ enum cpuhp_state { CPUHP_XEN_EVTCHN_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, - CPUHP_NET_FLOW_PREPARE, CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, @@ -151,18 +144,14 @@ enum cpuhp_state { CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, - CPUHP_AP_IRQ_RISCV_STARTING, CPUHP_AP_IRQ_LOONGARCH_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, - CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, - CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, @@ -179,7 +168,6 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, - CPUHP_AP_MARCO_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, @@ -217,9 +205,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, - CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, - CPUHP_AP_PERF_X86_IDXD_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, @@ -251,9 +237,7 @@ enum cpuhp_state { CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, - CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, - /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */ - CPUHP_AP_MM_DEMOTION_ONLINE, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d629094fac6e..875d12598bd2 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) return false; } +static inline bool cpuset_cpu_is_isolated(int cpu) +{ + return false; +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 5126a4fecb44..9eaeaafe0cad 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -87,12 +87,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#endif - int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, bool *high); diff --git a/include/linux/crc-ccitt.h b/include/linux/crc-ccitt.h index 72c92c396bb8..cd4f420231ba 100644 --- a/include/linux/crc-ccitt.h +++ b/include/linux/crc-ccitt.h @@ -5,19 +5,12 @@ #include <linux/types.h> extern u16 const crc_ccitt_table[256]; -extern u16 const crc_ccitt_false_table[256]; extern u16 crc_ccitt(u16 crc, const u8 *buffer, size_t len); -extern u16 crc_ccitt_false(u16 crc, const u8 *buffer, size_t len); static inline u16 crc_ccitt_byte(u16 crc, const u8 c) { return (crc >> 8) ^ crc_ccitt_table[(crc ^ c) & 0xff]; } -static inline u16 crc_ccitt_false_byte(u16 crc, const u8 c) -{ - return (crc << 8) ^ crc_ccitt_false_table[(crc >> 8) ^ c]; -} - #endif /* _LINUX_CRC_CCITT_H */ diff --git a/include/linux/damon.h b/include/linux/damon.h index e00ddf1ed39c..5881e4ac30be 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -2,7 +2,7 @@ /* * DAMON api * - * Author: SeongJae Park <sjpark@amazon.de> + * Author: SeongJae Park <sj@kernel.org> */ #ifndef _DAMON_H_ @@ -136,6 +136,9 @@ enum damos_action { * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -153,6 +156,17 @@ enum damos_action { * You could customize the prioritization logic by setting &weight_sz, * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. + * + * If @get_score function pointer is set, DAMON calls it back with + * @get_score_arg and get the return value of it for every @reset_interval. + * Then, DAMON adjusts the effective quota using the return value as a feedback + * score to the current quota, using its internal feedback loop algorithm. + * + * The feedback loop algorithem assumes the quota input and the feedback score + * output are in a positive proportional relationship, and the goal of the + * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are + * set together, those work as a hard limit quota. If neither @ms nor @sz are + * set, the mechanism starts from the quota of one byte. */ struct damos_quota { unsigned long ms; @@ -163,6 +177,9 @@ struct damos_quota { unsigned int weight_nr_accesses; unsigned int weight_age; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -179,6 +196,9 @@ struct damos_quota { /* For prioritization */ unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; + + /* For feedback loop */ + unsigned long esz_bp; }; /** diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3da2f0545d5d..1666c387861f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,12 +68,12 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 32 /* 192 bytes */ +# define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ -# else # define DNAME_INLINE_LEN 40 /* 128 bytes */ +# else +# define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif #endif @@ -101,8 +101,8 @@ struct dentry { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; - struct list_head d_child; /* child of parent list */ - struct list_head d_subdirs; /* our children */ + struct hlist_node d_sib; /* child of parent list */ + struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ @@ -111,7 +111,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -} __randomize_layout; +}; /* * dentry->d_lock spinlock nesting subclasses: @@ -151,13 +151,13 @@ struct dentry_operations { */ /* d_flags entries */ -#define DCACHE_OP_HASH 0x00000001 -#define DCACHE_OP_COMPARE 0x00000002 -#define DCACHE_OP_REVALIDATE 0x00000004 -#define DCACHE_OP_DELETE 0x00000008 -#define DCACHE_OP_PRUNE 0x00000010 +#define DCACHE_OP_HASH BIT(0) +#define DCACHE_OP_COMPARE BIT(1) +#define DCACHE_OP_REVALIDATE BIT(2) +#define DCACHE_OP_DELETE BIT(3) +#define DCACHE_OP_PRUNE BIT(4) -#define DCACHE_DISCONNECTED 0x00000020 +#define DCACHE_DISCONNECTED BIT(5) /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first @@ -168,50 +168,46 @@ struct dentry_operations { * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ -#define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ -#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ +#define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ -#define DCACHE_CANT_MOUNT 0x00000100 -#define DCACHE_GENOCIDE 0x00000200 -#define DCACHE_SHRINK_LIST 0x00000400 +#define DCACHE_CANT_MOUNT BIT(8) +#define DCACHE_SHRINK_LIST BIT(10) -#define DCACHE_OP_WEAK_REVALIDATE 0x00000800 +#define DCACHE_OP_WEAK_REVALIDATE BIT(11) -#define DCACHE_NFSFS_RENAMED 0x00001000 +#define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED 0x00008000 +#define DCACHE_DENTRY_KILLED BIT(15) -#define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST 0x00080000 +#define DCACHE_LRU_LIST BIT(19) -#define DCACHE_ENTRY_TYPE 0x00700000 -#define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */ -#define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */ -#define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */ -#define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */ -#define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */ +#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ +#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ -#define DCACHE_MAY_FREE 0x00800000 -#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ -#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL 0x04000000 +#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(26) -#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR 0x20000000 -#define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(29) +#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ extern seqlock_t rename_lock; @@ -220,8 +216,6 @@ extern seqlock_t rename_lock; */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); @@ -242,15 +236,11 @@ extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_for_umount(struct super_block *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); -/* <clickety>-<click> the ramfs-type tree */ -extern void d_genocide(struct dentry *); - extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); @@ -274,12 +264,8 @@ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); -/* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); -extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); -extern struct dentry *__d_lookup_rcu(const struct dentry *parent, - const struct qstr *name, unsigned *seq); static inline unsigned d_count(const struct dentry *dentry) { @@ -301,20 +287,40 @@ extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** - * dget, dget_dlock - get a reference to a dentry - * @dentry: dentry to get a reference to + * dget_dlock - get a reference to a dentry + * @dentry: dentry to get a reference to * - * Given a dentry or %NULL pointer increment the reference count - * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. + * Given a live dentry, increment the reference count and return the dentry. + * Caller must hold @dentry->d_lock. Making sure that dentry is alive is + * caller's resonsibility. There are many conditions sufficient to guarantee + * that; e.g. anything with non-negative refcount is alive, so's anything + * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { - if (dentry) - dentry->d_lockref.count++; + dentry->d_lockref.count++; return dentry; } + +/** + * dget - get a reference to a dentry + * @dentry: dentry to get a reference to + * + * Given a dentry or %NULL pointer increment the reference count + * if appropriate and return the dentry. A dentry will not be + * destroyed when it has references. Conversely, a dentry with + * no references can disappear for any number of reasons, starting + * with memory pressure. In other words, that primitive is + * used to clone an existing reference; using it on something with + * zero refcount is a bug. + * + * NOTE: it will spin if @dentry->d_lock is held. From the deadlock + * avoidance point of view it is equivalent to spin_lock()/increment + * refcount/spin_unlock(), so calling it under @dentry->d_lock is + * always a bug; so's calling it under ->d_lock on any of its descendents. + * + */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) @@ -325,12 +331,11 @@ static inline struct dentry *dget(struct dentry *dentry) extern struct dentry *dget_parent(struct dentry *dentry); /** - * d_unhashed - is dentry hashed - * @dentry: entry to check + * d_unhashed - is dentry hashed + * @dentry: entry to check * - * Returns true if the dentry passed is not currently hashed. + * Returns true if the dentry passed is not currently hashed. */ - static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); @@ -490,14 +495,6 @@ static inline int simple_positive(const struct dentry *dentry) return d_really_is_positive(dentry) && !d_unhashed(dentry); } -extern void d_set_fallthru(struct dentry *dentry); - -static inline bool d_is_fallthru(const struct dentry *dentry) -{ - return dentry->d_flags & DCACHE_FALLTHRU; -} - - extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) @@ -547,21 +544,6 @@ static inline struct inode *d_backing_inode(const struct dentry *upper) } /** - * d_backing_dentry - Get upper or lower dentry we should be using - * @upper: The upper layer - * - * This is the helper that should be used to get the dentry of the inode that - * will be used if this dentry were opened as a file. It may be the upper - * dentry or it may be a lower dentry pinned by the upper. - * - * Normal filesystems should not use this to access their own dentries. - */ -static inline struct dentry *d_backing_dentry(struct dentry *upper) -{ - return upper; -} - -/** * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) @@ -600,4 +582,14 @@ struct name_snapshot { void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); +static inline struct dentry *d_first_child(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); +} + +static inline struct dentry *d_next_sibling(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); +} + #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 3f31baa3293f..8ff4add71f88 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -343,16 +343,19 @@ struct dma_buf { /** * @exp_name: * - * Name of the exporter; useful for debugging. See the - * DMA_BUF_SET_NAME IOCTL. + * Name of the exporter; useful for debugging. Must not be NULL */ const char *exp_name; /** * @name: * - * Userspace-provided name; useful for accounting and debugging, - * protected by dma_resv_lock() on @resv and @name_lock for read access. + * Userspace-provided name. Default value is NULL. If not NULL, + * length cannot be longer than DMA_BUF_NAME_LEN, including NIL + * char. Useful for accounting and debugging. Read/Write accesses + * are protected by @name_lock + * + * See the IOCTLs DMA_BUF_SET_NAME or DMA_BUF_SET_NAME_A/B */ const char *name; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884..3eb3589ff43e 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -21,7 +21,6 @@ struct bus_dma_region { phys_addr_t cpu_start; dma_addr_t dma_start; u64 size; - u64 offset; }; static inline dma_addr_t translate_phys_to_dma(struct device *dev, @@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size) - return (dma_addr_t)paddr - m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = paddr - m->cpu_start; + + if (paddr >= m->cpu_start && offset < m->size) + return m->dma_start + offset; + } /* make sure dma_capable fails when no translation is available */ return DMA_MAPPING_ERROR; @@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size) - return (phys_addr_t)dma_addr + m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = dma_addr - m->dma_start; + + if (dma_addr >= m->dma_start && offset < m->size) + return m->cpu_start + offset; + } return (phys_addr_t)-1; } diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index b3772edca2e6..e06bad467f55 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/printk.h> #include <linux/rcupdate.h> +#include <linux/timekeeping.h> struct dma_fence; struct dma_fence_ops; diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 578fc5fa3750..9cf896ea1d41 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -17,9 +17,6 @@ struct dpll_pin; struct dpll_device_ops { int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_mode *mode, struct netlink_ext_ack *extack); - bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv, - const enum dpll_mode mode, - struct netlink_ext_ack *extack); int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_lock_status *status, struct netlink_ext_ack *extack); @@ -80,6 +77,9 @@ struct dpll_pin_ops { const struct dpll_device *dpll, void *dpll_priv, const s32 phase_adjust, struct netlink_ext_ack *extack); + int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, void *dpll_priv, + s64 *ffo, struct netlink_ext_ack *extack); }; struct dpll_pin_frequency { diff --git a/include/linux/edac.h b/include/linux/edac.h index fa4bda2a70f6..1174beb94ab6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_NVDIMM: Non-volatile RAM * @MEM_WIO2: Wide I/O 2. * @MEM_HBM2: High bandwidth Memory Gen 2. + * @MEM_HBM3: High bandwidth Memory Gen 3. */ enum mem_type { MEM_EMPTY = 0, @@ -218,6 +219,7 @@ enum mem_type { MEM_NVDIMM, MEM_WIO2, MEM_HBM2, + MEM_HBM3, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -248,6 +250,7 @@ enum mem_type { #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2) +#define MEM_FLAG_HBM3 BIT(MEM_HBM3) /** * enum edac_type - Error Detection and Correction capabilities and mode diff --git a/include/linux/efi.h b/include/linux/efi.h index 9cc5bf32f6f2..c74f47711f0b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -40,6 +40,7 @@ struct screen_info; #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_ACCESS_DENIED (15 | (1UL << (BITS_PER_LONG-1))) #define EFI_TIMEOUT (18 | (1UL << (BITS_PER_LONG-1))) #define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) @@ -1348,4 +1349,15 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table) umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n); +/* + * efivar ops event type + */ +#define EFIVAR_OPS_RDONLY 0 +#define EFIVAR_OPS_RDWR 1 + +extern struct blocking_notifier_head efivar_ops_nh; + +void efivars_generic_ops_register(void); +void efivars_generic_ops_unregister(void); + #endif /* _LINUX_EFI_H */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac4..88d91e087471 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int cpu; @@ -241,11 +241,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; + ref_freq = arch_scale_freq_ref(cpu); - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d95ab85f96ba..b0fb775a600d 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -7,6 +7,11 @@ #include <linux/syscalls.h> #include <linux/seccomp.h> #include <linux/sched.h> +#include <linux/context_tracking.h> +#include <linux/livepatch.h> +#include <linux/resume_user_mode.h> +#include <linux/tick.h> +#include <linux/kmsan.h> #include <asm/entry-common.h> @@ -98,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} * done between establishing state and enabling interrupts. The caller must * enable interrupts before invoking syscall_enter_from_user_mode_work(). */ -void enter_from_user_mode(struct pt_regs *regs); +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_enter_from_user_mode(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(__ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); +} /** * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts @@ -117,6 +134,9 @@ void enter_from_user_mode(struct pt_regs *regs); */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work); + /** * syscall_enter_from_user_mode_work - Check and handle work before invoking * a syscall @@ -140,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); + + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); + + return syscall; +} /** * syscall_enter_from_user_mode - Establish state and check and handle work @@ -159,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. */ -long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = syscall_enter_from_user_mode_work(regs, syscall); + instrumentation_end(); + + return ret; +} /** * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() @@ -259,6 +299,43 @@ static __always_inline void arch_exit_to_user_mode(void) { } void arch_do_signal_or_restart(struct pt_regs *regs); /** + * exit_to_user_mode_loop - do any pending work before leaving to user space + */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work); + +/** + * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * 1) check that interrupts are disabled + * 2) call tick_nohz_user_enter_prepare() + * 3) call exit_to_user_mode_loop() if any flags from + * EXIT_TO_USER_MODE_WORK are set + * 4) check that interrupts are still disabled + */ +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work; + + lockdep_assert_irqs_disabled(); + + /* Flush pending rcuog wakeup before the last need_resched() check */ + tick_nohz_user_enter_prepare(); + + ti_work = read_thread_flags(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that kernel state is sane for a return to userspace */ + kmap_assert_nomap(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + +/** * exit_to_user_mode - Fixup state when exiting to user mode * * Syscall/interrupt exit enables interrupts, but the kernel state is @@ -276,7 +353,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs); * non-instrumentable. * The caller has to invoke syscall_exit_to_user_mode_work() before this. */ -void exit_to_user_mode(void); +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} /** * syscall_exit_to_user_mode_work - Handle work before returning to user mode diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 689028257fcc..325e0778e937 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -95,6 +95,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len + * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -102,6 +103,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_TX_PUSH = BIT(2), ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), + ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) @@ -409,8 +411,10 @@ struct ethtool_pause_stats { * not entire FEC data blocks. This is a non-standard statistic. * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. * - * @lane: per-lane/PCS-instance counts as defined by the standard - * @total: error counts for the entire port, for drivers incapable of reporting + * For each of the above fields, the two substructure members are: + * + * - @lanes: per-lane/PCS-instance counts as defined by the standard + * - @total: error counts for the entire port, for drivers incapable of reporting * per-lane stats * * Drivers should fill in either only total or per-lane statistics, core @@ -595,9 +599,46 @@ struct ethtool_mm_stats { }; /** + * struct ethtool_rxfh_param - RXFH (RSS) parameters + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @indir_size: On SET, the array size of the user buffer for the + * indirection table, which may be zero, or + * %ETH_RXFH_INDIR_NO_CHANGE. On GET (read from the driver), + * the array size of the hardware indirection table. + * @indir: The indirection table of size @indir_size entries. + * @key_size: On SET, the array size of the user buffer for the hash key, + * which may be zero. On GET (read from the driver), the size of the + * hardware hash key. + * @key: The hash key of size @key_size bytes. + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. On SET, %ETH_RXFH_CONTEXT_ALLOC is used + * to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @rss_delete: Set to non-ZERO to remove the @rss_context context. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. + */ +struct ethtool_rxfh_param { + u8 hfunc; + u32 indir_size; + u32 *indir; + u32 key_size; + u8 *key; + u32 rss_context; + u8 rss_delete; + u8 input_xfrm; +}; + +/** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. + * @cap_rss_ctx_supported: indicates if the driver supports RSS + * contexts. + * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor + * RSS. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -694,15 +735,6 @@ struct ethtool_mm_stats { * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. - * @get_rxfh_context: Get the contents of the RX flow hash indirection table, - * hash key, and/or hash function assiciated to the given rss context. - * Returns a negative error code or zero. - * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting - * the contents of the RX flow hash indirection table, hash key, and/or - * hash function associated to the given context. Arguments which are set - * to %NULL or zero will remain unchanged. - * Returns a negative error code or zero. An error code must be returned - * if at least one unsupported change was requested. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. @@ -785,6 +817,8 @@ struct ethtool_mm_stats { */ struct ethtool_ops { u32 cap_link_lanes_supported:1; + u32 cap_rss_ctx_supported:1; + u32 cap_rss_sym_xor_supported:1; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); @@ -844,15 +878,9 @@ struct ethtool_ops { int (*reset)(struct net_device *, u32 *); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); - int (*get_rxfh)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc); - int (*set_rxfh)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc); - int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc, u32 rss_context); - int (*set_rxfh_context)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc, - u32 *rss_context, bool delete); + int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); + int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, + struct netlink_ext_ack *extack); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); @@ -1044,6 +1072,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, } /** + * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. + * @dev: pointer to net_device structure + * @info: buffer to hold the result + * Returns zero on success, non-zero otherwise. + */ +int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info); + +/** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update * @fmt: Format of string to write @@ -1053,6 +1089,19 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); +/** + * ethtool_puts - Write string to ethtool string data + * @data: Pointer to a pointer to the start of string to update + * @str: String to write + * + * Write string to *data without a trailing newline. Update *data + * to point at start of next string. + * + * Prefer this function to ethtool_sprintf() when given only + * two arguments or if @fmt is just "%s". + */ +extern void ethtool_puts(u8 **data, const char *str); + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index b9d83652c097..e32bee4345fb 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -58,15 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { - return -ENOSYS; -} - -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, - unsigned mask) -{ - return -ENOSYS; } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) @@ -92,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) #endif +static inline void eventfd_signal(struct eventfd_ctx *ctx) +{ + eventfd_signal_mask(ctx, 0); +} + #endif /* _LINUX_EVENTFD_H */ diff --git a/include/linux/evm.h b/include/linux/evm.h index 01fc495a83e2..36ec884320d9 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -31,6 +31,7 @@ extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); +extern int evm_inode_copy_up_xattr(const char *name); extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, @@ -117,6 +118,11 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry, return; } +static inline int evm_inode_copy_up_xattr(const char *name) +{ + return 0; +} + static inline int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 039fe0ce8d83..053137a0fe45 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -441,7 +441,7 @@ struct f2fs_sit_block { * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ #define ENTRIES_IN_SUM (F2FS_BLKSIZE / 8) -#define SUMMARY_SIZE (7) /* sizeof(struct summary) */ +#define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ #define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) diff --git a/include/linux/fb.h b/include/linux/fb.h index 94e2c44c6569..05dc9624897d 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -536,6 +536,7 @@ extern ssize_t fb_io_read(struct fb_info *info, char __user *buf, size_t count, loff_t *ppos); extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf, size_t count, loff_t *ppos); +int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma); #define __FB_DEFAULT_IOMEM_OPS_RDWR \ .fb_read = fb_io_read, \ @@ -547,7 +548,7 @@ extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf, .fb_imageblit = cfb_imageblit #define __FB_DEFAULT_IOMEM_OPS_MMAP \ - .fb_mmap = NULL /* default implementation */ + .fb_mmap = fb_io_mmap #define FB_DEFAULT_IOMEM_OPS \ __FB_DEFAULT_IOMEM_OPS_RDWR, \ @@ -848,7 +849,10 @@ static inline bool fb_modesetting_disabled(const char *drvname) } #endif -/* Convenience logging macros */ +/* + * Convenience logging macros + */ + #define fb_err(fb_info, fmt, ...) \ pr_err("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_notice(info, fmt, ...) \ @@ -860,4 +864,12 @@ static inline bool fb_modesetting_disabled(const char *drvname) #define fb_dbg(fb_info, fmt, ...) \ pr_debug("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) +#define fb_warn_once(fb_info, fmt, ...) \ + pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) + +#define fb_WARN_ONCE(fb_info, condition, fmt, ...) \ + WARN_ONCE(condition, "fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) +#define fb_WARN_ON_ONCE(fb_info, x) \ + fb_WARN_ONCE(fb_info, (x), "%s", "fb_WARN_ON_ONCE(" __stringify(x) ")") + #endif /* _LINUX_FB_H */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index bc4c3287a65e..78c8326d74ae 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -83,12 +83,17 @@ struct dentry; static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) { - fd = array_index_nospec(fd, fdt->max_fds); - return rcu_dereference_raw(fdt->fd[fd]); - } - return NULL; + unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); + struct file *needs_masking; + + /* + * 'mask' is zero for an out-of-bounds fd, all ones for ok. + * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. + * + * Accessing fdt->fd[0] is ok, but needs masking of the result. + */ + needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); + return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) @@ -114,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern struct file *close_fd_get_file(unsigned int fd); +extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); diff --git a/include/linux/file.h b/include/linux/file.h index 6e9099d29343..6834a29338c4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,18 +96,8 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); -extern int receive_fd(struct file *file, unsigned int o_flags); - -static inline int receive_fd_user(struct file *file, int __user *ufd, - unsigned int o_flags) -{ - if (ufd == NULL) - return -EFAULT; - return __receive_fd(file, ufd, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index a4953fafc8cb..68fb6c8142fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1067,7 +1067,7 @@ struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_prog_pack_free(struct bpf_binary_header *hdr); +void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { diff --git a/include/linux/firmware.h b/include/linux/firmware.h index de7fea3bca51..0311858b46ce 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -27,6 +27,7 @@ struct firmware { * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry + * @FW_UPLOAD_ERR_FW_INVALID: invalid firmware file * @FW_UPLOAD_ERR_MAX: Maximum error code marker */ enum fw_upload_err { @@ -38,6 +39,7 @@ enum fw_upload_err { FW_UPLOAD_ERR_INVALID_SIZE, FW_UPLOAD_ERR_RW_ERROR, FW_UPLOAD_ERR_WEAROUT, + FW_UPLOAD_ERR_FW_INVALID, FW_UPLOAD_ERR_MAX }; diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h new file mode 100644 index 000000000000..782cd5fc83d5 --- /dev/null +++ b/include/linux/framer/framer-provider.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer profider header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina <herve.codina@bootlin.com> + */ + +#ifndef __DRIVERS_PROVIDER_FRAMER_H +#define __DRIVERS_PROVIDER_FRAMER_H + +#include <linux/export.h> +#include <linux/framer/framer.h> +#include <linux/types.h> + +#define FRAMER_FLAG_POLL_STATUS BIT(0) + +/** + * struct framer_ops - set of function pointers for performing framer operations + * @init: operation to be performed for initializing the framer + * @exit: operation to be performed while exiting + * @power_on: powering on the framer + * @power_off: powering off the framer + * @flags: OR-ed flags (FRAMER_FLAG_*) to ask for core functionality + * - @FRAMER_FLAG_POLL_STATUS: + * Ask the core to perform a polling to get the framer status and + * notify consumers on change. + * The framer should call @framer_notify_status_change() when it + * detects a status change. This is usually done using interrupts. + * If the framer cannot detect this change, it can ask the core for + * a status polling. The core will call @get_status() periodically + * and, on change detected, it will notify the consumer. + * the @get_status() + * @owner: the module owner containing the ops + */ +struct framer_ops { + int (*init)(struct framer *framer); + void (*exit)(struct framer *framer); + int (*power_on)(struct framer *framer); + int (*power_off)(struct framer *framer); + + /** + * @get_status: + * + * Optional. + * + * Used to get the framer status. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_status)(struct framer *framer, struct framer_status *status); + + /** + * @set_config: + * + * Optional. + * + * Used to set the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*set_config)(struct framer *framer, const struct framer_config *config); + + /** + * @get_config: + * + * Optional. + * + * Used to get the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_config)(struct framer *framer, struct framer_config *config); + + u32 flags; + struct module *owner; +}; + +/** + * struct framer_provider - represents the framer provider + * @dev: framer provider device + * @children: can be used to override the default (dev->of_node) child node + * @owner: the module owner having of_xlate + * @list: to maintain a linked list of framer providers + * @of_xlate: function pointer to obtain framer instance from framer pointer + */ +struct framer_provider { + struct device *dev; + struct module *owner; + struct list_head list; + struct framer * (*of_xlate)(struct device *dev, + struct of_phandle_args *args); +}; + +static inline void framer_set_drvdata(struct framer *framer, void *data) +{ + dev_set_drvdata(&framer->dev, data); +} + +static inline void *framer_get_drvdata(struct framer *framer) +{ + return dev_get_drvdata(&framer->dev); +} + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) + +/* Create and destroy a framer */ +struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); +void framer_destroy(struct framer *framer); + +/* devm version */ +struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); + +struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args); + +struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_provider_of_unregister(struct framer_provider *framer_provider); + +struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_notify_status_change(struct framer *framer); + +#else /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +static inline struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void framer_destroy(struct framer *framer) +{ +} + +/* devm version */ +static inline struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_provider_of_unregister(struct framer_provider *framer_provider) +{ +} + +static inline struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_notify_status_change(struct framer *framer) +{ +} + +#endif /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +#define framer_provider_of_register(dev, xlate) \ + __framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#define devm_framer_provider_of_register(dev, xlate) \ + __devm_framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#endif /* __DRIVERS_PROVIDER_FRAMER_H */ diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h new file mode 100644 index 000000000000..9a9b88962c29 --- /dev/null +++ b/include/linux/framer/framer.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina <herve.codina@bootlin.com> + */ + +#ifndef __DRIVERS_FRAMER_H +#define __DRIVERS_FRAMER_H + +#include <linux/err.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/of.h> +#include <linux/device.h> +#include <linux/workqueue.h> + +/** + * enum framer_iface - Framer interface + * @FRAMER_IFACE_E1: E1 interface + * @FRAMER_IFACE_T1: T1 interface + */ +enum framer_iface { + FRAMER_IFACE_E1, + FRAMER_IFACE_T1, +}; + +/** + * enum framer_clock_type - Framer clock type + * @FRAMER_CLOCK_EXT: External clock + * @FRAMER_CLOCK_INT: Internal clock + */ +enum framer_clock_type { + FRAMER_CLOCK_EXT, + FRAMER_CLOCK_INT, +}; + +/** + * struct framer_config - Framer configuration + * @iface: Framer line interface + * @clock_type: Framer clock type + * @line_clock_rate: Framer line clock rate + */ +struct framer_config { + enum framer_iface iface; + enum framer_clock_type clock_type; + unsigned long line_clock_rate; +}; + +/** + * struct framer_status - Framer status + * @link_is_on: Framer link state. true, the link is on, false, the link is off. + */ +struct framer_status { + bool link_is_on; +}; + +/** + * enum framer_event - Event available for notification + * @FRAMER_EVENT_STATUS: Event notified on framer_status changes + */ +enum framer_event { + FRAMER_EVENT_STATUS, +}; + +/** + * struct framer - represents the framer device + * @dev: framer device + * @id: id of the framer device + * @ops: function pointers for performing framer operations + * @mutex: mutex to protect framer_ops + * @init_count: used to protect when the framer is used by multiple consumers + * @power_count: used to protect when the framer is used by multiple consumers + * @pwr: power regulator associated with the framer + * @notify_status_work: work structure used for status notifications + * @notifier_list: notifier list used for notifications + * @polling_work: delayed work structure used for the polling task + * @prev_status: previous read status used by the polling task to detect changes + */ +struct framer { + struct device dev; + int id; + const struct framer_ops *ops; + struct mutex mutex; /* Protect framer */ + int init_count; + int power_count; + struct regulator *pwr; + struct work_struct notify_status_work; + struct blocking_notifier_head notifier_list; + struct delayed_work polling_work; + struct framer_status prev_status; +}; + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) +int framer_pm_runtime_get(struct framer *framer); +int framer_pm_runtime_get_sync(struct framer *framer); +int framer_pm_runtime_put(struct framer *framer); +int framer_pm_runtime_put_sync(struct framer *framer); +int framer_init(struct framer *framer); +int framer_exit(struct framer *framer); +int framer_power_on(struct framer *framer); +int framer_power_off(struct framer *framer); +int framer_get_status(struct framer *framer, struct framer_status *status); +int framer_get_config(struct framer *framer, struct framer_config *config); +int framer_set_config(struct framer *framer, const struct framer_config *config); +int framer_notifier_register(struct framer *framer, struct notifier_block *nb); +int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb); + +struct framer *framer_get(struct device *dev, const char *con_id); +void framer_put(struct device *dev, struct framer *framer); + +struct framer *devm_framer_get(struct device *dev, const char *con_id); +struct framer *devm_framer_optional_get(struct device *dev, const char *con_id); +#else +static inline int framer_pm_runtime_get(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_get_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_init(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_exit(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_on(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_off(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_get_status(struct framer *framer, struct framer_status *status) +{ + return -ENOSYS; +} + +static inline int framer_get_config(struct framer *framer, struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_set_config(struct framer *framer, const struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_notifier_register(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +static inline int framer_notifier_unregister(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +struct framer *framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_put(struct device *dev, struct framer *framer) +{ +} + +static inline struct framer *devm_framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *devm_framer_optional_get(struct device *dev, const char *con_id) +{ + return NULL; +} + +#endif + +#endif /* __DRIVERS_FRAMER_H */ diff --git a/include/linux/framer/pef2256.h b/include/linux/framer/pef2256.h new file mode 100644 index 000000000000..71d80af58c40 --- /dev/null +++ b/include/linux/framer/pef2256.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PEF2256 consumer API + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina <herve.codina@bootlin.com> + */ +#ifndef __PEF2256_H__ +#define __PEF2256_H__ + +#include <linux/types.h> + +struct pef2256; +struct regmap; + +/* Retrieve the PEF2256 regmap */ +struct regmap *pef2256_get_regmap(struct pef2256 *pef2256); + +/* PEF2256 hardware versions */ +enum pef2256_version { + PEF2256_VERSION_UNKNOWN, + PEF2256_VERSION_1_2, + PEF2256_VERSION_2_1, + PEF2256_VERSION_2_2, +}; + +/* Get the PEF2256 hardware version */ +enum pef2256_version pef2256_get_version(struct pef2256 *pef2256); + +#endif /* __PEF2256_H__ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..e6ba0cc6f2ee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -434,7 +434,7 @@ struct address_space_operations { bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, @@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -484,9 +484,9 @@ struct address_space { unsigned long flags; struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; - void *private_data; + spinlock_t i_private_lock; + struct list_head i_private_list; + void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but @@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) */ struct file { union { + /* fput() uses task work when closing and freeing file (default). */ + struct callback_head f_task_work; + /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; @@ -1164,6 +1166,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ @@ -1185,7 +1188,8 @@ enum { struct sb_writers { unsigned short frozen; /* Is sb frozen? */ - unsigned short freeze_holders; /* Who froze fs? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -1645,9 +1649,70 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ static inline bool sb_write_started(const struct super_block *sb) { - return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * file_write_started - check if SB_FREEZE_WRITE is held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_started(file_inode(file)->i_sb); +} + +/** + * file_write_not_started - check if SB_FREEZE_WRITE is not held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_not_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_not_started(file_inode(file)->i_sb); } /** @@ -2029,9 +2094,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, @@ -2051,9 +2113,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { @@ -2517,26 +2594,31 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* - * file_user_path - get the path to display for memory mapped file - * * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying - * filesystem. When the mapped file path is displayed to user (e.g. via - * /proc/<pid>/maps), this helper should be used to get the path to display - * to the user, which is the path of the fd that user has requested to map. + * filesystem. When the mapped file path and inode number are displayed to + * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the + * path and inode number to display to the user, which is the path of the fd + * that user has requested to map and the inode number that would be returned + * by fstat() on that same fd. */ +/* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } +/* Get the inode whose inode number to display in /proc/<pid>/maps */ +static inline const struct inode *file_user_inode(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return d_inode(backing_file_user_path(f)->dentry); + return file_inode(f); +} static inline struct file *file_clone_open(struct file *file) { @@ -2991,8 +3073,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); extern void @@ -3121,7 +3201,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bcb6609b54b3..11e6434b8e71 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -100,29 +100,49 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } -/* Simple call site for access decisions */ -static inline int fsnotify_perm(struct file *file, int mask) +/* + * fsnotify_file_area_perm - permission hook before access to file range + */ +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) { - int ret; - __u32 fsnotify_mask = 0; + __u32 fsnotify_mask = FS_ACCESS_PERM; + + /* + * filesystem may be modified in the context of permission events + * (e.g. by HSM filling a file on access), so sb freeze protection + * must not be held. + */ + lockdep_assert_once(file_write_not_started(file)); - if (!(mask & (MAY_READ | MAY_OPEN))) + if (!(perm_mask & MAY_READ)) return 0; - if (mask & MAY_OPEN) { - fsnotify_mask = FS_OPEN_PERM; + return fsnotify_file(file, fsnotify_mask); +} + +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); +} - if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); +/* + * fsnotify_open_perm - permission hook before file open + */ +static inline int fsnotify_open_perm(struct file *file) +{ + int ret; - if (ret) - return ret; - } - } else if (mask & MAY_READ) { - fsnotify_mask = FS_ACCESS_PERM; + if (file->f_flags & __FMODE_EXEC) { + ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + if (ret) + return ret; } - return fsnotify_file(file, fsnotify_mask); + return fsnotify_file(file, FS_OPEN_PERM); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c0892d75ce33..7f63be5ca0f1 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ -#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ - __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; @@ -530,6 +528,8 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 +#define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 +#define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; @@ -763,11 +763,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid); + int add_flags); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid); + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, @@ -775,15 +774,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, - NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } /* given a group and a mark, flag mark to be freed when all references are dropped */ diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c5..1b6053da8754 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept - * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). + * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules + * implicit rules. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus * it can sleep). It will avoid disruptive actions like OOM killer. The * caller must handle the failure which is quite likely to happen under * heavy memory pressure. The flag is suitable when failure can easily be - * handled at small cost, such as reduced throughput + * handled at small cost, such as reduced throughput. * * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim * procedures that have previously failed if there is some indication - * that progress has been made else where. It can wait for other - * tasks to attempt high level approaches to freeing memory such as + * that progress has been made elsewhere. It can wait for other + * tasks to attempt high-level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is * a larger limit than with %__GFP_NORETRY. @@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t; * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting * memory tags at the same time as zeroing memory has minimal additional - * performace impact. + * performance impact. * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by @@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t; * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct - * reclaim, start physical IO or use any filesystem callback. + * reclaim, start physical IO or use any filesystem callback. It is very + * likely to fail to allocate memory, even for very small allocations. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. @@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t; #define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) -#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 0aed62f0c633..e846bd4e7559 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -531,19 +531,40 @@ struct gpio_chip { #endif /* CONFIG_OF_GPIO */ }; -const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset); +char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset); + + +struct _gpiochip_for_each_data { + const char **label; + unsigned int *i; +}; + +DEFINE_CLASS(_gpiochip_for_each_data, + struct _gpiochip_for_each_data, + if (*_T.label) kfree(*_T.label), + ({ + struct _gpiochip_for_each_data _data = { label, i }; + *_data.i = 0; + _data; + }), + const char **label, int *i) /** * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range - * @chip: the chip to query - * @i: loop variable - * @base: first GPIO in the range - * @size: amount of GPIOs to check starting from @base - * @label: label of current GPIO + * @_chip: the chip to query + * @_i: loop variable + * @_base: first GPIO in the range + * @_size: amount of GPIOs to check starting from @base + * @_label: label of current GPIO */ -#define for_each_requested_gpio_in_range(chip, i, base, size, label) \ - for (i = 0; i < size; i++) \ - if ((label = gpiochip_is_requested(chip, base + i)) == NULL) {} else +#define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label) \ + for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ + *_data.i < _size; \ + (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ + if ((*_data.label = \ + gpiochip_dup_line_label(_chip, _base + *_data.i)) == NULL) {} \ + else if (IS_ERR(*_data.label)) {} \ + else /* Iterates over all requested GPIO of the given @chip */ #define for_each_requested_gpio(chip, i, label) \ @@ -701,7 +722,6 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, #else #include <asm/bug.h> -#include <asm/errno.h> static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain) @@ -786,11 +806,10 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc); /* struct gpio_device getters */ int gpio_device_get_base(struct gpio_device *gdev); +const char *gpio_device_get_label(struct gpio_device *gdev); #else /* CONFIG_GPIOLIB */ -#include <linux/err.h> - #include <asm/bug.h> static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 86ea7c63a0d2..f316c8d0f3fc 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -659,6 +659,12 @@ enum pq_init_status { * number (nonce) provided by the host to prevent replay attacks. * public key and certificate also provided as part of the FW response. * + * CPUCP_PACKET_INFO_SIGNED_GET - + * Get the device information signed by the Trusted Platform device. + * device info data is also hashed with some unique number (nonce) provided + * by the host to prevent replay attacks. public key and certificate also + * provided as part of the FW response. + * * CPUCP_PACKET_MONITOR_DUMP_GET - * Get monitors registers dump from the CpuCP kernel. * The CPU will put the registers dump in the a buffer allocated by the driver @@ -733,7 +739,7 @@ enum cpucp_packet_id { CPUCP_PACKET_ENGINE_CORE_ASID_SET, /* internal */ CPUCP_PACKET_RESERVED2, /* not used */ CPUCP_PACKET_SEC_ATTEST_GET, /* internal */ - CPUCP_PACKET_RESERVED3, /* not used */ + CPUCP_PACKET_INFO_SIGNED_GET, /* internal */ CPUCP_PACKET_RESERVED4, /* not used */ CPUCP_PACKET_MONITOR_DUMP_GET, /* debugfs */ CPUCP_PACKET_RESERVED5, /* not used */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index be20cff4ba73..451c1dff0e87 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -484,6 +484,82 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, } /** + * folio_zero_tail - Zero the tail of a folio. + * @folio: The folio to zero. + * @offset: The byte offset in the folio to start zeroing at. + * @kaddr: The address the folio is currently mapped to. + * + * If you have already used kmap_local_folio() to map a folio, written + * some data to it and now need to zero the end of the folio (and flush + * the dcache), you can use this function. If you do not have the + * folio kmapped (eg the folio has been partially populated by DMA), + * use folio_zero_range() or folio_zero_segment() instead. + * + * Return: An address which can be passed to kunmap_local(). + */ +static inline __must_check void *folio_zero_tail(struct folio *folio, + size_t offset, void *kaddr) +{ + size_t len = folio_size(folio) - offset; + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memset(kaddr, 0, max); + kunmap_local(kaddr); + len -= max; + offset += max; + max = PAGE_SIZE; + kaddr = kmap_local_folio(folio, offset); + } + } + + memset(kaddr, 0, len); + flush_dcache_folio(folio); + + return kaddr; +} + +/** + * folio_fill_tail - Copy some data to a folio and pad with zeroes. + * @folio: The destination folio. + * @offset: The offset into @folio at which to start copying. + * @from: The data to copy. + * @len: How many bytes of data to copy. + * + * This function is most useful for filesystems which support inline data. + * When they want to copy data from the inode into the page cache, this + * function does everything for them. It supports large folios even on + * HIGHMEM configurations. + */ +static inline void folio_fill_tail(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + char *to = kmap_local_folio(folio, offset); + + VM_BUG_ON(offset + len > folio_size(folio)); + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memcpy(to, from, max); + kunmap_local(to); + len -= max; + from += max; + offset += max; + max = PAGE_SIZE; + to = kmap_local_folio(folio, offset); + } + } + + memcpy(to, from, len); + to = folio_zero_tail(folio, offset + len, to + len); + kunmap_local(to); +} + +/** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index ddc7ebb70523..5f4c74facf6a 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -108,17 +108,13 @@ enum qm_stop_reason { }; enum qm_state { - QM_INIT = 0, - QM_START, - QM_CLOSE, + QM_WORK = 0, QM_STOP, }; enum qp_state { - QP_INIT = 1, - QP_START, + QP_START = 1, QP_STOP, - QP_CLOSE, }; enum qm_hw_ver { @@ -160,6 +156,11 @@ enum qm_cap_bits { QM_SUPPORT_RPM, }; +struct qm_dev_alg { + u64 alg_msk; + const char *alg; +}; + struct dfx_diff_registers { u32 *regs; u32 reg_offset; @@ -265,6 +266,16 @@ struct hisi_qm_cap_info { u32 v3_val; }; +struct hisi_qm_cap_record { + u32 type; + u32 cap_val; +}; + +struct hisi_qm_cap_tables { + struct hisi_qm_cap_record *qm_cap_table; + struct hisi_qm_cap_record *dev_cap_table; +}; + struct hisi_qm_list { struct mutex lock; struct list_head list; @@ -365,7 +376,6 @@ struct hisi_qm { struct work_struct rst_work; struct work_struct cmd_process; - const char *algs; bool use_sva; resource_size_t phys_base; @@ -376,6 +386,8 @@ struct hisi_qm { u32 mb_qos; u32 type_rate; struct qm_err_isolate isolate_data; + + struct hisi_qm_cap_tables cap_tables; }; struct hisi_qp_status { @@ -563,6 +575,8 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); u32 hisi_qm_get_hw_info(struct hisi_qm *qm, const struct hisi_qm_cap_info *info_table, u32 index, bool is_read); +int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs, + u32 dev_algs_size); /* Used by VFIO ACC live migration driver */ struct pci_driver *hisi_sec_get_pf_driver(void); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f2044d5a652b..87e3bedf8eb0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,13 +13,13 @@ #define _LINUX_HRTIMER_H #include <linux/hrtimer_defs.h> -#include <linux/rbtree.h> +#include <linux/hrtimer_types.h> #include <linux/init.h> #include <linux/list.h> -#include <linux/percpu.h> +#include <linux/percpu-defs.h> +#include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/timer.h> -#include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -60,14 +60,6 @@ enum hrtimer_mode { }; /* - * Return values for the callback function - */ -enum hrtimer_restart { - HRTIMER_NORESTART, /* Timer is not restarted */ - HRTIMER_RESTART, /* Timer must be restarted */ -}; - -/* * Values to track state of the timer * * Possible states: @@ -95,38 +87,6 @@ enum hrtimer_restart { #define HRTIMER_STATE_ENQUEUED 0x01 /** - * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, - * the absolute expiry time in the hrtimers internal - * representation. The time is related to the clock on - * which the timer is based. Is setup by adding - * slack to the _softexpires value. For non range timers - * identical to _softexpires. - * @_softexpires: the absolute earliest expiry time of the hrtimer. - * The time which was given as expiry time when the timer - * was armed. - * @function: timer expiry callback function - * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) - * @is_rel: Set if the timer was armed relative - * @is_soft: Set if hrtimer will be expired in soft interrupt context. - * @is_hard: Set if hrtimer will be expired in hard interrupt context - * even on RT. - * - * The hrtimer structure must be initialized by hrtimer_init() - */ -struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); - struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; -}; - -/** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h new file mode 100644 index 000000000000..ad66a3081735 --- /dev/null +++ b/include/linux/hrtimer_types.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_HRTIMER_TYPES_H +#define _LINUX_HRTIMER_TYPES_H + +#include <linux/types.h> +#include <linux/timerqueue_types.h> + +struct hrtimer_clock_base; + +/* + * Return values for the callback function + */ +enum hrtimer_restart { + HRTIMER_NORESTART, /* Timer is not restarted */ + HRTIMER_RESTART, /* Timer must be restarted */ +}; + +/** + * struct hrtimer - the basic hrtimer structure + * @node: timerqueue node, which also manages node.expires, + * the absolute expiry time in the hrtimers internal + * representation. The time is related to the clock on + * which the timer is based. Is setup by adding + * slack to the _softexpires value. For non range timers + * identical to _softexpires. + * @_softexpires: the absolute earliest expiry time of the hrtimer. + * The time which was given as expiry time when the timer + * was armed. + * @function: timer expiry callback function + * @base: pointer to the timer base (per cpu and per clock) + * @state: state information (See bit values above) + * @is_rel: Set if the timer was armed relative + * @is_soft: Set if hrtimer will be expired in soft interrupt context. + * @is_hard: Set if hrtimer will be expired in hard interrupt context + * even on RT. + * + * The hrtimer structure must be initialized by hrtimer_init() + */ +struct hrtimer { + struct timerqueue_node node; + ktime_t _softexpires; + enum hrtimer_restart (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + u8 state; + u8 is_rel; + u8 is_soft; + u8 is_hard; +}; + +#endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa0350b0812a..5adb86af35fc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -67,6 +67,26 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) +/* + * Mask of all large folio orders supported for anonymous THP; all orders up to + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1 + * (which is a limitation of the THP implementation). + */ +#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) + +/* + * Mask of all large folio orders supported for file THP. + */ +#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) + +/* + * Mask of all large folio orders supported for THP. + */ +#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) + +#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) @@ -77,45 +97,105 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; +extern unsigned long huge_anon_orders_always; +extern unsigned long huge_anon_orders_madvise; +extern unsigned long huge_anon_orders_inherit; -#define hugepage_flags_enabled() \ - (transparent_hugepage_flags & \ - ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ - (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) -#define hugepage_flags_always() \ - (transparent_hugepage_flags & \ - (1<<TRANSPARENT_HUGEPAGE_FLAG)) +static inline bool hugepage_global_enabled(void) +{ + return transparent_hugepage_flags & + ((1<<TRANSPARENT_HUGEPAGE_FLAG) | + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)); +} + +static inline bool hugepage_global_always(void) +{ + return transparent_hugepage_flags & + (1<<TRANSPARENT_HUGEPAGE_FLAG); +} + +static inline bool hugepage_flags_enabled(void) +{ + /* + * We cover both the anon and the file-backed case here; we must return + * true if globally enabled, even when all anon sizes are set to never. + * So we don't need to look at huge_anon_orders_inherit. + */ + return hugepage_global_enabled() || + huge_anon_orders_always || + huge_anon_orders_madvise; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} /* * Do the below checks: * - For file vma, check if the linear page offset of vma is - * HPAGE_PMD_NR aligned within the file. The hugepage is - * guaranteed to be hugepage-aligned within the file, but we must - * check that the PMD-aligned addresses in the VMA map to - * PMD-aligned offsets within the file, else the hugepage will - * not be PMD-mappable. - * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE + * order-aligned within the file. The hugepage is + * guaranteed to be order-aligned within the file, but we must + * check that the order-aligned addresses in the VMA map to + * order-aligned offsets within the file, else the hugepage will + * not be mappable. + * - For all vmas, check if the haddr is in an aligned hugepage * area. */ -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { + unsigned long hpage_size = PAGE_SIZE << order; unsigned long haddr; /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) + hpage_size >> PAGE_SHIFT)) return false; } - haddr = addr & HPAGE_PMD_MASK; + haddr = ALIGN_DOWN(addr, hpage_size); - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } +/* + * Filter the bitfield of input orders to the ones suitable for use in the vma. + * See thp_vma_suitable_order(). + * All orders that pass the checks are returned as a bitfield. + */ +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) +{ + int order; + + /* + * Iterate over orders, highest to lowest, removing orders that don't + * meet alignment requirements from the set. Exit loop at first order + * that meets requirements, since all lower orders must also meet + * requirements. + */ + + order = highest_order(orders); + + while (orders) { + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + return orders; +} + static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -126,12 +206,55 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) inode = vma->vm_file->f_inode; return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - (vma->vm_flags & VM_EXEC) && !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs); +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders); + +/** + * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma + * @vma: the vm area to check + * @vm_flags: use these vm_flags instead of vma->vm_flags + * @smaps: whether answer will be used for smaps file + * @in_pf: whether answer will be used by page fault handler + * @enforce_sysfs: whether sysfs config should be taken into account + * @orders: bitfield of all orders to consider + * + * Calculates the intersection of the requested hugepage orders and the allowed + * hugepage orders for the provided vma. Permitted orders are encoded as a set + * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 + * corresponds to order-3, etc). Order-0 is never considered a hugepage order. + * + * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage + * orders are allowed. + */ +static inline +unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Optimization to check if required orders are enabled early. */ + if (enforce_sysfs && vma_is_anonymous(vma)) { + unsigned long mask = READ_ONCE(huge_anon_orders_always); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_anon_orders_madvise); + if (hugepage_global_always() || + ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) + mask |= READ_ONCE(huge_anon_orders_inherit); + + orders &= mask; + if (!orders) + return 0; + } + + return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, + enforce_sysfs, orders); +} #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -267,17 +390,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { return false; } -static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs) +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) { - return false; + return 0; +} + +static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + return 0; } static inline void folio_prep_large_rmappable(struct folio *folio) {} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 236ec7b63c54..c1ee640d87b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_ORDER; + return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2b5e500bf093..83c4d060a559 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -172,11 +172,11 @@ #define IEEE80211_SN_MODULO (IEEE80211_MAX_SN + 1) -/* PV1 Layout 11ah 9.8.3.1 */ +/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */ #define IEEE80211_PV1_FCTL_VERS 0x0003 #define IEEE80211_PV1_FCTL_FTYPE 0x001c #define IEEE80211_PV1_FCTL_STYPE 0x00e0 -#define IEEE80211_PV1_FCTL_TODS 0x0100 +#define IEEE80211_PV1_FCTL_FROMDS 0x0100 #define IEEE80211_PV1_FCTL_MOREFRAGS 0x0200 #define IEEE80211_PV1_FCTL_PM 0x0400 #define IEEE80211_PV1_FCTL_MOREDATA 0x0800 @@ -2720,6 +2720,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) #define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 #define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 +#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 /** * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3028af87716e..c1645c86eed9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) - return -EINVAL; + return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; @@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, return 0; } else { *vlan_tci = 0; - return -EINVAL; + return -ENODATA; } } diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce..adb83a42a6b9 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -11,7 +11,7 @@ * @__VA_ARGS__: arguments for @f * * Avoid retpoline overhead for known builtin, checking @f vs each of them and - * eventually invoking directly the builtin function. The functions are check + * eventually invoking directly the builtin function. The functions are checked * in the given order. Fallback to the indirect call. */ #define INDIRECT_CALL_1(f, f1, ...) \ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 40fc5813cf93..bccb3f1f6262 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -37,13 +37,6 @@ extern struct cred init_cred; #define INIT_TASK_COMM "swapper" -/* Attach to the init_task data structure for proper alignment */ -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -#define __init_task_data __section(".data..init_task") -#else -#define __init_task_data /**/ -#endif - /* Attach to the thread_info data structure for proper alignment */ #define __init_thread_info __section(".data..init_thread_info") diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index ee07393445f9..a3529b962be6 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -12,6 +12,19 @@ #define TPMI_MINOR_VERSION(val) FIELD_GET(GENMASK(4, 0), val) #define TPMI_MAJOR_VERSION(val) FIELD_GET(GENMASK(7, 5), val) +/* + * List of supported TMPI IDs. + * Some TMPI IDs are not used by Linux, so the numbers are not consecutive. + */ +enum intel_tpmi_id { + TPMI_ID_RAPL = 0, /* Running Average Power Limit */ + TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */ + TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */ + TPMI_ID_SST = 5, /* Speed Select Technology */ + TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */ + TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ +}; + /** * struct intel_tpmi_plat_info - Platform information for a TPMI device instance * @package_id: CPU Package id @@ -32,7 +45,6 @@ struct intel_tpmi_plat_info { struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); - -int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, int *locked, - int *disabled); +int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked, + bool *write_blocked); #endif diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index aefb73eeeebf..68ed6697fece 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -6,71 +6,13 @@ #include <linux/xarray.h> #include <uapi/linux/io_uring.h> -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* the request is executed from poll, it should not be freed */ - IO_URING_F_MULTISHOT = 4, - /* executed by io-wq */ - IO_URING_F_IOWQ = 8, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, - - /* ctx state flags, for URING_CMD */ - IO_URING_F_SQE128 = (1 << 8), - IO_URING_F_CQE32 = (1 << 9), - IO_URING_F_IOPOLL = (1 << 10), - - /* set when uring wants to cancel a previously issued command */ - IO_URING_F_CANCEL = (1 << 11), - IO_URING_F_COMPAT = (1 << 12), -}; - -/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ -#define IORING_URING_CMD_CANCELABLE (1U << 30) -#define IORING_URING_CMD_POLLED (1U << 31) - -struct io_uring_cmd { - struct file *file; - const struct io_uring_sqe *sqe; - union { - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); - /* used for polled completion */ - void *cookie; - }; - u32 cmd_op; - u32 flags; - u8 pdu[32]; /* available inline for free use */ -}; - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} - #if defined(CONFIG_IO_URING) -int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, - unsigned issue_flags); -struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); +bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { @@ -89,32 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); -void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) -{ - return -EOPNOTSUPP; -} -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) -{ -} -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline struct sock *io_uring_get_socket(struct file *file) -{ - return NULL; -} static inline void io_uring_task_cancel(void) { } @@ -133,13 +50,9 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ -} -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +static inline bool io_is_uring_fops(struct file *file) { - return NULL; + return false; } #endif diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h new file mode 100644 index 000000000000..e453a997c060 --- /dev/null +++ b/include/linux/io_uring/cmd.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_CMD_H +#define _LINUX_IO_URING_CMD_H + +#include <uapi/linux/io_uring.h> +#include <linux/io_uring_types.h> + +/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ +#define IORING_URING_CMD_CANCELABLE (1U << 30) + +struct io_uring_cmd { + struct file *file; + const struct io_uring_sqe *sqe; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + u32 cmd_op; + u32 flags; + u8 pdu[32]; /* available inline for free use */ +}; + +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); + +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags); + +#else +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2, unsigned issue_flags) +{ +} +static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) +{ +} +static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ +} +#endif + +/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); +} + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->task; +} + +#endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 239a4f68801b..854ad67a5f70 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,37 @@ #include <linux/llist.h> #include <uapi/linux/io_uring.h> +enum { + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 1, +}; + +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* the request is executed from poll, it should not be freed */ + IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = (1 << 8), + IO_URING_F_CQE32 = (1 << 9), + IO_URING_F_IOPOLL = (1 << 10), + + /* set when uring wants to cancel a previously issued command */ + IO_URING_F_CANCEL = (1 << 11), + IO_URING_F_COMPAT = (1 << 12), +}; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -358,9 +389,6 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif /* hashed buffered write serialization */ struct io_wq_hash *hash_map; diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 14f5cfabbbc8..db7fe25f3370 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -331,6 +331,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h index 1b06d074ade0..e3649a6563dd 100644 --- a/include/linux/iosys-map.h +++ b/include/linux/iosys-map.h @@ -168,9 +168,9 @@ struct iosys_map { * about the use of uninitialized variable. */ #define IOSYS_MAP_INIT_OFFSET(map_, offset_) ({ \ - struct iosys_map copy = *map_; \ - iosys_map_incr(©, offset_); \ - copy; \ + struct iosys_map copy_ = *map_; \ + iosys_map_incr(©_, offset_); \ + copy_; \ }) /** @@ -391,14 +391,14 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset, * Returns: * The value read from the mapping. */ -#define iosys_map_rd(map__, offset__, type__) ({ \ - type__ val; \ - if ((map__)->is_iomem) { \ - __iosys_map_rd_io(val, (map__)->vaddr_iomem + (offset__), type__);\ - } else { \ - __iosys_map_rd_sys(val, (map__)->vaddr + (offset__), type__); \ - } \ - val; \ +#define iosys_map_rd(map__, offset__, type__) ({ \ + type__ val_; \ + if ((map__)->is_iomem) { \ + __iosys_map_rd_io(val_, (map__)->vaddr_iomem + (offset__), type__); \ + } else { \ + __iosys_map_rd_sys(val_, (map__)->vaddr + (offset__), type__); \ + } \ + val_; \ }) /** @@ -413,13 +413,13 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset, * or if pointer may be unaligned (and problematic for the architecture * supported), use iosys_map_memcpy_to() */ -#define iosys_map_wr(map__, offset__, type__, val__) ({ \ - type__ val = (val__); \ - if ((map__)->is_iomem) { \ - __iosys_map_wr_io(val, (map__)->vaddr_iomem + (offset__), type__);\ - } else { \ - __iosys_map_wr_sys(val, (map__)->vaddr + (offset__), type__); \ - } \ +#define iosys_map_wr(map__, offset__, type__, val__) ({ \ + type__ val_ = (val__); \ + if ((map__)->is_iomem) { \ + __iosys_map_wr_io(val_, (map__)->vaddr_iomem + (offset__), type__); \ + } else { \ + __iosys_map_wr_sys(val_, (map__)->vaddr + (offset__), type__); \ + } \ }) /** @@ -485,9 +485,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset, * The value read from the mapping. */ #define iosys_map_rd_field(map__, struct_offset__, struct_type__, field__) ({ \ - struct_type__ *s; \ + struct_type__ *s_; \ iosys_map_rd(map__, struct_offset__ + offsetof(struct_type__, field__), \ - typeof(s->field__)); \ + typeof(s_->field__)); \ }) /** @@ -508,9 +508,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset, * usage and memory layout. */ #define iosys_map_wr_field(map__, struct_offset__, struct_type__, field__, val__) ({ \ - struct_type__ *s; \ + struct_type__ *s_; \ iosys_map_wr(map__, struct_offset__ + offsetof(struct_type__, field__), \ - typeof(s->field__), val__); \ + typeof(s_->field__), val__); \ }) #endif /* __IOSYS_MAP_H__ */ diff --git a/include/linux/ipc.h b/include/linux/ipc.h index e1c9eea6015b..9b1434247aab 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -2,7 +2,7 @@ #ifndef _LINUX_IPC_H #define _LINUX_IPC_H -#include <linux/spinlock.h> +#include <linux/spinlock_types.h> #include <linux/uidgid.h> #include <linux/rhashtable-types.h> #include <uapi/linux/ipc.h> diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 8cd11a223260..136f2980cba3 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -66,6 +66,9 @@ void irq_work_sync(struct irq_work *work); void irq_work_run(void); bool irq_work_needs_cpu(void); void irq_work_single(void *arg); + +void arch_irq_work_raise(void); + #else static inline bool irq_work_needs_cpu(void) { return false; } static inline void irq_work_run(void) { } diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 2b665c32f5fe..147feebd508c 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -12,6 +12,7 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#include <linux/irqflags_types.h> #include <linux/typecheck.h> #include <linux/cleanup.h> #include <asm/irqflags.h> @@ -34,19 +35,6 @@ #ifdef CONFIG_TRACE_IRQFLAGS -/* Per-task IRQ trace events information. */ -struct irqtrace_events { - unsigned int irq_events; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; -}; - DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); diff --git a/include/linux/irqflags_types.h b/include/linux/irqflags_types.h new file mode 100644 index 000000000000..c13f0d915097 --- /dev/null +++ b/include/linux/irqflags_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQFLAGS_TYPES_H +#define _LINUX_IRQFLAGS_TYPES_H + +#ifdef CONFIG_TRACE_IRQFLAGS + +/* Per-task IRQ trace events information. */ +struct irqtrace_events { + unsigned int irq_events; + unsigned long hardirq_enable_ip; + unsigned long hardirq_disable_ip; + unsigned int hardirq_enable_event; + unsigned int hardirq_disable_event; + unsigned long softirq_disable_ip; + unsigned long softirq_enable_ip; + unsigned int softirq_disable_event; + unsigned int softirq_enable_event; +}; + +#endif + +#endif /* _LINUX_IRQFLAGS_TYPES_H */ diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a4c204df3da..5428edd90982 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -86,7 +86,6 @@ int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); -u8 *ism_get_seid(void); const struct smcd_ops *ism_get_smcd_ops(void); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index beb30719ee16..971f3e826e15 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -756,11 +756,6 @@ struct journal_s unsigned long j_flags; /** - * @j_atomic_flags: Atomic journaling state flags. - */ - unsigned long j_atomic_flags; - - /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior @@ -999,6 +994,13 @@ struct journal_s struct block_device *j_fs_dev; /** + * @j_fs_dev_wb_err: + * + * Records the errseq of the client fs's backing block device. + */ + errseq_t j_fs_dev_wb_err; + + /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; @@ -1400,12 +1402,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) JBD2_JOURNAL_FLUSH_ZEROOUT) /* - * Journal atomic flag definitions - */ -#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing - * buffer back to disk */ - -/* * Function declarations for the journaling transaction and buffer * management */ @@ -1698,6 +1694,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle) handle->h_aborted = 1; } +static inline void jbd2_init_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + /* + * Save the original wb_err value of client fs's bdev mapping which + * could be used to detect the client fs's metadata async write error. + */ + errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); +} + +static inline int jbd2_check_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + return errseq_check(&mapping->wb_err, + READ_ONCE(journal->j_fs_dev_wb_err)); +} + #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72cb693b075b..dbb06d789e74 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -4,6 +4,7 @@ #include <linux/bug.h> #include <linux/kasan-enabled.h> +#include <linux/kasan-tags.h> #include <linux/kernel.h> #include <linux/static_key.h> #include <linux/types.h> @@ -129,20 +130,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab) __kasan_poison_slab(slab); } -void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache, +void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Temporarily unpoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * temporarily unpoisons an object from a newly allocated slab without doing + * anything else. The object must later be repoisoned by + * kasan_poison_new_object(). + */ +static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_unpoison_object_data(cache, object); + __kasan_unpoison_new_object(cache, object); } -void __kasan_poison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_poison_object_data(struct kmem_cache *cache, +void __kasan_poison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Repoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * repoisons an object that was previously unpoisoned by + * kasan_unpoison_new_object() without doing anything else. + */ +static __always_inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_poison_object_data(cache, object); + __kasan_poison_new_object(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, @@ -172,13 +192,6 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } -void __kasan_slab_free_mempool(void *ptr, unsigned long ip); -static __always_inline void kasan_slab_free_mempool(void *ptr) -{ - if (kasan_enabled()) - __kasan_slab_free_mempool(ptr, _RET_IP_); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( @@ -219,6 +232,113 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } +bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_poison_pages - Check and poison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function is similar to kasan_mempool_poison_object() but operates on + * page allocations. + * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_pages(). + * + * Return: true if the allocation can be safely reused; false otherwise. + */ +static __always_inline bool kasan_mempool_poison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + return __kasan_mempool_poison_pages(page, order, _RET_IP_); + return true; +} + +void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function unpoisons a page allocation that was previously poisoned by + * kasan_mempool_poison_pages() without zeroing the allocation's memory. For + * the tag-based modes, this function assigns a new tag to the allocation. + */ +static __always_inline void kasan_mempool_unpoison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_pages(page, order, _RET_IP_); +} + +bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); +/** + * kasan_mempool_poison_object - Check and poison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function poisons a slab allocation and saves a free stack trace for it + * without initializing the allocation's memory and without putting it into the + * quarantine (for the Generic mode). + * + * This function also performs checks to detect double-free and invalid-free + * bugs and reports them. The caller can use the return value of this function + * to find out if the allocation is buggy. + * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_object(). + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + * + * Return: true if the allocation can be safely reused; false otherwise. + */ +static __always_inline bool kasan_mempool_poison_object(void *ptr) +{ + if (kasan_enabled()) + return __kasan_mempool_poison_object(ptr, _RET_IP_); + return true; +} + +void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); +/** + * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * @size: Size to be unpoisoned. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function unpoisons a slab allocation that was previously poisoned via + * kasan_mempool_poison_object() and saves an alloc stack trace for it without + * initializing the allocation's memory. For the tag-based modes, this function + * does not assign a new tag to the allocation and instead restores the + * original tags based on the pointer value. + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + */ +static __always_inline void kasan_mempool_unpoison_object(void *ptr, + size_t size) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_object(ptr, size, _RET_IP_); +} + /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -242,9 +362,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, return false; } static inline void kasan_poison_slab(struct slab *slab) {} -static inline void kasan_unpoison_object_data(struct kmem_cache *cache, +static inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) {} -static inline void kasan_poison_object_data(struct kmem_cache *cache, +static inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) @@ -256,7 +376,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init return false; } static inline void kasan_kfree_large(void *ptr) {} -static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { @@ -276,6 +395,17 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } +static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order) +{ + return true; +} +static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {} +static inline bool kasan_mempool_poison_object(void *ptr) +{ + return true; +} +static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {} + static inline bool kasan_check_byte(const void *address) { return true; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8227455192b7..400cb6c02176 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; @@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; } static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } #endif +extern bool kexec_file_dbg_print; + +#define kexec_dprintk(fmt, ...) \ + printk("%s" fmt, \ + kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \ + ##__VA_ARGS__) + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 401af4757514..88100cc9caba 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -223,6 +223,8 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla #else /* CONFIG_KFENCE */ +#define kfence_sample_interval (0) + static inline bool is_kfence_address(const void *addr) { return false; } static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 8bfa6c98176d..929287981afe 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -9,6 +9,8 @@ #ifndef _LINUX_KMSAN_TYPES_H #define _LINUX_KMSAN_TYPES_H +#include <linux/types.h> + /* These constants are defined in the MSan LLVM instrumentation pass. */ #define KMSAN_RETVAL_SIZE 800 #define KMSAN_PARAM_SIZE 800 diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c2dd786a30e1..401348e9f92b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,8 +76,8 @@ static inline void ksm_exit(struct mm_struct *mm) * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); +struct folio *ksm_might_need_to_copy(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); @@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) +static inline struct folio *ksm_might_need_to_copy(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr) { - return page; + return folio; } static inline void rmap_walk_ksm(struct folio *folio, diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 73f20deb497d..3a4e723eae0f 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -21,12 +21,10 @@ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H -#include <linux/time.h> -#include <linux/jiffies.h> #include <asm/bug.h> - -/* Nanosecond scalar representation for kernel time values */ -typedef s64 ktime_t; +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index 7303b4bc2ce0..287f590ed56b 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst) bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline void linkmode_fill(unsigned long *dst) +{ + bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + static inline void linkmode_copy(unsigned long *dst, const unsigned long *src) { bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/include/linux/list.h b/include/linux/list.h index 1837caedf723..059aa1fff41e 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -1119,6 +1119,26 @@ static inline void hlist_move_list(struct hlist_head *old, old->first = NULL; } +/** + * hlist_splice_init() - move all entries from one list to another + * @from: hlist_head from which entries will be moved + * @last: last entry on the @from list + * @to: hlist_head to which entries will be moved + * + * @to can be empty, @from must contain at least @last. + */ +static inline void hlist_splice_init(struct hlist_head *from, + struct hlist_node *last, + struct hlist_head *to) +{ + if (to->first) + to->first->pprev = &last->next; + last->next = to->first; + to->first = from->first; + from->first->pprev = &to->first; + from->first = NULL; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb5..7675a48a0701 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -73,8 +73,10 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren /** * list_lru_add: add an element to the lru list's tail - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be added. + * @nid: the node id of the sublist to add the item to. + * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or @@ -83,24 +85,54 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it - * to @list_lru + * to @lru. + * + * Return: true if the list was updated, false otherwise + */ +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_add_obj: add an element to the lru list's tail + * @lru: the lru pointer + * @item: the item to be added. + * + * This function is similar to list_lru_add(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. * * Return value: true if the list was updated, false otherwise */ -bool list_lru_add(struct list_lru *lru, struct list_head *item); +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); /** - * list_lru_del: delete an element to the lru list - * @list_lru: the lru pointer + * list_lru_del: delete an element from the lru list + * @lru: the lru pointer * @item: the item to be deleted. + * @nid: the node id of the sublist to delete the item from. + * @memcg: the cgroup of the sublist to delete the item from. * - * This function works analogously as list_lru_add in terms of list + * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del. + * a list are also valid for list_lru_del(). * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ -bool list_lru_del(struct list_lru *lru, struct list_head *item); +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_del_obj: delete an element from the lru list + * @lru: the lru pointer + * @item: the item to be deleted. + * + * This function is similar to list_lru_del(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise. + */ +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru @@ -108,9 +140,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * @nid: the node id to count from. * @memcg: the cgroup to count from. * - * Always return a non-negative number, 0 for empty lists. There is no - * guarantee that the list is not updated while the count is being computed. - * Callers that want such a guarantee need to provide an outer lock. + * There is no guarantee that the list is not updated while the count is being + * computed. Callers that want such a guarantee need to provide an outer lock. + * + * Return: 0 for empty lists, otherwise the number of objects + * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); @@ -136,12 +170,28 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); +/** + * list_lru_putback: undo list_lru_isolate + * @lru: the lru pointer. + * @item: the item to put back. + * @nid: the node id of the sublist to put the item back to. + * @memcg: the cgroup of the sublist to put the item back to. + * + * Put back an isolated item into its original LRU. Note that unlike + * list_lru_add, this does not increment the node LRU count (as + * list_lru_isolate does not originally decrement this count). + * + * Since we might have dropped the LRU lock in between, recompute list_lru_one + * from the node's id and memcg. + */ +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -150,24 +200,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * This function will scan all elements in a particular list_lru, calling the + * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback - * will return an enum lru_status telling the list_lru infrastructure what to + * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * - * Please note that nr_to_walk does not mean how many objects will be freed, + * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * - * Return value: the number of objects effectively removed from the LRU. + * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** - * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -176,7 +226,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * Same as @list_lru_walk_one except that the spinlock is acquired with + * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index dc2844b071c2..08b0d1d9d78b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -82,63 +82,6 @@ struct lock_chain { u64 chain_key; }; -#define MAX_LOCKDEP_KEYS_BITS 13 -#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define INITIAL_CHAIN_KEY -1 - -struct held_lock { - /* - * One-way hash of the dependency chain up to this point. We - * hash the hashes step by step as the dependency chain grows. - * - * We use it for dependency-caching and we skip detection - * passes and dependency-updates if there is a cache-hit, so - * it is absolutely critical for 100% coverage of the validator - * to have a unique key value for every unique dependency path - * that can occur in the system, to make a unique hash value - * as likely as possible - hence the 64-bit width. - * - * The task struct holds the current hash value (initialized - * with zero), here we store the previous hash value: - */ - u64 prev_chain_key; - unsigned long acquire_ip; - struct lockdep_map *instance; - struct lockdep_map *nest_lock; -#ifdef CONFIG_LOCK_STAT - u64 waittime_stamp; - u64 holdtime_stamp; -#endif - /* - * class_idx is zero-indexed; it points to the element in - * lock_classes this held lock instance belongs to. class_idx is in - * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. - */ - unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; - /* - * The lock-stack is unified in that the lock chains of interrupt - * contexts nest ontop of process context chains, but we 'separate' - * the hashes by starting with 0 if we cross into an interrupt - * context, and we also keep do not add cross-context lock - * dependencies - the lock usage graph walking covers that area - * anyway, and we'd just unnecessarily increase the number of - * dependencies otherwise. [Note: hardirq and softirq contexts - * are separated from each other too.] - * - * The following field is used to detect when we cross into an - * interrupt context: - */ - unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ - unsigned int trylock:1; /* 16 bits */ - - unsigned int read:2; /* see lock_acquire() comment */ - unsigned int check:1; /* see lock_acquire() comment */ - unsigned int hardirqs_off:1; - unsigned int sync:1; - unsigned int references:11; /* 32 bits */ - unsigned int pin_count; -}; - /* * Initialization, self-test and debugging-output methods: */ diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ebc323d345a..70d30d40ea4a 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -127,12 +127,12 @@ struct lock_class { unsigned long usage_mask; const struct lock_trace *usage_traces[LOCK_TRACE_STATES]; + const char *name; /* * Generation counter, when doing certain classes of graph walking, * to ensure that we check one node only once: */ int name_version; - const char *name; u8 wait_type_inner; u8 wait_type_outer; @@ -198,6 +198,63 @@ struct lockdep_map { struct pin_cookie { unsigned int val; }; +#define MAX_LOCKDEP_KEYS_BITS 13 +#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) +#define INITIAL_CHAIN_KEY -1 + +struct held_lock { + /* + * One-way hash of the dependency chain up to this point. We + * hash the hashes step by step as the dependency chain grows. + * + * We use it for dependency-caching and we skip detection + * passes and dependency-updates if there is a cache-hit, so + * it is absolutely critical for 100% coverage of the validator + * to have a unique key value for every unique dependency path + * that can occur in the system, to make a unique hash value + * as likely as possible - hence the 64-bit width. + * + * The task struct holds the current hash value (initialized + * with zero), here we store the previous hash value: + */ + u64 prev_chain_key; + unsigned long acquire_ip; + struct lockdep_map *instance; + struct lockdep_map *nest_lock; +#ifdef CONFIG_LOCK_STAT + u64 waittime_stamp; + u64 holdtime_stamp; +#endif + /* + * class_idx is zero-indexed; it points to the element in + * lock_classes this held lock instance belongs to. class_idx is in + * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. + */ + unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; + /* + * The lock-stack is unified in that the lock chains of interrupt + * contexts nest ontop of process context chains, but we 'separate' + * the hashes by starting with 0 if we cross into an interrupt + * context, and we also keep do not add cross-context lock + * dependencies - the lock usage graph walking covers that area + * anyway, and we'd just unnecessarily increase the number of + * dependencies otherwise. [Note: hardirq and softirq contexts + * are separated from each other too.] + * + * The following field is used to detect when we cross into an + * interrupt context: + */ + unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ + unsigned int trylock:1; /* 16 bits */ + + unsigned int read:2; /* see lock_acquire() comment */ + unsigned int check:1; /* see lock_acquire() comment */ + unsigned int hardirqs_off:1; + unsigned int sync:1; + unsigned int references:11; /* 32 bits */ + unsigned int pin_count; +}; + #else /* !CONFIG_LOCKDEP */ /* diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..185924c56378 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -171,6 +171,8 @@ LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) +LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, + unsigned long arg) LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) @@ -262,6 +264,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) +LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, + struct lsm_ctx __user *ctx, size_t *size, u32 flags) +LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, + struct lsm_ctx *ctx, size_t size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index dcb5e5b5eb13..a2ade0ffe9e7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,6 +25,7 @@ #ifndef __LINUX_LSM_HOOKS_H #define __LINUX_LSM_HOOKS_H +#include <uapi/linux/lsm.h> #include <linux/security.h> #include <linux/init.h> #include <linux/rculist.h> @@ -42,6 +43,18 @@ struct security_hook_heads { #undef LSM_HOOK } __randomize_layout; +/** + * struct lsm_id - Identify a Linux Security Module. + * @lsm: name of the LSM, must be approved by the LSM maintainers + * @id: LSM ID number from uapi/linux/lsm.h + * + * Contains the information that identifies the LSM. + */ +struct lsm_id { + const char *name; + u64 id; +}; + /* * Security module hook list structure. * For use with generic list macros for common operations. @@ -50,7 +63,7 @@ struct security_hook_list { struct hlist_node list; struct hlist_head *head; union security_list_options hook; - const char *lsm; + const struct lsm_id *lsmid; } __randomize_layout; /* @@ -104,7 +117,7 @@ extern struct security_hook_heads security_hook_heads; extern char *lsm_names; extern void security_add_hooks(struct security_hook_list *hooks, int count, - const char *lsm); + const struct lsm_id *lsmid); #define LSM_FLAG_LEGACY_MAJOR BIT(0) #define LSM_FLAG_EXCLUSIVE BIT(1) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index d01e850b570f..b3d63123b945 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -256,6 +256,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -327,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); @@ -345,6 +350,36 @@ static inline bool mtree_empty(const struct maple_tree *mt) /* Advanced API */ /* + * Maple State Status + * ma_active means the maple state is pointing to a node and offset and can + * continue operating on the tree. + * ma_start means we have not searched the tree. + * ma_root means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * ma_none means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * ma_pause means the data within the maple state may be stale, restart the + * operation + * ma_overflow means the search has reached the upper limit of the search + * ma_underflow means the search has reached the lower limit of the search + * ma_error means there was an error, check the node for the error number. + */ +enum maple_status { + ma_active, + ma_start, + ma_root, + ma_none, + ma_pause, + ma_overflow, + ma_underflow, + ma_error, +}; + +/* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. @@ -376,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt) * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. + * + * The status of the state is used to determine how the next action should treat + * the state. For instance, if the status is ma_start then the next action + * should start at the root of the tree and walk down. If the status is + * ma_pause then the node may be stale data and should be discarded. If the + * status is ma_overflow, then the last action hit the upper limit. + * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ @@ -385,9 +427,11 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ + enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; + unsigned char end; /* The end of the node */ }; struct ma_wr_state { @@ -397,7 +441,6 @@ struct ma_wr_state { unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ - unsigned char node_end; /* mas->node end */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ @@ -406,30 +449,16 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) - /* * Special values for ma_state.node. - * MAS_START means we have not searched the tree. - * MAS_ROOT means we have searched the tree and the entry we found lives in - * the root of the tree (ie it has index 0, length 1 and is the only entry in - * the tree). - * MAS_NONE means we have searched the tree and there is no node in the - * tree for this entry. For example, we searched for index 1 in an empty - * tree. Or we have a tree which points to a full leaf node and we - * searched for an entry which is larger than can be contained in that - * leaf node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ -#define MAS_START ((struct maple_enode *)1UL) -#define MAS_ROOT ((struct maple_enode *)5UL) -#define MAS_NONE ((struct maple_enode *)9UL) -#define MAS_PAUSE ((struct maple_enode *)17UL) -#define MAS_OVERFLOW ((struct maple_enode *)33UL) -#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) @@ -438,7 +467,8 @@ struct ma_wr_state { .tree = mt, \ .index = first, \ .last = end, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ @@ -469,7 +499,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); -bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); @@ -498,28 +527,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; - mas->node = MAS_START; -} - -/* Checks if a mas has not found anything */ -static inline bool mas_is_none(const struct ma_state *mas) -{ - return mas->node == MAS_NONE; + mas->status = ma_start; + mas->node = NULL; } -/* Checks if a mas has been paused */ -static inline bool mas_is_paused(const struct ma_state *mas) +static inline bool mas_is_active(struct ma_state *mas) { - return mas->node == MAS_PAUSE; + return mas->status == ma_active; } -/* Check if the mas is pointing to a node or not */ -static inline bool mas_is_active(struct ma_state *mas) +static inline bool mas_is_err(struct ma_state *mas) { - if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) - return true; - - return false; + return mas->status == ma_error; } /** @@ -532,9 +551,10 @@ static inline bool mas_is_active(struct ma_state *mas) * * Context: Any context. */ -static inline void mas_reset(struct ma_state *mas) +static __always_inline void mas_reset(struct ma_state *mas) { - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } /** @@ -550,6 +570,131 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) + +#ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); +void mt_validate(struct maple_tree *mt); +void mt_cache_shrink(void); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. @@ -563,6 +708,9 @@ static inline void mas_reset(struct ma_state *mas) static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { + /* Ensure the range starts within the current slot */ + MAS_WARN_ON(mas, mas_is_active(mas) && + (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } @@ -580,8 +728,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { + mas_reset(mas); __mas_set_range(mas, start, last); - mas->node = MAS_START; } /** @@ -706,129 +854,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) - -#ifdef CONFIG_DEBUG_MAPLE_TREE -enum mt_dump_format { - mt_dump_dec, - mt_dump_hex, -}; - -extern atomic_t maple_tree_tests_run; -extern atomic_t maple_tree_tests_passed; - -void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); -void mas_dump(const struct ma_state *mas); -void mas_wr_dump(const struct ma_wr_state *wr_mas); -void mt_validate(struct maple_tree *mt); -void mt_cache_shrink(void); -#define MT_BUG_ON(__tree, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_BUG_ON(__mas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_WR_BUG_ON(__wrmas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MT_WARN_ON(__tree, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WARN_ON(__mas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) -#else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) -#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) -#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) -#define MT_WARN_ON(__tree, __x) WARN_ON(__x) -#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) -#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) -#endif /* CONFIG_DEBUG_MAPLE_TREE */ - #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 007fd9c3e4b6..79ceee3c8673 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -38,6 +38,7 @@ struct mdio_device { /* Bus address of the MDIO device (0-31) */ int addr; int flags; + int reset_state; struct gpio_desc *reset_gpio; struct reset_control *reset_ctrl; unsigned int reset_assert_delay; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ae3bde302f70..b695f9e946da 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); +bool memblock_validate_numa_coverage(unsigned long threshold_bytes); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bdcf3020d7a..20ff87f8e001 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,6 +219,12 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) unsigned long zswap_max; + + /* + * Prevent pages from this memcg from being written back from zswap to + * swap, and from being swapped out on zswap store failures. + */ + bool zswap_writeback; #endif unsigned long soft_limit; @@ -324,7 +330,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif @@ -821,6 +827,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return !memcg || css_tryget(&memcg->css); } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget_online(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1046,8 +1057,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats(struct mem_cgroup *memcg); +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1187,6 +1198,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; @@ -1349,6 +1365,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return true; } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } @@ -1548,11 +1569,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void mem_cgroup_flush_stats(void) +static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } @@ -1926,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { @@ -1939,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } +static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return true; +} #endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4aae6c06c5f2..7be1e32e6d42 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; +extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1314d9c5f05b..744c830f4b13 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, @@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) return false; } -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - return 0; -} - -static inline void vmem_altmap_free(struct vmem_altmap *altmap, - unsigned long nr_pfns) -{ -} - /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index fa940bbaf8ae..26b04f73f214 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -9,6 +9,7 @@ #include <linux/device.h> #include <linux/ethtool.h> #include <linux/skbuff.h> +#include <linux/net_tstamp.h> struct phy_device; @@ -51,7 +52,8 @@ struct mii_timestamper { struct sk_buff *skb, int type); int (*hwtstamp)(struct mii_timestamper *mii_ts, - struct ifreq *ifreq); + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 820bca965fb6..01275c6e8468 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } -static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe) { return cqe->l4_l3_hdr_type & 0x1; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d2b8d4a74a30..7ee5b79ff3d6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -150,6 +150,7 @@ enum { MLX5_REG_MTPPSE = 0x9054, MLX5_REG_MTUTC = 0x9055, MLX5_REG_MPEGC = 0x9056, + MLX5_REG_MPIR = 0x9059, MLX5_REG_MCQS = 0x9060, MLX5_REG_MCQI = 0x9061, MLX5_REG_MCC = 0x9062, @@ -678,6 +679,8 @@ struct mlx5e_resources { struct mlx5_td td; u32 mkey; struct mlx5_sq_bfreg bfreg; +#define MLX5_MAX_NUM_TC 8 + u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3f7b664d625b..fee20fc010c2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -435,7 +435,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 flow_table_modify[0x1]; u8 reformat[0x1]; u8 decap[0x1]; - u8 reserved_at_9[0x1]; + u8 reset_root_to_default[0x1]; u8 pop_vlan[0x1]; u8 push_vlan[0x1]; u8 reserved_at_c[0x1]; @@ -1801,7 +1801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_uc[0x1]; u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; - u8 reserved_at_3e8[0x2]; + u8 reserved_at_3e8[0x1]; + u8 silent_mode[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; @@ -1818,7 +1819,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_460[0x1]; u8 ats[0x1]; - u8 reserved_at_462[0x1]; + u8 cross_vhca_rqt[0x1]; u8 log_max_uctx[0x5]; u8 reserved_at_468[0x1]; u8 crypto[0x1]; @@ -1943,6 +1944,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { enum { MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS = 0x80000, + MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE = (1ULL << 20), }; enum { @@ -1992,7 +1994,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_260[0x120]; u8 reserved_at_380[0x10]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0x460]; + + u8 reserved_at_3a0[0x10]; + u8 max_rqt_vhca_id[0x10]; + + u8 reserved_at_3c0[0x440]; }; enum mlx5_ifc_flow_destination_type { @@ -2151,6 +2157,13 @@ struct mlx5_ifc_rq_num_bits { u8 rq_num[0x18]; }; +struct mlx5_ifc_rq_vhca_bits { + u8 reserved_at_0[0x8]; + u8 rq_num[0x18]; + u8 reserved_at_20[0x10]; + u8 rq_vhca_id[0x10]; +}; + struct mlx5_ifc_mac_address_layout_bits { u8 reserved_at_0[0x10]; u8 mac_addr_47_32[0x10]; @@ -3901,7 +3914,10 @@ struct mlx5_ifc_rqtc_bits { u8 reserved_at_e0[0x6a0]; - struct mlx5_ifc_rq_num_bits rq_num[]; + union { + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_num_bits, rq_num); + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_vhca_bits, rq_vhca); + }; }; enum { @@ -4744,7 +4760,10 @@ struct mlx5_ifc_set_l2_table_entry_in_bits { u8 reserved_at_c0[0x20]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x10]; + u8 silent_mode_valid[0x1]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; @@ -10089,6 +10108,20 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +struct mlx5_ifc_mpir_reg_bits { + u8 sdm[0x1]; + u8 reserved_at_1[0x1b]; + u8 host_buses[0x4]; + + u8 reserved_at_20[0x20]; + + u8 local_port[0x8]; + u8 reserved_at_28[0x15]; + u8 sd_group[0x3]; + + u8 reserved_at_60[0x20]; +}; + enum { MLX5_MTUTC_FREQ_ADJ_UNITS_PPB = 0x0, MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM = 0x1, @@ -10103,7 +10136,10 @@ enum { struct mlx5_ifc_mtutc_reg_bits { u8 reserved_at_0[0x5]; u8 freq_adj_units[0x3]; - u8 reserved_at_8[0x14]; + u8 reserved_at_8[0x3]; + u8 log_max_freq_adjustment[0x5]; + + u8 reserved_at_10[0xc]; u8 operation[0x4]; u8 freq_adjustment[0x20]; diff --git a/include/linux/mm.h b/include/linux/mm.h index da5219b48d52..f5a97dec5169 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, return mas_expected_entries(&vmi->mas, count); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { @@ -1804,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) static inline u8 page_kasan_tag(const struct page *page) { - u8 tag = 0xff; + u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; @@ -1833,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) - page_kasan_tag_set(page, 0xff); + page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ @@ -1953,15 +1964,15 @@ static inline bool page_maybe_dma_pinned(struct page *page) * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) +static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, + struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; - return page_maybe_dma_pinned(page); + return folio_maybe_dma_pinned(folio); } /** @@ -2373,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int generic_error_remove_page(struct address_space *mapping, struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); @@ -3859,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, @@ -3904,6 +3942,7 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..b2d3a88a34d1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -125,7 +125,7 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ @@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a); * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. - * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. Used for s390 page tables. + * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good @@ -438,7 +438,7 @@ struct ptdesc { #endif }; unsigned int __page_type; - atomic_t _refcount; + atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif @@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); -TABLE_MATCH(_refcount, _refcount); +TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2]; + /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq @@ -958,7 +961,7 @@ struct mm_struct { */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -973,7 +976,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* @@ -1011,11 +1014,13 @@ struct lru_gen_mm_list { spinlock_t lock; }; +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); -#ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm); -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { @@ -1036,7 +1041,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { @@ -1046,11 +1051,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm) { } -#ifdef CONFIG_MEMCG static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { @@ -1060,7 +1063,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) { } -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; @@ -1071,7 +1074,8 @@ struct vma_iterator { .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ }, \ } diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index aa44fff8bb9d..a2f6179b672b 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -9,9 +9,6 @@ */ #include <linux/types.h> -#include <linux/threads.h> -#include <linux/atomic.h> -#include <linux/cpumask.h> #include <asm/page.h> @@ -36,6 +33,8 @@ enum { NR_MM_COUNTERS }; +struct page; + struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9db36e197712..4ed33b127821 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,18 +22,21 @@ #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> +#include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 10 +#define MAX_PAGE_ORDER 10 #else -#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER +#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) +#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should @@ -95,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order <= MAX_ORDER; order++) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -207,6 +210,10 @@ enum node_stat_item { PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif + /* PGDEMOTE_*: pages demoted */ + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, NR_VM_NODE_STAT_ITEMS }; @@ -435,14 +442,12 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; -#ifdef CONFIG_MEMCG /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; -#endif }; enum { @@ -488,11 +493,6 @@ struct lru_gen_mm_walk { bool force_scan; }; -void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - -#ifdef CONFIG_MEMCG - /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins @@ -550,6 +550,8 @@ struct lru_gen_memcg { }; void lru_gen_init_pgdat(struct pglist_data *pgdat); +void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -558,19 +560,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); -#else /* !CONFIG_MEMCG */ - -#define MEMCG_NR_GENS 1 - -struct lru_gen_memcg { -}; - -static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) -{ -} - -#endif /* CONFIG_MEMCG */ - #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) @@ -585,8 +574,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { } -#ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -611,8 +598,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } -#endif /* CONFIG_MEMCG */ - #endif /* CONFIG_LRU_GEN */ struct lruvec { @@ -635,12 +620,15 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif +#endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif + struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ @@ -947,10 +935,10 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER + 1]; + struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY - /* Pages to be accepted. All pages on the list are MAX_ORDER */ + /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; #endif @@ -1760,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS -#error Allocator MAX_ORDER exceeds SECTION_SIZE +#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) @@ -1793,6 +1781,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { + struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif @@ -1986,7 +1975,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); - return test_bit(idx, ms->usage->subsection_map); + return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) @@ -2010,6 +1999,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; + int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the @@ -2023,13 +2013,19 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); - if (!valid_section(ms)) + rcu_read_lock(); + if (!valid_section(ms)) { + rcu_read_unlock(); return 0; + } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ - return early_section(ms) || pfn_section_valid(ms, pfn); + ret = early_section(ms) || pfn_section_valid(ms, pfn); + rcu_read_unlock(); + + return ret; } #endif diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b8da2db4ecd2..cd4d5c8781f5 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb); - #endif /* _LINUX_MNT_IDMAPPING_H */ diff --git a/include/linux/module.h b/include/linux/module.h index a98e188cf37b..96bc462872c0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -540,6 +540,8 @@ struct module { struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) + int num_kunit_init_suites; + struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif @@ -668,7 +670,7 @@ extern void __module_get(struct module *module); * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. - * This call will fail if the module is already being removed. + * This call will fail if the module is in the process of being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 4fa9726bc328..bfb85fd13e1f 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2); */ extern bool parameqn(const char *name1, const char *name2, size_t n); +typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); + /* Called on module insert or kernel boot */ extern char *parse_args(const char *name, char *args, @@ -392,9 +394,7 @@ extern char *parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, - void *arg, - int (*unknown)(char *param, char *val, - const char *doing, void *arg)); + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_SYSFS diff --git a/include/linux/mount.h b/include/linux/mount.h index ac3dd2876197..c34c18b4e8f3 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,8 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ - MNT_CURSOR) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) #define MNT_INTERNAL 0x4000 @@ -65,7 +64,7 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_CURSOR 0x10000000 +#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index c29ace15a053..e84522e31301 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1003,6 +1003,8 @@ struct nand_op_parser { /** * struct nand_operation - NAND operation descriptor * @cs: the CS line to select for this NAND operation + * @deassert_wp: set to true when the operation requires the WP pin to be + * de-asserted (ERASE, PROG, ...) * @instrs: array of instructions to execute * @ninstrs: length of the @instrs array * @@ -1010,6 +1012,7 @@ struct nand_op_parser { */ struct nand_operation { unsigned int cs; + bool deassert_wp; const struct nand_op_instr *instrs; unsigned int ninstrs; }; @@ -1021,6 +1024,14 @@ struct nand_operation { .ninstrs = ARRAY_SIZE(_instrs), \ } +#define NAND_DESTRUCTIVE_OPERATION(_cs, _instrs) \ + { \ + .cs = _cs, \ + .deassert_wp = true, \ + .instrs = _instrs, \ + .ninstrs = ARRAY_SIZE(_instrs), \ + } + int nand_op_parser_exec_op(struct nand_chip *chip, const struct nand_op_parser *parser, const struct nand_operation *op, bool check_only); @@ -1104,6 +1115,7 @@ struct nand_controller_ops { * the bus without restarting an entire read operation nor * changing the column. * @supported_op.cont_read: The controller supports sequential cache reads. + * @controller_wp: the controller is in charge of handling the WP pin. */ struct nand_controller { struct mutex lock; @@ -1112,6 +1124,7 @@ struct nand_controller { unsigned int data_only_read: 1; unsigned int cont_read: 1; } supported_op; + bool controller_wp; }; static inline void nand_controller_init(struct nand_controller *nfc) @@ -1265,6 +1278,7 @@ struct nand_secure_region { * @cont_read: Sequential page read internals * @cont_read.ongoing: Whether a continuous read is ongoing or not * @cont_read.first_page: Start of the continuous read operation + * @cont_read.pause_page: End of the current sequential cache read operation * @cont_read.last_page: End of the continuous read operation * @controller: The hardware controller structure which is shared among multiple * independent devices @@ -1321,6 +1335,7 @@ struct nand_chip { struct { bool ongoing; unsigned int first_page; + unsigned int pause_page; unsigned int last_page; } cont_read; diff --git a/include/linux/mutex.h b/include/linux/mutex.h index a33aa9eb9fc3..7e208d46ba5b 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,7 @@ #include <linux/osq_lock.h> #include <linux/debug_locks.h> #include <linux/cleanup.h> +#include <linux/mutex_types.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ @@ -33,49 +34,6 @@ #ifndef CONFIG_PREEMPT_RT -/* - * Simple, straightforward mutexes with strict semantics: - * - * - only one task can hold the mutex at a time - * - only the owner can unlock the mutex - * - multiple unlocks are not permitted - * - recursive locking is not permitted - * - a mutex object must be initialized via the API - * - a mutex object must not be initialized via memset or copying - * - task may not exit with mutex held - * - memory areas where held locks reside must not be freed - * - held mutexes must not be reinitialized - * - mutexes may not be used in hardware or software interrupt - * contexts such as tasklets and timers - * - * These semantics are fully enforced when DEBUG_MUTEXES is - * enabled. Furthermore, besides enforcing the above rules, the mutex - * debugging code also implements a number of additional features - * that make lock debugging easier and faster: - * - * - uses symbolic names of mutexes, whenever they are printed in debug output - * - point-of-acquire tracking, symbolic lookup of function names - * - list of all locks held in the system, printout of them - * - owner tracking - * - detects self-recursing locks and prints out all relevant info - * - detects multi-task circular deadlocks and prints out all affected - * locks and tasks (and only those tasks) - */ -struct mutex { - atomic_long_t owner; - raw_spinlock_t wait_lock; -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_queue osq; /* Spinner MCS lock */ -#endif - struct list_head wait_list; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifdef CONFIG_DEBUG_MUTEXES #define __DEBUG_MUTEX_INITIALIZER(lockname) \ @@ -131,14 +89,6 @@ extern bool mutex_is_locked(struct mutex *lock); /* * Preempt-RT variant based on rtmutexes. */ -#include <linux/rtmutex.h> - -struct mutex { - struct rt_mutex_base rtmutex; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; #define __MUTEX_INITIALIZER(mutexname) \ { \ @@ -221,6 +171,7 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) -DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T)) +DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) +DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h new file mode 100644 index 000000000000..fdf7f515fde8 --- /dev/null +++ b/include/linux/mutex_types.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MUTEX_TYPES_H +#define __LINUX_MUTEX_TYPES_H + +#include <linux/atomic.h> +#include <linux/lockdep_types.h> +#include <linux/osq_lock.h> +#include <linux/spinlock_types.h> +#include <linux/types.h> + +#ifndef CONFIG_PREEMPT_RT + +/* + * Simple, straightforward mutexes with strict semantics: + * + * - only one task can hold the mutex at a time + * - only the owner can unlock the mutex + * - multiple unlocks are not permitted + * - recursive locking is not permitted + * - a mutex object must be initialized via the API + * - a mutex object must not be initialized via memset or copying + * - task may not exit with mutex held + * - memory areas where held locks reside must not be freed + * - held mutexes must not be reinitialized + * - mutexes may not be used in hardware or software interrupt + * contexts such as tasklets and timers + * + * These semantics are fully enforced when DEBUG_MUTEXES is + * enabled. Furthermore, besides enforcing the above rules, the mutex + * debugging code also implements a number of additional features + * that make lock debugging easier and faster: + * + * - uses symbolic names of mutexes, whenever they are printed in debug output + * - point-of-acquire tracking, symbolic lookup of function names + * - list of all locks held in the system, printout of them + * - owner tracking + * - detects self-recursing locks and prints out all relevant info + * - detects multi-task circular deadlocks and prints out all affected + * locks and tasks (and only those tasks) + */ +struct mutex { + atomic_long_t owner; + raw_spinlock_t wait_lock; +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + struct optimistic_spin_queue osq; /* Spinner MCS lock */ +#endif + struct list_head wait_list; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#else /* !CONFIG_PREEMPT_RT */ +/* + * Preempt-RT variant based on rtmutexes. + */ +#include <linux/rtmutex.h> + +struct mutex { + struct rt_mutex_base rtmutex; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#endif /* CONFIG_PREEMPT_RT */ + +#endif /* __LINUX_MUTEX_TYPES_H */ diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h index ed42bd5f639f..0aa4411528fc 100644 --- a/include/linux/net/intel/i40e_client.h +++ b/include/linux/net/intel/i40e_client.h @@ -45,7 +45,7 @@ struct i40e_qv_info { struct i40e_qvlist_info { u32 num_vectors; - struct i40e_qv_info qv_info[]; + struct i40e_qv_info qv_info[] __counted_by(num_vectors); }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2564e209465e..118c40258d07 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -382,6 +382,7 @@ struct napi_struct { /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; + int irq; }; enum { @@ -665,6 +666,10 @@ struct netdev_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; /* * write-mostly part */ @@ -1324,6 +1329,9 @@ struct netdev_net_notifier { * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. + * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], + * struct netlink_ext_ack *extack); + * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink @@ -1606,6 +1614,9 @@ struct net_device_ops { int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); + int (*ndo_mdb_del_bulk)(struct net_device *dev, + struct nlattr *tb[], + struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); @@ -1865,6 +1876,7 @@ enum netdev_stat_type { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. + * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2091,6 +2103,72 @@ enum netdev_stat_type { */ struct net_device { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/net_device.rst. + * Please update the document when adding new fields. + */ + + /* TX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_tx); + unsigned long long priv_flags; + const struct net_device_ops *netdev_ops; + const struct header_ops *header_ops; + struct netdev_queue *_tx; + netdev_features_t gso_partial_features; + unsigned int real_num_tx_queues; + unsigned int gso_max_size; + unsigned int gso_ipv4_max_size; + u16 gso_max_segs; + s16 num_tc; + /* Note : dev->mtu is often read without holding a lock. + * Writers usually hold RTNL. + * It is recommended to use READ_ONCE() to annotate the reads, + * and to use WRITE_ONCE() to annotate the writes. + */ + unsigned int mtu; + unsigned short needed_headroom; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; +#ifdef CONFIG_XPS + struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; +#endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_egress; +#endif + __cacheline_group_end(net_device_read_tx); + + /* TXRX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_txrx); + unsigned int flags; + unsigned short hard_header_len; + netdev_features_t features; + struct inet6_dev __rcu *ip6_ptr; + __cacheline_group_end(net_device_read_txrx); + + /* RX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_rx); + struct bpf_prog __rcu *xdp_prog; + struct list_head ptype_specific; + int ifindex; + unsigned int real_num_rx_queues; + struct netdev_rx_queue *_rx; + unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; + unsigned int gro_max_size; + unsigned int gro_ipv4_max_size; + rx_handler_func_t __rcu *rx_handler; + void __rcu *rx_handler_data; + possible_net_t nd_net; +#ifdef CONFIG_NETPOLL + struct netpoll_info __rcu *npinfo; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_ingress; +#endif + __cacheline_group_end(net_device_read_rx); + char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; @@ -2115,7 +2193,6 @@ struct net_device { struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; - struct list_head ptype_specific; struct { struct list_head upper; @@ -2123,31 +2200,18 @@ struct net_device { } adj_list; /* Read-mostly cache-line for fast-path access */ - unsigned int flags; xdp_features_t xdp_features; - unsigned long long priv_flags; - const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; - int ifindex; + const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; unsigned short gflags; - unsigned short hard_header_len; - /* Note : dev->mtu is often read without holding a lock. - * Writers usually hold RTNL. - * It is recommended to use READ_ONCE() to annotate the reads, - * and to use WRITE_ONCE() to annotate the writes. - */ - unsigned int mtu; - unsigned short needed_headroom; unsigned short needed_tailroom; - netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; - netdev_features_t gso_partial_features; unsigned int min_mtu; unsigned int max_mtu; @@ -2185,8 +2249,6 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - const struct header_ops *header_ops; - unsigned char operstate; unsigned char link_mode; @@ -2227,9 +2289,7 @@ struct net_device { /* Protocol-specific pointers */ - struct in_device __rcu *ip_ptr; - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2264,26 +2324,13 @@ struct net_device { /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; - struct netdev_rx_queue *_rx; unsigned int num_rx_queues; - unsigned int real_num_rx_queues; - - struct bpf_prog __rcu *xdp_prog; - unsigned long gro_flush_timeout; - int napi_defer_hard_irqs; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) - unsigned int gro_max_size; - unsigned int gro_ipv4_max_size; unsigned int xdp_zc_max_segs; - rx_handler_func_t __rcu *rx_handler; - void __rcu *rx_handler_data; -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_ingress; -#endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; @@ -2298,25 +2345,13 @@ struct net_device { /* * Cache lines mostly used on transmit path */ - struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; - unsigned int real_num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; -#ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; -#endif -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_egress; -#endif -#ifdef CONFIG_NETFILTER_EGRESS - struct nf_hook_entries __rcu *nf_hooks_egress; -#endif - #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif @@ -2355,12 +2390,6 @@ struct net_device { bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); -#ifdef CONFIG_NETPOLL - struct netpoll_info __rcu *npinfo; -#endif - - possible_net_t nd_net; - /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; @@ -2395,20 +2424,15 @@ struct net_device { */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) - unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; - u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; - unsigned int gso_ipv4_max_size; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - s16 num_tc; - struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) @@ -2447,6 +2471,10 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) + /** @page_pools: page pools created for this netdevice */ + struct hlist_head page_pools; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2651,6 +2679,15 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, + struct napi_struct *napi); + +static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +{ + napi->irq = irq; +} + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ @@ -3964,6 +4001,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev, int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, @@ -4196,6 +4236,15 @@ static inline void netdev_ref_replace(struct net_device *odev, void linkwatch_fire_event(struct net_device *dev); /** + * linkwatch_sync_dev - sync linkwatch for the given device + * @dev: network device to sync linkwatch for + * + * Sync linkwatch for the given device, removing it from the + * pending work list (if queued). + */ +void linkwatch_sync_dev(struct net_device *dev); + +/** * netif_carrier_ok - test if carrier present * @dev: network device * diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 7834c0be2831..61aa48f46dd7 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -51,7 +51,7 @@ struct nf_ipv6_ops { u32 (*cookie_init_sequence)(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); int (*cookie_v6_check)(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie); + const struct tcphdr *th); #endif void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -179,16 +179,16 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, } static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie) + const struct tcphdr *th) { #if IS_ENABLED(CONFIG_SYN_COOKIES) #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (v6_ops) - return v6_ops->cookie_v6_check(iph, th, cookie); + return v6_ops->cookie_v6_check(iph, th); #elif IS_BUILTIN(CONFIG_IPV6) - return __cookie_v6_check(iph, th, cookie); + return __cookie_v6_check(iph, th); #endif #endif return 0; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 75d7de34c908..1a4445bf2ab9 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -228,10 +228,12 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); + +typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); + int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, - struct sk_buff *skb, void *data), + netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); @@ -351,5 +353,6 @@ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); +struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c11c4db34639..ef8d2d618d5b 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -869,4 +869,26 @@ enum { RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15, }; +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + + /* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + + /* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, + + OP_CB_ILLEGAL = 10044, +}; + #endif diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 279262057a92..f5ce7b101146 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -595,7 +595,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); * linux/fs/nfs/write.c */ extern int nfs_congestion_kb; -extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct folio *folio); extern int nfs_update_folio(struct file *file, struct folio *folio, diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1..b61438313a73 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -93,10 +93,10 @@ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> +#include <linux/nodemask_types.h> #include <linux/numa.h> #include <linux/random.h> -typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; /** diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h new file mode 100644 index 000000000000..6b28d97ea6ed --- /dev/null +++ b/include/linux/nodemask_types.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_NODEMASK_TYPES_H +#define __LINUX_NODEMASK_TYPES_H + +#include <linux/bitops.h> +#include <linux/numa.h> + +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; + +#endif /* __LINUX_NODEMASK_TYPES_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 771cb0285872..5601d14e2886 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -2,6 +2,7 @@ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H +#include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> diff --git a/include/linux/nubus.h b/include/linux/nubus.h index bdcd85e622d8..4d103ac8f5c7 100644 --- a/include/linux/nubus.h +++ b/include/linux/nubus.h @@ -89,8 +89,6 @@ struct nubus_driver { void (*remove)(struct nubus_board *board); }; -extern struct bus_type nubus_bus_type; - /* Generic NuBus interface functions, modelled after the PCI interface */ #ifdef CONFIG_PROC_FS extern bool nubus_populate_procfs; diff --git a/include/linux/numa.h b/include/linux/numa.h index a904861de800..915033a75731 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H +#include <linux/init.h> #include <linux/types.h> #ifdef CONFIG_NODES_SHIFT @@ -22,34 +23,26 @@ #endif #ifdef CONFIG_NUMA -#include <linux/printk.h> #include <asm/sparsemem.h> /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); #ifndef memory_add_physaddr_to_nid -static inline int memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int memory_add_physaddr_to_nid(u64 start); #endif + #ifndef phys_to_target_node -static inline int phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int phys_to_target_node(u64 start); #endif + #ifndef numa_fill_memblks static inline int __init numa_fill_memblks(u64 start, u64 end) { return NUMA_NO_MEMBLK; } #endif + #else /* !CONFIG_NUMA */ static inline int numa_nearest_node(int node, unsigned int state) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88e64acebfe..735cddc13d20 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -772,19 +772,14 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -bool __folio_start_writeback(struct folio *folio, bool keep_write); -bool set_page_writeback(struct page *page); +void __folio_start_writeback(struct folio *folio, bool keep_write); +void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static inline bool test_set_page_writeback(struct page *page) -{ - return set_page_writeback(page); -} - static __always_inline bool folio_test_head(struct folio *folio) { return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c095041..3f2409b968ec 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/pci.h b/include/linux/pci.h index dea043bc1e38..58a4c976c39b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,6 +1239,8 @@ int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val); int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val); int pci_write_config_word(const struct pci_dev *dev, int where, u16 val); int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val); +void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos, + u32 clear, u32 set); int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val); int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 275799b5f535..a0c75e467df3 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2605,6 +2605,8 @@ #define PCI_VENDOR_ID_TEKRAM 0x1de1 #define PCI_DEVICE_ID_TEKRAM_DC290 0xdc29 +#define PCI_VENDOR_ID_ALIBABA 0x1ded + #define PCI_VENDOR_ID_TEHUTI 0x1fc9 #define PCI_DEVICE_ID_TEHUTI_3009 0x3009 #define PCI_DEVICE_ID_TEHUTI_3010 0x3010 @@ -3065,6 +3067,7 @@ #define PCI_DEVICE_ID_INTEL_82443GX_0 0x71a0 #define PCI_DEVICE_ID_INTEL_82443GX_2 0x71a2 #define PCI_DEVICE_ID_INTEL_82372FB_1 0x7601 +#define PCI_DEVICE_ID_INTEL_HDA_ARL 0x7728 #define PCI_DEVICE_ID_INTEL_HDA_RPL_S 0x7a50 #define PCI_DEVICE_ID_INTEL_HDA_ADL_S 0x7ad0 #define PCI_DEVICE_ID_INTEL_HDA_MTL 0x7e28 diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 143fbc10ecfe..b3b34f6670cf 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -60,12 +60,6 @@ struct pmu_hw_events { DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS); /* - * Hardware lock to serialize accesses to PMU registers. Needed for the - * read/modify/write sequences. - */ - raw_spinlock_t pmu_lock; - - /* * When using percpu IRQs, we need a percpu dev_id. Place it here as we * already have to allocate this struct per cpu. */ @@ -189,4 +183,26 @@ void armpmu_free_irq(int irq, int cpu); #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" #define ARMV8_TRBE_PDEV_NAME "arm,trbe" +/* Why does everything I do descend into this? */ +#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi + +#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) + +#define GEN_PMU_FORMAT_ATTR(name) \ + PMU_FORMAT_ATTR(name, \ + _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI)) + +#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ + ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) + +#define ATTR_CFG_GET_FLD(attr, name) \ + _ATTR_CFG_GET_FLD(attr, \ + ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI) + #endif /* __ARM_PMU_H__ */ diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 9c226adf938a..46377e134d67 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -215,21 +215,27 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f -#define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ +/* Mask for writable bits */ +#define ARMV8_PMU_PMCR_MASK (ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \ + ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \ + ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \ + ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP) /* * PMOVSR: counters overflow flag status reg */ -#define ARMV8_PMU_OVSR_MASK 0xffffffff /* Mask for writable bits */ -#define ARMV8_PMU_OVERFLOWED_MASK ARMV8_PMU_OVSR_MASK +#define ARMV8_PMU_OVSR_P GENMASK(30, 0) +#define ARMV8_PMU_OVSR_C BIT(31) +/* Mask for writable bits is both P and C fields */ +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) /* * PMXEVTYPER: Event selection reg */ -#define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ -#define ARMV8_PMU_EVTYPE_EVENT 0xffff /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_TH GENMASK_ULL(43, 32) /* arm64 only */ +#define ARMV8_PMU_EVTYPE_TC GENMASK_ULL(63, 61) /* arm64 only */ /* * Event filters for PMUv3 @@ -244,19 +250,19 @@ /* * PMUSERENR: user enable reg */ -#define ARMV8_PMU_USERENR_MASK 0xf /* Mask for writable bits */ #define ARMV8_PMU_USERENR_EN (1 << 0) /* PMU regs can be accessed at EL0 */ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +/* Mask for writable bits */ +#define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ + ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) /* PMMIR_EL1.SLOTS mask */ -#define ARMV8_PMU_SLOTS_MASK 0xff - -#define ARMV8_PMU_BUS_SLOTS_SHIFT 8 -#define ARMV8_PMU_BUS_SLOTS_MASK 0xff -#define ARMV8_PMU_BUS_WIDTH_SHIFT 16 -#define ARMV8_PMU_BUS_WIDTH_MASK 0xf +#define ARMV8_PMU_SLOTS GENMASK(7, 0) +#define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) +#define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) +#define ARMV8_PMU_THWIDTH GENMASK(23, 20) /* * This code is really good diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5547ba68e6e4..d2a15c0c6f8a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1143,6 +1143,15 @@ static inline bool branch_sample_priv(const struct perf_event *event) return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } +static inline bool branch_sample_counters(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; +} + +static inline bool branch_sample_call_stack(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; +} struct perf_sample_data { /* @@ -1177,6 +1186,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; + u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; @@ -1254,7 +1264,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, - struct perf_branch_stack *brs) + struct perf_branch_stack *brs, + u64 *brs_cntr) { int size = sizeof(u64); /* nr */ @@ -1262,7 +1273,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); + /* + * The extension space for counters is appended after the + * struct perf_branch_stack. It is used to store the occurrences + * of events of each branch. + */ + if (brs_cntr) + size += brs->nr * sizeof(u64); + data->br_stack = brs; + data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3..466cf477551a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd) } #endif +#ifndef pmd_dirty +static inline int pmd_dirty(pmd_t pmd) +{ + return 0; +} +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode @@ -375,7 +382,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) */ static inline bool arch_has_hw_pte_young(void) { - return false; + return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif diff --git a/include/linux/phy.h b/include/linux/phy.h index bd285950972c..684efaeca07c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -327,7 +327,8 @@ struct mdio_bus_stats { /** * struct phy_package_shared - Shared information in PHY packages - * @addr: Common PHY address used to combine PHYs in one package + * @base_addr: Base PHY address of PHY package used to combine PHYs + * in one package and for offset calculation of phy_package_read/write * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv @@ -338,7 +339,7 @@ struct mdio_bus_stats { * phy_package_leave(). */ struct phy_package_shared { - int addr; + u8 base_addr; refcount_t refcnt; unsigned long flags; size_t priv_size; @@ -604,6 +605,8 @@ struct macsec_ops; * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value + * @possible_interfaces: bitmap if interface modes that the attached PHY + * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics @@ -673,6 +676,7 @@ struct phy_device { u32 dev_flags; phy_interface_t interface; + DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) @@ -1559,9 +1563,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev) return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } -static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) +static inline int phy_hwtstamp(struct phy_device *phydev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { - return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); + return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, @@ -1860,6 +1866,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); @@ -1969,10 +1976,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev, int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size); +int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, - int addr, size_t priv_size); + int base_addr, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); @@ -1995,48 +2002,83 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -static inline int phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_address(struct phy_device *phydev, + unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; + u8 base_addr = shared->base_addr; - if (!shared) + if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; - return mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + /* we know that addr will be in the range 0..31 and thus the + * implicit cast to a signed int is not a problem. + */ + return base_addr + addr_offset; } -static inline int __phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; + + return mdiobus_read(phydev->mdio.bus, addr, regnum); +} + +static inline int __phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) +{ + int addr = phy_package_address(phydev, addr_offset); + + if (addr < 0) + return addr; - return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } +int __phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int __phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + +int phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 875439ab45de..d589f89c612c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -99,72 +99,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode) } /** - * phylink_pcs_neg_mode() - helper to determine PCS inband mode - * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. - * @interface: interface mode to be used - * @advertising: adertisement ethtool link mode mask - * - * Determines the negotiation mode to be used by the PCS, and returns - * one of: - * - * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband - * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY) - * will be used. - * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg - * disabled - * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled - * - * Note: this is for cases where the PCS itself is involved in negotiation - * (e.g. Clause 37, SGMII and similar) not Clause 73. - */ -static inline unsigned int phylink_pcs_neg_mode(unsigned int mode, - phy_interface_t interface, - const unsigned long *advertising) -{ - unsigned int neg_mode; - - switch (interface) { - case PHY_INTERFACE_MODE_SGMII: - case PHY_INTERFACE_MODE_QSGMII: - case PHY_INTERFACE_MODE_QUSGMII: - case PHY_INTERFACE_MODE_USXGMII: - /* These protocols are designed for use with a PHY which - * communicates its negotiation result back to the MAC via - * inband communication. Note: there exist PHYs that run - * with SGMII but do not send the inband data. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - break; - - case PHY_INTERFACE_MODE_1000BASEX: - case PHY_INTERFACE_MODE_2500BASEX: - /* 1000base-X is designed for use media-side for Fibre - * connections, and thus the Autoneg bit needs to be - * taken into account. We also do this for 2500base-X - * as well, but drivers may not support this, so may - * need to override this. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, - advertising)) - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; - break; - - default: - neg_mode = PHYLINK_PCS_NEG_NONE; - break; - } - - return neg_mode; -} - -/** * struct phylink_link_state - link state structure * @advertising: ethtool bitmask containing advertised link modes * @lp_advertising: ethtool bitmask containing link partner advertised link diff --git a/include/linux/pid.h b/include/linux/pid.h index 653a527574c4..395cacce1179 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -2,18 +2,12 @@ #ifndef _LINUX_PID_H #define _LINUX_PID_H +#include <linux/pid_types.h> #include <linux/rculist.h> -#include <linux/wait.h> +#include <linux/rcupdate.h> #include <linux/refcount.h> - -enum pid_type -{ - PIDTYPE_PID, - PIDTYPE_TGID, - PIDTYPE_PGID, - PIDTYPE_SID, - PIDTYPE_MAX, -}; +#include <linux/sched.h> +#include <linux/wait.h> /* * What is struct pid? @@ -110,9 +104,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); -struct pid_namespace; -extern struct pid_namespace init_pid_ns; - extern int pid_max; extern int pid_max_min, pid_max_max; @@ -215,4 +206,127 @@ pid_t pid_vnr(struct pid *pid); } \ task = tg___; \ } while_each_pid_task(pid, type, task) + +static inline struct pid *task_pid(struct task_struct *task) +{ + return task->thread_pid; +} + +/* + * the helpers to get the task's different pids as they are seen + * from various namespaces + * + * task_xid_nr() : global id, i.e. the id seen from the init namespace; + * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of + * current. + * task_xid_nr_ns() : id seen from the ns specified; + * + * see also pid_nr() etc in include/linux/pid.h + */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); + +static inline pid_t task_pid_nr(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} + +static inline pid_t task_pid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); +} + + +static inline pid_t task_tgid_nr(struct task_struct *tsk) +{ + return tsk->tgid; +} + +/** + * pid_alive - check that a task structure is not stale + * @p: Task structure to be checked. + * + * Test if a process is not yet dead (at most zombie state) + * If pid_alive fails, then pointers within the task structure + * can be stale and must not be dereferenced. + * + * Return: 1 if the process is alive. 0 otherwise. + */ +static inline int pid_alive(const struct task_struct *p) +{ + return p->thread_pid != NULL; +} + +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} + +static inline pid_t task_pgrp_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); +} + + +static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} + +static inline pid_t task_session_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); +} + +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + +/* Obsolete, do not use: */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + +/** + * is_global_init - check if a task structure is init. Since init + * is free to have sub-threads we need to check tgid. + * @tsk: Task structure to be checked. + * + * Check if a task structure is the first user space task the kernel created. + * + * Return: 1 if the task structure is init. 0 otherwise. + */ +static inline int is_global_init(struct task_struct *tsk) +{ + return task_tgid_nr(tsk) == 1; +} + #endif /* _LINUX_PID_H */ diff --git a/include/linux/pid_types.h b/include/linux/pid_types.h new file mode 100644 index 000000000000..c2aee1d91dcf --- /dev/null +++ b/include/linux/pid_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_TYPES_H +#define _LINUX_PID_TYPES_H + +enum pid_type { + PIDTYPE_PID, + PIDTYPE_TGID, + PIDTYPE_PGID, + PIDTYPE_SID, + PIDTYPE_MAX, +}; + +struct pid_namespace; +extern struct pid_namespace init_pid_ns; + +#endif /* _LINUX_PID_TYPES_H */ diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index ea1cc6d829e9..f177416635a2 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -20,10 +20,31 @@ #define __MICROCHIP_KSZ_H #include <linux/types.h> +#include <linux/platform_data/dsa.h> + +enum ksz_chip_id { + KSZ8563_CHIP_ID = 0x8563, + KSZ8795_CHIP_ID = 0x8795, + KSZ8794_CHIP_ID = 0x8794, + KSZ8765_CHIP_ID = 0x8765, + KSZ8830_CHIP_ID = 0x8830, + KSZ9477_CHIP_ID = 0x00947700, + KSZ9896_CHIP_ID = 0x00989600, + KSZ9897_CHIP_ID = 0x00989700, + KSZ9893_CHIP_ID = 0x00989300, + KSZ9563_CHIP_ID = 0x00956300, + KSZ9567_CHIP_ID = 0x00956700, + LAN9370_CHIP_ID = 0x00937000, + LAN9371_CHIP_ID = 0x00937100, + LAN9372_CHIP_ID = 0x00937200, + LAN9373_CHIP_ID = 0x00937300, + LAN9374_CHIP_ID = 0x00937400, +}; struct ksz_platform_data { + /* Must be first such that dsa_register_switch() can access it */ + struct dsa_chip_data cd; u32 chip_id; - u16 enabled_ports; }; #endif diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h index c71a2dd66143..5f412a615532 100644 --- a/include/linux/platform_data/si5351.h +++ b/include/linux/platform_data/si5351.h @@ -105,10 +105,12 @@ struct si5351_clkout_config { * @clk_xtal: xtal input clock * @clk_clkin: clkin input clock * @pll_src: array of pll source clock setting + * @pll_reset: array indicating if plls should be reset after setting the rate * @clkout: array of clkout configuration */ struct si5351_platform_data { enum si5351_pll_src pll_src[2]; + bool pll_reset[2]; struct si5351_clkout_config clkout[8]; }; diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h index 41df326583f9..7f132029316a 100644 --- a/include/linux/platform_data/x86/clk-lpss.h +++ b/include/linux/platform_data/x86/clk-lpss.h @@ -15,6 +15,6 @@ struct lpss_clk_data { struct clk *clk; }; -extern int lpss_atom_clk_init(void); +int lpss_atom_clk_init(void); #endif /* __CLK_LPSS_H */ diff --git a/include/linux/plist.h b/include/linux/plist.h index 0f352c1d3c80..8c1c8adf7fe9 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -75,20 +75,10 @@ #include <linux/container_of.h> #include <linux/list.h> -#include <linux/types.h> +#include <linux/plist_types.h> #include <asm/bug.h> -struct plist_head { - struct list_head node_list; -}; - -struct plist_node { - int prio; - struct list_head prio_list; - struct list_head node_list; -}; - /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h new file mode 100644 index 000000000000..c37e784330af --- /dev/null +++ b/include/linux/plist_types.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_PLIST_TYPES_H +#define _LINUX_PLIST_TYPES_H + +#include <linux/types.h> + +struct plist_head { + struct list_head node_list; +}; + +struct plist_node { + int prio; + struct list_head prio_list; + struct list_head node_list; +}; + +#endif /* _LINUX_PLIST_TYPES_H */ diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index ccd97bcef269..76dcb7f37bcd 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -45,18 +45,6 @@ struct dev_pm_opp_supply { unsigned long u_watt; }; -/** - * struct dev_pm_opp_icc_bw - Interconnect bandwidth values - * @avg: Average bandwidth corresponding to this OPP (in icc units) - * @peak: Peak bandwidth corresponding to this OPP (in icc units) - * - * This structure stores the bandwidth values for a single interconnect path. - */ -struct dev_pm_opp_icc_bw { - u32 avg; - u32 peak; -}; - typedef int (*config_regulators_t)(struct device *dev, struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp, struct regulator **regulators, unsigned int count); @@ -74,8 +62,10 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table, * @supported_hw_count: Number of elements in the array. * @regulator_names: Array of pointers to the names of the regulator, NULL terminated. * @genpd_names: Null terminated array of pointers containing names of genpd to - * attach. - * @virt_devs: Pointer to return the array of virtual devices. + * attach. Mutually exclusive with required_devs. + * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually + * exclusive with required_devs. + * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs. * * This structure contains platform specific OPP configurations for the device. */ @@ -90,11 +80,15 @@ struct dev_pm_opp_config { const char * const *regulator_names; const char * const *genpd_names; struct device ***virt_devs; + struct device **required_devs; }; +#define OPP_LEVEL_UNSET U32_MAX + /** * struct dev_pm_opp_data - The data to use to initialize an OPP. - * @level: The performance level for the OPP. + * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if + * level field isn't used. * @freq: The clock rate in Hz for the OPP. * @u_volt: The voltage in uV for the OPP. */ @@ -157,7 +151,7 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, unsigned int *level); struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev, - unsigned long *level); + unsigned int *level); struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, unsigned int *bw, int index); @@ -324,7 +318,7 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, } static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev, - unsigned long *level) + unsigned int *level) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/include/linux/poison.h b/include/linux/poison.h index 851a855d3868..27a7dad17eef 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -83,6 +83,8 @@ /********** net/core/skbuff.c **********/ #define SKB_LIST_POISON_NEXT ((void *)(0x800 + POISON_POINTER_DELTA)) +/********** net/ **********/ +#define NET_PTR_POISON ((void *)(0x801 + POISON_POINTER_DELTA)) /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index d607f51404fc..dc7b738de299 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -2,40 +2,16 @@ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H -#include <linux/spinlock.h> +#include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> -#include <linux/alarmtimer.h> +#include <linux/posix-timers_types.h> +#include <linux/spinlock.h> #include <linux/timerqueue.h> struct kernel_siginfo; struct task_struct; -/* - * Bit fields within a clockid: - * - * The most significant 29 bits hold either a pid or a file descriptor. - * - * Bit 2 indicates whether a cpu clock refers to a thread or a process. - * - * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. - * - * A clockid is invalid if bits 2, 1, and 0 are all set. - */ -#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) -#define CPUCLOCK_PERTHREAD(clock) \ - (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) - -#define CPUCLOCK_PERTHREAD_MASK 4 -#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) -#define CPUCLOCK_CLOCK_MASK 3 -#define CPUCLOCK_PROF 0 -#define CPUCLOCK_VIRT 1 -#define CPUCLOCK_SCHED 2 -#define CPUCLOCK_MAX 3 -#define CLOCKFD CPUCLOCK_MAX -#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) - static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { @@ -109,44 +85,6 @@ static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) ctmr->node.expires = exp; } -/** - * posix_cputimer_base - Container per posix CPU clock - * @nextevt: Earliest-expiration cache - * @tqhead: timerqueue head for cpu_timers - */ -struct posix_cputimer_base { - u64 nextevt; - struct timerqueue_head tqhead; -}; - -/** - * posix_cputimers - Container for posix CPU timer related data - * @bases: Base container for posix CPU clocks - * @timers_active: Timers are queued. - * @expiry_active: Timer expiry is active. Used for - * process wide timers to avoid multiple - * task trying to handle expiry concurrently - * - * Used in task_struct and signal_struct - */ -struct posix_cputimers { - struct posix_cputimer_base bases[CPUCLOCK_MAX]; - unsigned int timers_active; - unsigned int expiry_active; -}; - -/** - * posix_cputimers_work - Container for task work based posix CPU timer expiry - * @work: The task work to be scheduled - * @mutex: Mutex held around expiry in context of this task work - * @scheduled: @work has been scheduled already, no further processing - */ -struct posix_cputimers_work { - struct callback_head work; - struct mutex mutex; - unsigned int scheduled; -}; - static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); @@ -179,7 +117,6 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else -struct posix_cputimers { }; struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h new file mode 100644 index 000000000000..a4712c1008c9 --- /dev/null +++ b/include/linux/posix-timers_types.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _linux_POSIX_TIMERS_TYPES_H +#define _linux_POSIX_TIMERS_TYPES_H + +#include <linux/mutex_types.h> +#include <linux/timerqueue_types.h> +#include <linux/types.h> + +/* + * Bit fields within a clockid: + * + * The most significant 29 bits hold either a pid or a file descriptor. + * + * Bit 2 indicates whether a cpu clock refers to a thread or a process. + * + * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. + * + * A clockid is invalid if bits 2, 1, and 0 are all set. + */ +#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) +#define CPUCLOCK_PERTHREAD(clock) \ + (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) + +#define CPUCLOCK_PERTHREAD_MASK 4 +#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) +#define CPUCLOCK_CLOCK_MASK 3 +#define CPUCLOCK_PROF 0 +#define CPUCLOCK_VIRT 1 +#define CPUCLOCK_SCHED 2 +#define CPUCLOCK_MAX 3 +#define CLOCKFD CPUCLOCK_MAX +#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) + +#ifdef CONFIG_POSIX_TIMERS + +/** + * posix_cputimer_base - Container per posix CPU clock + * @nextevt: Earliest-expiration cache + * @tqhead: timerqueue head for cpu_timers + */ +struct posix_cputimer_base { + u64 nextevt; + struct timerqueue_head tqhead; +}; + +/** + * posix_cputimers - Container for posix CPU timer related data + * @bases: Base container for posix CPU clocks + * @timers_active: Timers are queued. + * @expiry_active: Timer expiry is active. Used for + * process wide timers to avoid multiple + * task trying to handle expiry concurrently + * + * Used in task_struct and signal_struct + */ +struct posix_cputimers { + struct posix_cputimer_base bases[CPUCLOCK_MAX]; + unsigned int timers_active; + unsigned int expiry_active; +}; + +/** + * posix_cputimers_work - Container for task work based posix CPU timer expiry + * @work: The task work to be scheduled + * @mutex: Mutex held around expiry in context of this task work + * @scheduled: @work has been scheduled already, no further processing + */ +struct posix_cputimers_work { + struct callback_head work; + struct mutex mutex; + unsigned int scheduled; +}; + +#else /* CONFIG_POSIX_TIMERS */ + +struct posix_cputimers { }; + +#endif /* CONFIG_POSIX_TIMERS */ + +#endif /* _linux_POSIX_TIMERS_TYPES_H */ diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b3d6..f7f1e5251c67 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,7 +10,6 @@ #include <linux/types.h> #include <linux/once.h> -#include <linux/percpu.h> #include <linux/random.h> struct rnd_state { diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 9aa6358a1a16..7233e9cf1bab 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,7 +9,7 @@ #include <linux/linkage.h> #include <linux/cleanup.h> -#include <linux/list.h> +#include <linux/types.h> /* * We put the hardirq and softirq counter into the preemption @@ -360,7 +360,9 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { - INIT_HLIST_NODE(¬ifier->link); + /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ + notifier->link.next = NULL; + notifier->link.pprev = NULL; notifier->ops = ops; } diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a8..1f1166133e6b 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -80,6 +80,16 @@ int fwnode_property_match_string(const struct fwnode_handle *fwnode, bool fwnode_device_is_available(const struct fwnode_handle *fwnode); +static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode) +{ + if (fwnode_property_present(fwnode, "big-endian")) + return true; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && + fwnode_property_present(fwnode, "native-endian")) + return true; + return false; +} + static inline bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat) { @@ -87,6 +97,22 @@ bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char } /** + * device_is_big_endian - check if a device has BE registers + * @dev: Pointer to the struct device + * + * Returns: true if the device has a "big-endian" property, or if the kernel + * was compiled for BE *and* the device has a "native-endian" property. + * Returns false otherwise. + * + * Callers would nominally use ioread32be/iowrite32be if + * device_is_big_endian() == true, or readl/writel otherwise. + */ +static inline bool device_is_big_endian(const struct device *dev) +{ + return fwnode_device_is_big_endian(dev_fwnode(dev)); +} + +/** * device_is_compatible - match 'compatible' property of the device with a given string * @dev: Pointer to the struct device * @compat: The string to match 'compatible' property with @@ -489,6 +515,13 @@ struct software_node { const struct property_entry *properties; }; +#define SOFTWARE_NODE(_name_, _properties_, _parent_) \ + (struct software_node) { \ + .name = _name_, \ + .properties = _properties_, \ + .parent = _parent_, \ + } + bool is_software_node(const struct fwnode_handle *fwnode); const struct software_node * to_software_node(const struct fwnode_handle *fwnode); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 4fa4ef0a173a..06cc8888199e 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -74,7 +74,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); -int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); +void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); @@ -257,10 +257,9 @@ static inline void __dquot_free_space(struct inode *inode, qsize_t number, inode_sub_bytes(inode, number); } -static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); - return 0; } static inline int dquot_reclaim_space_nodirty(struct inode *inode, @@ -358,14 +357,10 @@ static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } -static inline int dquot_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { - int ret; - - ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); - if (!ret) - mark_inode_dirty_sync(inode); - return ret; + dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 5e0f74f2f8ca..d07f0848802e 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -8,6 +8,7 @@ #include <linux/rcupdate.h> #include <linux/completion.h> +#include <linux/sched.h> /* * Structure allowing asynchronous waiting on RCU. @@ -55,4 +56,13 @@ do { \ #define synchronize_rcu_mult(...) \ _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index c4cc3b89ced1..abcdde4df697 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -177,7 +177,17 @@ void ctrl_alt_del(void); extern void orderly_poweroff(bool force); extern void orderly_reboot(void); -void hw_protection_shutdown(const char *reason, int ms_until_forced); +void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); + +static inline void hw_protection_reboot(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, false); +} + +static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, true); +} /* * Emergency restart, callable from an interrupt handler. diff --git a/include/linux/refcount.h b/include/linux/refcount.h index a62fcca97486..85c6df0d1bef 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -96,22 +96,11 @@ #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> +#include <linux/refcount_types.h> #include <linux/spinlock_types.h> struct mutex; -/** - * typedef refcount_t - variant of atomic_t specialized for reference counts - * @refs: atomic_t counter field - * - * The counter saturates at REFCOUNT_SATURATED and will not move once - * there. This avoids wrapping the counter and causing 'spurious' - * use-after-free bugs. - */ -typedef struct refcount_struct { - atomic_t refs; -} refcount_t; - #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) diff --git a/include/linux/refcount_types.h b/include/linux/refcount_types.h new file mode 100644 index 000000000000..162004f06edf --- /dev/null +++ b/include/linux/refcount_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_REFCOUNT_TYPES_H +#define _LINUX_REFCOUNT_TYPES_H + +#include <linux/types.h> + +/** + * typedef refcount_t - variant of atomic_t specialized for reference counts + * @refs: atomic_t counter field + * + * The counter saturates at REFCOUNT_SATURATED and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free bugs. + */ +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +#endif /* _LINUX_REFCOUNT_TYPES_H */ diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 39b666b40ea6..4660582a3302 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -33,6 +33,7 @@ #include <linux/err.h> #include <linux/suspend.h> +#include <regulator/regulator.h> struct device; struct notifier_block; @@ -85,52 +86,6 @@ struct regulator_dev; #define REGULATOR_MODE_STANDBY 0x8 /* - * Regulator notifier events. - * - * UNDER_VOLTAGE Regulator output is under voltage. - * OVER_CURRENT Regulator output current is too high. - * REGULATION_OUT Regulator output is out of regulation. - * FAIL Regulator output has failed. - * OVER_TEMP Regulator over temp. - * FORCE_DISABLE Regulator forcibly shut down by software. - * VOLTAGE_CHANGE Regulator voltage changed. - * Data passed is old voltage cast to (void *). - * DISABLE Regulator was disabled. - * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. - * Data passed is "struct pre_voltage_change_data" - * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. - * Data passed is old voltage cast to (void *). - * PRE_DISABLE Regulator is about to be disabled - * ABORT_DISABLE Regulator disable failed for some reason - * - * NOTE: These events can be OR'ed together when passed into handler. - */ - -#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 -#define REGULATOR_EVENT_OVER_CURRENT 0x02 -#define REGULATOR_EVENT_REGULATION_OUT 0x04 -#define REGULATOR_EVENT_FAIL 0x08 -#define REGULATOR_EVENT_OVER_TEMP 0x10 -#define REGULATOR_EVENT_FORCE_DISABLE 0x20 -#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 -#define REGULATOR_EVENT_DISABLE 0x80 -#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 -#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 -#define REGULATOR_EVENT_PRE_DISABLE 0x400 -#define REGULATOR_EVENT_ABORT_DISABLE 0x800 -#define REGULATOR_EVENT_ENABLE 0x1000 -/* - * Following notifications should be emitted only if detected condition - * is such that the HW is likely to still be working but consumers should - * take a recovery action to prevent problems esacalating into errors. - */ -#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 -#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 -#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 -#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 -#define REGULATOR_EVENT_WARN_MASK 0x1E000 - -/* * Regulator errors that can be queried using regulator_get_error_flags * * UNDER_VOLTAGE Regulator output is under voltage. diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4b7eceb3828b..22a07c0900a4 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -51,12 +51,7 @@ enum regulator_detection_severity { /* Initialize struct linear_range for regulators */ #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) \ -{ \ - .min = _min_uV, \ - .min_sel = _min_sel, \ - .max_sel = _max_sel, \ - .step = _step_uV, \ -} + LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) /** * struct regulator_ops - regulator operations. diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 621b7f4a3639..0cd76d264727 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -49,6 +49,13 @@ struct regulator; #define DISABLE_IN_SUSPEND 1 #define ENABLE_IN_SUSPEND 2 +/* + * Default time window (in milliseconds) following a critical under-voltage + * event during which less critical actions can be safely carried out by the + * system. + */ +#define REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS 10 + /* Regulator active discharge flags */ enum regulator_active_discharge { REGULATOR_ACTIVE_DISCHARGE_DEFAULT, @@ -127,6 +134,8 @@ struct notification_limit { * @ramp_disable: Disable ramp delay when initialising or when setting voltage. * @soft_start: Enable soft start so that voltage ramps slowly. * @pull_down: Enable pull down when regulator is disabled. + * @system_critical: Set if the regulator is critical to system stability or + * functionality. * @over_current_protection: Auto disable on over current event. * * @over_current_detection: Configure over current limits. @@ -153,6 +162,13 @@ struct notification_limit { * regulator_active_discharge values are used for * initialisation. * @enable_time: Turn-on time of the rails (unit: microseconds) + * @uv_less_critical_window_ms: Specifies the time window (in milliseconds) + * following a critical under-voltage (UV) event + * during which less critical actions can be + * safely carried out by the system (for example + * logging). After this time window more critical + * actions should be done (for example prevent + * HW damage). */ struct regulation_constraints { @@ -204,6 +220,7 @@ struct regulation_constraints { unsigned int settling_time_up; unsigned int settling_time_down; unsigned int enable_time; + unsigned int uv_less_critical_window_ms; unsigned int active_discharge; @@ -214,6 +231,7 @@ struct regulation_constraints { unsigned ramp_disable:1; /* disable ramp delay */ unsigned soft_start:1; /* ramp voltage slowly */ unsigned pull_down:1; /* pull down resistor when regulator off */ + unsigned system_critical:1; /* critical to system stability */ unsigned over_current_protection:1; /* auto disable on over current */ unsigned over_current_detection:1; /* notify on over current */ unsigned over_voltage_detection:1; /* notify on over voltage */ diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 980a65594412..13f17676c5f4 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -7,8 +7,8 @@ #include <linux/compiler.h> #include <linux/types.h> -#include <linux/time64.h> +struct __kernel_timespec; struct timespec; struct old_timespec32; struct pollfd; diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf..e0135e0adae0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/task_work.h> #include <linux/memcontrol.h> +#include <linux/rseq.h> #include <linux/blk-cgroup.h> /** diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 57467cbf4c5b..b6f3797277ff 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,7 +12,7 @@ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/mutex.h> -#include <linux/workqueue.h> +#include <linux/workqueue_types.h> struct rhash_head { struct rhash_head __rcu *next; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b26fe858fd44..b7944a833668 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) down_write(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); @@ -172,133 +177,323 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio); typedef int __bitwise rmap_t; /* - * No special request: if the page is a subpage of a compound page, it is - * mapped via a PTE. The mapped (sub)page is possibly shared between processes. + * No special request: A mapped anonymous (sub)page is possibly shared between + * processes. */ #define RMAP_NONE ((__force rmap_t)0) -/* The (sub)page is exclusive to a single process. */ +/* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) /* - * The compound page is not mapped via PTEs, but instead via a single PMD and - * should be accounted accordingly. + * Internally, we're using an enum to specify the granularity. We make the + * compiler emit specialized code for each granularity. */ -#define RMAP_COMPOUND ((__force rmap_t)BIT(1)) +enum rmap_level { + RMAP_LEVEL_PTE = 0, + RMAP_LEVEL_PMD, +}; + +static inline void __folio_rmap_sanity_checks(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + /* hugetlb folios are handled separately. */ + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + + /* + * TODO: we get driver-allocated folios that have nothing to do with + * the rmap using vm_insert_page(); therefore, we cannot assume that + * folio_test_large_rmappable() holds for large folios. We should + * handle any desired mapcount+stats accounting for these folios in + * VM_MIXEDMAP VMAs separately, and then sanity-check here that + * we really only get rmappable folios. + */ + + VM_WARN_ON_ONCE(nr_pages <= 0); + VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); + VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); + + switch (level) { + case RMAP_LEVEL_PTE: + break; + case RMAP_LEVEL_PMD: + /* + * We don't support folios larger than a single PMD yet. So + * when RMAP_LEVEL_PMD is set, we assume that we are creating + * a single "entire" mapping of the folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); + break; + default: + VM_WARN_ON_ONCE(true); + } +} /* * rmap interfaces called when adding or removing pte of page */ void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, rmap_t flags); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); +void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *, unsigned long address, rmap_t flags); +#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ + folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) +void folio_add_anon_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -void page_add_file_rmap(struct page *, struct vm_area_struct *, - bool compound); -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, - struct vm_area_struct *, bool compound); -void page_remove_rmap(struct page *, struct vm_area_struct *, - bool compound); - -void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *, +void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_add_file_rmap_pte(folio, page, vma) \ + folio_add_file_rmap_ptes(folio, page, 1, vma) +void folio_add_file_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); +void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_remove_rmap_pte(folio, page, vma) \ + folio_remove_rmap_ptes(folio, page, 1, vma) +void folio_remove_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); + +void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, +void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -static inline void __page_dup_rmap(struct page *page, bool compound) +/* See folio_try_dup_anon_rmap_*() */ +static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, + struct vm_area_struct *vma) { - if (compound) { - struct folio *folio = (struct folio *)page; + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - VM_BUG_ON_PAGE(compound && !PageHead(page), page); - atomic_inc(&folio->_entire_mapcount); - } else { - atomic_inc(&page->_mapcount); + if (PageAnonExclusive(&folio->page)) { + if (unlikely(folio_needs_cow_for_dma(vma, folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); } + atomic_inc(&folio->_entire_mapcount); + return 0; +} + +/* See folio_try_share_anon_rmap_*() */ +static inline int hugetlb_try_share_anon_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); + + /* Paired with the memory barrier in try_grab_folio(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb(); + + if (unlikely(folio_maybe_dma_pinned(folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); + + /* + * This is conceptually a smp_wmb() paired with the smp_rmb() in + * gup_must_unshare(). + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb__after_atomic(); + return 0; +} + +static inline void hugetlb_add_file_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); + + atomic_inc(&folio->_entire_mapcount); +} + +static inline void hugetlb_remove_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + + atomic_dec(&folio->_entire_mapcount); } -static inline void page_dup_file_rmap(struct page *page, bool compound) +static __always_inline void __folio_dup_file_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) { - __page_dup_rmap(page, compound); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + switch (level) { + case RMAP_LEVEL_PTE: + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + atomic_inc(&folio->_entire_mapcount); + break; + } } /** - * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped - * anonymous page - * @page: the page to duplicate the mapping for - * @compound: the page is mapped as compound or as a small page - * @vma: the source vma + * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated * - * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * The page range of the folio is defined by [page, page + nr_pages) * - * Duplicating the mapping can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages) +{ + __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); +} +#define folio_dup_file_rmap_pte(folio, page) \ + folio_dup_file_rmap_ptes(folio, page, 1) + +/** + * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of * - * If duplicating the mapping succeeds, the page has to be mapped R/O into - * the parent and the child. It must *not* get mapped writable after this call. + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * - * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. + * The caller needs to hold the page table lock. */ -static inline int page_try_dup_anon_rmap(struct page *page, bool compound, - struct vm_area_struct *vma) +static inline void folio_dup_file_rmap_pmd(struct folio *folio, + struct page *page) { - VM_BUG_ON_PAGE(!PageAnon(page), page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); +#else + WARN_ON_ONCE(true); +#endif +} - /* - * No need to check+clear for already shared pages, including KSM - * pages. - */ - if (!PageAnonExclusive(page)) - goto dup; +static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma, + enum rmap_level level) +{ + bool maybe_pinned; + int i; + + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); /* - * If this page may have been pinned by the parent process, - * don't allow to duplicate the mapping but instead require to e.g., - * copy the page immediately for the child so that we'll always - * guarantee the pinned page won't be randomly replaced in the + * If this folio may have been pinned by the parent process, + * don't allow to duplicate the mappings but instead require to e.g., + * copy the subpage immediately for the child so that we'll always + * guarantee the pinned folio won't be randomly replaced in the * future on write faults. */ - if (likely(!is_device_private_page(page) && - unlikely(page_needs_cow_for_dma(vma, page)))) - return -EBUSY; + maybe_pinned = likely(!folio_is_device_private(folio)) && + unlikely(folio_needs_cow_for_dma(src_vma, folio)); - ClearPageAnonExclusive(page); /* - * It's okay to share the anon page between both processes, mapping - * the page R/O into both processes. + * No need to check+clear for already shared PTEs/PMDs of the + * folio. But if any page is PageAnonExclusive, we must fallback to + * copying if the folio maybe pinned. */ -dup: - __page_dup_rmap(page, compound); + switch (level) { + case RMAP_LEVEL_PTE: + if (unlikely(maybe_pinned)) { + for (i = 0; i < nr_pages; i++) + if (PageAnonExclusive(page + i)) + return -EBUSY; + } + do { + if (PageAnonExclusive(page)) + ClearPageAnonExclusive(page); + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + if (PageAnonExclusive(page)) { + if (unlikely(maybe_pinned)) + return -EBUSY; + ClearPageAnonExclusive(page); + } + atomic_inc(&folio->_entire_mapcount); + break; + } return 0; } /** - * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly - * shared to prepare for KSM or temporary unmapping - * @page: the exclusive anonymous page to try marking possibly shared + * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range + * of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * @src_vma: The vm area from which the mappings are duplicated + * + * The page range of the folio is defined by [page, page + nr_pages) * - * The caller needs to hold the PT lock and has to have the page table entry - * cleared/invalidated. + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. * - * This is similar to page_try_dup_anon_rmap(), however, not used during fork() - * to duplicate a mapping, but instead to prepare for KSM or temporarily - * unmapping a page (swap, migration) via page_remove_rmap(). + * Duplicating the mappings can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. * - * Marking the page shared can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. + * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. * - * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY - * otherwise. + * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. */ -static inline int page_try_share_anon_rmap(struct page *page) +static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma) { - VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); + return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, + RMAP_LEVEL_PTE); +} +#define folio_try_dup_anon_rmap_pte(folio, page, vma) \ + folio_try_dup_anon_rmap_ptes(folio, page, 1, vma) - /* device private pages cannot get pinned via GUP. */ - if (unlikely(is_device_private_page(page))) { +/** + * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range + * of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * @src_vma: The vm area from which the mapping is duplicated + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. + * + * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. + * + * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. + */ +static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, + struct page *page, struct vm_area_struct *src_vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + +static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + /* device private folios cannot get pinned via GUP. */ + if (unlikely(folio_is_device_private(folio))) { ClearPageAnonExclusive(page); return 0; } @@ -349,7 +544,7 @@ static inline int page_try_share_anon_rmap(struct page *page) if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_mb(); - if (unlikely(page_maybe_dma_pinned(page))) + if (unlikely(folio_maybe_dma_pinned(folio))) return -EBUSY; ClearPageAnonExclusive(page); @@ -362,6 +557,68 @@ static inline int page_try_share_anon_rmap(struct page *page) return 0; } +/** + * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page + * mapped by a PTE possibly shared to prepare + * for KSM or temporary unmapping + * @folio: The folio to share a mapping of + * @page: The mapped exclusive page + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during + * fork() to duplicate mappings, but instead to prepare for KSM or temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). + * + * Marking the mapped page shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped page possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pte(struct folio *folio, + struct page *page) +{ + return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); +} + +/** + * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page + * range mapped by a PMD possibly shared to + * prepare for temporary unmapping + * @folio: The folio to share the mapping of + * @page: The first page to share the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during + * fork() to duplicate a mapping, but instead to prepare for temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). + * + * Marking the mapped pages shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped pages possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h new file mode 100644 index 000000000000..bc8af3eb5598 --- /dev/null +++ b/include/linux/rseq.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _LINUX_RSEQ_H +#define _LINUX_RSEQ_H + +#ifdef CONFIG_RSEQ + +#include <linux/preempt.h> +#include <linux/sched.h> + +/* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. + */ +enum rseq_event_mask_bits { + RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, + RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, + RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, +}; + +enum rseq_event_mask { + RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), + RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), + RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), +}; + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); + +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(ksig, regs); +} + +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ + preempt_disable(); + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + preempt_enable(); + rseq_handle_notify_resume(ksig, regs); +} + +/* rseq_preempt() requires preemption to be disabled. */ +static inline void rseq_preempt(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* rseq_migrate() requires preemption to be disabled. */ +static inline void rseq_migrate(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_VM) { + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; + } else { + t->rseq = current->rseq; + t->rseq_len = current->rseq_len; + t->rseq_sig = current->rseq_sig; + t->rseq_event_mask = current->rseq_event_mask; + } +} + +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; +} + +#else + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_preempt(struct task_struct *t) +{ +} +static inline void rseq_migrate(struct task_struct *t) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} + +#endif + +#ifdef CONFIG_DEBUG_RSEQ + +void rseq_syscall(struct pt_regs *regs); + +#else + +static inline void rseq_syscall(struct pt_regs *regs) +{ +} + +#endif + +#endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/rslib.h b/include/linux/rslib.h index 238bb85243d3..a04dacbdc8ae 100644 --- a/include/linux/rslib.h +++ b/include/linux/rslib.h @@ -10,7 +10,6 @@ #ifndef _RSLIB_H_ #define _RSLIB_H_ -#include <linux/list.h> #include <linux/types.h> /* for gfp_t */ #include <linux/gfp.h> /* for GFP_KERNEL */ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3d6cf306cd55..410529fca18b 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -10,6 +10,13 @@ #include <uapi/linux/rtnetlink.h> extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); + +static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net, + u32 pid, u32 group, int echo) +{ + return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo); +} + extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); @@ -72,6 +79,18 @@ static inline bool lockdep_rtnl_is_held(void) #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) +/** + * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning + * its old value + * @rp: RCU pointer, whose value is returned + * @p: regular pointer + * + * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated + * pointer. The old value of @rp is returned, and @rp is set to @p + */ +#define rcu_replace_pointer_rtnl(rp, p) \ + rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); @@ -130,4 +149,26 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, extern void rtnl_offload_xstats_notify(struct net_device *dev); +static inline int rtnl_has_listeners(const struct net *net, u32 group) +{ + struct sock *rtnl = net->rtnl; + + return netlink_has_listeners(rtnl, group); +} + +/** + * rtnl_notify_needed - check if notification is needed + * @net: Pointer to the net namespace + * @nlflags: netlink ingress message flags + * @group: rtnl group + * + * Based on the ingress message flags and rtnl group, returns true + * if a notification is needed, false otherwise. + */ +static inline bool +rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) +{ + return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); +} + #endif /* __LINUX_RTNETLINK_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 1dd530ce8b45..9c29689ff505 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -203,11 +203,11 @@ extern void up_read(struct rw_semaphore *sem); extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) -DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) - -DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T)) -DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T)) +DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) +DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0) +DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) +DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) /* * downgrade write lock to read lock diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..9a66147915b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -10,33 +10,41 @@ #include <uapi/linux/sched.h> #include <asm/current.h> - -#include <linux/pid.h> -#include <linux/sem.h> +#include <asm/processor.h> +#include <linux/thread_info.h> +#include <linux/preempt.h> +#include <linux/cpumask.h> + +#include <linux/cache.h> +#include <linux/irqflags_types.h> +#include <linux/smp_types.h> +#include <linux/pid_types.h> +#include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> -#include <linux/mutex.h> -#include <linux/plist.h> -#include <linux/hrtimer.h> -#include <linux/irqflags.h> -#include <linux/seccomp.h> -#include <linux/nodemask.h> -#include <linux/rcupdate.h> -#include <linux/refcount.h> +#include <linux/mutex_types.h> +#include <linux/plist_types.h> +#include <linux/hrtimer_types.h> +#include <linux/timer_types.h> +#include <linux/seccomp_types.h> +#include <linux/nodemask_types.h> +#include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> -#include <linux/syscall_user_dispatch.h> +#include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> -#include <linux/posix-timers.h> -#include <linux/rseq.h> -#include <linux/seqlock.h> +#include <linux/posix-timers_types.h> +#include <linux/restart_block.h> +#include <uapi/linux/rseq.h> +#include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/livepatch_sched.h> +#include <linux/uidgid_types.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ @@ -63,11 +71,13 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; /* @@ -413,42 +423,6 @@ struct load_weight { u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - * - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg - * updates. When a task is dequeued, its util_est should not be updated if its - * util_avg has not been updated in the meantime. - * This information is mapped into the MSB bit of util_est.enqueued at dequeue - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg - * for a task) it is safe to use MSB. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -#define UTIL_AVG_UNCHANGED 0x80000000 -} __attribute__((__aligned__(sizeof(u64)))); - /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -503,9 +477,20 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; @@ -523,7 +508,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; @@ -553,7 +538,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime; struct list_head group_node; unsigned int on_rq; @@ -607,6 +592,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; @@ -654,6 +642,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -668,7 +657,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* @@ -795,6 +797,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE @@ -1561,114 +1564,6 @@ struct task_struct { */ }; -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->thread_pid; -} - -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); - -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} - -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} - -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} - - -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - * - * Return: 1 if the process is alive. 0 otherwise. - */ -static inline int pid_alive(const struct task_struct *p) -{ - return p->thread_pid != NULL; -} - -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} - -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} - - -static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} - -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} - -static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); -} - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); -} - -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - -/* Obsolete, do not use: */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return task_pgrp_nr_ns(tsk, &init_pid_ns); -} - #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1712,20 +1607,6 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } -/** - * is_global_init - check if a task structure is init. Since init - * is free to have sub-threads we need to check tgid. - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - * - * Return: 1 if the task structure is init. 0 otherwise. - */ -static inline int is_global_init(struct task_struct *tsk) -{ - return task_tgid_nr(tsk) == 1; -} - extern struct pid *cad_pid; /* @@ -1955,9 +1836,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; -#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif @@ -2177,15 +2056,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) -{ -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif -} - #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); @@ -2227,37 +2097,6 @@ static inline bool preempt_model_preemptible(void) return preempt_model_full() || preempt_model_rt(); } -/* - * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPTION, - * but a general need for low latency) - */ -static inline int spin_needbreak(spinlock_t *lock) -{ -#ifdef CONFIG_PREEMPTION - return spin_is_contended(lock); -#else - return 0; -#endif -} - -/* - * Check if a rwlock is contended. - * Returns non-zero if there is another task waiting on the rwlock. - * Returns zero if the lock is not contended or the system / underlying - * rwlock implementation does not support contention detection. - * Technically does not depend on CONFIG_PREEMPTION, but a general need - * for low latency. - */ -static inline int rwlock_needbreak(rwlock_t *lock) -{ -#ifdef CONFIG_PREEMPTION - return rwlock_is_contended(lock); -#else - return 0; -#endif -} - static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); @@ -2292,6 +2131,8 @@ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); +#include <linux/spinlock.h> + /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. @@ -2328,129 +2169,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ -#ifdef CONFIG_RSEQ - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); - -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); -} - -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); - rseq_handle_notify_resume(ksig, regs); -} - -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. - */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; - } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; -} - -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} - -#endif - -#ifdef CONFIG_DEBUG_RSEQ - -void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif - #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index fe1a46f30d24..2b461129d1fa 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -2,6 +2,7 @@ #define _LINUX_SCHED_ISOLATION_H #include <linux/cpumask.h> +#include <linux/cpuset.h> #include <linux/init.h> #include <linux/tick.h> @@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type) static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || - !housekeeping_test_cpu(cpu, HK_TYPE_TICK); + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..4b7664c56208 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -9,6 +9,7 @@ #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> +#include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> @@ -432,7 +433,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); @@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a23af225c898..d362aacf9f89 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -7,6 +7,8 @@ * functionality: */ +#include <linux/rcupdate.h> +#include <linux/refcount.h> #include <linux/sched.h> #include <linux/uaccess.h> @@ -226,4 +228,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index f158b025c175..ccd72b978e1f 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/magic.h> +#include <linux/refcount.h> #ifdef CONFIG_THREAD_INFO_IN_TASK diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index de545ba85218..a6e04b4a21d7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 175079552f68..709ad84809e1 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,6 +3,7 @@ #define _LINUX_SECCOMP_H #include <uapi/linux/seccomp.h> +#include <linux/seccomp_types.h> #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ SECCOMP_FILTER_FLAG_LOG | \ @@ -21,25 +22,6 @@ #include <linux/atomic.h> #include <asm/seccomp.h> -struct seccomp_filter; -/** - * struct seccomp - the state of a seccomp'ed process - * - * @mode: indicates one of the valid values above for controlled - * system calls available to a process. - * @filter_count: number of seccomp filters - * @filter: must always point to a valid seccomp-filter or NULL as it is - * accessed without locking during system call entry. - * - * @filter must only be accessed from the context of current as there - * is no read locking. - */ -struct seccomp { - int mode; - atomic_t filter_count; - struct seccomp_filter *filter; -}; - #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) @@ -64,8 +46,6 @@ static inline int seccomp_mode(struct seccomp *s) #include <linux/errno.h> -struct seccomp { }; -struct seccomp_filter { }; struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER @@ -126,6 +106,8 @@ static inline long seccomp_get_metadata(struct task_struct *task, #ifdef CONFIG_SECCOMP_CACHE_DEBUG struct seq_file; +struct pid_namespace; +struct pid; int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); diff --git a/include/linux/seccomp_types.h b/include/linux/seccomp_types.h new file mode 100644 index 000000000000..cf0a0355024f --- /dev/null +++ b/include/linux/seccomp_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SECCOMP_TYPES_H +#define _LINUX_SECCOMP_TYPES_H + +#include <linux/types.h> + +#ifdef CONFIG_SECCOMP + +struct seccomp_filter; +/** + * struct seccomp - the state of a seccomp'ed process + * + * @mode: indicates one of the valid values above for controlled + * system calls available to a process. + * @filter_count: number of seccomp filters + * @filter: must always point to a valid seccomp-filter or NULL as it is + * accessed without locking during system call entry. + * + * @filter must only be accessed from the context of current as there + * is no read locking. + */ +struct seccomp { + int mode; + atomic_t filter_count; + struct seccomp_filter *filter; +}; + +#else + +struct seccomp { }; +struct seccomp_filter { }; + +#endif + +#endif /* _LINUX_SECCOMP_TYPES_H */ diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881..d0eb20f90b26 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/sockptr.h> +#include <uapi/linux/lsm.h> struct linux_binprm; struct cred; @@ -60,6 +61,7 @@ struct fs_parameter; enum fs_value_type; struct watch; struct watch_notification; +struct lsm_ctx; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -138,6 +140,8 @@ enum lockdown_reason { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; +extern u32 lsm_active_cnt; +extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, @@ -261,6 +265,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb); /* prototypes */ extern int security_init(void); extern int early_security_init(void); +extern u64 lsm_name_to_attr(const char *name); /* Security operations */ int security_binder_set_context_mgr(const struct cred *mgr); @@ -389,6 +394,8 @@ int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int security_file_ioctl_compat(struct file *file, unsigned int cmd, + unsigned long arg); int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags); int security_mmap_addr(unsigned long addr); @@ -470,10 +477,13 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); -int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags); +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t size, u32 flags); +int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); -int security_setprocattr(const char *lsm, const char *name, void *value, - size_t size); +int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); @@ -484,6 +494,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); +int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, + void *val, size_t val_len, u64 id, u64 flags); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -501,6 +513,11 @@ static inline int unregister_blocking_lsm_notifier(struct notifier_block *nb) return 0; } +static inline u64 lsm_name_to_attr(const char *name) +{ + return LSM_ATTR_UNDEF; +} + static inline void security_free_mnt_opts(void **mnt_opts) { } @@ -987,6 +1004,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd, return 0; } +static inline int security_file_ioctl_compat(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + return 0; +} + static inline int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { @@ -1337,14 +1361,28 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } -static inline int security_getprocattr(struct task_struct *p, const char *lsm, +static inline int security_getselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int security_setselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t size, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { return -EINVAL; } -static inline int security_setprocattr(const char *lsm, char *name, - void *value, size_t size) +static inline int security_setprocattr(int lsmid, char *name, void *value, + size_t size) { return -EINVAL; } @@ -1395,6 +1433,12 @@ static inline int security_locked_down(enum lockdown_reason what) { return 0; } +static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, + size_t *uctx_len, void *val, size_t val_len, + u64 id, u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_SECURITY */ #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) diff --git a/include/linux/sem.h b/include/linux/sem.h index 5608a500c43e..c4deefe42aeb 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -3,25 +3,17 @@ #define _LINUX_SEM_H #include <uapi/linux/sem.h> +#include <linux/sem_types.h> struct task_struct; -struct sem_undo_list; #ifdef CONFIG_SYSVIPC -struct sysv_sem { - struct sem_undo_list *undo_list; -}; - extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -struct sysv_sem { - /* empty */ -}; - static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { return 0; diff --git a/include/linux/sem_types.h b/include/linux/sem_types.h new file mode 100644 index 000000000000..73df1971a7ae --- /dev/null +++ b/include/linux/sem_types.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SEM_TYPES_H +#define _LINUX_SEM_TYPES_H + +struct sem_undo_list; + +struct sysv_sem { +#ifdef CONFIG_SYSVIPC + struct sem_undo_list *undo_list; +#endif +}; + +#endif /* _LINUX_SEM_TYPES_H */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e92f9d5577ba..d90d8ee29d81 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -18,6 +18,7 @@ #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/preempt.h> +#include <linux/seqlock_types.h> #include <linux/spinlock.h> #include <asm/processor.h> @@ -37,37 +38,6 @@ */ #define KCSAN_SEQLOCK_REGION_MAX 1000 -/* - * Sequence counters (seqcount_t) - * - * This is the raw counting mechanism, without any writer protection. - * - * Write side critical sections must be serialized and non-preemptible. - * - * If readers can be invoked from hardirq or softirq contexts, - * interrupts or bottom halves must also be respectively disabled before - * entering the write section. - * - * This mechanism can't be used if the protected data contains pointers, - * as the writer can invalidate a pointer that a reader is following. - * - * If the write serialization mechanism is one of the common kernel - * locking primitives, use a sequence counter with associated lock - * (seqcount_LOCKNAME_t) instead. - * - * If it's desired to automatically handle the sequence counter writer - * serialization and non-preemptibility requirements, use a sequential - * lock (seqlock_t) instead. - * - * See Documentation/locking/seqlock.rst - */ -typedef struct seqcount { - unsigned sequence; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} seqcount_t; - static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { @@ -132,28 +102,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) */ /* - * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot - * disable preemption. It can lead to higher latencies, and the write side - * sections will not be able to acquire locks which become sleeping locks - * (e.g. spinlock_t). - * - * To remain preemptible while avoiding a possible livelock caused by the - * reader preempting the writer, use a different technique: let the reader - * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the - * case, acquire then release the associated LOCKNAME writer serialization - * lock. This will allow any possibly-preempted writer to make progress - * until the end of its writer serialization lock critical section. - * - * This lock-unlock technique must be implemented for all of PREEMPT_RT - * sleeping locks. See Documentation/locking/locktypes.rst - */ -#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) -#define __SEQ_LOCK(expr) expr -#else -#define __SEQ_LOCK(expr) -#endif - -/* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock @@ -194,11 +142,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ -typedef struct seqcount_##lockname { \ - seqcount_t seqcount; \ - __SEQ_LOCK(locktype *lock); \ -} seqcount_##lockname##_t; \ - \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ @@ -284,6 +227,7 @@ SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) +#undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t @@ -794,25 +738,6 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s) smp_wmb(); /* increment "sequence" before following stores */ } -/* - * Sequential locks (seqlock_t) - * - * Sequence counters with an embedded spinlock for writer serialization - * and non-preemptibility. - * - * For more info, see: - * - Comments on top of seqcount_t - * - Documentation/locking/seqlock.rst - */ -typedef struct { - /* - * Make sure that readers don't starve writers on PREEMPT_RT: use - * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). - */ - seqcount_spinlock_t seqcount; - spinlock_t lock; -} seqlock_t; - #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ diff --git a/include/linux/seqlock_types.h b/include/linux/seqlock_types.h new file mode 100644 index 000000000000..dfdf43e3fa3d --- /dev/null +++ b/include/linux/seqlock_types.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SEQLOCK_TYPES_H +#define __LINUX_SEQLOCK_TYPES_H + +#include <linux/lockdep_types.h> +#include <linux/mutex_types.h> +#include <linux/spinlock_types.h> + +/* + * Sequence counters (seqcount_t) + * + * This is the raw counting mechanism, without any writer protection. + * + * Write side critical sections must be serialized and non-preemptible. + * + * If readers can be invoked from hardirq or softirq contexts, + * interrupts or bottom halves must also be respectively disabled before + * entering the write section. + * + * This mechanism can't be used if the protected data contains pointers, + * as the writer can invalidate a pointer that a reader is following. + * + * If the write serialization mechanism is one of the common kernel + * locking primitives, use a sequence counter with associated lock + * (seqcount_LOCKNAME_t) instead. + * + * If it's desired to automatically handle the sequence counter writer + * serialization and non-preemptibility requirements, use a sequential + * lock (seqlock_t) instead. + * + * See Documentation/locking/seqlock.rst + */ +typedef struct seqcount { + unsigned sequence; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} seqcount_t; + +/* + * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot + * disable preemption. It can lead to higher latencies, and the write side + * sections will not be able to acquire locks which become sleeping locks + * (e.g. spinlock_t). + * + * To remain preemptible while avoiding a possible livelock caused by the + * reader preempting the writer, use a different technique: let the reader + * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the + * case, acquire then release the associated LOCKNAME writer serialization + * lock. This will allow any possibly-preempted writer to make progress + * until the end of its writer serialization lock critical section. + * + * This lock-unlock technique must be implemented for all of PREEMPT_RT + * sleeping locks. See Documentation/locking/locktypes.rst + */ +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) +#define __SEQ_LOCK(expr) expr +#else +#define __SEQ_LOCK(expr) +#endif + +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ +typedef struct seqcount_##lockname { \ + seqcount_t seqcount; \ + __SEQ_LOCK(locktype *lock); \ +} seqcount_##lockname##_t; + +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) +#undef SEQCOUNT_LOCKNAME + +/* + * Sequential locks (seqlock_t) + * + * Sequence counters with an embedded spinlock for writer serialization + * and non-preemptibility. + * + * For more info, see: + * - Comments on top of seqcount_t + * - Documentation/locking/seqlock.rst + */ +typedef struct { + /* + * Make sure that readers don't starve writers on PREEMPT_RT: use + * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). + */ + seqcount_spinlock_t seqcount; + spinlock_t lock; +} seqlock_t; + +#endif /* __LINUX_SEQLOCK_TYPES_H */ diff --git a/include/linux/shm.h b/include/linux/shm.h index d8e69aed3d32..c55bef0538e5 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -2,12 +2,12 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ -#include <linux/list.h> +#include <linux/types.h> #include <asm/page.h> -#include <uapi/linux/shm.h> #include <asm/shmparam.h> struct file; +struct task_struct; #ifdef CONFIG_SYSVIPC struct sysv_shm { diff --git a/include/linux/signal.h b/include/linux/signal.h index 3b98e7a28538..f19816832f05 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -3,6 +3,7 @@ #define _LINUX_SIGNAL_H #include <linux/bug.h> +#include <linux/list.h> #include <linux/signal_types.h> #include <linux/string.h> diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index a70b2bdbf4d9..caf4f7a59ab9 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -6,7 +6,7 @@ * Basic signal handling related data type definitions: */ -#include <linux/list.h> +#include <linux/types.h> #include <uapi/linux/signal.h> typedef struct kernel_siginfo { diff --git a/include/linux/sizes.h b/include/linux/sizes.h index 84aa448d8bb3..c3a00b967d18 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -47,8 +47,17 @@ #define SZ_8G _AC(0x200000000, ULL) #define SZ_16G _AC(0x400000000, ULL) #define SZ_32G _AC(0x800000000, ULL) +#define SZ_64G _AC(0x1000000000, ULL) +#define SZ_128G _AC(0x2000000000, ULL) +#define SZ_256G _AC(0x4000000000, ULL) +#define SZ_512G _AC(0x8000000000, ULL) #define SZ_1T _AC(0x10000000000, ULL) +#define SZ_2T _AC(0x20000000000, ULL) +#define SZ_4T _AC(0x40000000000, ULL) +#define SZ_8T _AC(0x80000000000, ULL) +#define SZ_16T _AC(0x100000000000, ULL) +#define SZ_32T _AC(0x200000000000, ULL) #define SZ_64T _AC(0x400000000000, ULL) #endif /* __LINUX_SIZES_H__ */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..a5ae952454c8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -566,6 +566,15 @@ struct ubuf_info_msgzc { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); +/* Preserve some data across TX submission and completion. + * + * Note, this state is stored in the driver. Extending the layout + * might need some special care. + */ +struct xsk_tx_metadata_compl { + __u64 *tx_timestamp; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -578,7 +587,10 @@ struct skb_shared_info { /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; - struct skb_shared_hwtstamps hwtstamps; + union { + struct skb_shared_hwtstamps hwtstamps; + struct xsk_tx_metadata_compl xsk_meta; + }; unsigned int gso_type; u32 tskey; @@ -742,7 +754,6 @@ typedef unsigned char *sk_buff_data_t; * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) - * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header @@ -776,7 +787,6 @@ typedef unsigned char *sk_buff_data_t; * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) - * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash @@ -1057,7 +1067,7 @@ struct sk_buff { refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS - /* only useable after checking ->active_extensions != 0 */ + /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; @@ -3299,7 +3309,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. - * 1. This is for device Rx, therefor a cold page is preferred. + * 1. This is for device Rx, therefore a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page @@ -3997,6 +4007,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); @@ -4235,10 +4246,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + BITS_PER_LONG != 64) + goto slow; + + /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) @@ -4258,11 +4272,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; + default: +slow: + return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; -#else - return memcmp(a - meta_len, b - meta_len, meta_len); -#endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c953b8c0d2f4..888a4b217829 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -100,6 +100,11 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + /* psock_update_sk_prot may be called with restore=false many times + * so the handler must be safe for this case. It will be called + * exactly once with restore=true when the psock is being destroyed + * and psock refcnt is zero, but before an RCU grace period. + */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..b5f5ee8308d0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -24,7 +24,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. + * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) @@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void) * Kmalloc array related definitions */ -#ifdef CONFIG_SLAB /* - * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) -#ifndef KMALLOC_SHIFT_LOW -#define KMALLOC_SHIFT_LOW 5 -#endif -#endif - -#ifdef CONFIG_SLUB -#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -#endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) @@ -788,12 +778,4 @@ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); -#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -int slab_prepare_cpu(unsigned int cpu); -int slab_dead_cpu(unsigned int cpu); -#else -#define slab_prepare_cpu NULL -#define slab_dead_cpu NULL -#endif - #endif /* _LINUX_SLAB_H */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h deleted file mode 100644 index a61e7d55d0d3..000000000000 --- a/include/linux/slab_def.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLAB_DEF_H -#define _LINUX_SLAB_DEF_H - -#include <linux/kfence.h> -#include <linux/reciprocal_div.h> - -/* - * Definitions unique to the original Linux SLAB allocator. - */ - -struct kmem_cache { - struct array_cache __percpu *cpu_cache; - -/* 1) Cache tunables. Protected by slab_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int size; - struct reciprocal_value reciprocal_buffer_size; -/* 2) touched by every alloc & free from the backend */ - - slab_flags_t flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 3) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t allocflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int freelist_size; - - /* constructor func */ - void (*ctor)(void *obj); - -/* 4) cache creation/removal */ - const char *name; - struct list_head list; - int refcount; - int object_size; - int align; - -/* 5) statistics */ -#ifdef CONFIG_DEBUG_SLAB - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; - - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. 'size' contains the total - * object size including these internal fields, while 'obj_offset' - * and 'object_size' contain the offset to the user object and its - * size. - */ - int obj_offset; -#endif /* CONFIG_DEBUG_SLAB */ - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) -{ - void *object = x - (x - slab->s_mem) % cache->size; - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - if (is_kfence_address(slab_address(slab))) - return 1; - return cache->num; -} - -#endif /* _LINUX_SLAB_DEF_H */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h deleted file mode 100644 index deb90cf4bffb..000000000000 --- a/include/linux/slub_def.h +++ /dev/null @@ -1,204 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLUB_DEF_H -#define _LINUX_SLUB_DEF_H - -/* - * SLUB : A Slab allocator without object queues. - * - * (C) 2007 SGI, Christoph Lameter - */ -#include <linux/kfence.h> -#include <linux/kobject.h> -#include <linux/reciprocal_div.h> -#include <linux/local_lock.h> - -enum stat_item { - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ - FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ - FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ - FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ - ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ - CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ - NR_SLUB_STAT_ITEMS -}; - -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - union { - struct { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_aba_t freelist_tid; - }; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ -#endif - local_lock_t lock; /* Protects the fields above */ -#ifdef CONFIG_SLUB_STATS - unsigned stat[NR_SLUB_STAT_ITEMS]; -#endif -}; -#endif /* CONFIG_SLUB_TINY */ - -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - -/* - * Word size structure that can be atomically updated or read and that - * contains both the order and the number of objects that a slab of the - * given order would contain. - */ -struct kmem_cache_order_objects { - unsigned int x; -}; - -/* - * Slab cache management. - */ -struct kmem_cache { -#ifndef CONFIG_SLUB_TINY - struct kmem_cache_cpu __percpu *cpu_slab; -#endif - /* Used for retrieving partial slabs, etc. */ - slab_flags_t flags; - unsigned long min_partial; - unsigned int size; /* The size of an object including metadata */ - unsigned int object_size;/* The size of an object without metadata */ - struct reciprocal_value reciprocal_size; - unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif - struct kmem_cache_order_objects oo; - - /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects min; - gfp_t allocflags; /* gfp flags to use on each alloc */ - int refcount; /* Refcount for slab cache destroy */ - void (*ctor)(void *); - unsigned int inuse; /* Offset to metadata */ - unsigned int align; /* Alignment */ - unsigned int red_left_pad; /* Left redzone padding size */ - const char *name; /* Name (only for display!) */ - struct list_head list; /* List of slab caches */ -#ifdef CONFIG_SYSFS - struct kobject kobj; /* For sysfs */ -#endif -#ifdef CONFIG_SLAB_FREELIST_HARDENED - unsigned long random; -#endif - -#ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. - */ - unsigned int remote_node_defrag_ratio; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) -#define SLAB_SUPPORTS_SYSFS -void sysfs_slab_unlink(struct kmem_cache *); -void sysfs_slab_release(struct kmem_cache *); -#else -static inline void sysfs_slab_unlink(struct kmem_cache *s) -{ -} -static inline void sysfs_slab_release(struct kmem_cache *s) -{ -} -#endif - -void *fixup_red_left(struct kmem_cache *s, void *p); - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) { - void *object = x - (x - slab_address(slab)) % cache->size; - void *last_object = slab_address(slab) + - (slab->objects - 1) * cache->size; - void *result = (unlikely(object > last_object)) ? last_object : object; - - result = fixup_red_left(cache, result); - return result; -} - -/* Determine object index from a given position */ -static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) -{ - return reciprocal_divide(kasan_reset_tag(obj) - addr, - cache->reciprocal_size); -} - -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - if (is_kfence_address(obj)) - return 0; - return __obj_to_index(cache, slab_address(slab), obj); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - return slab->objects; -} -#endif /* _LINUX_SLUB_DEF_H */ diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index fc456f75c131..8c9ca857ccf6 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -161,24 +161,6 @@ int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message, struct completion *completion, bool atomic); /* - * Send a message to the given endpoint and wait until it has been submitted - * to the hardware FIFO. - * Will return zero on success and a negative error code on failure - * (e.g. -ETIME when the message couldn't be written within the given - * timeout) - * - * @rtk: RTKit reference - * @ep: target endpoint - * @message: message to be sent - * @timeout: timeout in milliseconds to allow the message transmission - * to be completed - * @atomic: if set to true this function can be called from atomic - * context. - */ -int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message, - unsigned long timeout, bool atomic); - -/* * Process incoming messages in atomic context. * This only guarantees that messages arrive as far as the recv_message_early * callback; drivers expecting to handle incoming messages synchronously diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index 2475ef914746..4885b065b849 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -62,6 +62,14 @@ enum mtk_ddp_comp_id { DDP_COMPONENT_OVL_2L1, DDP_COMPONENT_OVL_2L2, DDP_COMPONENT_OVL1, + DDP_COMPONENT_PADDING0, + DDP_COMPONENT_PADDING1, + DDP_COMPONENT_PADDING2, + DDP_COMPONENT_PADDING3, + DDP_COMPONENT_PADDING4, + DDP_COMPONENT_PADDING5, + DDP_COMPONENT_PADDING6, + DDP_COMPONENT_PADDING7, DDP_COMPONENT_POSTMASK0, DDP_COMPONENT_PWM0, DDP_COMPONENT_PWM1, diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 6b0a7dc48a4b..f866d5c8ed32 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -233,6 +233,8 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem) * limitations) * @supports_op: check if an operation is supported by the controller * @exec_op: execute a SPI memory operation + * not all driver provides supports_op(), so it can return -EOPNOTSUPP + * if the op is not supported by the driver/controller * @get_name: get a custom name for the SPI mem device from the controller. * This might be needed if the controller driver has been ported * to use the SPI mem layer and a custom name is used to keep diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5..471fe2ff9066 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -20,6 +20,9 @@ #include <uapi/linux/spi/spi.h> +/* Max no. of CS supported per spi device */ +#define SPI_CS_CNT_MAX 4 + struct dma_chan; struct software_node; struct ptp_system_timestamp; @@ -132,7 +135,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Chipselect, distinguishing chips handled by @controller. + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @mode: The spi mode defines how data is clocked out and in. * This may be changed by the device's driver. * The "active low" default for chipselect mode can be overridden @@ -157,8 +161,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: GPIO descriptor of the chipselect line (optional, NULL when - * not using a GPIO line) + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -167,6 +171,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. * @pcpu_statistics: statistics for the spi_device + * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * * A @spi_device is used to interchange data between an SPI slave * (usually a discrete chip) and CPU memory. @@ -182,7 +187,7 @@ struct spi_device { struct spi_controller *controller; struct spi_controller *master; /* Compatibility layer */ u32 max_speed_hz; - u8 chip_select; + u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -213,7 +218,7 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod; /* Chip select GPIO descriptor */ + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ struct spi_delay word_delay; /* Inter-word delay */ /* CS delays */ struct spi_delay cs_setup; @@ -223,6 +228,13 @@ struct spi_device { /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; + /* Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array.When the controller is capable to handle + * multiple chip selects & memories are connected in parallel + * then more than one bit need to be set in cs_index_mask. + */ + u32 cs_index_mask : SPI_CS_CNT_MAX; + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: @@ -279,22 +291,33 @@ static inline void *spi_get_drvdata(const struct spi_device *spi) static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx) { - return spi->chip_select; + return spi->chip_select[idx]; } static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect) { - spi->chip_select = chipselect; + spi->chip_select[idx] = chipselect; } static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx) { - return spi->cs_gpiod; + return spi->cs_gpiod[idx]; } static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod) { - spi->cs_gpiod = csgpiod; + spi->cs_gpiod[idx] = csgpiod; +} + +static inline bool spi_is_csgpiod(struct spi_device *spi) +{ + u8 idx; + + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi_get_csgpiod(spi, idx)) + return true; + } + return false; } /** @@ -399,6 +422,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use + * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert + * more than one chip select at once. * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. @@ -461,10 +486,13 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem - * can issue the next transfer. Note: transfer_one and - * transfer_one_message are mutually exclusive; when both - * are set, the generic subsystem does not call your - * transfer_one callback. + * can issue the next transfer. If the transfer fails, the + * driver must set the flag SPI_TRANS_FAIL_IO to + * spi_transfer->error first, before calling + * spi_finalize_current_transfer(). + * Note: transfer_one and transfer_one_message are mutually + * exclusive; when both are set, the generic subsystem does + * not call your transfer_one callback. * @handle_err: the subsystem calls the driver to handle an error that occurs * in the generic implementation of transfer_one_message(). * @mem_ops: optimized/dedicated operations for interactions with SPI memory. @@ -567,6 +595,11 @@ struct spi_controller { #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ #define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select slave */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ + /* + * The spi-controller has multi chip select capability and can + * assert/de-assert more than one chip select at once. + */ +#define SPI_CONTROLLER_MULTI_CS BIT(7) /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; @@ -677,7 +710,8 @@ struct spi_controller { bool rt; bool auto_runtime_pm; bool cur_msg_mapped; - char last_cs; + char last_cs[SPI_CS_CNT_MAX]; + char last_cs_index_mask; bool last_cs_mode_high; bool fallback; struct completion xfer_completion; @@ -1040,6 +1074,7 @@ struct spi_transfer { unsigned len; #define SPI_TRANS_FAIL_NO_START BIT(0) +#define SPI_TRANS_FAIL_IO BIT(1) u16 error; dma_addr_t tx_dma; @@ -1638,8 +1673,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) /* Compatibility layer */ #define spi_master spi_controller -#define SPI_MASTER_HALF_DUPLEX SPI_CONTROLLER_HALF_DUPLEX - #define spi_master_get_devdata(_ctlr) spi_controller_get_devdata(_ctlr) #define spi_master_set_devdata(_ctlr, _data) \ spi_controller_set_devdata(_ctlr, _data) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 31d3d747a9db..eaac8b0da25b 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -449,6 +449,37 @@ static __always_inline int spin_is_contended(spinlock_t *lock) return raw_spin_is_contended(&lock->rlock); } +/* + * Does a critical section need to be broken due to another + * task waiting?: (technically does not depend on CONFIG_PREEMPTION, + * but a general need for low latency) + */ +static inline int spin_needbreak(spinlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return spin_is_contended(lock); +#else + return 0; +#endif +} + +/* + * Check if a rwlock is contended. + * Returns non-zero if there is another task waiting on the rwlock. + * Returns zero if the lock is not contended or the system / underlying + * rwlock implementation does not support contention detection. + * Technically does not depend on CONFIG_PREEMPTION, but a general need + * for low latency. + */ +static inline int rwlock_needbreak(rwlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return rwlock_is_contended(lock); +#else + return 0; +#endif +} + #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) #else /* !CONFIG_PREEMPT_RT */ @@ -507,6 +538,8 @@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t, raw_spin_lock(_T->lock), raw_spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t, raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING), raw_spin_unlock(_T->lock)) @@ -515,23 +548,62 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, raw_spin_lock_irq(_T->lock), raw_spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try, + raw_spin_trylock_irqsave(_T->lock, _T->flags)) + DEFINE_LOCK_GUARD_1(spinlock, spinlock_t, spin_lock(_T->lock), spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, spin_lock_irq(_T->lock), spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, + spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, + spin_trylock_irqsave(_T->lock, _T->flags)) + +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, + read_lock(_T->lock), + read_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, + read_lock_irq(_T->lock), + read_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, + read_lock_irqsave(_T->lock, _T->flags), + read_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, + write_lock(_T->lock), + write_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, + write_lock_irq(_T->lock), + write_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, + write_lock_irqsave(_T->lock, _T->flags), + write_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/splice.h b/include/linux/splice.h index 6c461573434d..9dec4861d09f 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -68,28 +68,37 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, typedef int (splice_direct_actor)(struct pipe_inode_info *, struct splice_desc *); -extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int, - splice_actor *); -extern ssize_t __splice_from_pipe(struct pipe_inode_info *, - struct splice_desc *, splice_actor *); -extern ssize_t splice_to_pipe(struct pipe_inode_info *, - struct splice_pipe_desc *); -extern ssize_t add_to_pipe(struct pipe_inode_info *, - struct pipe_buffer *); -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); -extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); -extern long do_splice(struct file *in, loff_t *off_in, - struct file *out, loff_t *off_out, - size_t len, unsigned int flags); +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor); +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, + struct splice_desc *sd, splice_actor *actor); +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd); +ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf); +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); +ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, + splice_direct_actor *actor); +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); -extern long do_tee(struct file *in, struct file *out, size_t len, - unsigned int flags); -extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags); +static inline long splice_copy_file_range(struct file *in, loff_t pos_in, + struct file *out, loff_t pos_out, + size_t len) +{ + return splice_file_range(in, &pos_in, out, &pos_out, len); +} + +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags); +ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); /* * for dynamic pipe sizing diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e58306783d8e..adcbb8f23600 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,8 +11,6 @@ * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free * stack traces often repeat, using stack depot allows to save about 100x space. * - * Stack traces are never removed from the stack depot. - * * Author: Alexander Potapenko <glider@google.com> * Copyright (C) 2016 Google, Inc. * @@ -32,6 +30,18 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +typedef u32 depot_flags_t; + +/* + * Flags that can be passed to stack_depot_save_flags(); see the comment next + * to its declaration for more details. + */ +#define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) +#define STACK_DEPOT_FLAG_GET ((depot_flags_t)0x0002) + +#define STACK_DEPOT_FLAGS_NUM 2 +#define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) + /* * Using stack depot requires its initialization, which can be done in 3 ways: * @@ -69,31 +79,39 @@ static inline int stack_depot_early_init(void) { return 0; } #endif /** - * __stack_depot_save - Save a stack trace to stack depot + * stack_depot_save_flags - Save a stack trace to stack depot * * @entries: Pointer to the stack trace * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) + * @depot_flags: Stack depot flags + * + * Saves a stack trace from @entries array of size @nr_entries. + * + * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can + * replenish the stack pools in case no space is left (allocates using GFP + * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and + * fails if no space is left to store the stack trace. * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. + * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment + * the refcount on the saved stack trace if it already exists in stack depot. + * Users of this flag must also call stack_depot_put() when keeping the stack + * trace is no longer required to avoid overflowing the refcount. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting @can_alloc to %false is required if + * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * * Return: Handle of the stack struct stored in depot, 0 on failure */ -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t gfp_flags, bool can_alloc); +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, + depot_flags_t depot_flags); /** * stack_depot_save - Save a stack trace to stack depot @@ -102,8 +120,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. + * Does not increment the refcount on the saved stack trace; see + * stack_depot_save_flags() for more details. + * + * Context: Contexts where allocations via alloc_pages() are allowed; + * see stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ @@ -142,6 +163,18 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); /** + * stack_depot_put - Drop a reference to a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * + * The stack trace is evicted from stack depot once all references to it have + * been dropped (once the number of stack_depot_evict() calls matches the + * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for + * this stack trace). + */ +void stack_depot_put(depot_stack_handle_t handle); + +/** * stack_depot_set_extra_bits - Set extra bits in a stack depot handle * * @handle: Stack depot handle returned from stack_depot_save() diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index db30a159f9d5..f22bf915dcf6 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -20,7 +20,8 @@ #ifdef CONFIG_SUNRPC_BACKCHANNEL struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid); void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied); -void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task); +void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, + const struct rpc_timeout *to); void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index e9d4377d03c6..5e9d1469c6fa 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -252,7 +252,6 @@ void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *, const char *rpc_proc_name(const struct rpc_task *task); -void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 8ada7dc802d3..2d61987b3545 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -38,6 +38,17 @@ struct rpc_wait { }; /* + * This describes a timeout strategy + */ +struct rpc_timeout { + unsigned long to_initval, /* initial timeout */ + to_maxval, /* max timeout */ + to_increment; /* if !exponential */ + unsigned int to_retries; /* max # of retries */ + unsigned char to_exponential; +}; + +/* * This is the RPC task struct */ struct rpc_task { @@ -205,7 +216,8 @@ struct rpc_wait_queue { */ struct rpc_task *rpc_new_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_task(const struct rpc_task_setup *); -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, + struct rpc_timeout *timeout); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b10f987509cc..67cf1c9efd80 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -69,7 +69,6 @@ struct svc_serv { struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; - struct kref sv_refcnt; unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -97,31 +96,13 @@ struct svc_serv { #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; -/** - * svc_get() - increment reference count on a SUNRPC serv - * @serv: the svc_serv to have count incremented - * - * Returns: the svc_serv that was passed in. - */ -static inline struct svc_serv *svc_get(struct svc_serv *serv) -{ - kref_get(&serv->sv_refcnt); - return serv; -} - -void svc_destroy(struct kref *); +/* This is used by pool_stats to find and lock an svc */ +struct svc_info { + struct svc_serv *serv; + struct mutex *mutex; +}; -/** - * svc_put - decrement reference count on a SUNRPC serv - * @serv: the svc_serv to have count decremented - * - * When the reference count reaches zero, svc_destroy() - * is called to clean up and free the serv. - */ -static inline void svc_put(struct svc_serv *serv) -{ - kref_put(&serv->sv_refcnt, svc_destroy); -} +void svc_destroy(struct svc_serv **svcp); /* * Maximum payload size supported by a kernel RPC server. @@ -250,6 +231,8 @@ struct svc_rqst { struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ + unsigned long bc_to_initval; + unsigned int bc_to_retries; void ** rq_lease_breaker; /* The v4 client breaking a lease */ unsigned int rq_status_counter; /* RPC processing counter */ }; @@ -260,8 +243,6 @@ enum { RQ_LOCAL, /* local request */ RQ_USEDEFERRAL, /* use deferral */ RQ_DROPME, /* drop current reply */ - RQ_SPLICE_OK, /* turned off in gss privacy to prevent - * encrypting page cache pages */ RQ_VICTIM, /* Have agreed to shut down */ RQ_DATA, /* request has data */ }; @@ -433,7 +414,7 @@ void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -int svc_pool_stats_open(struct svc_serv *serv, struct file *file); +int svc_pool_stats_open(struct svc_info *si, struct file *file); void svc_process(struct svc_rqst *rqstp); void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp); int svc_register(const struct svc_serv *, struct net *, const int, diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a5ee0af2a310..e7595ae62fe2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -65,6 +65,7 @@ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; +extern struct workqueue_struct *svcrdma_wq; extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; @@ -97,6 +98,7 @@ struct svcxprt_rdma { u32 sc_pending_recvs; u32 sc_recv_batch; struct list_head sc_rq_dto_q; + struct list_head sc_read_complete_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; @@ -115,6 +117,13 @@ struct svcxprt_rdma { /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 +static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + return container_of(xprt, struct svcxprt_rdma, sc_xprt); +} + /* * Default connection parameters */ @@ -126,6 +135,43 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/** + * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + +/** + * svc_rdma_send_cid_init - Initialize a Send Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + +/* + * A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a set of rdma_rw's that handle data movement + * for all segments of one chunk. + */ +struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; + struct ib_cqe cc_cqe; + struct list_head cc_rwctxts; + ktime_t cc_posttime; + int cc_sqecount; +}; + struct svc_rdma_recv_ctxt { struct llist_node rc_node; struct list_head rc_list; @@ -136,22 +182,33 @@ struct svc_rdma_recv_ctxt { void *rc_recv_buf; struct xdr_stream rc_stream; u32 rc_byte_len; - unsigned int rc_page_count; u32 rc_inv_rkey; __be32 rc_msgtype; + /* State for pulling a Read chunk */ + unsigned int rc_pageoff; + unsigned int rc_curpage; + unsigned int rc_readbytes; + struct xdr_buf rc_saved_arg; + struct svc_rdma_chunk_ctxt rc_cc; + struct svc_rdma_pcl rc_call_pcl; struct svc_rdma_pcl rc_read_pcl; struct svc_rdma_chunk *rc_cur_result_payload; struct svc_rdma_pcl rc_write_pcl; struct svc_rdma_pcl rc_reply_pcl; + + unsigned int rc_page_count; + struct page *rc_pages[RPCSVC_MAXPAGES]; }; struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; + struct work_struct sc_work; + struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; @@ -180,6 +237,11 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); +extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc); +extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); @@ -200,7 +262,8 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr); extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 6f90203edbf8..61c455f1e1f5 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -131,8 +131,11 @@ enum svc_auth_status { * This call releases a domain. * * set_client() - * Givens a pending request (struct svc_rqst), finds and assigns + * Given a pending request (struct svc_rqst), finds and assigns * an appropriate 'auth_domain' as the client. + * + * pseudoflavor() + * Returns RPC_AUTH pseudoflavor in use by @rqstp. */ struct auth_ops { char * name; @@ -143,11 +146,13 @@ struct auth_ops { int (*release)(struct svc_rqst *rqstp); void (*domain_release)(struct auth_domain *dom); enum svc_auth_status (*set_client)(struct svc_rqst *rqstp); + rpc_authflavor_t (*pseudoflavor)(struct svc_rqst *rqstp); }; struct svc_xprt; extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); +extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f85d3a0daca2..464f6a9492ab 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -30,17 +30,6 @@ #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) -/* - * This describes a timeout strategy - */ -struct rpc_timeout { - unsigned long to_initval, /* initial timeout */ - to_maxval, /* max timeout */ - to_increment; /* if !exponential */ - unsigned int to_retries; /* max # of retries */ - unsigned char to_exponential; -}; - enum rpc_display_format_t { RPC_DISPLAY_ADDR = 0, RPC_DISPLAY_PORT, diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h index 5c4ae1a26183..d8dbef6b7fc2 100644 --- a/include/linux/surface_aggregator/serial_hub.h +++ b/include/linux/surface_aggregator/serial_hub.h @@ -12,7 +12,7 @@ #ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H #define _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H -#include <linux/crc-ccitt.h> +#include <linux/crc-itu-t.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/list.h> @@ -188,7 +188,7 @@ static_assert(sizeof(struct ssh_command) == 8); */ static inline u16 ssh_crc(const u8 *buf, size_t len) { - return crc_ccitt_false(0xffff, buf, len); + return crc_itu_t(0xffff, buf, len); } /* diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b905..4db00ddad261 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -extern void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma); - /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, @@ -490,13 +487,12 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -extern struct swap_info_struct *page_swap_info(struct page *); -extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); +struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); -sector_t swap_page_sector(struct page *page); +sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h index 641ca8880995..3858a6ffdd5c 100644 --- a/include/linux/syscall_user_dispatch.h +++ b/include/linux/syscall_user_dispatch.h @@ -6,16 +6,10 @@ #define _SYSCALL_USER_DISPATCH_H #include <linux/thread_info.h> +#include <linux/syscall_user_dispatch_types.h> #ifdef CONFIG_GENERIC_ENTRY -struct syscall_user_dispatch { - char __user *selector; - unsigned long offset; - unsigned long len; - bool on_dispatch; -}; - int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, unsigned long len, char __user *selector); @@ -29,7 +23,6 @@ int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long siz void __user *data); #else -struct syscall_user_dispatch {}; static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, unsigned long len, char __user *selector) diff --git a/include/linux/syscall_user_dispatch_types.h b/include/linux/syscall_user_dispatch_types.h new file mode 100644 index 000000000000..3be36b06c7d7 --- /dev/null +++ b/include/linux/syscall_user_dispatch_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SYSCALL_USER_DISPATCH_TYPES_H +#define _SYSCALL_USER_DISPATCH_TYPES_H + +#include <linux/types.h> + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +#else + +struct syscall_user_dispatch {}; + +#endif + +#endif /* _SYSCALL_USER_DISPATCH_TYPES_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e92..5c0dbef55792 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -71,9 +71,12 @@ struct clone_args; struct open_how; struct mount_attr; struct landlock_ruleset_attr; +struct lsm_ctx; enum landlock_rule_type; struct cachestat_range; struct cachestat; +struct statmount; +struct mnt_id_req; #include <linux/types.h> #include <linux/aio_abi.h> @@ -407,6 +410,12 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf); asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf); +asmlinkage long sys_statmount(const struct mnt_id_req __user *req, + struct statmount __user *buf, size_t bufsize, + unsigned int flags); +asmlinkage long sys_listmount(const struct mnt_id_req __user *req, + u64 __user *buf, size_t bufsize, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); #if BITS_PER_LONG == 32 @@ -949,6 +958,11 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t *size, __u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, __u32 flags); +asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags); /* * Architecture-specific system calls diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 61b40ea81f4d..ee7d33b89e9e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -210,11 +210,6 @@ struct ctl_table_root { int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; -/* struct ctl_path describes where in the hierarchy a table is added */ -struct ctl_path { - const char *procname; -}; - #define register_sysctl(path, table) \ register_sysctl_sz(path, table, ARRAY_SIZE(table)) @@ -255,8 +250,6 @@ extern int unaligned_enabled; extern int unaligned_dump_stack; extern int no_unaligned_warning; -#define SYSCTL_PERM_EMPTY_DIR (1 << 0) - #else /* CONFIG_SYSCTL */ static inline void register_sysctl_init(const char *path, struct ctl_table *table) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b646b574b060..89b290d8c8dc 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -190,23 +190,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req) #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/tcp_sock.rst. + * Please update the document when adding new fields. + */ + /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; - u16 tcp_header_len; /* Bytes of tcp header to send */ + + /* TX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_tx); + /* timestamp of last sent data packet (for restart window) */ + u32 max_window; /* Maximal window ever seen from peer */ + u32 rcv_ssthresh; /* Current window clamp */ + u32 reordering; /* Packet reordering metric. */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ + /* from STCP, retrans queue hinting */ + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __cacheline_group_end(tcp_sock_read_tx); + + /* TXRX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_txrx); + u32 tsoffset; /* timestamp offset */ + u32 snd_wnd; /* The window we expect to receive */ + u32 mss_cache; /* Cached effective mss, not including SACKS */ + u32 snd_cwnd; /* Sending congestion window */ + u32 prr_out; /* Total number of pkts sent during Recovery. */ + u32 lost_out; /* Lost packets */ + u32 sacked_out; /* SACK'd packets */ + u16 tcp_header_len; /* Bytes of tcp header to send */ + u8 chrono_type : 2, /* current chronograph type */ + repair : 1, + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + __cacheline_group_end(tcp_sock_read_txrx); + + /* RX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_rx); + u32 copied_seq; /* Head of yet unread data */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ + u32 snd_wl1; /* Sequence for window update */ + u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 retrans_out; /* Retransmitted packets out */ + u16 advmss; /* Advertised MSS */ + u16 urg_data; /* Saved octet of OOB data and control flags */ + u32 lost; /* Total data packets lost incl. rexmits */ + struct minmax rtt_min; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; /* Slow start size threshold */ + __cacheline_group_end(tcp_sock_read_rx); + /* TX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned; + u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ + u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 pushed_seq; /* Last pushed seq, required to talk to windows */ + u32 lsndtime; + u32 mdev_us; /* medium deviation */ + u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ + u64 tcp_mstamp; /* most recent packet received/sent */ + u32 rtt_seq; /* sequence number to update rttvar */ + struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ + struct sk_buff *highest_sack; /* skb just after the highest + * skb with SACKed bit set + * (validity guaranteed only if + * sacked_out > 0) + */ + u8 ecn_flags; /* ECN status bits. */ + __cacheline_group_end(tcp_sock_write_tx); + + /* TXRX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_txrx); /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; - + u32 rcv_nxt; /* What we want to receive next */ + u32 snd_nxt; /* Next sequence we send */ + u32 snd_una; /* First byte we want an ack for */ + u32 window_clamp; /* Maximal window to advertise */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 packets_out; /* Packets which are "in flight" */ + u32 snd_up; /* Urgent pointer */ + u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 app_limited; /* limited until "delivered" reaches this val */ + u32 rcv_wnd; /* Current receiver window */ /* - * RFC793 variables by their proper names. This means you can - * read the code and the spec side by side (and laugh ...) - * See RFC793 and RFC1122. The RFC writes these in capitals. + * Options received (usually on last packet, some only on SYN packets). */ - u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_rx); + u64 bytes_received; + /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ @@ -216,45 +314,44 @@ struct tcp_sock { u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ - u32 rcv_nxt; /* What we want to receive next */ - u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ - u32 snd_nxt; /* Next sequence we send */ - u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut - * The total number of segments sent. - */ - u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut - * total number of data segments sent. - */ - u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut - * total number of data bytes sent. - */ + u32 max_packets_out; /* max packets_out in last window */ + u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ + u32 rcv_rtt_last_tsecr; + u64 first_tx_mstamp; /* start of window send phase */ + u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ + struct { + u32 rtt_us; + u32 seq; + u64 time; + } rcv_rtt_est; +/* Receiver queue space */ + struct { + u32 space; + u32 seq; + u64 time; + } rcvq_space; + __cacheline_group_end(tcp_sock_write_rx); + /* End of Hot Path */ + +/* + * RFC793 variables by their proper names. This means you can + * read the code and the spec side by side (and laugh ...) + * See RFC793 and RFC1122. The RFC writes these in capitals. + */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ - u32 snd_una; /* First byte we want an ack for */ - u32 snd_sml; /* Last byte of the most recently transmitted small packet */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ - u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; - - u32 tsoffset; /* timestamp offset */ - struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ - - u32 snd_wl1; /* Sequence for window update */ - u32 snd_wnd; /* The window we expect to receive */ - u32 max_window; /* Maximal window ever seen from peer */ - u32 mss_cache; /* Cached effective mss, not including SACKS */ - u32 window_clamp; /* Maximal window to advertise */ - u32 rcv_ssthresh; /* Current window clamp */ u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { @@ -268,24 +365,16 @@ struct tcp_sock { dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; - u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ tcp_usec_ts:1, /* TSval values in usec */ unused:4; - u32 chrono_start; /* Start time in jiffies of a TCP chrono */ - u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ - u8 chrono_type:2, /* current chronograph type */ - rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ - is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - fastopen_client_fail:2; /* reason why fastopen failed */ - u8 nonagle : 4,/* Disable Nagle algorithm? */ - thin_lto : 1,/* Use linear timeouts for thin streams */ - recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ - repair : 1, + fastopen_client_fail:2, /* reason why fastopen failed */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ @@ -293,45 +382,19 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ - u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ - u64 tcp_wstamp_ns; /* departure time for next sent data packet */ - u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ - u64 tcp_mstamp; /* most recent packet received/sent */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rtt_seq; /* sequence number to update rttvar */ - struct minmax rtt_min; - u32 packets_out; /* Packets which are "in flight" */ - u32 retrans_out; /* Retransmitted packets out */ - u32 max_packets_out; /* max packets_out in last window */ - u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ - - u16 urg_data; /* Saved octet of OOB data and control flags */ - u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ - u32 snd_up; /* Urgent pointer */ - -/* - * Options received (usually on last packet, some only on SYN packets). - */ - struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ - u32 snd_ssthresh; /* Slow start size threshold */ - u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; @@ -339,32 +402,10 @@ struct tcp_sock { u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ - u32 prr_out; /* Total number of pkts sent during Recovery. */ - u32 delivered; /* Total data packets delivered incl. rexmits */ - u32 delivered_ce; /* Like the above but only ECE marked packets */ - u32 lost; /* Total data packets lost incl. rexmits */ - u32 app_limited; /* limited until "delivered" reaches this val */ - u64 first_tx_mstamp; /* start of window send phase */ - u64 delivered_mstamp; /* time we reached "delivered" */ - u32 rate_delivered; /* saved rate sample: packets delivered */ - u32 rate_interval_us; /* saved rate sample: time elapsed */ - - u32 rcv_wnd; /* Current receiver window */ - u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ - u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ - u32 pushed_seq; /* Last pushed seq, required to talk to windows */ - u32 lost_out; /* Lost packets */ - u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; - /* from STCP, retrans queue hinting */ - struct sk_buff* lost_skb_hint; - struct sk_buff *retransmit_skb_hint; - - /* OOO segments go in this rbtree. Socket lock must be held. */ - struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ @@ -373,12 +414,6 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - struct sk_buff *highest_sack; /* skb just after the highest - * skb with SACKed bit set - * (validity guaranteed only if - * sacked_out > 0) - */ - int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ @@ -429,21 +464,6 @@ struct tcp_sock { u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ -/* Receiver side RTT estimation */ - u32 rcv_rtt_last_tsecr; - struct { - u32 rtt_us; - u32 seq; - u64 time; - } rcv_rtt_est; - -/* Receiver queue space */ - struct { - u32 space; - u32 seq; - u64 time; - } rcvq_space; - /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 17eb1c5205d3..911ddf92dcee 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -84,6 +84,7 @@ struct tee_param { * @release: release this open file * @open_session: open a new session * @close_session: close a session + * @system_session: declare session as a system session * @invoke_func: invoke a trusted function * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command @@ -100,6 +101,7 @@ struct tee_driver_ops { struct tee_ioctl_open_session_arg *arg, struct tee_param *param); int (*close_session)(struct tee_context *ctx, u32 session); + int (*system_session)(struct tee_context *ctx, u32 session); int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); @@ -430,6 +432,20 @@ int tee_client_open_session(struct tee_context *ctx, int tee_client_close_session(struct tee_context *ctx, u32 session); /** + * tee_client_system_session() - Declare session as a system session + * @ctx: TEE Context + * @session: Session id + * + * This function requests TEE to provision an entry context ready to use for + * that session only. The provisioned entry context is used for command + * invocation and session closure, not for command cancelling requests. + * TEE releases the provisioned context upon session closure. + * + * Return < 0 on error else 0 if an entry context has been provisioned. + */ +int tee_client_system_session(struct tee_context *ctx, u32 session); + +/** * tee_client_invoke_func() - Invoke a function in a Trusted Application * @ctx: TEE Context * @arg: Invoke arguments, see description of diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cee814d5d1ac..bf84595a4e86 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -51,18 +51,23 @@ enum thermal_notify_event { THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */ THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */ THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */ + THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ + THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ + THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ }; /** * struct thermal_trip - representation of a point in temperature domain * @temperature: temperature value in miliCelsius * @hysteresis: relative hysteresis in miliCelsius + * @threshold: trip crossing notification threshold miliCelsius * @type: trip point type * @priv: pointer to driver data associated with this trip */ struct thermal_trip { int temperature; int hysteresis; + int threshold; enum thermal_trip_type type; void *priv; }; @@ -115,6 +120,7 @@ struct thermal_cooling_device { * @id: unique id number for each thermal zone * @type: the thermal zone device type * @device: &struct device for this thermal zone + * @removal: removal completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -149,11 +155,13 @@ struct thermal_cooling_device { * @node: node in thermal_tz_list (in thermal_core.c) * @poll_queue: delayed work for polling * @notify_event: Last notification event + * @suspended: thermal zone suspend indicator */ struct thermal_zone_device { int id; char type[THERMAL_NAME_LENGTH]; struct device device; + struct completion removal; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; @@ -181,6 +189,7 @@ struct thermal_zone_device { struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; + bool suspended; }; /** @@ -193,6 +202,8 @@ struct thermal_zone_device { * thermal zone. * @throttle: callback called for every trip point even if temperature is * below the trip point temperature + * @update_tz: callback called when thermal zone internals have changed, e.g. + * thermal cooling instance was added/removed * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { @@ -201,6 +212,8 @@ struct thermal_governor { void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, const struct thermal_trip *trip); + void (*update_tz)(struct thermal_zone_device *tz, + enum thermal_notify_event reason); struct list_head governor_list; }; @@ -280,10 +293,6 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); - -int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id, - const struct thermal_trip *trip); - int for_each_thermal_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); @@ -291,16 +300,11 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); int thermal_zone_get_num_trips(struct thermal_zone_device *tz); +void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, + struct thermal_trip *trip, int temp); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); -#ifdef CONFIG_THERMAL_ACPI -int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); -int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); -#endif - #ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 03d9c5ac01d1..876e31b4461d 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -7,10 +7,13 @@ #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/err.h> +#include <linux/time64.h> struct user_namespace; extern struct user_namespace init_user_ns; +struct vm_area_struct; + struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fe1e467ba046..7c43e98cf211 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -4,6 +4,7 @@ #include <linux/errno.h> #include <linux/clocksource_ids.h> +#include <linux/ktime.h> /* Included from linux/ktime.h */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 26a545bb0153..f18a2f1eb79e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -7,21 +7,7 @@ #include <linux/stddef.h> #include <linux/debugobjects.h> #include <linux/stringify.h> - -struct timer_list { - /* - * All fields that change during normal runtime grouped to the - * same cacheline - */ - struct hlist_node entry; - unsigned long expires; - void (*function)(struct timer_list *); - u32 flags; - -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; +#include <linux/timer_types.h> #ifdef CONFIG_LOCKDEP /* diff --git a/include/linux/timer_types.h b/include/linux/timer_types.h new file mode 100644 index 000000000000..fae5a388f914 --- /dev/null +++ b/include/linux/timer_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMER_TYPES_H +#define _LINUX_TIMER_TYPES_H + +#include <linux/lockdep_types.h> +#include <linux/types.h> + +struct timer_list { + /* + * All fields that change during normal runtime grouped to the + * same cacheline + */ + struct hlist_node entry; + unsigned long expires; + void (*function)(struct timer_list *); + u32 flags; + +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +#endif /* _LINUX_TIMER_TYPES_H */ diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index adc80e29168e..62973f7d4610 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -3,18 +3,7 @@ #define _LINUX_TIMERQUEUE_H #include <linux/rbtree.h> -#include <linux/ktime.h> - - -struct timerqueue_node { - struct rb_node node; - ktime_t expires; -}; - -struct timerqueue_head { - struct rb_root_cached rb_root; -}; - +#include <linux/timerqueue_types.h> extern bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h new file mode 100644 index 000000000000..dc298d0923e3 --- /dev/null +++ b/include/linux/timerqueue_types.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMERQUEUE_TYPES_H +#define _LINUX_TIMERQUEUE_TYPES_H + +#include <linux/rbtree_types.h> +#include <linux/types.h> + +struct timerqueue_node { + struct rb_node node; + ktime_t expires; +}; + +struct timerqueue_head { + struct rb_root_cached rb_root; +}; + +#endif /* _LINUX_TIMERQUEUE_TYPES_H */ diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 1c3948a1d6ad..3c13240077b8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a); struct tnum tnum_subreg(struct tnum a); /* Returns the tnum with the lower 32-bit subreg cleared */ struct tnum tnum_clear_subreg(struct tnum a); +/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower + * 32-bit subreg in *subreg* + */ +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg); /* Returns the tnum with the lower 32-bit subreg set to value */ struct tnum tnum_const_subreg(struct tnum a, u32 value); /* Returns true if 32-bit subreg @a is a known constant*/ diff --git a/include/linux/torture.h b/include/linux/torture.h index c98d0c83d117..1541454da03e 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -21,6 +21,7 @@ #include <linux/debugobjects.h> #include <linux/bug.h> #include <linux/compiler.h> +#include <linux/hrtimer.h> /* Definitions for a non-string torture-test module parameter. */ #define torture_param(type, name, init, msg) \ diff --git a/include/linux/types.h b/include/linux/types.h index 253168bb3fe1..2bc8766ba20c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -120,6 +120,9 @@ typedef s64 int64_t; #define aligned_be64 __aligned_be64 #define aligned_le64 __aligned_le64 +/* Nanosecond scalar representation for kernel time values */ +typedef s64 ktime_t; + /** * The type used for indexing onto a disc or disc partition. * diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb..f85ec5613721 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -12,20 +12,12 @@ * to detect when we overlook these differences. * */ -#include <linux/types.h> +#include <linux/uidgid_types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; - -typedef struct { - uid_t val; -} kuid_t; - - -typedef struct { - gid_t val; -} kgid_t; +struct uid_gid_map; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } @@ -138,6 +130,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +181,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ diff --git a/include/linux/uidgid_types.h b/include/linux/uidgid_types.h new file mode 100644 index 000000000000..b35ac4955a33 --- /dev/null +++ b/include/linux/uidgid_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UIDGID_TYPES_H +#define _LINUX_UIDGID_TYPES_H + +#include <linux/types.h> + +typedef struct { + uid_t val; +} kuid_t; + +typedef struct { + gid_t val; +} kgid_t; + +#endif /* _LINUX_UIDGID_TYPES_H */ diff --git a/include/linux/uio.h b/include/linux/uio.h index b6214cbf2a43..bea9c89922d9 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); -int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2dc19f40d05..e4056547fbe6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm, extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); +/* move_pages */ +void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); +void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); +ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, + unsigned long dst_start, unsigned long src_start, + unsigned long len, __u64 flags); +int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr); + /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..c82089dee0c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa1240040..747943bc8cc2 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, - PGDEMOTE_KSWAPD, - PGDEMOTE_DIRECT, - PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, @@ -145,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fed855bae6d8..343906a98d6e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -void __mod_lruvec_page_state(struct page *page, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void mod_lruvec_page_state(struct page *page, +static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_page_state(page, idx, val); + __lruvec_stat_mod_folio(folio, idx, val); local_irq_restore(flags); } +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + lruvec_stat_mod_folio(page_folio(page), idx, val); +} + #else static inline void __mod_lruvec_state(struct lruvec *lruvec, @@ -583,37 +589,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(page_pgdat(page), idx, val); -} - -static inline void mod_lruvec_page_state(struct page *page, +static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - mod_node_page_state(page_pgdat(page), idx, val); + __mod_node_page_state(folio_pgdat(folio), idx, val); } -#endif /* CONFIG_MEMCG */ - -static inline void __inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) { - __mod_lruvec_page_state(page, idx, 1); + mod_node_page_state(folio_pgdat(folio), idx, val); } -static inline void __dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) { - __mod_lruvec_page_state(page, idx, -1); + mod_node_page_state(page_pgdat(page), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - __mod_lruvec_page_state(&folio->page, idx, val); -} +#endif /* CONFIG_MEMCG */ static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) @@ -627,24 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, 1); -} - -static inline void dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, -1); -} - -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { diff --git a/include/linux/wait.h b/include/linux/wait.h index 3473b663176f..8aa3372f21a0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -9,7 +9,6 @@ #include <linux/spinlock.h> #include <asm/current.h> -#include <uapi/linux/wait.h> typedef struct wait_queue_entry wait_queue_entry_t; diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 763bd382cf2d..686291b87852 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -11,7 +11,6 @@ #include <linux/device.h> #include <linux/acpi.h> #include <linux/mod_devicetable.h> -#include <uapi/linux/wmi.h> /** * struct wmi_device - WMI device structure @@ -22,11 +21,17 @@ */ struct wmi_device { struct device dev; - - /* private: used by the WMI driver core */ bool setable; }; +/** + * to_wmi_device() - Helper macro to cast a device to a wmi_device + * @device: device struct + * + * Cast a struct device to a struct wmi_device. + */ +#define to_wmi_device(device) container_of(device, struct wmi_device, dev) + extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, const struct acpi_buffer *in, @@ -35,9 +40,9 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); -u8 wmidev_instance_count(struct wmi_device *wdev); +acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); -extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); +u8 wmidev_instance_count(struct wmi_device *wdev); /** * struct wmi_driver - WMI driver structure @@ -47,11 +52,8 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); * @probe: Callback for device binding * @remove: Callback for device unbinding * @notify: Callback for receiving WMI events - * @filter_callback: Callback for filtering device IOCTLs * * This represents WMI drivers which handle WMI devices. - * @filter_callback is only necessary for drivers which - * want to set up a WMI IOCTL interface. */ struct wmi_driver { struct device_driver driver; @@ -61,8 +63,6 @@ struct wmi_driver { int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); void (*notify)(struct wmi_device *device, union acpi_object *data); - long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd, - struct wmi_ioctl_buffer *arg); }; extern int __must_check __wmi_driver_register(struct wmi_driver *driver, diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..2cc0a9606175 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -14,12 +14,7 @@ #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> - -struct workqueue_struct; - -struct work_struct; -typedef void (*work_func_t)(struct work_struct *work); -void delayed_work_timer_fn(struct timer_list *t); +#include <linux/workqueue_types.h> /* * The first word is the work queue pointer and the flags rolled into @@ -95,15 +90,6 @@ enum { #define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) -struct work_struct { - atomic_long_t data; - struct list_head entry; - work_func_t func; -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; - #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) @@ -491,7 +477,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/include/linux/workqueue_types.h b/include/linux/workqueue_types.h new file mode 100644 index 000000000000..4c38824f3ab4 --- /dev/null +++ b/include/linux/workqueue_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_WORKQUEUE_TYPES_H +#define _LINUX_WORKQUEUE_TYPES_H + +#include <linux/atomic.h> +#include <linux/lockdep_types.h> +#include <linux/timer_types.h> +#include <linux/types.h> + +struct workqueue_struct; + +struct work_struct; +typedef void (*work_func_t)(struct work_struct *work); +void delayed_work_timer_fn(struct timer_list *t); + +struct work_struct { + atomic_long_t data; + struct list_head entry; + work_func_t func; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +#endif /* _LINUX_WORKQUEUE_TYPES_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c..6d0a14f7019d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..0b709f5bc65f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,19 +5,41 @@ #include <linux/types.h> #include <linux/mm_types.h> +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); - +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_folio_swapin(struct folio *folio); +bool is_zswap_enabled(void); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -31,6 +53,14 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} +static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} +static inline void zswap_folio_swapin(struct folio *folio) {} + +static inline bool is_zswap_enabled(void) +{ + return false; +} #endif |