From 7b4ff1adb57ad96d8f12a05d8c661a3d8c4d2be1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 11 May 2017 10:17:45 -0300 Subject: mutex, futex: adjust kernel-doc markups to generate ReST There are a few issues on some kernel-doc markups that was causing troubles with kernel-doc output on ReST format: ./kernel/futex.c:492: WARNING: Inline emphasis start-string without end-string. ./kernel/futex.c:1264: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:1721: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2338: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2426: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2899: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2972: WARNING: Block quote ends without a blank line; unexpected unindent. Fix them. No functional changes. Acked-by: Darren Hart (VMware) Signed-off-by: Mauro Carvalho Chehab --- kernel/futex.c | 40 ++++++++++++++++++++-------------------- kernel/locking/mutex.c | 6 +++--- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..b8ae87d227da 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key) * * Return: a negative error code or 0 * - * The key words are stored in *key on success. + * The key words are stored in @key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). @@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: - * 0 - ready to wait; - * 1 - acquired the lock; - * <0 - error + * - 0 - ready to wait; + * - 1 - acquired the lock; + * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. */ @@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * hb1 and hb2 must be held by the caller. * * Return: - * 0 - failed to acquire the lock atomically; - * >0 - acquired the lock, return value is vpid of the top_waiter - * <0 - error + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, @@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * uaddr2 atomically on behalf of the top waiter. * * Return: - * >=0 - on success, the number of tasks requeued or woken; - * <0 - on error + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_requeue, @@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * be paired with exactly one earlier call to queue_me(). * * Return: - * 1 - if the futex_q was still queued (and we removed unqueued it); - * 0 - if the futex_q was already removed by the waking thread + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) { @@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart); * acquire the lock. Must be called with the hb lock held. * * Return: - * 1 - success, lock taken; - * 0 - success, lock not taken; - * <0 - on error (-EFAULT) + * - 1 - success, lock taken; + * - 0 - success, lock not taken; + * - <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { @@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * with no q.key reference on failure. * * Return: - * 0 - uaddr contains val and hb has been locked; - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - 0 - uaddr contains val and hb has been locked; + * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2895,8 +2895,8 @@ pi_faulted: * called with the hb lock held. * * Return: - * 0 = no early wakeup detected; - * <0 = -ETIMEDOUT or -ERESTARTNOINTR + * - 0 = no early wakeup detected; + * - <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * If 4 or 7, we cleanup and return with -ETIMEDOUT. * * Return: - * 0 - On success; - * <0 - On error + * - 0 - On success; + * - <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 198527a62149..858a07590e39 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock); * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) + * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging + * checks that will enforce the restrictions and will also do + * deadlock debugging) * * This function is similar to (but not equivalent to) down(). */ -- cgit From c0c6e0850514c16814c37f64a9a1bcc6c52a19ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 14 May 2017 12:03:39 -0300 Subject: irq: update genericirq book location This book got converted from DocBook. Update its references to point to the current location. Signed-off-by: Mauro Carvalho Chehab --- kernel/irq/chip.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/irqdesc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..4188a0a7691f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -7,7 +7,7 @@ * This file contains the core interrupt handling code, for irq-chip * based architectures. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst */ #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..bbf9a7174283 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -6,7 +6,7 @@ * * This file contains the core interrupt handling code. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..22e443133987 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -4,7 +4,7 @@ * * This file contains the interrupt descriptor management code * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ #include -- cgit From af777cd1b83e95138e7285fde87c795ef0ae7c4d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 13 May 2017 04:51:40 -0700 Subject: doc: ReSTify credentials.txt This updates the credentials API documentation to ReST markup and moves it under the security subsection of kernel API documentation. Cc: David Howells Signed-off-by: Kees Cook Signed-off-by: Jonathan Corbet --- Documentation/security/00-INDEX | 2 - Documentation/security/credentials.rst | 554 +++++++++++++++++++++++++++++++ Documentation/security/credentials.txt | 581 --------------------------------- Documentation/security/index.rst | 1 + include/linux/cred.h | 2 +- kernel/cred.c | 2 +- 6 files changed, 557 insertions(+), 585 deletions(-) create mode 100644 Documentation/security/credentials.rst delete mode 100644 Documentation/security/credentials.txt (limited to 'kernel') diff --git a/Documentation/security/00-INDEX b/Documentation/security/00-INDEX index 414235c1fcfc..c4df62a9ae5b 100644 --- a/Documentation/security/00-INDEX +++ b/Documentation/security/00-INDEX @@ -10,8 +10,6 @@ Yama.txt - documentation on the Yama Linux Security Module. apparmor.txt - documentation on the AppArmor security extension. -credentials.txt - - documentation about credentials in Linux. keys-ecryptfs.txt - description of the encryption keys for the ecryptfs filesystem. keys-request-key.txt diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst new file mode 100644 index 000000000000..038a7e19eff9 --- /dev/null +++ b/Documentation/security/credentials.rst @@ -0,0 +1,554 @@ +==================== +Credentials in Linux +==================== + +By: David Howells + +.. contents:: :local: + +Overview +======== + +There are several parts to the security check performed by Linux when one +object acts upon another: + + 1. Objects. + + Objects are things in the system that may be acted upon directly by + userspace programs. Linux has a variety of actionable objects, including: + + - Tasks + - Files/inodes + - Sockets + - Message queues + - Shared memory segments + - Semaphores + - Keys + + As a part of the description of all these objects there is a set of + credentials. What's in the set depends on the type of object. + + 2. Object ownership. + + Amongst the credentials of most objects, there will be a subset that + indicates the ownership of that object. This is used for resource + accounting and limitation (disk quotas and task rlimits for example). + + In a standard UNIX filesystem, for instance, this will be defined by the + UID marked on the inode. + + 3. The objective context. + + Also amongst the credentials of those objects, there will be a subset that + indicates the 'objective context' of that object. This may or may not be + the same set as in (2) - in standard UNIX files, for instance, this is the + defined by the UID and the GID marked on the inode. + + The objective context is used as part of the security calculation that is + carried out when an object is acted upon. + + 4. Subjects. + + A subject is an object that is acting upon another object. + + Most of the objects in the system are inactive: they don't act on other + objects within the system. Processes/tasks are the obvious exception: + they do stuff; they access and manipulate things. + + Objects other than tasks may under some circumstances also be subjects. + For instance an open file may send SIGIO to a task using the UID and EUID + given to it by a task that called ``fcntl(F_SETOWN)`` upon it. In this case, + the file struct will have a subjective context too. + + 5. The subjective context. + + A subject has an additional interpretation of its credentials. A subset + of its credentials forms the 'subjective context'. The subjective context + is used as part of the security calculation that is carried out when a + subject acts. + + A Linux task, for example, has the FSUID, FSGID and the supplementary + group list for when it is acting upon a file - which are quite separate + from the real UID and GID that normally form the objective context of the + task. + + 6. Actions. + + Linux has a number of actions available that a subject may perform upon an + object. The set of actions available depends on the nature of the subject + and the object. + + Actions include reading, writing, creating and deleting files; forking or + signalling and tracing tasks. + + 7. Rules, access control lists and security calculations. + + When a subject acts upon an object, a security calculation is made. This + involves taking the subjective context, the objective context and the + action, and searching one or more sets of rules to see whether the subject + is granted or denied permission to act in the desired manner on the + object, given those contexts. + + There are two main sources of rules: + + a. Discretionary access control (DAC): + + Sometimes the object will include sets of rules as part of its + description. This is an 'Access Control List' or 'ACL'. A Linux + file may supply more than one ACL. + + A traditional UNIX file, for example, includes a permissions mask that + is an abbreviated ACL with three fixed classes of subject ('user', + 'group' and 'other'), each of which may be granted certain privileges + ('read', 'write' and 'execute' - whatever those map to for the object + in question). UNIX file permissions do not allow the arbitrary + specification of subjects, however, and so are of limited use. + + A Linux file might also sport a POSIX ACL. This is a list of rules + that grants various permissions to arbitrary subjects. + + b. Mandatory access control (MAC): + + The system as a whole may have one or more sets of rules that get + applied to all subjects and objects, regardless of their source. + SELinux and Smack are examples of this. + + In the case of SELinux and Smack, each object is given a label as part + of its credentials. When an action is requested, they take the + subject label, the object label and the action and look for a rule + that says that this action is either granted or denied. + + +Types of Credentials +==================== + +The Linux kernel supports the following types of credentials: + + 1. Traditional UNIX credentials. + + - Real User ID + - Real Group ID + + The UID and GID are carried by most, if not all, Linux objects, even if in + some cases it has to be invented (FAT or CIFS files for example, which are + derived from Windows). These (mostly) define the objective context of + that object, with tasks being slightly different in some cases. + + - Effective, Saved and FS User ID + - Effective, Saved and FS Group ID + - Supplementary groups + + These are additional credentials used by tasks only. Usually, an + EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID + will be used as the objective. For tasks, it should be noted that this is + not always true. + + 2. Capabilities. + + - Set of permitted capabilities + - Set of inheritable capabilities + - Set of effective capabilities + - Capability bounding set + + These are only carried by tasks. They indicate superior capabilities + granted piecemeal to a task that an ordinary task wouldn't otherwise have. + These are manipulated implicitly by changes to the traditional UNIX + credentials, but can also be manipulated directly by the ``capset()`` + system call. + + The permitted capabilities are those caps that the process might grant + itself to its effective or permitted sets through ``capset()``. This + inheritable set might also be so constrained. + + The effective capabilities are the ones that a task is actually allowed to + make use of itself. + + The inheritable capabilities are the ones that may get passed across + ``execve()``. + + The bounding set limits the capabilities that may be inherited across + ``execve()``, especially when a binary is executed that will execute as + UID 0. + + 3. Secure management flags (securebits). + + These are only carried by tasks. These govern the way the above + credentials are manipulated and inherited over certain operations such as + execve(). They aren't used directly as objective or subjective + credentials. + + 4. Keys and keyrings. + + These are only carried by tasks. They carry and cache security tokens + that don't fit into the other standard UNIX credentials. They are for + making such things as network filesystem keys available to the file + accesses performed by processes, without the necessity of ordinary + programs having to know about security details involved. + + Keyrings are a special type of key. They carry sets of other keys and can + be searched for the desired key. Each process may subscribe to a number + of keyrings: + + Per-thread keying + Per-process keyring + Per-session keyring + + When a process accesses a key, if not already present, it will normally be + cached on one of these keyrings for future accesses to find. + + For more information on using keys, see Documentation/security/keys.txt. + + 5. LSM + + The Linux Security Module allows extra controls to be placed over the + operations that a task may do. Currently Linux supports several LSM + options. + + Some work by labelling the objects in a system and then applying sets of + rules (policies) that say what operations a task with one label may do to + an object with another label. + + 6. AF_KEY + + This is a socket-based approach to credential management for networking + stacks [RFC 2367]. It isn't discussed by this document as it doesn't + interact directly with task and file credentials; rather it keeps system + level credentials. + + +When a file is opened, part of the opening task's subjective context is +recorded in the file struct created. This allows operations using that file +struct to use those credentials instead of the subjective context of the task +that issued the operation. An example of this would be a file opened on a +network filesystem where the credentials of the opened file should be presented +to the server, regardless of who is actually doing a read or a write upon it. + + +File Markings +============= + +Files on disk or obtained over the network may have annotations that form the +objective security context of that file. Depending on the type of filesystem, +this may include one or more of the following: + + * UNIX UID, GID, mode; + * Windows user ID; + * Access control list; + * LSM security label; + * UNIX exec privilege escalation bits (SUID/SGID); + * File capabilities exec privilege escalation bits. + +These are compared to the task's subjective security context, and certain +operations allowed or disallowed as a result. In the case of execve(), the +privilege escalation bits come into play, and may allow the resulting process +extra privileges, based on the annotations on the executable file. + + +Task Credentials +================ + +In Linux, all of a task's credentials are held in (uid, gid) or through +(groups, keys, LSM security) a refcounted structure of type 'struct cred'. +Each task points to its credentials by a pointer called 'cred' in its +task_struct. + +Once a set of credentials has been prepared and committed, it may not be +changed, barring the following exceptions: + + 1. its reference count may be changed; + + 2. the reference count on the group_info struct it points to may be changed; + + 3. the reference count on the security data it points to may be changed; + + 4. the reference count on any keyrings it points to may be changed; + + 5. any keyrings it points to may be revoked, expired or have their security + attributes changed; and + + 6. the contents of any keyrings to which it points may be changed (the whole + point of keyrings being a shared set of credentials, modifiable by anyone + with appropriate access). + +To alter anything in the cred struct, the copy-and-replace principle must be +adhered to. First take a copy, then alter the copy and then use RCU to change +the task pointer to make it point to the new copy. There are wrappers to aid +with this (see below). + +A task may only alter its _own_ credentials; it is no longer permitted for a +task to alter another's credentials. This means the ``capset()`` system call +is no longer permitted to take any PID other than the one of the current +process. Also ``keyctl_instantiate()`` and ``keyctl_negate()`` functions no +longer permit attachment to process-specific keyrings in the requesting +process as the instantiating process may need to create them. + + +Immutable Credentials +--------------------- + +Once a set of credentials has been made public (by calling ``commit_creds()`` +for example), it must be considered immutable, barring two exceptions: + + 1. The reference count may be altered. + + 2. Whilst the keyring subscriptions of a set of credentials may not be + changed, the keyrings subscribed to may have their contents altered. + +To catch accidental credential alteration at compile time, struct task_struct +has _const_ pointers to its credential sets, as does struct file. Furthermore, +certain functions such as ``get_cred()`` and ``put_cred()`` operate on const +pointers, thus rendering casts unnecessary, but require to temporarily ditch +the const qualification to be able to alter the reference count. + + +Accessing Task Credentials +-------------------------- + +A task being able to alter only its own credentials permits the current process +to read or replace its own credentials without the need for any form of locking +-- which simplifies things greatly. It can just call:: + + const struct cred *current_cred() + +to get a pointer to its credentials structure, and it doesn't have to release +it afterwards. + +There are convenience wrappers for retrieving specific aspects of a task's +credentials (the value is simply returned in each case):: + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + void *current_security(void) Current's LSM security pointer + struct user_struct *current_user(void) Current's user account + +There are also convenience wrappers for retrieving specific associated pairs of +a task's credentials:: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +which return these pairs of values through their arguments after retrieving +them from the current task's credentials. + + +In addition, there is a function for obtaining a reference on the current +process's current set of credentials:: + + const struct cred *get_current_cred(void); + +and functions for getting references to one of the credentials that don't +actually live in struct cred:: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +which get references to the current process's user accounting structure and +supplementary groups list respectively. + +Once a reference has been obtained, it must be released with ``put_cred()``, +``free_uid()`` or ``put_group_info()`` as appropriate. + + +Accessing Another Task's Credentials +------------------------------------ + +Whilst a task may access its own credentials without the need for locking, the +same is not true of a task wanting to access another task's credentials. It +must use the RCU read lock and ``rcu_dereference()``. + +The ``rcu_dereference()`` is wrapped by:: + + const struct cred *__task_cred(struct task_struct *task); + +This should be used inside the RCU read lock, as in the following example:: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +Should it be necessary to hold another task's credentials for a long period of +time, and possibly to sleep whilst doing so, then the caller should get a +reference on them using:: + + const struct cred *get_task_cred(struct task_struct *task); + +This does all the RCU magic inside of it. The caller must call put_cred() on +the credentials so obtained when they're finished with. + +.. note:: + The result of ``__task_cred()`` should not be passed directly to + ``get_cred()`` as this may race with ``commit_cred()``. + +There are a couple of convenience functions to access bits of another task's +credentials, hiding the RCU magic from the caller:: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +If the caller is holding the RCU read lock at the time anyway, then:: + + __task_cred(task)->uid + __task_cred(task)->euid + +should be used instead. Similarly, if multiple aspects of a task's credentials +need to be accessed, RCU read lock should be used, ``__task_cred()`` called, +the result stored in a temporary pointer and then the credential aspects called +from that before dropping the lock. This prevents the potentially expensive +RCU magic from being invoked multiple times. + +Should some other single aspect of another task's credentials need to be +accessed, then this can be used:: + + task_cred_xxx(task, member) + +where 'member' is a non-pointer member of the cred struct. For instance:: + + uid_t task_cred_xxx(task, suid); + +will retrieve 'struct cred::suid' from the task, doing the appropriate RCU +magic. This may not be used for pointer members as what they point to may +disappear the moment the RCU read lock is dropped. + + +Altering Credentials +-------------------- + +As previously mentioned, a task may only alter its own credentials, and may not +alter those of another task. This means that it doesn't need to use any +locking to alter its own credentials. + +To alter the current process's credentials, a function should first prepare a +new set of credentials by calling:: + + struct cred *prepare_creds(void); + +this locks current->cred_replace_mutex and then allocates and constructs a +duplicate of the current process's credentials, returning with the mutex still +held if successful. It returns NULL if not successful (out of memory). + +The mutex prevents ``ptrace()`` from altering the ptrace state of a process +whilst security checks on credentials construction and changing is taking place +as the ptrace state may alter the outcome, particularly in the case of +``execve()``. + +The new credentials set should be altered appropriately, and any security +checks and hooks done. Both the current and the proposed sets of credentials +are available for this purpose as current_cred() will return the current set +still at this point. + + +When the credential set is ready, it should be committed to the current process +by calling:: + + int commit_creds(struct cred *new); + +This will alter various aspects of the credentials and the process, giving the +LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to +actually commit the new credentials to ``current->cred``, it will release +``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it +will notify the scheduler and others of the changes. + +This function is guaranteed to return 0, so that it can be tail-called at the +end of such functions as ``sys_setresuid()``. + +Note that this function consumes the caller's reference to the new credentials. +The caller should _not_ call ``put_cred()`` on the new credentials afterwards. + +Furthermore, once this function has been called on a new set of credentials, +those credentials may _not_ be changed further. + + +Should the security checks fail or some other error occur after +``prepare_creds()`` has been called, then the following function should be +invoked:: + + void abort_creds(struct cred *new); + +This releases the lock on ``current->cred_replace_mutex`` that +``prepare_creds()`` got and then releases the new credentials. + + +A typical credentials alteration function would look something like this:: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +Managing Credentials +-------------------- + +There are some functions to help manage credentials: + + - ``void put_cred(const struct cred *cred);`` + + This releases a reference to the given set of credentials. If the + reference count reaches zero, the credentials will be scheduled for + destruction by the RCU system. + + - ``const struct cred *get_cred(const struct cred *cred);`` + + This gets a reference on a live set of credentials, returning a pointer to + that set of credentials. + + - ``struct cred *get_new_cred(struct cred *cred);`` + + This gets a reference on a set of credentials that is under construction + and is thus still mutable, returning a pointer to that set of credentials. + + +Open File Credentials +===================== + +When a new file is opened, a reference is obtained on the opening task's +credentials and this is attached to the file struct as ``f_cred`` in place of +``f_uid`` and ``f_gid``. Code that used to access ``file->f_uid`` and +``file->f_gid`` should now access ``file->f_cred->fsuid`` and +``file->f_cred->fsgid``. + +It is safe to access ``f_cred`` without the use of RCU or locking because the +pointer will not change over the lifetime of the file struct, and nor will the +contents of the cred struct pointed to, barring the exceptions listed above +(see the Task Credentials section). + + +Overriding the VFS's Use of Credentials +======================================= + +Under some circumstances it is desirable to override the credentials used by +the VFS, and that can be done by calling into such as ``vfs_mkdir()`` with a +different set of credentials. This is done in the following places: + + * ``sys_faccessat()``. + * ``do_coredump()``. + * nfs4recover.c. diff --git a/Documentation/security/credentials.txt b/Documentation/security/credentials.txt deleted file mode 100644 index 86257052e31a..000000000000 --- a/Documentation/security/credentials.txt +++ /dev/null @@ -1,581 +0,0 @@ - ==================== - CREDENTIALS IN LINUX - ==================== - -By: David Howells - -Contents: - - (*) Overview. - - (*) Types of credentials. - - (*) File markings. - - (*) Task credentials. - - - Immutable credentials. - - Accessing task credentials. - - Accessing another task's credentials. - - Altering credentials. - - Managing credentials. - - (*) Open file credentials. - - (*) Overriding the VFS's use of credentials. - - -======== -OVERVIEW -======== - -There are several parts to the security check performed by Linux when one -object acts upon another: - - (1) Objects. - - Objects are things in the system that may be acted upon directly by - userspace programs. Linux has a variety of actionable objects, including: - - - Tasks - - Files/inodes - - Sockets - - Message queues - - Shared memory segments - - Semaphores - - Keys - - As a part of the description of all these objects there is a set of - credentials. What's in the set depends on the type of object. - - (2) Object ownership. - - Amongst the credentials of most objects, there will be a subset that - indicates the ownership of that object. This is used for resource - accounting and limitation (disk quotas and task rlimits for example). - - In a standard UNIX filesystem, for instance, this will be defined by the - UID marked on the inode. - - (3) The objective context. - - Also amongst the credentials of those objects, there will be a subset that - indicates the 'objective context' of that object. This may or may not be - the same set as in (2) - in standard UNIX files, for instance, this is the - defined by the UID and the GID marked on the inode. - - The objective context is used as part of the security calculation that is - carried out when an object is acted upon. - - (4) Subjects. - - A subject is an object that is acting upon another object. - - Most of the objects in the system are inactive: they don't act on other - objects within the system. Processes/tasks are the obvious exception: - they do stuff; they access and manipulate things. - - Objects other than tasks may under some circumstances also be subjects. - For instance an open file may send SIGIO to a task using the UID and EUID - given to it by a task that called fcntl(F_SETOWN) upon it. In this case, - the file struct will have a subjective context too. - - (5) The subjective context. - - A subject has an additional interpretation of its credentials. A subset - of its credentials forms the 'subjective context'. The subjective context - is used as part of the security calculation that is carried out when a - subject acts. - - A Linux task, for example, has the FSUID, FSGID and the supplementary - group list for when it is acting upon a file - which are quite separate - from the real UID and GID that normally form the objective context of the - task. - - (6) Actions. - - Linux has a number of actions available that a subject may perform upon an - object. The set of actions available depends on the nature of the subject - and the object. - - Actions include reading, writing, creating and deleting files; forking or - signalling and tracing tasks. - - (7) Rules, access control lists and security calculations. - - When a subject acts upon an object, a security calculation is made. This - involves taking the subjective context, the objective context and the - action, and searching one or more sets of rules to see whether the subject - is granted or denied permission to act in the desired manner on the - object, given those contexts. - - There are two main sources of rules: - - (a) Discretionary access control (DAC): - - Sometimes the object will include sets of rules as part of its - description. This is an 'Access Control List' or 'ACL'. A Linux - file may supply more than one ACL. - - A traditional UNIX file, for example, includes a permissions mask that - is an abbreviated ACL with three fixed classes of subject ('user', - 'group' and 'other'), each of which may be granted certain privileges - ('read', 'write' and 'execute' - whatever those map to for the object - in question). UNIX file permissions do not allow the arbitrary - specification of subjects, however, and so are of limited use. - - A Linux file might also sport a POSIX ACL. This is a list of rules - that grants various permissions to arbitrary subjects. - - (b) Mandatory access control (MAC): - - The system as a whole may have one or more sets of rules that get - applied to all subjects and objects, regardless of their source. - SELinux and Smack are examples of this. - - In the case of SELinux and Smack, each object is given a label as part - of its credentials. When an action is requested, they take the - subject label, the object label and the action and look for a rule - that says that this action is either granted or denied. - - -==================== -TYPES OF CREDENTIALS -==================== - -The Linux kernel supports the following types of credentials: - - (1) Traditional UNIX credentials. - - Real User ID - Real Group ID - - The UID and GID are carried by most, if not all, Linux objects, even if in - some cases it has to be invented (FAT or CIFS files for example, which are - derived from Windows). These (mostly) define the objective context of - that object, with tasks being slightly different in some cases. - - Effective, Saved and FS User ID - Effective, Saved and FS Group ID - Supplementary groups - - These are additional credentials used by tasks only. Usually, an - EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID - will be used as the objective. For tasks, it should be noted that this is - not always true. - - (2) Capabilities. - - Set of permitted capabilities - Set of inheritable capabilities - Set of effective capabilities - Capability bounding set - - These are only carried by tasks. They indicate superior capabilities - granted piecemeal to a task that an ordinary task wouldn't otherwise have. - These are manipulated implicitly by changes to the traditional UNIX - credentials, but can also be manipulated directly by the capset() system - call. - - The permitted capabilities are those caps that the process might grant - itself to its effective or permitted sets through capset(). This - inheritable set might also be so constrained. - - The effective capabilities are the ones that a task is actually allowed to - make use of itself. - - The inheritable capabilities are the ones that may get passed across - execve(). - - The bounding set limits the capabilities that may be inherited across - execve(), especially when a binary is executed that will execute as UID 0. - - (3) Secure management flags (securebits). - - These are only carried by tasks. These govern the way the above - credentials are manipulated and inherited over certain operations such as - execve(). They aren't used directly as objective or subjective - credentials. - - (4) Keys and keyrings. - - These are only carried by tasks. They carry and cache security tokens - that don't fit into the other standard UNIX credentials. They are for - making such things as network filesystem keys available to the file - accesses performed by processes, without the necessity of ordinary - programs having to know about security details involved. - - Keyrings are a special type of key. They carry sets of other keys and can - be searched for the desired key. Each process may subscribe to a number - of keyrings: - - Per-thread keying - Per-process keyring - Per-session keyring - - When a process accesses a key, if not already present, it will normally be - cached on one of these keyrings for future accesses to find. - - For more information on using keys, see Documentation/security/keys.txt. - - (5) LSM - - The Linux Security Module allows extra controls to be placed over the - operations that a task may do. Currently Linux supports several LSM - options. - - Some work by labelling the objects in a system and then applying sets of - rules (policies) that say what operations a task with one label may do to - an object with another label. - - (6) AF_KEY - - This is a socket-based approach to credential management for networking - stacks [RFC 2367]. It isn't discussed by this document as it doesn't - interact directly with task and file credentials; rather it keeps system - level credentials. - - -When a file is opened, part of the opening task's subjective context is -recorded in the file struct created. This allows operations using that file -struct to use those credentials instead of the subjective context of the task -that issued the operation. An example of this would be a file opened on a -network filesystem where the credentials of the opened file should be presented -to the server, regardless of who is actually doing a read or a write upon it. - - -============= -FILE MARKINGS -============= - -Files on disk or obtained over the network may have annotations that form the -objective security context of that file. Depending on the type of filesystem, -this may include one or more of the following: - - (*) UNIX UID, GID, mode; - - (*) Windows user ID; - - (*) Access control list; - - (*) LSM security label; - - (*) UNIX exec privilege escalation bits (SUID/SGID); - - (*) File capabilities exec privilege escalation bits. - -These are compared to the task's subjective security context, and certain -operations allowed or disallowed as a result. In the case of execve(), the -privilege escalation bits come into play, and may allow the resulting process -extra privileges, based on the annotations on the executable file. - - -================ -TASK CREDENTIALS -================ - -In Linux, all of a task's credentials are held in (uid, gid) or through -(groups, keys, LSM security) a refcounted structure of type 'struct cred'. -Each task points to its credentials by a pointer called 'cred' in its -task_struct. - -Once a set of credentials has been prepared and committed, it may not be -changed, barring the following exceptions: - - (1) its reference count may be changed; - - (2) the reference count on the group_info struct it points to may be changed; - - (3) the reference count on the security data it points to may be changed; - - (4) the reference count on any keyrings it points to may be changed; - - (5) any keyrings it points to may be revoked, expired or have their security - attributes changed; and - - (6) the contents of any keyrings to which it points may be changed (the whole - point of keyrings being a shared set of credentials, modifiable by anyone - with appropriate access). - -To alter anything in the cred struct, the copy-and-replace principle must be -adhered to. First take a copy, then alter the copy and then use RCU to change -the task pointer to make it point to the new copy. There are wrappers to aid -with this (see below). - -A task may only alter its _own_ credentials; it is no longer permitted for a -task to alter another's credentials. This means the capset() system call is no -longer permitted to take any PID other than the one of the current process. -Also keyctl_instantiate() and keyctl_negate() functions no longer permit -attachment to process-specific keyrings in the requesting process as the -instantiating process may need to create them. - - -IMMUTABLE CREDENTIALS ---------------------- - -Once a set of credentials has been made public (by calling commit_creds() for -example), it must be considered immutable, barring two exceptions: - - (1) The reference count may be altered. - - (2) Whilst the keyring subscriptions of a set of credentials may not be - changed, the keyrings subscribed to may have their contents altered. - -To catch accidental credential alteration at compile time, struct task_struct -has _const_ pointers to its credential sets, as does struct file. Furthermore, -certain functions such as get_cred() and put_cred() operate on const pointers, -thus rendering casts unnecessary, but require to temporarily ditch the const -qualification to be able to alter the reference count. - - -ACCESSING TASK CREDENTIALS --------------------------- - -A task being able to alter only its own credentials permits the current process -to read or replace its own credentials without the need for any form of locking -- which simplifies things greatly. It can just call: - - const struct cred *current_cred() - -to get a pointer to its credentials structure, and it doesn't have to release -it afterwards. - -There are convenience wrappers for retrieving specific aspects of a task's -credentials (the value is simply returned in each case): - - uid_t current_uid(void) Current's real UID - gid_t current_gid(void) Current's real GID - uid_t current_euid(void) Current's effective UID - gid_t current_egid(void) Current's effective GID - uid_t current_fsuid(void) Current's file access UID - gid_t current_fsgid(void) Current's file access GID - kernel_cap_t current_cap(void) Current's effective capabilities - void *current_security(void) Current's LSM security pointer - struct user_struct *current_user(void) Current's user account - -There are also convenience wrappers for retrieving specific associated pairs of -a task's credentials: - - void current_uid_gid(uid_t *, gid_t *); - void current_euid_egid(uid_t *, gid_t *); - void current_fsuid_fsgid(uid_t *, gid_t *); - -which return these pairs of values through their arguments after retrieving -them from the current task's credentials. - - -In addition, there is a function for obtaining a reference on the current -process's current set of credentials: - - const struct cred *get_current_cred(void); - -and functions for getting references to one of the credentials that don't -actually live in struct cred: - - struct user_struct *get_current_user(void); - struct group_info *get_current_groups(void); - -which get references to the current process's user accounting structure and -supplementary groups list respectively. - -Once a reference has been obtained, it must be released with put_cred(), -free_uid() or put_group_info() as appropriate. - - -ACCESSING ANOTHER TASK'S CREDENTIALS ------------------------------------- - -Whilst a task may access its own credentials without the need for locking, the -same is not true of a task wanting to access another task's credentials. It -must use the RCU read lock and rcu_dereference(). - -The rcu_dereference() is wrapped by: - - const struct cred *__task_cred(struct task_struct *task); - -This should be used inside the RCU read lock, as in the following example: - - void foo(struct task_struct *t, struct foo_data *f) - { - const struct cred *tcred; - ... - rcu_read_lock(); - tcred = __task_cred(t); - f->uid = tcred->uid; - f->gid = tcred->gid; - f->groups = get_group_info(tcred->groups); - rcu_read_unlock(); - ... - } - -Should it be necessary to hold another task's credentials for a long period of -time, and possibly to sleep whilst doing so, then the caller should get a -reference on them using: - - const struct cred *get_task_cred(struct task_struct *task); - -This does all the RCU magic inside of it. The caller must call put_cred() on -the credentials so obtained when they're finished with. - - [*] Note: The result of __task_cred() should not be passed directly to - get_cred() as this may race with commit_cred(). - -There are a couple of convenience functions to access bits of another task's -credentials, hiding the RCU magic from the caller: - - uid_t task_uid(task) Task's real UID - uid_t task_euid(task) Task's effective UID - -If the caller is holding the RCU read lock at the time anyway, then: - - __task_cred(task)->uid - __task_cred(task)->euid - -should be used instead. Similarly, if multiple aspects of a task's credentials -need to be accessed, RCU read lock should be used, __task_cred() called, the -result stored in a temporary pointer and then the credential aspects called -from that before dropping the lock. This prevents the potentially expensive -RCU magic from being invoked multiple times. - -Should some other single aspect of another task's credentials need to be -accessed, then this can be used: - - task_cred_xxx(task, member) - -where 'member' is a non-pointer member of the cred struct. For instance: - - uid_t task_cred_xxx(task, suid); - -will retrieve 'struct cred::suid' from the task, doing the appropriate RCU -magic. This may not be used for pointer members as what they point to may -disappear the moment the RCU read lock is dropped. - - -ALTERING CREDENTIALS --------------------- - -As previously mentioned, a task may only alter its own credentials, and may not -alter those of another task. This means that it doesn't need to use any -locking to alter its own credentials. - -To alter the current process's credentials, a function should first prepare a -new set of credentials by calling: - - struct cred *prepare_creds(void); - -this locks current->cred_replace_mutex and then allocates and constructs a -duplicate of the current process's credentials, returning with the mutex still -held if successful. It returns NULL if not successful (out of memory). - -The mutex prevents ptrace() from altering the ptrace state of a process whilst -security checks on credentials construction and changing is taking place as -the ptrace state may alter the outcome, particularly in the case of execve(). - -The new credentials set should be altered appropriately, and any security -checks and hooks done. Both the current and the proposed sets of credentials -are available for this purpose as current_cred() will return the current set -still at this point. - - -When the credential set is ready, it should be committed to the current process -by calling: - - int commit_creds(struct cred *new); - -This will alter various aspects of the credentials and the process, giving the -LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually -commit the new credentials to current->cred, it will release -current->cred_replace_mutex to allow ptrace() to take place, and it will notify -the scheduler and others of the changes. - -This function is guaranteed to return 0, so that it can be tail-called at the -end of such functions as sys_setresuid(). - -Note that this function consumes the caller's reference to the new credentials. -The caller should _not_ call put_cred() on the new credentials afterwards. - -Furthermore, once this function has been called on a new set of credentials, -those credentials may _not_ be changed further. - - -Should the security checks fail or some other error occur after prepare_creds() -has been called, then the following function should be invoked: - - void abort_creds(struct cred *new); - -This releases the lock on current->cred_replace_mutex that prepare_creds() got -and then releases the new credentials. - - -A typical credentials alteration function would look something like this: - - int alter_suid(uid_t suid) - { - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - new->suid = suid; - ret = security_alter_suid(new); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); - } - - -MANAGING CREDENTIALS --------------------- - -There are some functions to help manage credentials: - - (*) void put_cred(const struct cred *cred); - - This releases a reference to the given set of credentials. If the - reference count reaches zero, the credentials will be scheduled for - destruction by the RCU system. - - (*) const struct cred *get_cred(const struct cred *cred); - - This gets a reference on a live set of credentials, returning a pointer to - that set of credentials. - - (*) struct cred *get_new_cred(struct cred *cred); - - This gets a reference on a set of credentials that is under construction - and is thus still mutable, returning a pointer to that set of credentials. - - -===================== -OPEN FILE CREDENTIALS -===================== - -When a new file is opened, a reference is obtained on the opening task's -credentials and this is attached to the file struct as 'f_cred' in place of -'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid -should now access file->f_cred->fsuid and file->f_cred->fsgid. - -It is safe to access f_cred without the use of RCU or locking because the -pointer will not change over the lifetime of the file struct, and nor will the -contents of the cred struct pointed to, barring the exceptions listed above -(see the Task Credentials section). - - -======================================= -OVERRIDING THE VFS'S USE OF CREDENTIALS -======================================= - -Under some circumstances it is desirable to override the credentials used by -the VFS, and that can be done by calling into such as vfs_mkdir() with a -different set of credentials. This is done in the following places: - - (*) sys_faccessat(). - - (*) do_coredump(). - - (*) nfs4recover.c. diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index 07335659ce8d..415be8e0b013 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -5,5 +5,6 @@ Security Documentation .. toctree:: :maxdepth: 1 + credentials IMA-templates tpm/index diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..c728d515e5e2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -1,4 +1,4 @@ -/* Credentials management - see Documentation/security/credentials.txt +/* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/kernel/cred.c b/kernel/cred.c index 2bc66075740f..ecf03657e71c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/security/credentials.txt +/* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit From 719f6a7040f1bdaf96fcc709d272548facb88e90 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 20 Apr 2017 10:52:31 +0200 Subject: printk: Use the main logbuf in NMI when logbuf_lock is available The commit 42a0bb3f71383b457a7d ("printk/nmi: generic solution for safe printk in NMI") caused that printk stores messages into a temporary buffer in NMI context. The buffer is per-CPU and therefore the size is rather limited. It works quite well for NMI backtraces. But there are longer logs that might get printed in NMI context, for example, lockdep warnings, ftrace_dump_on_oops. The temporary buffer is used to avoid deadlocks caused by logbuf_lock. Also it is needed to avoid races with the other temporary buffer that is used when PRINTK_SAFE_CONTEXT is entered. But the main buffer can be used in NMI if the lock is available and we did not interrupt PRINTK_SAFE_CONTEXT. The lock is checked using raw_spin_is_locked(). It might cause false negatives when the lock is taken on another CPU and this CPU is in the safe context from other reasons. Note that the safe context is used also to get console semaphore or when calling console drivers. For this reason, we do the check in printk_nmi_enter(). It makes the handling consistent for the entire NMI handler and avoids reshuffling of the messages. The patch also defines special printk context that allows to use printk_deferred() in NMI. Note that we could not flush the messages to the consoles because console drivers might use many other internal locks. The newly created vprintk_deferred() disables the preemption only around the irq work handling. It is needed there to keep the consistency between the two per-CPU variables. But there is no reason to disable preemption around vprintk_emit(). Finally, the patch puts back explicit serialization of the NMI backtraces from different CPUs. It was removed by the commit a9edc88093287183ac934b ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). It was not needed because the flushing of the temporary per-CPU buffers was serialized. Link: http://lkml.kernel.org/r/1493912763-24873-1-git-send-email-pmladek@suse.com Cc: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Russell King Cc: Daniel Thompson Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Chris Metcalf Cc: x86@kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Suggested-by: Sergey Senozhatsky Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/internal.h | 6 ++++-- kernel/printk/printk.c | 19 ++++++++++++++----- kernel/printk/printk_safe.c | 26 ++++++++++++++++++++++++-- lib/nmi_backtrace.c | 3 +++ 4 files changed, 45 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1db044f808b7..2a7d04049af4 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -18,12 +18,14 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff +#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; __printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 779479ac9f57..e5278e7d1922 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2722,16 +2722,13 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_deferred(const char *fmt, ...) +int vprintk_deferred(const char *fmt, va_list args) { - va_list args; int r; - preempt_disable(); - va_start(args, fmt); r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - va_end(args); + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); @@ -2739,6 +2736,18 @@ int printk_deferred(const char *fmt, ...) return r; } +int printk_deferred(const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_deferred(fmt, args); + va_end(args); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 033e50a7d706..03a42a539b20 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,12 +308,24 @@ static int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + /* + * The size of the extra per-CPU buffer is limited. Use it only when + * the main one is locked. If this CPU is not in the safe context, + * the lock must be taken on another CPU and we could wait for it. + */ + if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && + raw_spin_is_locked(&logbuf_lock)) { + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + } else { + this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); + } } void printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_and(printk_context, + ~(PRINTK_NMI_CONTEXT_MASK | + PRINTK_NMI_DEFERRED_CONTEXT_MASK)); } #else @@ -351,12 +363,22 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); + /* Use extra buffer to prevent a recursion deadlock in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); + /* + * Use the main logbuf when logbuf_lock is available in NMI. + * But avoid calling console drivers that might have their own locks. + */ + if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) + return vprintk_deferred(fmt, args); + + /* No obstacles. */ return vprintk_default(fmt, args); } diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 4e8a30d1c22f..0bc0a3535a8a 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -86,9 +86,11 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, bool nmi_cpu_backtrace(struct pt_regs *regs) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + arch_spin_lock(&lock); if (regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", cpu, instruction_pointer(regs)); @@ -99,6 +101,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } + arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit From 7e95a225901a5d2fd140f14b4302805cecc22da7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:52:01 -0400 Subject: move compat wait4 and waitid next to native variants Signed-off-by: Al Viro --- kernel/compat.c | 66 ------------------------------------------------------ kernel/exit.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..b4cdba6bbd02 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -543,72 +543,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -COMPAT_SYSCALL_DEFINE4(wait4, - compat_pid_t, pid, - compat_uint_t __user *, stat_addr, - int, options, - struct compat_rusage __user *, ru) -{ - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; - } -} - -COMPAT_SYSCALL_DEFINE5(waitid, - int, which, compat_pid_t, pid, - struct compat_siginfo __user *, uinfo, int, options, - struct compat_rusage __user *, uru) -{ - siginfo_t info; - struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); - - set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); - else - ret = put_compat_rusage(&ru, uru); - if (ret) - return -EFAULT; - } - - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(uinfo, &info); -} - static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..f98782bd27b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1735,3 +1736,71 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) } #endif + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) +{ + if (!ru) { + return sys_wait4(pid, stat_addr, options, NULL); + } else { + struct rusage r; + int ret; + unsigned int status; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_wait4(pid, + (stat_addr ? + (unsigned int __user *) &status : NULL), + options, (struct rusage __user *) &r); + set_fs (old_fs); + + if (ret > 0) { + if (put_compat_rusage(&r, ru)) + return -EFAULT; + if (stat_addr && put_user(status, stat_addr)) + return -EFAULT; + } + return ret; + } +} + +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, infop, int, options, + struct compat_rusage __user *, uru) +{ + siginfo_t info; + struct rusage ru; + long ret; + mm_segment_t old_fs = get_fs(); + + memset(&info, 0, sizeof(info)); + + set_fs(KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? (struct rusage __user *)&ru : NULL); + set_fs(old_fs); + + if ((ret < 0) || (info.si_signo == 0)) + return ret; + + if (uru) { + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); + if (ret) + return -EFAULT; + } + + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return copy_siginfo_to_user32(infop, &info); +} +#endif -- cgit From ce72a16fa705f960ca2352e95a7c5f4801475e75 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:25:02 -0400 Subject: wait4(2)/waitid(2): separate copying rusage to userland New helpers: kernel_waitid() and kernel_wait4(). sys_waitid(), sys_wait4() and their compat variants switched to those. Copying struct rusage to userland is left to syscall itself. For compat_sys_wait4() that eliminates the use of set_fs() completely. For compat_sys_waitid() it's still needed (for siginfo handling); that will change shortly. Signed-off-by: Al Viro --- include/linux/resource.h | 2 +- kernel/exit.c | 89 ++++++++++++++++++++++++++++-------------------- kernel/sys.c | 16 ++++----- 3 files changed, 59 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/resource.h b/include/linux/resource.h index 5bc3116e649c..277afdad6589 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -6,7 +6,7 @@ struct task_struct; -int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +void getrusage(struct task_struct *p, int who, struct rusage *ru); int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim); diff --git a/kernel/exit.c b/kernel/exit.c index f98782bd27b6..d44f12948c5f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1003,7 +1003,7 @@ struct wait_opts { struct siginfo __user *wo_info; int __user *wo_stat; - struct rusage __user *wo_rusage; + struct rusage *wo_rusage; wait_queue_t child_wait; int notask_error; @@ -1054,8 +1054,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { struct siginfo __user *infop; - int retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + int retval = 0; + + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; @@ -1182,8 +1184,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_unlock_irq(¤t->sighand->siglock); } - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; if (!retval && wo->wo_stat) @@ -1316,8 +1319,9 @@ unlock_sig: if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; if (!retval && wo->wo_stat) retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); @@ -1377,8 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (!wo->wo_info) { - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; put_task_struct(p); if (!retval && wo->wo_stat) retval = put_user(0xffff, wo->wo_stat); @@ -1618,8 +1623,8 @@ end: return retval; } -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) +static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1687,8 +1692,21 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return ret; } -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + + if (!err) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + +static long kernel_wait4(pid_t upid, int __user *stat_addr, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1724,6 +1742,19 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, return ret; } +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); + + if (err > 0) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + #ifdef __ARCH_WANT_SYS_WAITPID /* @@ -1744,29 +1775,13 @@ COMPAT_SYSCALL_DEFINE4(wait4, int, options, struct compat_rusage __user *, ru) { - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; + struct rusage r; + long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); + if (err > 0) { + if (ru && put_compat_rusage(&r, ru)) + return -EFAULT; } + return err; } COMPAT_SYSCALL_DEFINE5(waitid, @@ -1782,8 +1797,8 @@ COMPAT_SYSCALL_DEFINE5(waitid, memset(&info, 0, sizeof(info)); set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); + ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? &ru : NULL); set_fs(old_fs); if ((ret < 0) || (info.si_signo == 0)) diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..dab1a0658a92 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1552,7 +1552,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) r->ru_oublock += task_io_get_oublock(t); } -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; @@ -1626,20 +1626,16 @@ out: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; - return getrusage(current, who, ru); + + getrusage(current, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -1651,7 +1647,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) who != RUSAGE_THREAD) return -EINVAL; - k_getrusage(current, who, &r); + getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif -- cgit From 359566faefa850504d146839d74496f0cf12d3b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:39:39 -0400 Subject: kernel_wait4()/kernel_waitid(): delay copying status to userland Signed-off-by: Al Viro --- kernel/exit.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d44f12948c5f..94cdccf8e7e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,7 +1002,7 @@ struct wait_opts { struct pid *wo_pid; struct siginfo __user *wo_info; - int __user *wo_stat; + int wo_stat; struct rusage *wo_rusage; wait_queue_t child_wait; @@ -1189,8 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; - if (!retval && wo->wo_stat) - retval = put_user(status, wo->wo_stat); + wo->wo_stat = status; infop = wo->wo_info; if (!retval && infop) @@ -1322,8 +1321,7 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); retval = 0; - if (!retval && wo->wo_stat) - retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (!retval && infop) @@ -1383,12 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) if (!wo->wo_info) { if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; put_task_struct(p); - if (!retval && wo->wo_stat) - retval = put_user(0xffff, wo->wo_stat); - if (!retval) - retval = pid; + wo->wo_stat = 0xffff; + retval = pid; } else { retval = wait_noreap_copyout(wo, p, pid, uid, CLD_CONTINUED, SIGCONT); @@ -1662,7 +1657,6 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; - wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); @@ -1734,10 +1728,12 @@ static long kernel_wait4(pid_t upid, int __user *stat_addr, wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; - wo.wo_stat = stat_addr; + wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); + if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) + ret = -EFAULT; return ret; } -- cgit From 67d7ddded322db99f451a7959d56ed6c70a6c4aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:53:13 -0400 Subject: waitid(2): leave copyout of siginfo to syscall itself have kernel_waitid() collect the information needed for siginfo into a small structure (waitid_info) passed to it; deal with copyout in sys_waitid()/compat_sys_waitid(). Signed-off-by: Al Viro --- kernel/exit.c | 168 ++++++++++++++++++++++------------------------------------ 1 file changed, 64 insertions(+), 104 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 94cdccf8e7e7..42f26480b3cc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -996,12 +996,19 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } +struct waitid_info { + pid_t pid; + uid_t uid; + int status; + int cause; +}; + struct wait_opts { enum pid_type wo_type; int wo_flags; struct pid *wo_pid; - struct siginfo __user *wo_info; + struct waitid_info *wo_info; int wo_stat; struct rusage *wo_rusage; @@ -1053,8 +1060,7 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { - struct siginfo __user *infop; - int retval = 0; + struct waitid_info *infop; if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); @@ -1062,22 +1068,12 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->cause = why; + infop->pid = pid; + infop->uid = uid; + infop->status = status; } - if (!retval) - retval = pid; - return retval; + return pid; } /* @@ -1088,10 +1084,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - int state, retval, status; + int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); - struct siginfo __user *infop; + struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; @@ -1186,36 +1182,22 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - + if (infop) { if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; + infop->cause = CLD_EXITED; + infop->status = status >> 8; } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->pid = pid; + infop->uid = uid; } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); @@ -1232,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); - return retval; + return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) @@ -1268,8 +1250,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { - struct siginfo __user *infop; - int retval, exit_code, *p_code, why; + struct waitid_info *infop; + int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; @@ -1320,28 +1302,19 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; + if (infop) { + infop->cause = why; + infop->status = exit_code; + infop->pid = pid; + infop->uid = uid; + } put_task_struct(p); - BUG_ON(!retval); - return retval; + BUG_ON(!pid); + return pid; } /* @@ -1618,7 +1591,7 @@ end: return retval; } -static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; @@ -1660,27 +1633,8 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) { + if (ret > 0) ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } put_pid(pid); return ret; @@ -1690,12 +1644,24 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; - long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } + if (!infop) + return err; + + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } @@ -1785,33 +1751,27 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { - siginfo_t info; struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); - set_fs(KERNEL_DS); - ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? &ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ + if (!err && uru) { + /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); + err = copy_to_user(uru, &ru, sizeof(ru)); else - ret = put_compat_rusage(&ru, uru); - if (ret) + err = put_compat_rusage(&ru, uru); + if (err) return -EFAULT; } - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(infop, &info); + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } #endif -- cgit From e61a250229fbf0f003e93676bf4d8a555a8c9eec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:25:03 -0400 Subject: lift getrusage() from wait_noreap_copyout() Signed-off-by: Al Viro --- kernel/exit.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 42f26480b3cc..d4f5097da85a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1062,9 +1062,6 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, { struct waitid_info *infop; - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - put_task_struct(p); infop = wo->wo_info; if (infop) { @@ -1099,6 +1096,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; @@ -1296,12 +1295,12 @@ unlock_sig: why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; @@ -1350,10 +1349,10 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (!wo->wo_info) { - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); wo->wo_stat = 0xffff; retval = pid; -- cgit From bb380ec33a7d8ee048e722889627869d21a5d527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:33:21 -0400 Subject: kill wait_noreap_copyout() folds into callers Signed-off-by: Al Viro --- kernel/exit.c | 65 +++++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d4f5097da85a..f01ebaab978a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1057,22 +1057,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) return 1; } -static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, - pid_t pid, uid_t uid, int why, int status) -{ - struct waitid_info *infop; - - put_task_struct(p); - infop = wo->wo_info; - if (infop) { - infop->cause = why; - infop->pid = pid; - infop->uid = uid; - infop->status = status; - } - return pid; -} - /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold @@ -1091,22 +1075,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (unlikely(wo->wo_flags & WNOWAIT)) { int exit_code = p->exit_code; - int why; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; + infop = wo->wo_info; + if (infop) { + if ((exit_code & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = exit_code >> 8; + } else { + infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = exit_code & 0x7f; + } + infop->pid = pid; + infop->uid = uid; } - return wait_noreap_copyout(wo, p, pid, uid, why, status); + return pid; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1297,11 +1286,10 @@ unlock_sig: sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (unlikely(wo->wo_flags & WNOWAIT)) - return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - - wo->wo_stat = (exit_code << 8) | 0x7f; + if (likely(!(wo->wo_flags & WNOWAIT))) + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { @@ -1310,9 +1298,6 @@ unlock_sig: infop->pid = pid; infop->uid = uid; } - put_task_struct(p); - - BUG_ON(!pid); return pid; } @@ -1324,7 +1309,7 @@ unlock_sig: */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { - int retval; + struct waitid_info *infop; pid_t pid; uid_t uid; @@ -1351,18 +1336,18 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (!wo->wo_info) { - put_task_struct(p); + infop = wo->wo_info; + if (!infop) { wo->wo_stat = 0xffff; - retval = pid; } else { - retval = wait_noreap_copyout(wo, p, pid, uid, - CLD_CONTINUED, SIGCONT); - BUG_ON(retval == 0); + infop->cause = CLD_CONTINUED; + infop->pid = pid; + infop->uid = uid; + infop->status = SIGCONT; } - - return retval; + return pid; } /* -- cgit From 76d9871e1122aabc086e7aade5251b1e5124cbb9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:38:26 -0400 Subject: wait_task_zombie: consolidate info logics Signed-off-by: Al Viro --- kernel/exit.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f01ebaab978a..97db9ee03f90 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1074,28 +1074,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; - + status = p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); - - infop = wo->wo_info; - if (infop) { - if ((exit_code & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = exit_code >> 8; - } else { - infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = exit_code & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - return pid; + goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1174,19 +1160,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; - infop = wo->wo_info; - if (infop) { - if ((status & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = status >> 8; - } else { - infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = status & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ @@ -1202,6 +1175,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); +out_info: + infop = wo->wo_info; + if (infop) { + if ((status & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = status >> 8; + } else { + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; + } + infop->pid = pid; + infop->uid = uid; + } + return pid; } -- cgit From 4c48abe91be03d191d0c20cc755877da2cb35622 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:27:32 -0400 Subject: waitid(): switch copyout of siginfo to unsafe_put_user() Signed-off-by: Al Viro --- kernel/exit.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 97db9ee03f90..f3b8c3a87bc1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1625,15 +1625,18 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; - + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } static long kernel_wait4(pid_t upid, int __user *stat_addr, @@ -1736,13 +1739,20 @@ COMPAT_SYSCALL_DEFINE5(waitid, return -EFAULT; } - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; + if (!infop) + return err; + + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } #endif -- cgit From 92ebce5ac55dba258c608248dddf59eca3f7f514 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 23:54:33 -0400 Subject: osf_wait4: switch to kernel_wait4() ... and sanitize copying rusage to userland Signed-off-by: Al Viro --- arch/alpha/kernel/osf_sys.c | 53 ++++++++++++--------------------------------- include/linux/sched/task.h | 2 ++ kernel/exit.c | 4 ++-- 3 files changed, 18 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ce93124a850b..b23d6fbbb225 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1183,48 +1183,23 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - struct rusage r; - long ret, err; unsigned int status = 0; - mm_segment_t old_fs; - + struct rusage r; + long err = kernel_wait4(pid, &status, options, &r); + if (err <= 0) + return err; + if (put_user(status, ustatus)) + return -EFAULT; if (!ur) - return sys_wait4(pid, ustatus, options, NULL); - - old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, (unsigned int __user *) &status, options, - (struct rusage __user *) &r); - set_fs (old_fs); - - if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur))) + return err; + if (put_tv32(&ur->ru_utime, &r.ru_utime)) return -EFAULT; - - err = put_user(status, ustatus); - if (ret < 0) - return err ? err : ret; - - err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec); - err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec); - err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec); - err |= __put_user(r.ru_stime.tv_usec, &ur->ru_stime.tv_usec); - err |= __put_user(r.ru_maxrss, &ur->ru_maxrss); - err |= __put_user(r.ru_ixrss, &ur->ru_ixrss); - err |= __put_user(r.ru_idrss, &ur->ru_idrss); - err |= __put_user(r.ru_isrss, &ur->ru_isrss); - err |= __put_user(r.ru_minflt, &ur->ru_minflt); - err |= __put_user(r.ru_majflt, &ur->ru_majflt); - err |= __put_user(r.ru_nswap, &ur->ru_nswap); - err |= __put_user(r.ru_inblock, &ur->ru_inblock); - err |= __put_user(r.ru_oublock, &ur->ru_oublock); - err |= __put_user(r.ru_msgsnd, &ur->ru_msgsnd); - err |= __put_user(r.ru_msgrcv, &ur->ru_msgrcv); - err |= __put_user(r.ru_nsignals, &ur->ru_nsignals); - err |= __put_user(r.ru_nvcsw, &ur->ru_nvcsw); - err |= __put_user(r.ru_nivcsw, &ur->ru_nivcsw); - - return err ? err : ret; + if (put_tv32(&ur->ru_stime, &r.ru_stime)) + return -EFAULT; + if (copy_to_user(&ur->ru_maxrss, &r.ru_maxrss, + sizeof(struct rusage32) - offsetof(struct rusage32, ru_maxrss))) + return -EFAULT; + return err; } /* diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a978d7189cfd..6b830fd9d809 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -9,6 +9,7 @@ #include struct task_struct; +struct rusage; union thread_union; /* @@ -74,6 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern long kernel_wait4(pid_t, int *, int, struct rusage *); extern void free_task(struct task_struct *tsk); diff --git a/kernel/exit.c b/kernel/exit.c index f3b8c3a87bc1..462fc25eec6e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,8 +1639,8 @@ Efault: return -EFAULT; } -static long kernel_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage *ru) +long kernel_wait4(pid_t upid, int __user *stat_addr, int options, + struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; -- cgit From 6f9a22bc5775d231ab8fbe2c2f3c88e45e3e7c28 Mon Sep 17 00:00:00 2001 From: Michael Hernandez Date: Thu, 18 May 2017 10:47:47 -0700 Subject: PCI/MSI: Ignore affinity if pre/post vector count is more than min_vecs min_vecs is the minimum amount of vectors needed to operate in MSI-X mode which may just include the vectors that don't need affinity. Disabling affinity settings causes the qla2xxx driver scsi_add_host() to fail when blk_mq is enabled as the blk_mq_pci_map_queues() expects affinity masks on each vector. Fixes: dfef358bd1be ("PCI/MSI: Don't apply affinity if there aren't enough vectors left") Signed-off-by: Michael Hernandez Signed-off-by: Himanshu Madhani Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org # v4.10+ --- drivers/pci/msi.c | 14 ++------------ include/linux/interrupt.h | 4 ++-- kernel/irq/affinity.c | 13 ++++++++++++- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index ba44fdfda66b..9e1569107cd6 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1058,7 +1058,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, for (;;) { if (affd) { - nvec = irq_calc_affinity_vectors(nvec, affd); + nvec = irq_calc_affinity_vectors(minvec, nvec, affd); if (nvec < minvec) return -ENOSPC; } @@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, for (;;) { if (affd) { - nvec = irq_calc_affinity_vectors(nvec, affd); + nvec = irq_calc_affinity_vectors(minvec, nvec, affd); if (nvec < minvec) return -ENOSPC; } @@ -1165,16 +1165,6 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, if (flags & PCI_IRQ_AFFINITY) { if (!affd) affd = &msi_default_affd; - - if (affd->pre_vectors + affd->post_vectors > min_vecs) - return -EINVAL; - - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (affd->pre_vectors + affd->post_vectors == min_vecs) - affd = NULL; } else { if (WARN_ON(affd)) affd = NULL; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a6fba4804672..0991f973f8ca 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -291,7 +291,7 @@ extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd); -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd); +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ @@ -331,7 +331,7 @@ irq_create_affinity_masks(int nvec, const struct irq_affinity *affd) } static inline int -irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { return maxvec; } diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..9b71406d2eec 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -66,6 +66,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) struct cpumask *masks; cpumask_var_t nmsk; + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (!affv) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -140,15 +147,19 @@ out: /** * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @minvec: The minimum number of vectors available * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; int cpus; + if (resv > minvec) + return 0; + /* Stabilize the cpumasks */ get_online_cpus(); cpus = cpumask_weight(cpu_online_mask); -- cgit From 4b3e4ed6b0d958d7fb2f160bb8ebfb4f0db19382 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 20 Apr 2017 13:07:30 -0400 Subject: audit: unswing cap_* fields in PATH records The cap_* fields swing in and out of PATH records. If no capabilities are set, the cap_* fields are completely missing and when one of the cap_fi or cap_fp values is empty, that field is omitted. Original: type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fe=1 cap_fver=2 Normalize the PATH record by always printing all 4 cap_* fields. Fixed: type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL cap_fp=none cap_fi=none cap_fe=0 cap_fver=0 type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fi=none cap_fe=1 cap_fver=2 See: https://github.com/linux-audit/audit-kernel/issues/42 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a7c6a50477aa..b2e877100242 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1999,22 +1999,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); } static inline int audit_copy_fcaps(struct audit_names *name, -- cgit From 490194269665d6d4915a4a5774f002885c5a2d8f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:26 -0700 Subject: module: Pass struct load_info into symbol checks Since we're already using values from struct load_info, just pass this pointer in directly and use what's needed as we need it. This allows us to access future fields in struct load_info too. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..ca4509b13400 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1278,12 +1278,13 @@ static u32 resolve_rel_crc(const s32 *crc) return *(u32 *)((void *)crc + *crc); } -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) { + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; @@ -1326,8 +1327,7 @@ bad_version: return 0; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { const s32 *crc; @@ -1343,8 +1343,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, BUG(); } preempt_enable(); - return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc); + return check_version(info, VMLINUX_SYMBOL_STR(module_layout), + mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1358,8 +1358,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } #else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) @@ -1367,8 +1366,7 @@ static inline int check_version(Elf_Shdr *sechdrs, return 1; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { return 1; @@ -1404,7 +1402,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -2971,7 +2969,7 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + if (!check_modstruct_version(info, mod)) return ERR_PTR(-ENOEXEC); return mod; -- cgit From 3e2e857f9c3a19d55ee0ba7b428b8be5008960bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:27 -0700 Subject: module: Add module name to modinfo Accessing the mod structure (e.g. for mod->name) prior to having completed check_modstruct_version() can result in writing garbage to the error logs if the layout of the mod structure loaded from disk doesn't match the running kernel's mod structure layout. This kind of mismatch will become much more likely if a kernel is built with different randomization seed for the struct layout randomization plugin. Instead, add and use a new modinfo string for logging the module name. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++++++------- scripts/mod/modpost.c | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ca4509b13400..3803449ca219 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,6 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { + char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -1318,12 +1319,12 @@ static int check_version(const struct load_info *info, } /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", - mod->name, symname); + info->name, symname); return 0; } @@ -2913,9 +2914,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) info->index.vers = 0; /* Pretend no __versions section! */ else info->index.vers = find_sec(info, "__versions"); + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; } @@ -2955,14 +2962,22 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo name field)"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; + /* + * If we didn't load the .modinfo 'name' field, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = mod->name; + if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", info->name); return ERR_PTR(-ENOEXEC); } @@ -2990,7 +3005,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { pr_err("%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); + info->name, modmagic, vermagic); return -ENOEXEC; } @@ -3270,7 +3285,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; - if (blacklisted(mod->name)) + if (blacklisted(info->name)) return ERR_PTR(-EPERM); err = check_modinfo(mod, info, flags); diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 30d752a4a6a6..48397feb08fb 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2126,6 +2126,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#include \n"); buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); + buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n"); buf_printf(b, "\n"); buf_printf(b, "__visible struct module __this_module\n"); buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n"); -- cgit From f36776fafbaa0094390dd4e7e3e29805e0b82730 Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Tue, 9 May 2017 15:22:30 +0200 Subject: kobject: support passing in variables for synthetic uevents This patch makes it possible to pass additional arguments in addition to uevent action name when writing /sys/.../uevent attribute. These additional arguments are then inserted into generated synthetic uevent as additional environment variables. Before, we were not able to pass any additional uevent environment variables for synthetic uevents. This made it hard to identify such uevents properly in userspace to make proper distinction between genuine uevents originating from kernel and synthetic uevents triggered from userspace. Also, it was not possible to pass any additional information which would make it possible to optimize and change the way the synthetic uevents are processed back in userspace based on the originating environment of the triggering action in userspace. With the extra additional variables, we are able to pass through this extra information needed and also it makes it possible to synchronize with such synthetic uevents as they can be clearly identified back in userspace. The format for writing the uevent attribute is following: ACTION [UUID [KEY=VALUE ...] There's no change in how "ACTION" is recognized - it stays the same ("add", "change", "remove"). The "ACTION" is the only argument required to generate synthetic uevent, the rest of arguments, that this patch adds support for, are optional. The "UUID" is considered as transaction identifier so it's possible to use the same UUID value for one or more synthetic uevents in which case we logically group these uevents together for any userspace listeners. The "UUID" is expected to be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" format where "x" is a hex digit. The value appears in uevent as "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment variable. The "KEY=VALUE" pairs can contain alphanumeric characters only. It's possible to define zero or more more pairs - each pair is then delimited by a space character " ". Each pair appears in synthetic uevents as "SYNTH_ARG_KEY=VALUE" environment variable. That means the KEY name gains "SYNTH_ARG_" prefix to avoid possible collisions with existing variables. To pass the "KEY=VALUE" pairs, it's also required to pass in the "UUID" part for the synthetic uevent first. If "UUID" is not passed in, the generated synthetic uevent gains "SYNTH_UUID=0" environment variable automatically so it's possible to identify this situation in userspace when reading generated uevent and so we can still make a difference between genuine and synthetic uevents. Signed-off-by: Peter Rajnoha Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-uevent | 47 ++++++++++ drivers/base/bus.c | 10 +- drivers/base/core.c | 7 +- include/linux/kobject.h | 4 +- kernel/module.c | 5 +- lib/kobject_uevent.c | 167 ++++++++++++++++++++++++++++++--- 6 files changed, 207 insertions(+), 33 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-uevent (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-uevent b/Documentation/ABI/testing/sysfs-uevent new file mode 100644 index 000000000000..d7ac99072a16 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-uevent @@ -0,0 +1,47 @@ +What: /sys/.../uevent +Date: May 2017 +KernelVersion: 4.12 +Contact: Linux kernel mailing list +Description: + Enable passing additional variables for synthetic uevents that + are generated by writing /sys/.../uevent file. + + Recognized extended format is ACTION [UUID [KEY=VALUE ...]. + + The ACTION is compulsory - it is the name of the uevent action + ("add", "change", "remove"). There is no change compared to + previous functionality here. The rest of the extended format + is optional. + + You need to pass UUID first before any KEY=VALUE pairs. + The UUID must be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + format where 'x' is a hex digit. The UUID is considered to be + a transaction identifier so it's possible to use the same UUID + value for one or more synthetic uevents in which case we + logically group these uevents together for any userspace + listeners. The UUID value appears in uevent as + "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment + variable. + + If UUID is not passed in, the generated synthetic uevent gains + "SYNTH_UUID=0" environment variable automatically. + + The KEY=VALUE pairs can contain alphanumeric characters only. + It's possible to define zero or more pairs - each pair is then + delimited by a space character ' '. Each pair appears in + synthetic uevent as "SYNTH_ARG_KEY=VALUE". That means the KEY + name gains "SYNTH_ARG_" prefix to avoid possible collisions + with existing variables. + + Example of valid sequence written to the uevent file: + + add fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed A=1 B=abc + + This generates synthetic uevent including these variables: + + ACTION=add + SYNTH_ARG_A=1 + SYNTH_ARG_B=abc + SYNTH_UUID=fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed +Users: + udev, userspace tools generating synthetic uevents diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 6470eb8088f4..f945f2f0ee06 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -648,10 +648,7 @@ static void remove_probe_files(struct bus_type *bus) static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&drv->p->kobj, action); + kobject_synth_uevent(&drv->p->kobj, buf, count); return count; } static DRIVER_ATTR_WO(uevent); @@ -868,10 +865,7 @@ static void klist_devices_put(struct klist_node *n) static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&bus->p->subsys.kobj, action); + kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); return count; } static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store); diff --git a/drivers/base/core.c b/drivers/base/core.c index bbecaf9293be..6564339d7f59 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -981,12 +981,9 @@ out: static ssize_t uevent_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - enum kobject_action action; + if (kobject_synth_uevent(&dev->kobj, buf, count)) + dev_err(dev, "uevent: failed to send synthetic uevent\n"); - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&dev->kobj, action); - else - dev_err(dev, "uevent: unknown action-string\n"); return count; } static DEVICE_ATTR_RW(uevent); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..eeab34b0f589 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -217,11 +217,9 @@ extern struct kobject *firmware_kobj; int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); +int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count); __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); -int kobject_action_type(const char *buf, size_t count, - enum kobject_action *type); - #endif /* _KOBJECT_H_ */ diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..d7eb41d772c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1202,10 +1202,7 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buffer, count, &action) == 0) - kobject_uevent(&mk->kobj, action); + kobject_synth_uevent(&mk->kobj, buffer, count); return count; } diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9a2b811966eb..719c155fce20 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include @@ -52,19 +54,13 @@ static const char *kobject_actions[] = { [KOBJ_OFFLINE] = "offline", }; -/** - * kobject_action_type - translate action string to numeric type - * - * @buf: buffer containing the action string, newline is ignored - * @count: length of buffer - * @type: pointer to the location to store the action type - * - * Returns 0 if the action string was recognized. - */ -int kobject_action_type(const char *buf, size_t count, - enum kobject_action *type) +static int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type, + const char **args) { enum kobject_action action; + size_t count_first; + const char *args_start; int ret = -EINVAL; if (count && (buf[count-1] == '\n' || buf[count-1] == '\0')) @@ -73,11 +69,20 @@ int kobject_action_type(const char *buf, size_t count, if (!count) goto out; + args_start = strnchr(buf, count, ' '); + if (args_start) { + count_first = args_start - buf; + args_start = args_start + 1; + } else + count_first = count; + for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) { - if (strncmp(kobject_actions[action], buf, count) != 0) + if (strncmp(kobject_actions[action], buf, count_first) != 0) continue; - if (kobject_actions[action][count] != '\0') + if (kobject_actions[action][count_first] != '\0') continue; + if (args) + *args = args_start; *type = action; ret = 0; break; @@ -86,6 +91,142 @@ out: return ret; } +static const char *action_arg_word_end(const char *buf, const char *buf_end, + char delim) +{ + const char *next = buf; + + while (next <= buf_end && *next != delim) + if (!isalnum(*next++)) + return NULL; + + if (next == buf) + return NULL; + + return next; +} + +static int kobject_action_args(const char *buf, size_t count, + struct kobj_uevent_env **ret_env) +{ + struct kobj_uevent_env *env = NULL; + const char *next, *buf_end, *key; + int key_len; + int r = -EINVAL; + + if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0')) + count--; + + if (!count) + return -EINVAL; + + env = kzalloc(sizeof(*env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* first arg is UUID */ + if (count < UUID_STRING_LEN || !uuid_is_valid(buf) || + add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf)) + goto out; + + /* + * the rest are custom environment variables in KEY=VALUE + * format with ' ' delimiter between each KEY=VALUE pair + */ + next = buf + UUID_STRING_LEN; + buf_end = buf + count - 1; + + while (next <= buf_end) { + if (*next != ' ') + goto out; + + /* skip the ' ', key must follow */ + key = ++next; + if (key > buf_end) + goto out; + + buf = next; + next = action_arg_word_end(buf, buf_end, '='); + if (!next || next > buf_end || *next != '=') + goto out; + key_len = next - buf; + + /* skip the '=', value must follow */ + if (++next > buf_end) + goto out; + + buf = next; + next = action_arg_word_end(buf, buf_end, ' '); + if (!next) + goto out; + + if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s", + key_len, key, (int) (next - buf), buf)) + goto out; + } + + r = 0; +out: + if (r) + kfree(env); + else + *ret_env = env; + return r; +} + +/** + * kobject_synth_uevent - send synthetic uevent with arguments + * + * @kobj: struct kobject for which synthetic uevent is to be generated + * @buf: buffer containing action type and action args, newline is ignored + * @count: length of buffer + * + * Returns 0 if kobject_synthetic_uevent() is completed with success or the + * corresponding error when it fails. + */ +int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count) +{ + char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL }; + enum kobject_action action; + const char *action_args; + struct kobj_uevent_env *env; + const char *msg = NULL, *devpath; + int r; + + r = kobject_action_type(buf, count, &action, &action_args); + if (r) { + msg = "unknown uevent action string\n"; + goto out; + } + + if (!action_args) { + r = kobject_uevent_env(kobj, action, no_uuid_envp); + goto out; + } + + r = kobject_action_args(action_args, + count - (action_args - buf), &env); + if (r == -EINVAL) { + msg = "incorrect uevent action arguments\n"; + goto out; + } + + if (r) + goto out; + + r = kobject_uevent_env(kobj, action, env->envp); + kfree(env); +out: + if (r) { + devpath = kobject_get_path(kobj, GFP_KERNEL); + printk(KERN_WARNING "synth uevent: %s: %s", + devpath ?: "unknown device", + msg ?: "failed to send uevent"); + kfree(devpath); + } + return r; +} + #ifdef CONFIG_NET static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) { -- cgit From 9ab6055f959032258c0f83a070cd0d26ed7a8fc5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 24 May 2017 17:55:10 -0600 Subject: kernel/locking: Fix compile error with qrwlock.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Saw these compile errors on SPARC when queued rwlock feature is enabled. CC kernel/locking/qrwlock.o kernel/locking/qrwlock.c: In function ‘queued_read_lock_slowpath’: kernel/locking/qrwlock.c:89: error: implicit declaration of function ‘arch_spin_lock’ kernel/locking/qrwlock.c:102: error: implicit declaration of function ‘arch_spin_unlock’ make[4]: *** [kernel/locking/qrwlock.o] Error 1 Include spinlock.h in qrwlock.c to fix it. Signed-off-by: Babu Moger Reviewed-by: HÃ¥kon Bugge Reviewed-by: Jane Chu Reviewed-by: Shannon Nelson Reviewed-by: Vijay Kumar Signed-off-by: David S. Miller --- kernel/locking/qrwlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* -- cgit From 613763a1f056211522bac77ff39f25706e678fdd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 May 2017 22:04:29 -0400 Subject: take compat_sys_old_getrlimit() to native syscall ... and sanitize the ifdefs in there Signed-off-by: Al Viro --- arch/powerpc/include/asm/compat.h | 1 - arch/s390/include/asm/compat.h | 1 - arch/x86/include/asm/compat.h | 1 - include/linux/syscalls.h | 2 +- kernel/compat.c | 29 ----------------------------- kernel/sys.c | 24 ++++++++++++++++++++++++ 6 files changed, 25 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 4f2df589ec1d..f256e1d14a14 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -109,7 +109,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 0ddd37e6c29d..b9300f8aee10 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -178,7 +178,6 @@ struct compat_statfs64 { u32 f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 24118c0b4640..5343c19814b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -116,7 +116,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9b06f8..3cb15ea48aee 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -650,7 +650,7 @@ asmlinkage long sys_olduname(struct oldold_utsname __user *); asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim); -#if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64)) +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim); #endif asmlinkage long sys_setrlimit(unsigned int resource, diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..860f674fa556 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -468,35 +468,6 @@ COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, return do_prlimit(current, resource, &r, NULL); } -#ifdef COMPAT_RLIM_OLD_INFINITY - -COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); - set_fs(old_fs); - - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -#endif - COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..3778a8a417b6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1328,6 +1328,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + r = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (r.rlim_cur > 0x7FFFFFFF) + r.rlim_cur = 0x7FFFFFFF; + if (r.rlim_max > 0x7FFFFFFF) + r.rlim_max = 0x7FFFFFFF; + + if (put_user(r.rlim_cur, &rlim->rlim_cur) || + put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + return 0; +} +#endif + #endif static inline bool rlim64_is_infinity(__u64 rlim64) -- cgit From bcfe8ad8ef55a1cf3c935c2667e8e5ae598b3b7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 27 May 2017 00:29:34 -0400 Subject: do_sigaltstack(): lift copying to/from userland into callers Signed-off-by: Al Viro --- kernel/signal.c | 107 ++++++++++++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..d1eed0d7ca64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3113,78 +3113,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) } static int -do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) +do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) { - stack_t oss; - int error; + struct task_struct *t = current; - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp) | - (current->sas_ss_flags & SS_FLAG_BITS); + if (oss) { + memset(oss, 0, sizeof(stack_t)); + oss->ss_sp = (void __user *) t->sas_ss_sp; + oss->ss_size = t->sas_ss_size; + oss->ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); + } - if (uss) { - void __user *ss_sp; - size_t ss_size; - unsigned ss_flags; + if (ss) { + void __user *ss_sp = ss->ss_sp; + size_t ss_size = ss->ss_size; + unsigned ss_flags = ss->ss_flags; int ss_mode; - error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) - goto out; - error = __get_user(ss_sp, &uss->ss_sp) | - __get_user(ss_flags, &uss->ss_flags) | - __get_user(ss_size, &uss->ss_size); - if (error) - goto out; - - error = -EPERM; - if (on_sig_stack(sp)) - goto out; + if (unlikely(on_sig_stack(sp))) + return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; - error = -EINVAL; - if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && - ss_mode != 0) - goto out; + if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0)) + return -EINVAL; if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { - error = -ENOMEM; - if (ss_size < MINSIGSTKSZ) - goto out; + if (unlikely(ss_size < MINSIGSTKSZ)) + return -ENOMEM; } - current->sas_ss_sp = (unsigned long) ss_sp; - current->sas_ss_size = ss_size; - current->sas_ss_flags = ss_flags; - } - - error = 0; - if (uoss) { - error = -EFAULT; - if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) - goto out; - error = __put_user(oss.ss_sp, &uoss->ss_sp) | - __put_user(oss.ss_size, &uoss->ss_size) | - __put_user(oss.ss_flags, &uoss->ss_flags); + t->sas_ss_sp = (unsigned long) ss_sp; + t->sas_ss_size = ss_size; + t->sas_ss_flags = ss_flags; } - -out: - return error; + return 0; } + SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { - return do_sigaltstack(uss, uoss, current_user_stack_pointer()); + stack_t new, old; + int err; + if (uss && copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, + current_user_stack_pointer()); + if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) + err = -EFAULT; + return err; } int restore_altstack(const stack_t __user *uss) { - int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + stack_t new; + if (copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); /* squash all but EFAULT for now */ - return err == -EFAULT ? err : 0; + return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) @@ -3207,29 +3197,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, { stack_t uss, uoss; int ret; - mm_segment_t seg; if (uss_ptr) { compat_stack_t uss32; - - memset(&uss, 0, sizeof(stack_t)); if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), - (stack_t __force __user *) &uoss, + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer()); - set_fs(seg); if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || - __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || - __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || - __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + compat_stack_t old; + memset(&old, 0, sizeof(old)); + old.ss_sp = ptr_to_compat(uoss.ss_sp); + old.ss_flags = uoss.ss_flags; + old.ss_size = uoss.ss_size; + if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; -- cgit From 7786f6b6dfc12d17eea2df04116de6ebac50c884 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 7 Apr 2017 10:17:27 -0400 Subject: audit: add ambient capabilities to CAPSET and BPRM_FCAPS records Capabilities were augmented to include ambient capabilities in v4.3 commit 58319057b784 ("capabilities: ambient capabilities"). Add ambient capabilities to the audit BPRM_FCAPS and CAPSET records. The record contains fields "old_pp", "old_pi", "old_pe", "new_pp", "new_pi", "new_pe" so in keeping with the previous record normalizations, change the "new_*" variants to simply drop the "new_" prefix. A sample of the replaced BPRM_FCAPS record: RAW: type=BPRM_FCAPS msg=audit(1491468034.252:237): fver=2 fp=0000000000200000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 old_pa=0000000000000000 pp=0000000000200000 pi=0000000000000000 pe=0000000000200000 pa=0000000000000000 INTERPRET: type=BPRM_FCAPS msg=audit(04/06/2017 04:40:34.252:237): fver=2 fp=sys_admin fi=none fe=chown old_pp=none old_pi=none old_pe=none old_pa=none pp=sys_admin pi=none pe=sys_admin pa=none A sample of the replaced CAPSET record: RAW: type=CAPSET msg=audit(1491469502.371:242): pid=833 cap_pi=0000003fffffffff cap_pp=0000003fffffffff cap_pe=0000003fffffffff cap_pa=0000000000000000 INTERPRET: type=CAPSET msg=audit(04/06/2017 05:05:02.371:242) : pid=833 cap_pi=chown,dac_override,dac_read_search,fowner,fsetid,kill, setgid,setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,sys_time, sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pp=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pe=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pa=none See: https://github.com/linux-audit/audit-kernel/issues/40 Signed-off-by: Richard Guy Briggs Acked-by: Serge Hallyn Signed-off-by: Paul Moore --- kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index ddfce2ea4891..bb3a4e14b7e5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -68,6 +68,7 @@ struct audit_cap_data { unsigned int fE; /* effective bit of file cap */ kernel_cap_t effective; /* effective set of process */ }; + kernel_cap_t ambient; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b2dcbe637b7c..5fa68d10032f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1260,6 +1260,7 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, @@ -1381,9 +1382,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); - audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); - audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); - audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); + audit_log_cap(ab, "pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "pe", &axs->new_pcap.effective); + audit_log_cap(ab, "pa", &axs->new_pcap.ambient); break; } } @@ -2341,10 +2344,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; + ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; + ax->new_pcap.ambient = new->cap_ambient; return 0; } @@ -2363,6 +2368,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; + context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } -- cgit From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:27 -0700 Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual indirect call by register and use kernel internal opcode to mark call instruction into bpf_tail_call() helper. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 3 +++ kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- 8 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 71f930501ade..b1d38eeb24f6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -586,7 +586,7 @@ emit_cond_jmp: break; } /* tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: if (emit_bpf_tail_call(ctx)) return -EFAULT; break; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index aee2bb817ac6..a01366584a4b 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -938,7 +938,7 @@ common_load: /* * Tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); break; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6e97a2e3fd8d..42ad3832586c 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -991,7 +991,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i } break; } - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: /* * Implicit input: * B1: pointer to ctx diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 21de77419f48..4a52d34facf9 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1217,7 +1217,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) } /* tail call */ - case BPF_JMP | BPF_CALL |BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_tail_call(ctx); break; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f58939393eef..fec12eaa0dec 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -877,7 +877,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index 62d948f80730..a20ba40fcb73 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,6 +57,9 @@ struct bpf_prog_aux; #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +/* unused opcode to mark special call to bpf_tail_call() helper */ +#define BPF_TAIL_CALL 0xf0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dedf367f59bb..339289402b96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, - [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, + [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 339c8a1371de..28113d0e8e92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3469,7 +3469,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * that doesn't support bpf_tail_call yet */ insn->imm = 0; - insn->code |= BPF_X; + insn->code = BPF_JMP | BPF_TAIL_CALL; continue; } -- cgit From f696b8f471ec987e987e38206b8eb23c39ee5a86 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:28 -0700 Subject: bpf: split bpf core interpreter split __bpf_prog_run() interpreter into stack allocation and execution parts. The code section shrinks which helps interpreter performance in some cases. text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before 25777 10328 624 36729 8f79 kernel/bpf/core.o.after Very short programs got slower (due to extra function call): Before: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 7 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 8 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 7 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 11 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 7 PASS After: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 11 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 11 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 11 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 14 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 10 PASS Longer programs got faster: Before: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 20286 20513 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 31853 31768 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9815 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 6 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13959 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 210 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 21724 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19118 PASS After: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 19008 18827 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 29238 28450 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9485 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 12 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13257 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 213 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 19389 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19583 PASS For real world production programs the difference is noise. This patch is first step towards reducing interpreter stack consumption. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 339289402b96..abd410d394bc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + u64 *stack) { - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; + u64 tmp; static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ @@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - select_insn: goto *jumptable[insn->code]; @@ -1219,7 +1216,17 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } -STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ +STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ + +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG]; + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + return ___bpf_prog_run(regs, insn, stack); +} bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) -- cgit From 8726679a0fa317f8e83d0843b266453f31bff092 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:29 -0700 Subject: bpf: teach verifier to track stack depth teach verifier to track bpf program stack depth Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6bb38d76faf4..fcc80ca11045 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -171,6 +171,7 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + u32 stack_depth; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_verifier_ops *ops; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28113d0e8e92..d96f27ff9f6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } + + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (t == BPF_WRITE) { if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && @@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -3167,7 +3174,8 @@ process_bpf_exit: insn_idx++; } - verbose("processed %d insns\n", insn_processed); + verbose("processed %d insns, stack depth %d\n", + insn_processed, env->prog->aux->stack_depth); return 0; } -- cgit From 80a58d02559465b0ea403ff91c8bca9a733b1b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:30 -0700 Subject: bpf: reconcile bpf_tail_call and stack_depth The next set of patches will take advantage of stack_depth tracking, so make sure that the program that does bpf_tail_call() has stack depth large enough for the callee. We could have tracked the stack depth of the prog_array owner program and only allow insertion of the programs with stack depth less than the owner, but it will break existing applications. Some of them have trivial root bpf program that only does multiple bpf_tail_calls and at init time the prog array is empty. In the future we may add a flag to do such tracking optionally, but for now play simple and safe. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d96f27ff9f6f..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3470,6 +3470,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; + env->prog->aux->stack_depth = MAX_BPF_STACK; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal -- cgit From b870aa901f4be1d32c13faf9e8f40bf2a8562e19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:33 -0700 Subject: bpf: use different interpreter depending on required stack size 16 __bpf_prog_run() interpreters for various stack sizes add .text but not a lot comparing to run-time stack savings text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before_split 25777 10328 624 36729 8f79 kernel/bpf/core.o.after_split 26970 10328 624 37922 9422 kernel/bpf/core.o.now Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index abd410d394bc..774069ca18a7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1218,16 +1218,38 @@ load_byte: } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG]; - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - return ___bpf_prog_run(regs, insn, stack); +#define PROG_NAME(stack_size) __bpf_prog_run##stack_size +#define DEFINE_BPF_PROG_RUN(stack_size) \ +static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + ARG1 = (u64) (unsigned long) ctx; \ + return ___bpf_prog_run(regs, insn, stack); \ } +#define EVAL1(FN, X) FN(X) +#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) +#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) +#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) +#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) +#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) + +EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); + +#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), + +static unsigned int (*interpreters[])(const void *ctx, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1275,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = (void *) __bpf_prog_run; + fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit From fb9a307d11d62749d75b404f15517d73f5d6e148 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:15:59 -0700 Subject: bpf: Allow CGROUP_SKB eBPF program to access sk_buff This allows cgroup eBPF program to classify packet based on their protocol or other detail information. Currently program need CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A process with CAP_NET_ADMIN can already see all packets on the system, for example, by creating an iptables rules that causes the packet to be passed to userspace via NFLOG. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..8acae64df255 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,6 +2426,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit From 80b7d81912d807f161d55e9c2c9cc81061666f83 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:16:00 -0700 Subject: bpf: Remove the capability check for cgroup skb eBPF program Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN capability while attaching the program to a cgroup only requires the user have CAP_NET_ADMIN privilege. We can escape the capability check when load the program just like socket filter program to make the capability requirement consistent. Change since v1: Change the code style in order to be compliant with checkpatch.pl preference Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 265a0d854e33..59da103adb85 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -815,7 +815,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !capable(CAP_SYS_ADMIN)) return -EPERM; /* plain bpf_prog allocation */ -- cgit From f91840a32deef5cb1bf73338bc5010f843b01426 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Jun 2017 21:03:52 -0700 Subject: perf, bpf: Add BPF support to all perf_event types Allow BPF_PROG_TYPE_PERF_EVENT program types to attach to all perf_event types, including HW_CACHE, RAW, and dynamic pmu events. Only tracepoint/kprobe events are treated differently which require BPF_PROG_TYPE_TRACEPOINT/BPF_PROG_TYPE_KPROBE program types accordingly. Also add support for reading all event counters using bpf_perf_event_read() helper. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/perf_event.h | 7 +++++-- kernel/bpf/arraymap.c | 28 +++++++-------------------- kernel/events/core.c | 47 +++++++++++++++++++++++++++------------------- kernel/trace/bpf_trace.c | 22 ++++++++-------------- 4 files changed, 48 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..8fc5f0fada5e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -896,7 +896,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -extern u64 perf_event_read_local(struct perf_event *event); +int perf_event_read_local(struct perf_event *event, u64 *value); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1301,7 +1301,10 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline u64 perf_event_read_local(struct perf_event *event) { return -EINVAL; } +static inline int perf_event_read_local(struct perf_event *event, u64 *value) +{ + return -EINVAL; +} static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 172dc8ee0e3b..ecb43542246e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -452,38 +452,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - const struct perf_event_attr *attr; struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; + u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; + ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - ee = ERR_PTR(-EINVAL); - - attr = perf_event_attrs(event); - if (IS_ERR(attr) || attr->inherit) + if (perf_event_read_local(event, &value) == -EOPNOTSUPP) goto err_out; - switch (attr->type) { - case PERF_TYPE_SOFTWARE: - if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) - goto err_out; - /* fall-through */ - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - ee = bpf_event_entry_gen(perf_file, map_file); - if (ee) - return ee; - ee = ERR_PTR(-ENOMEM); - /* fall-through */ - default: - break; - } - + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..51e40e4876c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3636,10 +3636,10 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -u64 perf_event_read_local(struct perf_event *event) +int perf_event_read_local(struct perf_event *event, u64 *value) { unsigned long flags; - u64 val; + int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context @@ -3647,25 +3647,37 @@ u64 perf_event_read_local(struct perf_event *event) */ local_irq_save(flags); - /* If this is a per-task event, it must be for current */ - WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && - event->hw.target != current); - - /* If this is a per-CPU event, it must be for this CPU */ - WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()); - /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ - WARN_ON_ONCE(event->attr.inherit); + if (event->attr.inherit) { + ret = -EOPNOTSUPP; + goto out; + } /* * It must not have a pmu::count method, those are not * NMI safe. */ - WARN_ON_ONCE(event->pmu->count); + if (event->pmu->count) { + ret = -EOPNOTSUPP; + goto out; + } + + /* If this is a per-task event, it must be for current */ + if ((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current) { + ret = -EINVAL; + goto out; + } + + /* If this is a per-CPU event, it must be for this CPU */ + if (!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()) { + ret = -EINVAL; + goto out; + } /* * If the event is currently on this CPU, its either a per-task event, @@ -3675,10 +3687,11 @@ u64 perf_event_read_local(struct perf_event *event) if (event->oncpu == smp_processor_id()) event->pmu->read(event); - val = local64_read(&event->count); + *value = local64_read(&event->count); +out: local_irq_restore(flags); - return val; + return ret; } static int perf_event_read(struct perf_event *event, bool group) @@ -8037,12 +8050,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) bool is_kprobe, is_tracepoint; struct bpf_prog *prog; - if (event->attr.type == PERF_TYPE_HARDWARE || - event->attr.type == PERF_TYPE_SOFTWARE) - return perf_event_set_bpf_handler(event, prog_fd); - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; + return perf_event_set_bpf_handler(event, prog_fd); if (event->tp_event->prog) return -EEXIST; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 460a031c77e5..08eb072430b9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -234,7 +234,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - struct perf_event *event; + u64 value = 0; + int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -247,21 +248,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - event = ee->event; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - - /* make sure event is local and doesn't have pmu::count */ - if (unlikely(event->oncpu != cpu || event->pmu->count)) - return -EINVAL; - + err = perf_event_read_local(ee->event, &value); /* - * we don't know if the function is run successfully by the - * return value. It can be judged in other places, such as - * eBPF programs. + * this api is ugly since we miss [-22..-2] range of valid + * counter values, but that's uapi */ - return perf_event_read_local(event); + if (err) + return err; + return value; } static const struct bpf_func_proto bpf_perf_event_read_proto = { -- cgit From dc4bb0e2356149aee4cdae061936f3bbdd45595c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:46 -0700 Subject: bpf: Introduce bpf_prog ID This patch generates an unique ID for each BPF_PROG_LOAD-ed prog. It is worth to note that each BPF_PROG_LOAD-ed prog will have a different ID even they have the same bpf instructions. The ID is generated by the existing idr_alloc_cyclic(). The ID is ranged from [1, INT_MAX). It is allocated in cyclic manner, so an ID will get reused every 2 billion BPF_PROG_LOAD. The bpf_prog_alloc_id() is done after bpf_prog_select_runtime() because the jit process may have allocated a new prog. Hence, we need to ensure the value of pointer 'prog' will not be changed any more before storing the prog to the prog_idr. After bpf_prog_select_runtime(), the prog is read-only. Hence, the id is stored in 'struct bpf_prog_aux'. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fcc80ca11045..c5946d19f2ca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -172,6 +172,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 stack_depth; + u32 id; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_verifier_ops *ops; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 59da103adb85..2a1b32b470f1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -22,8 +22,11 @@ #include #include #include +#include DEFINE_PER_CPU(int, bpf_prog_active); +static DEFINE_IDR(prog_idr); +static DEFINE_SPINLOCK(prog_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -650,6 +653,34 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } +static int bpf_prog_alloc_id(struct bpf_prog *prog) +{ + int id; + + spin_lock_bh(&prog_idr_lock); + id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + prog->aux->id = id; + spin_unlock_bh(&prog_idr_lock); + + /* id is in [1, INT_MAX) */ + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_prog_free_id(struct bpf_prog *prog) +{ + /* cBPF to eBPF migrations are currently not in the idr store. */ + if (!prog->aux->id) + return; + + spin_lock_bh(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); + spin_unlock_bh(&prog_idr_lock); +} + static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -663,6 +694,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -857,15 +889,21 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; + err = bpf_prog_alloc_id(prog); + if (err) + goto free_used_maps; + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ - goto free_used_maps; + goto free_id; bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; +free_id: + bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: -- cgit From f3f1c054c288bb6e503005e6d73611151ed20e91 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:47 -0700 Subject: bpf: Introduce bpf_map ID This patch generates an unique ID for each created bpf_map. The approach is similar to the earlier patch for bpf_prog ID. It is worth to note that the bpf_map's ID and bpf_prog's ID are in two independent ID spaces and both have the same valid range: [1, INT_MAX). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c5946d19f2ca..c32bace66d3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; u32 pages; + u32 id; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a1b32b470f1..4c3075b5d840 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,8 @@ DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); +static DEFINE_IDR(map_idr); +static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -117,6 +119,29 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) free_uid(user); } +static int bpf_map_alloc_id(struct bpf_map *map) +{ + int id; + + spin_lock_bh(&map_idr_lock); + id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + map->id = id; + spin_unlock_bh(&map_idr_lock); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_map_free_id(struct bpf_map *map) +{ + spin_lock_bh(&map_idr_lock); + idr_remove(&map_idr, map->id); + spin_unlock_bh(&map_idr_lock); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -141,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -239,14 +265,20 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; + err = bpf_map_alloc_id(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ - goto free_map; + goto free_id; trace_bpf_map_create(map, err); return err; +free_id: + bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: -- cgit From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:48 -0700 Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID to allow userspace to iterate all bpf_prog IDs and bpf_map IDs. The API is trying to be consistent with the existing BPF_MAP_GET_NEXT_KEY. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e78aece03628..629747a3f273 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,8 @@ enum bpf_cmd { BPF_PROG_ATTACH, BPF_PROG_DETACH, BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, }; enum bpf_map_type { @@ -209,6 +211,11 @@ union bpf_attr { __u32 repeat; __u32 duration; } test; + + struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ + __u32 start_id; + __u32 next_id; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4c3075b5d840..2405feedb8c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); @@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr, return ret; } +#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id + +static int bpf_obj_get_next_id(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct idr *idr, + spinlock_t *lock) +{ + u32 next_id = attr->start_id; + int err = 0; + + if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + next_id++; + spin_lock_bh(lock); + if (!idr_get_next(idr, &next_id)) + err = -ENOENT; + spin_unlock_bh(lock); + + if (!err) + err = put_user(next_id, &uattr->next_id); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; + case BPF_PROG_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &prog_idr, &prog_idr_lock); + break; + case BPF_MAP_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &map_idr, &map_idr_lock); + break; default: err = -EINVAL; break; -- cgit From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:49 -0700 Subject: bpf: Add BPF_PROG_GET_FD_BY_ID Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd from a bpf_prog's ID. bpf_prog_inc_not_zero() is added and is called with prog_idr_lock held. __bpf_prog_put() is also added which has the 'bool do_idr_lock' param to decide if the prog_idr_lock should be acquired when freeing the prog->id. In the error path of bpf_prog_inc_not_zero(), it may have to call __bpf_prog_put(map, false) which does not need to take the prog_idr_lock when freeing the prog->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++-- kernel/bpf/syscall.c | 91 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 629747a3f273..d70cfed19d5e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_cmd { BPF_PROG_TEST_RUN, BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, }; enum bpf_map_type { @@ -212,8 +213,11 @@ union bpf_attr { __u32 duration; } test; - struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ - __u32 start_id; + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + }; __u32 next_id; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2405feedb8c1..dc6253bb8ebb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog) +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { /* cBPF to eBPF migrations are currently not in the idr store. */ if (!prog->aux->id) return; - spin_lock_bh(&prog_idr_lock); + if (do_idr_lock) + spin_lock_bh(&prog_idr_lock); + else + __acquire(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); - spin_unlock_bh(&prog_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&prog_idr_lock); + else + __release(&prog_idr_lock); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -void bpf_prog_put(struct bpf_prog *prog) +static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ - bpf_prog_free_id(prog); + bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } + +void bpf_prog_put(struct bpf_prog *prog) +{ + __bpf_prog_put(prog, true); +} EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) @@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc); +/* prog_idr_lock should have been held */ +static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +{ + int refold; + + refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_prog_put(prog, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + return prog; +} + static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); @@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; err = bpf_prog_new_fd(prog); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_prog_put() is needed because the above + * bpf_prog_alloc_id() has published the prog + * to the userspace and the userspace may + * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. + */ + bpf_prog_put(prog); + return err; + } bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; -free_id: - bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: @@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_prog_new_fd(prog); + if (fd < 0) + bpf_prog_put(prog); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_PROG_GET_FD_BY_ID: + err = bpf_prog_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:50 -0700 Subject: bpf: Add BPF_MAP_GET_FD_BY_ID Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd from a bpf_map's ID. bpf_map_inc_not_zero() is added and is called with map_idr_lock held. __bpf_map_put() is also added which has the 'bool do_idr_lock' param to decide if the map_idr_lock should be acquired when freeing the map->id. In the error path of bpf_map_inc_not_zero(), it may have to call __bpf_map_put(map, false) which does not need to take the map_idr_lock when freeing the map->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 95 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d70cfed19d5e..dd23f47ff00c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,7 @@ enum bpf_cmd { BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, }; enum bpf_map_type { @@ -217,6 +218,7 @@ union bpf_attr { union { __u32 start_id; __u32 prog_id; + __u32 map_id; }; __u32 next_id; }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc6253bb8ebb..1802bb9c47d9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map) +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { - spin_lock_bh(&map_idr_lock); + if (do_idr_lock) + spin_lock_bh(&map_idr_lock); + else + __acquire(&map_idr_lock); + idr_remove(&map_idr, map->id); - spin_unlock_bh(&map_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&map_idr_lock); + else + __release(&map_idr_lock); } /* called from workqueue */ @@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map) /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ -void bpf_map_put(struct bpf_map *map) +static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ - bpf_map_free_id(map); + bpf_map_free_id(map, do_idr_lock); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } +void bpf_map_put(struct bpf_map *map) +{ + __bpf_map_put(map, true); +} + void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); @@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr) goto free_map; err = bpf_map_new_fd(map); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_map_put() is needed because the above + * bpf_map_alloc_id() has published the map + * to the userspace and the userspace may + * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. + */ + bpf_map_put(map); + return err; + } trace_bpf_map_create(map, err); return err; -free_id: - bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: @@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } +/* map_idr_lock should have been held */ +static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) +{ + int refold; + + refold = __atomic_add_unless(&map->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_map_put(map, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + if (uref) + atomic_inc(&map->usercnt); + + return map; +} + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id + +static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_map *map; + u32 id = attr->map_id; + int fd; + + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&map_idr_lock); + map = idr_find(&map_idr, id); + if (map) + map = bpf_map_inc_not_zero(map, true); + else + map = ERR_PTR(-ENOENT); + spin_unlock_bh(&map_idr_lock); + + if (IS_ERR(map)) + return PTR_ERR(map); + + fd = bpf_map_new_fd(map); + if (fd < 0) + bpf_map_put(map); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; + case BPF_MAP_GET_FD_BY_ID: + err = bpf_map_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:52 -0700 Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info for both bpf_prog and bpf_map. The kernel can figure out the fd is associated with a bpf_prog or bpf_map. The suggested struct bpf_prog_info and struct bpf_map_info are not meant to be a complete list and it is not the goal of this patch. New fields can be added in the future patch. The focus of this patch is to create the interface, BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and bpf_map's info. The obj's info, which will be extended (and get bigger) over time, is separated from the bpf_attr to avoid bloating the bpf_attr. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 - include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/syscall.c | 163 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 174 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e2dddf21f3b..1fa26dc562ce 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -69,8 +69,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -#define BPF_TAG_SIZE 8 - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd23f47ff00c..9b2c10b45733 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -86,6 +86,7 @@ enum bpf_cmd { BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, }; enum bpf_map_type { @@ -222,6 +223,12 @@ union bpf_attr { }; __u32 next_id; }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -686,4 +693,25 @@ struct xdp_md { __u32 data_end; }; +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1802bb9c47d9..8942c820d620 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + +static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_prog_info info = {}; + u32 info_len = attr->info.info_len; + char __user *uinsns; + u32 ulen; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + if (copy_from_user(&info, uinfo, info_len)) + return err; + + info.type = prog->type; + info.id = prog->aux->id; + + memcpy(info.tag, prog->tag, sizeof(prog->tag)); + + if (!capable(CAP_SYS_ADMIN)) { + info.jited_prog_len = 0; + info.xlated_prog_len = 0; + goto done; + } + + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } + + ulen = info.xlated_prog_len; + info.xlated_prog_len = bpf_prog_size(prog->len); + if (info.xlated_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + if (copy_to_user(uinsns, prog->insnsi, ulen)) + return -EFAULT; + } + +done: + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +static int bpf_map_get_info_by_fd(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_map_info info = {}; + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + info.type = map->map_type; + info.id = map->id; + info.key_size = map->key_size; + info.value_size = map->value_size; + info.max_entries = map->max_entries; + info.map_flags = map->map_flags; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info + +static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ufd = attr->info.bpf_fd; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) + return -EINVAL; + + f = fdget(ufd); + if (!f.file) + return -EBADFD; + + if (f.file->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + uattr); + else if (f.file->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(f.file->private_data, attr, + uattr); + else + err = -EINVAL; + + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz * user-space does not rely on any kernel feature * extensions we dont know about yet. */ - if (size > sizeof(attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - size = sizeof(attr); - } + err = check_uarg_tail_zero(uattr, sizeof(attr), size); + if (err) + return err; + size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) @@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; + case BPF_OBJ_GET_INFO_BY_FD: + err = bpf_obj_get_info_by_fd(&attr, uattr); + break; default: err = -EINVAL; break; -- cgit From 92046578ac88e0a93f8ef03240e6c832b0189aa7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Jun 2017 18:38:04 +0200 Subject: bpf: cgroup skb progs cannot access ld_abs/ind Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB type to use ld_abs/ind instructions. However, at this point, we cannot use them, since offsets relative to SKF_LL_OFF will end up pointing skb_mac_header(skb) out of bounds since in the egress path it is not yet set at that point in time, but only after __dev_queue_xmit() did a general reset on the mac header. bpf_internal_load_pointer_neg_helper() will then end up reading data from a wrong offset. BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes() already to access packet data, which is also more flexible than the insns carried over from cBPF. Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Chenbo Feng Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acae64df255..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,7 +2426,6 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit From c3ca46ef719580eb01994fc6032db470fde92d85 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 23 May 2017 15:05:50 +0900 Subject: ftrace/kprobes: selftests: Check kretprobe maxactive is supported Check the kretprobe maxactive is supported by kprobe_events interface. To ensure the kernel feature, this changes ftrace README to describe it. Signed-off-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- kernel/trace/trace.c | 3 ++- tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..dc3f91e70345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4473,7 +4473,8 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p|r[:[/]] []\n" + "\t Format: p[:[/]] []\n" + "\t r[maxactive][:[/]] []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc index 57abdf1caabf..7ec6f2639ad6 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc @@ -2,6 +2,7 @@ # description: Kretprobe dynamic event with maxactive [ -f kprobe_events ] || exit_unsupported # this is configurable +grep -q 'r\[maxactive\]' README || exit_unsupported # this is older version echo > kprobe_events -- cgit From 67e707bd68269aac70904943f07a979eeb163b13 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 8 Jun 2017 18:09:09 +0530 Subject: config: android-recommended: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Reviewed-at: https://android-review.googlesource.com/#/c/238388/ Signed-off-by: Jeff Vander Stoep [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 28ee064b6744..a86faa41bfd2 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -11,6 +11,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y -- cgit From 0c9238c7a1cfd834d8bb96a2b1fabe0b1a5961df Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:10 +0530 Subject: config: android-recommended: enable CONFIG_ARM64_SW_TTBR0_PAN Enable PAN emulation using TTBR0_EL1 switching. Reviewed-at: https://android-review.googlesource.com/#/c/325997/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel and updated the commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a86faa41bfd2..a02c447769f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,6 +6,7 @@ # CONFIG_NF_CONNTRACK_SIP is not set # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set +CONFIG_ARM64_SW_TTBR0_PAN=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y -- cgit From c1ebc2febdb85a73a4f91a9b9eaab6387619eaa6 Mon Sep 17 00:00:00 2001 From: Max Shi Date: Thu, 8 Jun 2017 18:09:11 +0530 Subject: config: android-base: disable CONFIG_USELIB and CONFIG_FHANDLE Turn off the two kernel configs to disable related system ABI. Reviewed-at: https://android-review.googlesource.com/#/c/264976/ Signed-off-by: Max Shi [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 26a06e09a5bd..efe5ff86767e 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,10 +1,12 @@ # KEEP ALPHABETICALLY SORTED # CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set +# CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set # CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set +# CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y -- cgit From fb0b1538983c1cf7d2a2242b332a34a953753624 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:12 +0530 Subject: config: android-recommended: enable CONFIG_CPU_SW_DOMAIN_PAN Enable CPU domain PAN to ensure that normal kernel accesses are unable to access userspace addresses. Reviewed-at: https://android-review.googlesource.com/#/c/334035/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel, updated the commit message and re-placed the CONFIG_STRICT_KERNEL_RWX config in sorted order] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a02c447769f7..946fb92418f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -14,7 +14,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y -CONFIG_STRICT_KERNEL_RWX=y +CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y @@ -107,6 +107,7 @@ CONFIG_SCHEDSTATS=y CONFIG_SMARTJOYPLUS_FF=y CONFIG_SND=y CONFIG_SOUND=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -- cgit From 5b89db2fa545b473dc352689ac3afe407367ea34 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:13 +0530 Subject: config: android-base: add CONFIG_IKCONFIG option This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a requirement for the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364553/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index efe5ff86767e..e12cfec25758 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -25,6 +25,8 @@ CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y -- cgit From 2096e1706336d83cd66ca744e4d904af4d63e25c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:14 +0530 Subject: config: android-base: add CONFIG_MODULES option This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS which are required by the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364554/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index e12cfec25758..62cb392fc34b 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,7 +3,6 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set -# CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set @@ -64,6 +63,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y -- cgit From 2edfe6be206adc4c1055e053322d27267f8952bc Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 8 Jun 2017 18:09:15 +0530 Subject: config: android-base: add CGROUP_BPF Add CONFIG_CGROUP_BPF as a default configuration in android base config since it is used to replace XT_QTAGUID in future. Reviewed-at: https://android-review.googlesource.com/#/c/400374/ Signed-off-by: Chenbo Feng [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 62cb392fc34b..cdde5af6b332 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -14,6 +14,7 @@ CONFIG_ASHMEM=y CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y +CONFIG_CGROUP_BPF=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y -- cgit From 9e69dd0179c346dfb5d08b8d46d5f5c9c81ab1b7 Mon Sep 17 00:00:00 2001 From: Roberto Pereira Date: Thu, 8 Jun 2017 18:09:16 +0530 Subject: config: android-base: disable CONFIG_NFSD and CONFIG_NFS_FS Disable Network file system support. Reviewed-at: https://android-review.googlesource.com/#/c/409559/ Signed-off-by: Roberto Pereira [AmitP: cherry-picked this change from Android common kernel and updated commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index cdde5af6b332..d70829033bb7 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,6 +3,8 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -- cgit From 1e1fc133483ef3b56c20bf3cd9241146c41042f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 May 2017 00:29:38 -0400 Subject: compat_{get,put}_bitmap(): use unsafe_{get,put}_user() unroll the inner loops, while we are at it Signed-off-by: Al Viro --- include/linux/compat.h | 3 +- kernel/compat.c | 81 +++++++++++++++++--------------------------------- 2 files changed, 29 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c5f3152cbb5..94ceb0348a25 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -388,8 +388,7 @@ asmlinkage long compat_sys_wait4(compat_pid_t pid, #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) -#define BITS_TO_COMPAT_LONGS(bits) \ - (((bits)+BITS_PER_COMPAT_LONG-1)/BITS_PER_COMPAT_LONG) +#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); diff --git a/kernel/compat.c b/kernel/compat.c index 860f674fa556..9c2a8f3788d5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -871,84 +871,59 @@ int get_compat_sigevent(struct sigevent *event, long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = 0; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - /* - * We dont want to read past the end of the userspace - * bitmap. We must however ensure the end of the - * kernel bitmap is zeroed. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__get_user(um, umask)) - return -EFAULT; - } else { - um = 0; - } - - umask++; - m |= (long)um << (j * BITS_PER_COMPAT_LONG); - } - *mask++ = m; + user_access_begin(); + while (nr_compat_longs > 1) { + compat_ulong_t l1, l2; + unsafe_get_user(l1, umask++, Efault); + unsafe_get_user(l2, umask++, Efault); + *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_get_user(*mask, umask++, Efault); + user_access_end(); return 0; + +Efault: + user_access_end(); + return -EFAULT; } long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = *mask++; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - um = m; - - /* - * We dont want to write past the end of the userspace - * bitmap. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__put_user(um, umask)) - return -EFAULT; - } - - umask++; - m >>= 4*sizeof(um); - m >>= 4*sizeof(um); - } + user_access_begin(); + while (nr_compat_longs > 1) { + unsigned long m = *mask++; + unsafe_put_user((compat_ulong_t)m, umask++, Efault); + unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); + user_access_end(); return 0; +Efault: + user_access_end(); + return -EFAULT; } void -- cgit From ca2406ed58fef3f7c8ef6470cba807bfc3415605 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:22:44 -0400 Subject: times(2): move compat to native Signed-off-by: Al Viro --- include/linux/time.h | 3 --- kernel/compat.c | 24 ------------------------ kernel/sys.c | 28 +++++++++++++++++++++++++++- 3 files changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index c0543f5f25de..f769ea88250d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -171,9 +171,6 @@ extern int do_getitimer(int which, struct itimerval *value); extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); -struct tms; -extern void do_sys_times(struct tms *); - /* * Similar to the struct tm in userspace , but it needs to be here so * that the kernel source is self contained. diff --git a/kernel/compat.c b/kernel/compat.c index 9c2a8f3788d5..99408252762e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,30 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -static compat_clock_t clock_t_to_compat_clock_t(clock_t x) -{ - return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); -} - -COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) -{ - if (tbuf) { - struct tms tms; - struct compat_tms tmp; - - do_sys_times(&tms); - /* Convert our struct tms to the compat version. */ - tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); - tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); - tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); - tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); - if (copy_to_user(tbuf, &tmp, sizeof(tmp))) - return -EFAULT; - } - force_successful_syscall_return(); - return compat_jiffies_to_clock_t(jiffies); -} - #ifdef __ARCH_WANT_SYS_SIGPENDING /* diff --git a/kernel/sys.c b/kernel/sys.c index 3778a8a417b6..161b5eae9c77 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -void do_sys_times(struct tms *tms) +static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; @@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) return (long) jiffies_64_to_clock_t(get_jiffies_64()); } +#ifdef CONFIG_COMPAT +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) +{ + if (tbuf) { + struct tms tms; + struct compat_tms tmp; + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + force_successful_syscall_return(); + return compat_jiffies_to_clock_t(jiffies); +} +#endif + /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully -- cgit From d9e968cb9f849770288f5fde3d8d3a5f7e339052 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:33:51 -0400 Subject: getrlimit()/setrlimit(): move compat to native Signed-off-by: Al Viro --- kernel/compat.c | 38 -------------------------------------- kernel/sys.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 99408252762e..58b8e57398d1 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -427,44 +427,6 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - - if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || - __get_user(r.rlim_cur, &rlim->rlim_cur) || - __get_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - - if (r.rlim_cur == COMPAT_RLIM_INFINITY) - r.rlim_cur = RLIM_INFINITY; - if (r.rlim_max == COMPAT_RLIM_INFINITY) - r.rlim_max = RLIM_INFINITY; - return do_prlimit(current, resource, &r, NULL); -} - -COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - - ret = do_prlimit(current, resource, NULL, &r); - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || diff --git a/kernel/sys.c b/kernel/sys.c index 161b5eae9c77..873e6eaa314f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1332,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) return ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + struct compat_rlimit r32; + + if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) + return -EFAULT; + + if (r32.rlim_cur == COMPAT_RLIM_INFINITY) + r.rlim_cur = RLIM_INFINITY; + else + r.rlim_cur = r32.rlim_cur; + if (r32.rlim_max == COMPAT_RLIM_INFINITY) + r.rlim_max = RLIM_INFINITY; + else + r.rlim_max = r32.rlim_max; + return do_prlimit(current, resource, &r, NULL); +} + +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + int ret; + + ret = do_prlimit(current, resource, NULL, &r); + if (!ret) { + struct rlimit r32; + if (r.rlim_cur > COMPAT_RLIM_INFINITY) + r32.rlim_cur = COMPAT_RLIM_INFINITY; + else + r32.rlim_cur = r.rlim_cur; + if (r.rlim_max > COMPAT_RLIM_INFINITY) + r32.rlim_max = COMPAT_RLIM_INFINITY; + else + r32.rlim_max = r.rlim_max; + + if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) + return -EFAULT; + } + return ret; +} + +#endif + #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* -- cgit From 8f13621abcedb278cfecf9703583743f9c474c97 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:42:07 -0400 Subject: sigpending(): move compat to native ... and kill set_fs() use Signed-off-by: Al Viro --- kernel/compat.c | 23 ----------------------- kernel/signal.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 58b8e57398d1..195e23469854 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,29 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -#ifdef __ARCH_WANT_SYS_SIGPENDING - -/* - * Assumption: old_sigset_t and compat_old_sigset_t are both - * types that can be passed to put_user()/get_user(). - */ - -COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sigpending((old_sigset_t __user *) &s); - set_fs(old_fs); - if (ret == 0) - ret = put_user(s, set); - return ret; -} - -#endif - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/signal.c b/kernel/signal.c index d1eed0d7ca64..6237f492adfc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3254,6 +3254,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) +{ + sigset_t set; + int err = do_sigpending(&set, sizeof(old_sigset_t)); + if (err == 0) + if (copy_to_user(set32, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; +} +#endif + #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -- cgit From 7668b679c3b7931f6436d1eef50904831209e749 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 06:39:31 -0400 Subject: put_compat_rusage(): switch to copy_to_user() Signed-off-by: Al Viro --- kernel/compat.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 195e23469854..64e772aabdb5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -406,25 +406,27 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { - if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || - __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || - __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || - __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || - __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || - __put_user(r->ru_maxrss, &ru->ru_maxrss) || - __put_user(r->ru_ixrss, &ru->ru_ixrss) || - __put_user(r->ru_idrss, &ru->ru_idrss) || - __put_user(r->ru_isrss, &ru->ru_isrss) || - __put_user(r->ru_minflt, &ru->ru_minflt) || - __put_user(r->ru_majflt, &ru->ru_majflt) || - __put_user(r->ru_nswap, &ru->ru_nswap) || - __put_user(r->ru_inblock, &ru->ru_inblock) || - __put_user(r->ru_oublock, &ru->ru_oublock) || - __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || - __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || - __put_user(r->ru_nsignals, &ru->ru_nsignals) || - __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || - __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) + struct compat_rusage r32; + memset(&r32, 0, sizeof(r32)); + r32.ru_utime.tv_sec = r->ru_utime.tv_sec; + r32.ru_utime.tv_usec = r->ru_utime.tv_usec; + r32.ru_stime.tv_sec = r->ru_stime.tv_sec; + r32.ru_stime.tv_usec = r->ru_stime.tv_usec; + r32.ru_maxrss = r->ru_maxrss; + r32.ru_ixrss = r->ru_ixrss; + r32.ru_idrss = r->ru_idrss; + r32.ru_isrss = r->ru_isrss; + r32.ru_minflt = r->ru_minflt; + r32.ru_majflt = r->ru_majflt; + r32.ru_nswap = r->ru_nswap; + r32.ru_inblock = r->ru_inblock; + r32.ru_oublock = r->ru_oublock; + r32.ru_msgsnd = r->ru_msgsnd; + r32.ru_msgrcv = r->ru_msgrcv; + r32.ru_nsignals = r->ru_nsignals; + r32.ru_nvcsw = r->ru_nvcsw; + r32.ru_nivcsw = r->ru_nivcsw; + if (copy_to_user(ru, &r32, sizeof(r32))) return -EFAULT; return 0; } -- cgit From 1b3c872c8342803d0fcd8042e4e007d173191db6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:46:17 -0400 Subject: rt_sigtimedwait(): move compat to native Signed-off-by: Al Viro --- include/linux/signal.h | 2 -- kernel/compat.c | 32 -------------------------------- kernel/signal.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 1f5a16620693..231603ac20a3 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -246,8 +246,6 @@ extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); -extern int do_sigtimedwait(const sigset_t *, siginfo_t *, - const struct timespec *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); diff --git a/kernel/compat.c b/kernel/compat.c index 64e772aabdb5..36e6e7c405e3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -866,38 +866,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) } } -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, - struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) -{ - compat_sigset_t s32; - sigset_t s; - struct timespec t; - siginfo_t info; - long ret; - - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&s, &s32); - - if (uts) { - if (compat_get_timespec(&t, uts)) - return -EFAULT; - } - - ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - - if (ret > 0 && uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 6237f492adfc..fe5b3608c31c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2768,7 +2768,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ -int do_sigtimedwait(const sigset_t *which, siginfo_t *info, +static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, const struct timespec *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; @@ -2857,6 +2857,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + struct timespec t; + siginfo_t info; + long ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + + if (uts) { + if (compat_get_timespec(&t, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} +#endif + /** * sys_kill - send a signal to a process * @pid: the PID of the process -- cgit From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:40 +0200 Subject: bpf: avoid excessive stack usage for perf_sample_data perf_sample_data consumes 386 bytes on stack, reduce excessive stack usage and move it to per cpu buffer. It's allowed due to preemption being disabled for tracing, xdp and tc programs, thus at all times only one program can run on a specific CPU and programs cannot run from interrupt. We similarly also handle bpf_pt_regs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08eb072430b9..051d7fca0c09 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw) { struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; @@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = raw; - perf_event_output(event, &sample_data, regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = raw; + perf_event_output(event, sd, regs); return 0; } -- cgit From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:41 +0200 Subject: bpf: don't check spilled reg state for non-STACK_SPILLed type slots spilled_regs[] state is only used for stack slots of type STACK_SPILL, never for STACK_MISC. Right now, in states_equal(), even if we have old and current stack state of type STACK_MISC, we compare spilled_regs[] for that particular offset. Just skip these like we do everywhere else. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..d031b3b0752e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (i % BPF_REG_SIZE) continue; + if (old->stack_slot_type[i] != STACK_SPILL) + continue; if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], sizeof(old->spilled_regs[0]))) -- cgit From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:42 +0200 Subject: bpf: reset id on CONST_IMM transition Whenever we set the register to the type CONST_IMM, we currently don't reset the id to 0. id member is not used in CONST_IMM case, so don't let it become stale, where pruning won't be able to match later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d031b3b0752e..d195d825515a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].id = 0; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); @@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; return 0; } -- cgit From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:43 +0200 Subject: bpf: reset id on spilled regs in clear_all_pkt_pointers Right now, we don't reset the id of spilled registers in case of clear_all_pkt_pointers(). Given pkt_pointers are highly likely to contain an id, do so by reusing __mark_reg_unknown_value(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d195d825515a..519a6144d3d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - reg->type = UNKNOWN_VALUE; - reg->imm = 0; + __mark_reg_unknown_value(state->spilled_regs, + i / BPF_REG_SIZE); } } -- cgit From e4c1a0d15b62a3ed2f8cd560e9fa589dbe880c13 Mon Sep 17 00:00:00 2001 From: Derek Robson Date: Mon, 12 Jun 2017 14:33:03 +1200 Subject: audit: style fix Fixed checkpatch.pl warnings of "function definition argument FOO should also have an identifier name" Signed-off-by: Derek Robson Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index bb3a4e14b7e5..b331d9b83f63 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -248,13 +248,13 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *); +int audit_send_list(void *_dest); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; -extern int audit_del_rule(struct audit_entry *); -extern void audit_free_rule_rcu(struct rcu_head *); +extern int audit_del_rule(struct audit_entry *entry); +extern void audit_free_rule_rcu(struct rcu_head *head); extern struct list_head audit_filter_list[]; extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); @@ -302,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE -extern struct audit_chunk *audit_tree_lookup(const struct inode *); -extern void audit_put_chunk(struct audit_chunk *); -extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); -extern int audit_make_tree(struct audit_krule *, char *, u32); -extern int audit_add_tree_rule(struct audit_krule *); -extern int audit_remove_tree_rule(struct audit_krule *); +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *); -extern void audit_put_tree(struct audit_tree *); -extern void audit_kill_trees(struct list_head *); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct list_head *list); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -324,7 +324,7 @@ extern void audit_kill_trees(struct list_head *); #define audit_kill_trees(list) BUG() #endif -extern char *audit_unpack_string(void **, size_t *, size_t); +extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; @@ -334,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype); #ifdef CONFIG_AUDITSYSCALL extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED -- cgit From 6a5ae63a0cc5d3ac73a96cb412fde66f3a71f98e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 8 Jun 2017 19:53:24 -0700 Subject: tracing: Remove unused declaration of trace_stop_cmdline_recording trace_stop_cmdline_recording declaration isn't in use, remove it. Link: http://lkml.kernel.org/r/20170609025327.9508-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..63deff9cdf2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1910,8 +1910,6 @@ static void tracing_stop_tr(struct trace_array *tr) raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -void trace_stop_cmdline_recording(void); - static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; -- cgit From f4e981cba2dec675d40ac4f270b7e8ac164c9004 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Wed, 24 May 2017 07:49:50 +0200 Subject: printk: add __printf attributes to internal functions When compiling with -Wsuggest-attribute=format, gcc complains that some functions in kernel/printk/printk_safe.c transmit their argument to printf-like functions without having a printf attribute. Silence these warnings by adding relevant __printf attributes. Link: http://lkml.kernel.org/r/20170524054950.6722-1-nicolas.iooss_linux@m4x.org Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Nicolas Iooss Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk_safe.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 03a42a539b20..3cdaeaef9ce1 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) * happen, printk_safe_log_store() will notice the buffer->len mismatch * and repeat the write. */ -static int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) +static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { int add; size_t len; @@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void) * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); @@ -330,7 +330,7 @@ void printk_nmi_exit(void) #else -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { return 0; } @@ -342,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args) * into itself. It uses a per-CPU buffer to store the message, just like * NMI. */ -static int vprintk_safe(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); -- cgit From c81be52a3ac0267aa830a2c4cb769030ea3483c9 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 09:35:24 -0400 Subject: audit: fix a race condition with the auditd tracking code Originally reported by Adam and Dusty, it appears we have a small race window in kauditd_thread(), as documented in the Fedora BZ: * https://bugzilla.redhat.com/show_bug.cgi?id=1459326#c35 "This issue is partly due to the read-copy nature of RCU, and partly due to how we sync the auditd_connection state across kauditd_thread and the audit control channel. The kauditd_thread thread is always running so it can service the record queues and emit the multicast messages, if it happens to be just past the "main_queue" label, but before the "if (sk == NULL || ...)" if-statement which calls auditd_reset() when the new auditd connection is registered it could end up resetting the auditd connection, regardless of if it is valid or not. This is a rather small window and the variable nature of multi-core scheduling explains why this is proving rather difficult to reproduce." The fix is to have functions only call auditd_reset() when they believe that the kernel/auditd connection is still valid, e.g. non-NULL, and to have these callers pass their local copy of the auditd_connection pointer to auditd_reset() where it can be compared with the current connection state before resetting. If the caller has a stale state tracking pointer then the reset is ignored. We also make a small change to kauditd_thread() so that if the kernel/auditd connection is dead we skip the retry queue and send the records straight to the hold queue. This is necessary as we used to rely on auditd_reset() to occasionally purge the retry queue but we are going to be calling the reset function much less now and we want to make sure the retry queue doesn't grow unbounded. Reported-by: Adam Williamson Reported-by: Dusty Mabe Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b2e877100242..e1e2b3abfb93 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb) /** * auditd_reset - Disconnect the auditd connection + * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the - * hold queue in case auditd reconnects. + * hold queue in case auditd reconnects. It is important to note that the @ac + * pointer should never be dereferenced inside this function as it may be NULL + * or invalid, you can only compare the memory address! If @ac is NULL then + * the connection will always be reset. */ -static void auditd_reset(void) +static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; @@ -590,6 +594,11 @@ static void auditd_reset(void) spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); + if (ac && ac != ac_old) { + /* someone already registered a new auditd connection */ + spin_unlock_irqrestore(&auditd_conn_lock, flags); + return; + } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); @@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) return rc; err: - if (rc == -ECONNREFUSED) - auditd_reset(); + if (ac && rc == -ECONNREFUSED) + auditd_reset(ac); return rc; } @@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -815,12 +824,13 @@ main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the - * multicast send and move the record to the retry queue */ + * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, - kauditd_retry_skb); - if (sk == NULL || rc < 0) - auditd_reset(); + (sk ? + kauditd_retry_skb : kauditd_hold_skb)); + if (ac && rc < 0) + auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ @@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) auditd_pid, 1); /* unregister the auditd connection */ - auditd_reset(); + auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { -- cgit From 02fd7f68f5342bc7e8054cb05ea4a07f26d41d12 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:42 -0500 Subject: trace: rename kernel enum section to eval The kernel and its modules have sections containing the enum string to value conversions. Rename this section because we intend to store more than enums in it. Link: http://lkml.kernel.org/r/20170531215653.3240-2-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 6 +++--- include/trace/trace_events.h | 2 +- kernel/module.c | 2 +- kernel/trace/trace.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 314a0b9219c6..800f9f9677a6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -125,9 +125,9 @@ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ KEEP(*(_ftrace_events)) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; \ - VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \ - KEEP(*(_ftrace_enum_map)) \ - VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .; + VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .; \ + KEEP(*(_ftrace_eval_map)) \ + VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .; #else #define FTRACE_EVENTS() #endif diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 00f643164ca2..4bdd84023f5b 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -43,7 +43,7 @@ TRACE_MAKE_SYSTEM_STR(); .enum_value = a \ }; \ static struct trace_enum_map __used \ - __attribute__((section("_ftrace_enum_map"))) \ + __attribute__((section("_ftrace_eval_map"))) \ *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a /* diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..9ec4713c5eee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,7 +3077,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_enum_map", + mod->trace_enums = section_objs(info, "_ftrace_eval_map", sizeof(*mod->trace_enums), &mod->num_trace_enums); #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63deff9cdf2c..acd3eb4d56a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7732,15 +7732,15 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_enum_maps[]; -extern struct trace_enum_map *__stop_ftrace_enum_maps[]; +extern struct trace_enum_map *__start_ftrace_eval_maps[]; +extern struct trace_enum_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { int len; - len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; - trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -- cgit From 00f4b652b6f1dbfd4e1d5419d7f1cc23b1374da8 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:43 -0500 Subject: trace: rename trace_enum_map to trace_eval_map Each enum is loaded into the trace_enum_map, as we are now using this for more than enums rename it. Link: http://lkml.kernel.org/r/20170531215653.3240-3-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/linux/module.h | 2 +- include/linux/tracepoint.h | 6 +++--- include/trace/trace_events.h | 8 ++++---- kernel/trace/trace.c | 24 ++++++++++++------------ kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events.c | 14 +++++++------- 6 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..46b48043d741 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -442,7 +442,7 @@ struct module { #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; - struct trace_enum_map **trace_enums; + struct trace_eval_map **trace_enums; unsigned int num_trace_enums; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index cc48cb2ce209..f7b0f5525e46 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -25,10 +25,10 @@ struct module; struct tracepoint; struct notifier_block; -struct trace_enum_map { +struct trace_eval_map { const char *system; - const char *enum_string; - unsigned long enum_value; + const char *eval_string; + unsigned long eval_value; }; #define TRACEPOINT_DEFAULT_PRIO 10 diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4bdd84023f5b..49cce5fb54ee 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -35,14 +35,14 @@ TRACE_MAKE_SYSTEM_STR(); #undef TRACE_DEFINE_ENUM #define TRACE_DEFINE_ENUM(a) \ - static struct trace_enum_map __used __initdata \ + static struct trace_eval_map __used __initdata \ __##TRACE_SYSTEM##_##a = \ { \ .system = TRACE_SYSTEM_STRING, \ - .enum_string = #a, \ - .enum_value = a \ + .eval_string = #a, \ + .eval_value = a \ }; \ - static struct trace_enum_map __used \ + static struct trace_eval_map __used \ __attribute__((section("_ftrace_eval_map"))) \ *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acd3eb4d56a0..46fac3f63af1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ union trace_enum_map_item; struct trace_enum_map_tail { /* * "end" is first and points to NULL as it must be different - * than "mod" or "enum_string" + * than "mod" or "eval_string" */ union trace_enum_map_item *next; const char *end; /* points to NULL */ @@ -148,7 +148,7 @@ static DEFINE_MUTEX(trace_enum_mutex); * pointer to the next array of saved enum_map items. */ union trace_enum_map_item { - struct trace_enum_map map; + struct trace_eval_map map; struct trace_enum_map_head head; struct trace_enum_map_tail tail; }; @@ -4748,7 +4748,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { static union trace_enum_map_item * update_enum_map(union trace_enum_map_item *ptr) { - if (!ptr->map.enum_string) { + if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ @@ -4808,7 +4808,7 @@ static int enum_map_show(struct seq_file *m, void *v) union trace_enum_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", - ptr->map.enum_string, ptr->map.enum_value, + ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; @@ -4844,11 +4844,11 @@ trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, +trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, int len) { - struct trace_enum_map **stop; - struct trace_enum_map **map; + struct trace_eval_map **stop; + struct trace_eval_map **map; union trace_enum_map_item *map_array; union trace_enum_map_item *ptr; @@ -4902,13 +4902,13 @@ static void trace_create_enum_file(struct dentry *d_tracer) #else /* CONFIG_TRACE_ENUM_MAP_FILE */ static inline void trace_create_enum_file(struct dentry *d_tracer) { } static inline void trace_insert_enum_map_file(struct module *mod, - struct trace_enum_map **start, int len) { } + struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ static void trace_insert_enum_map(struct module *mod, - struct trace_enum_map **start, int len) + struct trace_eval_map **start, int len) { - struct trace_enum_map **map; + struct trace_eval_map **map; if (len <= 0) return; @@ -7732,8 +7732,8 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_eval_maps[]; -extern struct trace_enum_map *__stop_ftrace_eval_maps[]; +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39fd77330aab..a9667297ae49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_enum_map **map, int len); +void trace_event_enum_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } +static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7973e10398c..cf5b9aa4d732 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,18 +2067,18 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the enum value as a string */ - elen = snprintf(ptr, 0, "%ld", map->enum_value); + elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; - snprintf(ptr, elen + 1, "%ld", map->enum_value); + snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); @@ -2090,11 +2090,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) } static void update_event_printk(struct trace_event_call *call, - struct trace_enum_map *map) + struct trace_eval_map *map) { char *ptr; int quote = 0; - int len = strlen(map->enum_string); + int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { @@ -2125,7 +2125,7 @@ static void update_event_printk(struct trace_event_call *call, continue; } if (isalpha(*ptr) || *ptr == '_') { - if (strncmp(map->enum_string, ptr, len) == 0 && + if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = enum_replace(ptr, map, len); /* Hmm, enum string smaller than value */ @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_enum_map **map, int len) +void trace_event_enum_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit From 99be647c5841d570a23b5dfa65bfecada8b6e6b5 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:44 -0500 Subject: trace: rename struct module entry for trace enums Each module has a list of enum's its contributing to the enum map, rename that entry to reflect its use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-4-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/linux/module.h | 4 ++-- kernel/module.c | 6 +++--- kernel/trace/trace.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 46b48043d741..8eb9a1e693e5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -442,8 +442,8 @@ struct module { #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; - struct trace_eval_map **trace_enums; - unsigned int num_trace_enums; + struct trace_eval_map **trace_evals; + unsigned int num_trace_evals; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; diff --git a/kernel/module.c b/kernel/module.c index 9ec4713c5eee..df1c4a9e7abb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,9 +3077,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_eval_map", - sizeof(*mod->trace_enums), - &mod->num_trace_enums); + mod->trace_evals = section_objs(info, "_ftrace_eval_map", + sizeof(*mod->trace_evals), + &mod->num_trace_evals); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46fac3f63af1..061abd8ba101 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7746,7 +7746,7 @@ static void __init trace_enum_init(void) #ifdef CONFIG_MODULES static void trace_module_add_enums(struct module *mod) { - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; /* @@ -7756,7 +7756,7 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); + trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE @@ -7765,7 +7765,7 @@ static void trace_module_remove_enums(struct module *mod) union trace_enum_map_item *map; union trace_enum_map_item **last = &trace_enum_maps; - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); -- cgit From 23bf8cb8dc86c0368d2471ebb4622e7edd38190b Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:45 -0500 Subject: trace: rename trace enum data structures in trace.c The enum map entries can be exported to userspace via a sys enum_map file. Rename those functions and structures to reflect the fact that we are using them for more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-5-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 061abd8ba101..13c81f4f2bd7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -122,38 +122,38 @@ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_ENUM_MAP_FILE /* Map of enums to their values, for "enum_map" file */ -struct trace_enum_map_head { +struct trace_eval_map_head { struct module *mod; unsigned long length; }; -union trace_enum_map_item; +union trace_eval_map_item; -struct trace_enum_map_tail { +struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ - union trace_enum_map_item *next; + union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_enum_mutex); /* - * The trace_enum_maps are saved in an array with two extra elements, + * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved enum_map items. */ -union trace_enum_map_item { +union trace_eval_map_item { struct trace_eval_map map; - struct trace_enum_map_head head; - struct trace_enum_map_tail tail; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; }; -static union trace_enum_map_item *trace_enum_maps; +static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4745,8 +4745,8 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { }; #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static union trace_enum_map_item * -update_enum_map(union trace_enum_map_item *ptr) +static union trace_eval_map_item * +update_enum_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4761,7 +4761,7 @@ update_enum_map(union trace_enum_map_item *ptr) static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. @@ -4782,12 +4782,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) static void *enum_map_start(struct seq_file *m, loff_t *pos) { - union trace_enum_map_item *v; + union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_enum_mutex); - v = trace_enum_maps; + v = trace_eval_maps; if (v) v++; @@ -4805,7 +4805,7 @@ static void enum_map_stop(struct seq_file *m, void *v) static int enum_map_show(struct seq_file *m, void *v) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, @@ -4836,8 +4836,8 @@ static const struct file_operations tracing_enum_map_fops = { .release = seq_release, }; -static inline union trace_enum_map_item * -trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +static inline union trace_eval_map_item * +trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4849,13 +4849,13 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, { struct trace_eval_map **stop; struct trace_eval_map **map; - union trace_enum_map_item *map_array; - union trace_enum_map_item *ptr; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; stop = start + len; /* - * The trace_enum_maps contains the map plus a head and tail item, + * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ @@ -4867,10 +4867,10 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_lock(&trace_enum_mutex); - if (!trace_enum_maps) - trace_enum_maps = map_array; + if (!trace_eval_maps) + trace_eval_maps = map_array; else { - ptr = trace_enum_maps; + ptr = trace_eval_maps; for (;;) { ptr = trace_enum_jmp_to_tail(ptr); if (!ptr->tail.next) @@ -7762,15 +7762,15 @@ static void trace_module_add_enums(struct module *mod) #ifdef CONFIG_TRACE_ENUM_MAP_FILE static void trace_module_remove_enums(struct module *mod) { - union trace_enum_map_item *map; - union trace_enum_map_item **last = &trace_enum_maps; + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); - map = trace_enum_maps; + map = trace_eval_maps; while (map) { if (map->head.mod == mod) -- cgit From 1793ed939b2a0e18b06467e10d15b66925d75d5f Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:46 -0500 Subject: trace: rename trace_enum_mutex to trace_eval_mutex There is a lock protecting the trace_enum_map, rename it to reflect the use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-6-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 13c81f4f2bd7..e0279f5dc83f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -138,7 +138,7 @@ struct trace_eval_map_tail { const char *end; /* points to NULL */ }; -static DEFINE_MUTEX(trace_enum_mutex); +static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, @@ -4785,7 +4785,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) union trace_eval_map_item *v; loff_t l = 0; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) @@ -4800,7 +4800,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) static void enum_map_stop(struct seq_file *m, void *v) { - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static int enum_map_show(struct seq_file *m, void *v) @@ -4865,7 +4865,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -4890,7 +4890,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, } memset(map_array, 0, sizeof(*map_array)); - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static void trace_create_enum_file(struct dentry *d_tracer) @@ -7768,7 +7768,7 @@ static void trace_module_remove_enums(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); map = trace_eval_maps; @@ -7785,7 +7785,7 @@ static void trace_module_remove_enums(struct module *mod) *last = trace_enum_jmp_to_tail(map)->tail.next; kfree(map); out: - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_enums(struct module *mod) { } -- cgit From 5f60b351a7e37bf243725bcc131708be2c8ea497 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:47 -0500 Subject: trace: rename trace.c enum functions Rename the init and trace_enum_jmp_to_tail() routines to reflect their use by more than enumerated types. Link: http://lkml.kernel.org/r/20170531215653.3240-7-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e0279f5dc83f..d703f429bbd9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4837,7 +4837,7 @@ static const struct file_operations tracing_enum_map_fops = { }; static inline union trace_eval_map_item * -trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4872,7 +4872,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, else { ptr = trace_eval_maps; for (;;) { - ptr = trace_enum_jmp_to_tail(ptr); + ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; @@ -7735,7 +7735,7 @@ struct dentry *tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_enum_init(void) +static void __init trace_eval_init(void) { int len; @@ -7775,14 +7775,14 @@ static void trace_module_remove_enums(struct module *mod) while (map) { if (map->head.mod == mod) break; - map = trace_enum_jmp_to_tail(map); + map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) goto out; - *last = trace_enum_jmp_to_tail(map)->tail.next; + *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); out: mutex_unlock(&trace_eval_mutex); @@ -7839,7 +7839,7 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); - trace_enum_init(); + trace_eval_init(); trace_create_enum_file(d_tracer); -- cgit From f57a41434fc51732dd5e35e0e1aa9e607f1a05d6 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:48 -0500 Subject: trace: rename enum_map functions Rename the core trace enum routines to use eval, to reflect their use by more than just enum to value mapping. Link: http://lkml.kernel.org/r/20170531215653.3240-8-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 74 ++++++++++++++++++++++----------------------- kernel/trace/trace.h | 4 +-- kernel/trace/trace_events.c | 2 +- 3 files changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d703f429bbd9..d830be7f0ba6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_map items. + * pointer to the next array of saved enum_eval/enum_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. - * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list - * of strings in the order that the enums were defined. + * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b @@ -4746,7 +4746,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { #ifdef CONFIG_TRACE_ENUM_MAP_FILE static union trace_eval_map_item * -update_enum_map(union trace_eval_map_item *ptr) +update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4759,7 +4759,7 @@ update_enum_map(union trace_eval_map_item *ptr) return ptr; } -static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; @@ -4767,7 +4767,7 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; @@ -4775,12 +4775,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); return ptr; } -static void *enum_map_start(struct seq_file *m, loff_t *pos) +static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; @@ -4792,18 +4792,18 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) v++; while (v && l < *pos) { - v = enum_map_next(m, v, &l); + v = eval_map_next(m, v, &l); } return v; } -static void enum_map_stop(struct seq_file *m, void *v) +static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } -static int enum_map_show(struct seq_file *m, void *v) +static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; @@ -4814,23 +4814,23 @@ static int enum_map_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations tracing_enum_map_seq_ops = { - .start = enum_map_start, - .next = enum_map_next, - .stop = enum_map_stop, - .show = enum_map_show, +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, }; -static int tracing_enum_map_open(struct inode *inode, struct file *filp) +static int tracing_eval_map_open(struct inode *inode, struct file *filp) { if (tracing_disabled) return -ENODEV; - return seq_open(filp, &tracing_enum_map_seq_ops); + return seq_open(filp, &tracing_eval_map_seq_ops); } -static const struct file_operations tracing_enum_map_fops = { - .open = tracing_enum_map_open, +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -4844,7 +4844,7 @@ trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; @@ -4861,7 +4861,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warn("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace eval mapping\n"); return; } @@ -4893,19 +4893,19 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_unlock(&trace_eval_mutex); } -static void trace_create_enum_file(struct dentry *d_tracer) +static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("enum_map", 0444, d_tracer, - NULL, &tracing_enum_map_fops); + NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_ENUM_MAP_FILE */ -static inline void trace_create_enum_file(struct dentry *d_tracer) { } -static inline void trace_insert_enum_map_file(struct module *mod, +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ -static void trace_insert_enum_map(struct module *mod, +static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; @@ -4915,9 +4915,9 @@ static void trace_insert_enum_map(struct module *mod, map = start; - trace_event_enum_update(map, len); + trace_event_eval_update(map, len); - trace_insert_enum_map_file(mod, start, len); + trace_insert_eval_map_file(mod, start, len); } static ssize_t @@ -7740,11 +7740,11 @@ static void __init trace_eval_init(void) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); + trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -static void trace_module_add_enums(struct module *mod) +static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) return; @@ -7756,11 +7756,11 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); + trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static void trace_module_remove_enums(struct module *mod) +static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; @@ -7788,7 +7788,7 @@ static void trace_module_remove_enums(struct module *mod) mutex_unlock(&trace_eval_mutex); } #else -static inline void trace_module_remove_enums(struct module *mod) { } +static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, @@ -7798,10 +7798,10 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - trace_module_add_enums(mod); + trace_module_add_evals(mod); break; case MODULE_STATE_GOING: - trace_module_remove_enums(mod); + trace_module_remove_evals(mod); break; } @@ -7841,7 +7841,7 @@ static __init int tracer_init_tracefs(void) trace_eval_init(); - trace_create_enum_file(d_tracer); + trace_create_eval_file(d_tracer); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a9667297ae49..69a3ab3ee4f5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_eval_map **map, int len); +void trace_event_eval_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cf5b9aa4d732..e6897b005947 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_eval_map **map, int len) +void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit From 67ec0d85955630924b971e04c0954370a74b8706 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:49 -0500 Subject: tracing: Rename enum_replace to eval_replace The enum_replace stanza works as is for sizeof() calls as well as enums. Rename it as well. Link: http://lkml.kernel.org/r/20170531215653.3240-9-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e6897b005947..83dfd0dbbbfe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,12 +2067,12 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) +static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; - /* Find the length of the enum value as a string */ + /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) @@ -2127,14 +2127,14 @@ static void update_event_printk(struct trace_event_call *call, if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { - ptr = enum_replace(ptr, map, len); - /* Hmm, enum string smaller than value */ + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* - * No need to decrement here, as enum_replace() + * No need to decrement here, as eval_replace() * returns the pointer to the character passed - * the enum, and two enums can not be placed + * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ -- cgit From 681bec0367c2606b6154060310a2ffa543175980 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:53 -0500 Subject: tracing: Rename update the enum_map file The enum_map file is used to display a list of symbol to name conversions. As its now used to resolve sizeof lets update the name and description. Link: http://lkml.kernel.org/r/20170531215653.3240-13-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 22 +++++++++++----------- kernel/trace/trace.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e06f04e98fe..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N -config TRACE_ENUM_MAP_FILE - bool "Show enum mappings for trace events" +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" depends on TRACING help - The "print fmt" of the trace events will show the enum names instead - of their values. This can cause problems for user space tools that - use this string to parse the raw data as user space does not know + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know how to convert the string to its value. To fix this, there's a special macro in the kernel that can be used - to convert the enum into its value. If this macro is used, then the - print fmt strings will have the enums converted to their values. + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. If something does not get converted properly, this option can be - used to show what enums the kernel tried to convert. + used to show what enums/sizeof the kernel tried to convert. - This option is for debugging the enum conversions. A file is created - in the tracing directory called "enum_map" that will show the enum + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the names matched with their values and what trace event system they belong too. Normally, the mapping of the strings to values will be freed after boot up or module load. With this option, they will not be freed, as - they are needed for the "enum_map" file. Enabling this option will + they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. If unsure, say N diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d830be7f0ba6..19ac2088d10a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,8 +120,8 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -/* Map of enums to their values, for "enum_map" file */ +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_eval/enum_map items. + * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -154,7 +154,7 @@ union trace_eval_map_item { }; static union trace_eval_map_item *trace_eval_maps; -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4744,7 +4744,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { @@ -4895,15 +4895,15 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("enum_map", 0444, d_tracer, + trace_create_file("eval_map", 0444, d_tracer, NULL, &tracing_eval_map_fops); } -#else /* CONFIG_TRACE_ENUM_MAP_FILE */ +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } -#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) @@ -7759,7 +7759,7 @@ static void trace_module_add_evals(struct module *mod) trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; @@ -7789,7 +7789,7 @@ static void trace_module_remove_evals(struct module *mod) } #else static inline void trace_module_remove_evals(struct module *mod) { } -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) -- cgit From 31fd85816dbe3a714bcc3f67c17c3dd87011f79e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Jun 2017 15:52:13 -0700 Subject: bpf: permits narrower load from bpf program context fields Currently, verifier will reject a program if it contains an narrower load from the bpf context structure. For example, __u8 h = __sk_buff->hash, or __u16 p = __sk_buff->protocol __u32 sample_period = bpf_perf_event_data->sample_period which are narrower loads of 4-byte or 8-byte field. This patch solves the issue by: . Introduce a new parameter ctx_field_size to carry the field size of narrower load from prog type specific *__is_valid_access validator back to verifier. . The non-zero ctx_field_size for a memory access indicates (1). underlying prog type specific convert_ctx_accesses supporting non-whole-field access (2). the current insn is a narrower or whole field access. . In verifier, for such loads where load memory size is less than ctx_field_size, verifier transforms it to a full field load followed by proper masking. . Currently, __sk_buff and bpf_perf_event_data->sample_period are supporting narrowing loads. . Narrower stores are still not allowed as typical ctx stores are just normal stores. Because of this change, some tests in verifier will fail and these tests are removed. As a bonus, rename some out of bound __sk_buff->cb access to proper field name and remove two redundant "skb cb oob" tests. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 71 ++++++++++++++++------ kernel/trace/bpf_trace.c | 21 +++++-- net/core/filter.c | 56 +++++++++++++----- tools/testing/selftests/bpf/test_verifier.c | 92 ++++------------------------- 6 files changed, 124 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c32bace66d3d..1bcbf0a71f75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -157,7 +157,7 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type); + enum bpf_reg_type *reg_type, int *ctx_field_size); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d5093b52b485..189741c0da85 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -73,6 +73,7 @@ struct bpf_insn_aux_data { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; + int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 519a6144d3d3..44b97d958fb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -758,15 +758,26 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + int ctx_field_size = 0; + /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { + /* a non zero ctx_field_size indicates: + * . For this field, the prog type specific ctx conversion algorithm + * only supports whole field access. + * . This ctx access is a candiate for later verifier transformation + * to load the whole field and then apply a mask to get correct result. + */ + if (ctx_field_size) + env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -868,7 +879,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { @@ -911,7 +922,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value_and_range(state->regs, value_regno); @@ -972,7 +983,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; int err; @@ -994,13 +1005,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; /* check whether atomic_add can read the memory */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); if (err) return err; /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn->dst_reg, insn->off, + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); } @@ -1416,7 +1427,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); if (err) return err; } @@ -2993,18 +3004,12 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn->src_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W && - BPF_SIZE(insn->code) != BPF_DW) { - insn_idx++; - continue; - } - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { @@ -3032,7 +3037,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn); + err = check_xadd(env, insn_idx, insn); if (err) return err; insn_idx++; @@ -3051,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) @@ -3080,7 +3085,7 @@ static int do_check(struct bpf_verifier_env *env) return err; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) @@ -3383,7 +3388,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, delta = 0; + int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3423,11 +3428,39 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; + off = insn->off; + size = bpf_size_to_bytes(BPF_SIZE(insn->code)); + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; + is_narrower_load = (type == BPF_READ && size < ctx_field_size); + + /* If the read access is a narrower load of the field, + * convert to a 4/8-byte load, to minimum program type specific + * convert_ctx_access changes. If conversion is successful, + * we will apply proper mask to the result. + */ + if (is_narrower_load) { + int size_code = BPF_H; + + if (ctx_field_size == 4) + size_code = BPF_W; + else if (ctx_field_size == 8) + size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); + insn->code = BPF_LDX | BPF_MEM | size_code; + } cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } + if (is_narrower_load) { + if (ctx_field_size <= 4) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + else + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 051d7fca0c09..9d3ec8253131 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,17 +581,26 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { + int sample_period_off; + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (off == offsetof(struct bpf_perf_event_data, sample_period)) { - if (size != sizeof(u64)) - return false; + + /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ + sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); + if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { + *ctx_field_size = 8; +#ifdef __LITTLE_ENDIAN + return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; +#else + return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; +#endif } else { if (size != sizeof(long)) return false; diff --git a/net/core/filter.c b/net/core/filter.c index a65a3b25e104..60ed6f343a63 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2856,7 +2856,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static bool __is_valid_access(int off, int size) +static bool __is_valid_access(int off, int size, enum bpf_access_type type, + int *ctx_field_size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2872,9 +2873,27 @@ static bool __is_valid_access(int off, int size) offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) return false; break; - default: + case offsetof(struct __sk_buff, data) ... + offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data_end) ... + offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: if (size != sizeof(__u32)) return false; + break; + default: + /* permit narrower load for not cb/data/data_end fields */ + *ctx_field_size = 4; + if (type == BPF_WRITE) { + if (size != sizeof(__u32)) + return false; + } else { + if (size != sizeof(__u32)) +#ifdef __LITTLE_ENDIAN + return (off & 0x3) == 0 && (size == 1 || size == 2); +#else + return (off & 0x3) + size == 4 && (size == 1 || size == 2); +#endif + } } return true; @@ -2882,12 +2901,16 @@ static bool __is_valid_access(int off, int size) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): - case offsetof(struct __sk_buff, data): - case offsetof(struct __sk_buff, data_end): + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data) ... + offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data_end) ... + offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: return false; } @@ -2901,15 +2924,17 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: return false; } @@ -2934,12 +2959,13 @@ static bool lwt_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) { switch (off) { @@ -3002,7 +3028,8 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) { switch (off) { @@ -3027,7 +3054,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool __is_valid_xdp_access(int off, int size) @@ -3044,7 +3071,8 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) return false; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4ee4708b0d60..13341700930c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1073,44 +1073,22 @@ static struct bpf_test tests[] = { .result = ACCEPT, }, { - "check cb access: byte, oob 1", + "__sk_buff->hash, offset 0, byte store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 4), + offsetof(struct __sk_buff, hash)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: byte, oob 2", + "__sk_buff->tc_index, offset 3, byte store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 1), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: byte, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 4), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: byte, oob 4", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 1), + offsetof(struct __sk_buff, tc_index) + 3), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", @@ -1188,44 +1166,22 @@ static struct bpf_test tests[] = { .result = REJECT, }, { - "check cb access: half, oob 1", + "check __sk_buff->hash, offset 0, half store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 4), + offsetof(struct __sk_buff, hash)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: half, oob 2", + "check __sk_buff->tc_index, offset 2, half store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 2), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: half, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 4), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: half, oob 4", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 2), + offsetof(struct __sk_buff, tc_index) + 2), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", @@ -1366,28 +1322,6 @@ static struct bpf_test tests[] = { }, { "check cb access: double, oob 2", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 8), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: double, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 8), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: double, oob 4", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, @@ -1398,22 +1332,22 @@ static struct bpf_test tests[] = { .result = REJECT, }, { - "check cb access: double, oob 5", + "check __sk_buff->ifindex dw store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 8), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, ifindex)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: double, oob 6", + "check __sk_buff->ifindex dw load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 8), + offsetof(struct __sk_buff, ifindex)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", -- cgit From 73a7242a06ff995d771fbe243e72b516feaa6e3d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:01 -0400 Subject: cgroup: Keep accurate count of tasks in each css_set The reference count in the css_set data structure was used as a proxy of the number of tasks attached to that css_set. However, that count is actually not an accurate measure especially with thread mode support. So a new variable nr_tasks is added to the css_set to keep track of the actual task count. This new variable is protected by the css_set_lock. Functions that require the actual task count are updated to use the new variable. tj: s/task_count/nr_tasks/ for consistency with cgroup_root->nr_cgrps. Refreshed on top of cgroup/for-v4.13 which dropped on css_set_populated() -> nr_tasks conversion. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup/cgroup-v1.c | 6 +----- kernel/cgroup/cgroup.c | 10 ++++++++++ 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ec47101cb1bf..3bc4196bf217 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -166,6 +166,9 @@ struct css_set { /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; + /* internal task count, protected by css_set_lock */ + int nr_tasks; + /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..e9ea5f201fac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -334,10 +334,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@ -346,7 +342,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += refcount_read(&link->cset->refcount); + count += link->cset->nr_tasks; spin_unlock_irq(&css_set_lock); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8d4e85eae42c..dbfd7028b1c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set + * + * css_set_populated() should be the same as !!cset->nr_tasks at steady + * state. However, css_set_populated() can be called while a task is being + * added to or removed from the linked list before the nr_tasks is + * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -1598,6 +1603,7 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); + cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -2064,8 +2070,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); + to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); put_css_set_locked(from_cset); + from_cset->nr_tasks--; } } spin_unlock_irq(&css_set_lock); @@ -4789,6 +4797,7 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); + cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } spin_unlock_irq(&css_set_lock); @@ -4838,6 +4847,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); -- cgit From a28f8f5e995fe5964ae304444913536058f26e37 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:02 -0400 Subject: cgroup: Move debug cgroup to its own file The debug cgroup currently resides within cgroup-v1.c and is enabled only for v1 cgroup. To enable the debug cgroup also for v2, it makes sense to put the code into its own file as it will no longer be v1 specific. There is no change to the debug cgroup specific code. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/Makefile | 1 + kernel/cgroup/cgroup-internal.h | 2 + kernel/cgroup/cgroup-v1.c | 149 +------------------------------------- kernel/cgroup/debug.c | 153 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 148 deletions(-) create mode 100644 kernel/cgroup/debug.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int cgroup_task_count(const struct cgroup *cgrp); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e9ea5f201fac..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -335,7 +335,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ -static int cgroup_task_count(const struct cgroup *cgrp) +int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; @@ -1259,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); - - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %pK\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..1c209aa43733 --- /dev/null +++ b/kernel/cgroup/debug.c @@ -0,0 +1,153 @@ +#include +#include +#include + +#include "cgroup-internal.h" + +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +/* + * debug_taskcount_read - return the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = refcount_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; -- cgit From 23b0be480f341db26ce0dee7d3f6e67f8e0e166f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:03 -0400 Subject: cgroup: Make Kconfig prompt of debug cgroup more accurate The Kconfig prompt and description of the debug cgroup controller more accurate by saying that it is for debug purpose only and its interfaces are unstable. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- init/Kconfig | 7 +++++-- kernel/cgroup/debug.c | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 1d3475fc9496..5f9d13929ae0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1205,11 +1205,14 @@ config CGROUP_BPF inet sockets. config CGROUP_DEBUG - bool "Example controller" + bool "Debug controller" default n + depends on DEBUG_KERNEL help This option enables a simple controller that exports - debugging information about the cgroups framework. + debugging information about the cgroups framework. This + controller is for control cgroup debugging only. Its + interfaces are not stable. Say N. diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 1c209aa43733..cbe77a2087c5 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,9 @@ +/* + * Debug controller + * + * WARNING: This controller is for cgroup core debugging only. + * Its interfaces are unstable and subject to changes at any time. + */ #include #include #include -- cgit From 575313f40ff33d0c2aff2701dfb2ccfcd6211d55 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:04 -0400 Subject: cgroup: Make debug cgroup support v2 and thread mode Besides supporting cgroup v2 and thread mode, the following changes are also made: 1) current_* cgroup files now resides only at the root as we don't need duplicated files of the same function all over the cgroup hierarchy. 2) The cgroup_css_links_read() function is modified to report the number of tasks that are skipped because of overflow. 3) The number of extra unaccounted references are displayed. 4) The current_css_set_read() function now prints out the addresses of the css'es associated with the current css_set. 5) A new cgroup_subsys_states file is added to display the css objects associated with a cgroup. 6) A new cgroup_masks file is added to display the various controller bit masks in the cgroup. tj: Dropped thread mode related information for now so that debug controller changes aren't blocked on the thread mode. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 170 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 153 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index cbe77a2087c5..057d9b07f461 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -36,10 +36,37 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int current_css_set_read(struct seq_file *seq, void *v) { - return (u64)(unsigned long)current->cgroups; + struct css_set *cset; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + int i, refcnt; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + refcnt = refcount_read(&cset->refcount); + seq_printf(seq, "css_set %pK %d", cset, refcnt); + if (refcnt > cset->nr_tasks) + seq_printf(seq, " +%d", refcnt - cset->nr_tasks); + seq_puts(seq, "\n"); + + /* + * Print the css'es stored in the current css_set. + */ + for_each_subsys(ss, i) { + css = cset->subsys[ss->id]; + if (!css) + continue; + seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, + (unsigned long)css, css->id); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + return 0; } static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, @@ -84,31 +111,126 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; + int dead_cnt = 0, extra_refs = 0; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, "css_set %pK\n", cset); + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } + seq_puts(seq, "\n"); list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } - continue; - overflow: - seq_puts(seq, " ...\n"); + /* show # of overflowed tasks */ + if (count > MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " ... (%d)\n", + count - MAX_TASKS_SHOWN_PER_CSS); + + if (cset->dead) { + seq_puts(seq, " [dead]\n"); + dead_cnt++; + } + + WARN_ON(count != cset->nr_tasks); } spin_unlock_irq(&css_set_lock); + + if (!dead_cnt && !extra_refs) + return 0; + + seq_puts(seq, "\n"); + if (extra_refs) + seq_printf(seq, "extra references = %d\n", extra_refs); + if (dead_cnt) + seq_printf(seq, "dead css_sets = %d\n", dead_cnt); + + return 0; +} + +static int cgroup_subsys_states_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + char pbuf[16]; + int i; + + mutex_lock(&cgroup_mutex); + for_each_subsys(ss, i) { + css = rcu_dereference_check(cgrp->subsys[ss->id], true); + if (!css) + continue; + + pbuf[0] = '\0'; + + /* Show the parent CSS if applicable*/ + if (css->parent) + snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", + css->parent->id); + seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, + (unsigned long)css, css->id, + atomic_read(&css->online_cnt), pbuf); + } + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + int i, j; + struct { + u16 *mask; + char *name; + } mask_list[] = { + { &cgrp->subtree_control, "subtree_control" }, + { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, + }; + + mutex_lock(&cgroup_mutex); + for (i = 0; i < ARRAY_SIZE(mask_list); i++) { + u16 mask = *mask_list[i].mask; + bool first = true; + + seq_printf(seq, "%-17s: ", mask_list[i].name); + for_each_subsys(ss, j) { + if (!(mask & (1 << ss->id))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; + } + seq_putc(seq, '\n'); + } + + mutex_unlock(&cgroup_mutex); return 0; } @@ -126,17 +248,20 @@ static struct cftype debug_files[] = { { .name = "current_css_set", - .read_u64 = current_css_set_read, + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_refcount", .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_cg_links", .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { @@ -144,6 +269,16 @@ static struct cftype debug_files[] = { .seq_show = cgroup_css_links_read, }, + { + .name = "cgroup_subsys_states", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "cgroup_masks", + .seq_show = cgroup_masks_read, + }, + { .name = "releasable", .read_u64 = releasable_read, @@ -153,7 +288,8 @@ static struct cftype debug_files[] = { }; struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, + .dfl_cftypes = debug_files, }; -- cgit From 8cc38fa7fa317d44710f24475576b1f9ee205da9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:32 -0400 Subject: cgroup: make debug an implicit controller on cgroup2 Make debug an implicit controller on cgroup2 which is enabled by "cgroup_debug" boot param. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 057d9b07f461..d61e692a5338 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -240,7 +240,7 @@ static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) !css_has_online_children(&css->cgroup->self)); } -static struct cftype debug_files[] = { +static struct cftype debug_legacy_files[] = { { .name = "taskcount", .read_u64 = debug_taskcount_read, @@ -287,9 +287,62 @@ static struct cftype debug_files[] = { { } /* terminate */ }; +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "csses", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "masks", + .seq_show = cgroup_masks_read, + }, + + { } /* terminate */ +}; + struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .legacy_cftypes = debug_files, - .dfl_cftypes = debug_files, + .legacy_cftypes = debug_legacy_files, }; + +/* + * On v2, debug is an implicit controller enabled by "cgroup_debug" boot + * parameter. + */ +static int __init enable_cgroup_debug(char *str) +{ + debug_cgrp_subsys.dfl_cftypes = debug_files; + debug_cgrp_subsys.implicit_on_dfl = true; + return 1; +} +__setup("cgroup_debug", enable_cgroup_debug); -- cgit From 2866c0b4cf25274040f0b4cc045ad1f44b885dd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:36 -0400 Subject: cgroup: refactor cgroup_masks_read() in the debug controller Factor out cgroup_masks_read_one() out of cgroup_masks_read() for simplicity. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index d61e692a5338..163fdbd7adf6 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -200,36 +200,32 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) return 0; } -static int cgroup_masks_read(struct seq_file *seq, void *v) +static void cgroup_masks_read_one(struct seq_file *seq, const char *name, + u16 mask) { - struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss; - int i, j; - struct { - u16 *mask; - char *name; - } mask_list[] = { - { &cgrp->subtree_control, "subtree_control" }, - { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, - }; + int ssid; + bool first = true; - mutex_lock(&cgroup_mutex); - for (i = 0; i < ARRAY_SIZE(mask_list); i++) { - u16 mask = *mask_list[i].mask; - bool first = true; - - seq_printf(seq, "%-17s: ", mask_list[i].name); - for_each_subsys(ss, j) { - if (!(mask & (1 << ss->id))) - continue; - if (!first) - seq_puts(seq, ", "); - seq_puts(seq, ss->name); - first = false; - } - seq_putc(seq, '\n'); + seq_printf(seq, "%-17s: ", name); + for_each_subsys(ss, ssid) { + if (!(mask & (1 << ssid))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; } + seq_putc(seq, '\n'); +} +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + mutex_lock(&cgroup_mutex); + cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); + cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); mutex_unlock(&cgroup_mutex); return 0; } -- cgit From b6053d40e3387c916d56d319ba6bfdb2c42a67c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:41 -0400 Subject: cgroup: fix lockdep warning in debug controller The debug controller grabs cgroup_mutex from interface file show functions which can deadlock and triggers lockdep warnings. Fix it by using cgroup_kn_lock_live()/cgroup_kn_unlock() instead. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 163fdbd7adf6..dac46af22782 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -38,12 +38,15 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, static int current_css_set_read(struct seq_file *seq, void *v) { + struct kernfs_open_file *of = seq->private; struct css_set *cset; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; int i, refcnt; - mutex_lock(&cgroup_mutex); + if (!cgroup_kn_lock_live(of->kn, false)) + return -ENODEV; + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); @@ -65,7 +68,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return 0; } @@ -174,13 +177,17 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static int cgroup_subsys_states_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; char pbuf[16]; int i; - mutex_lock(&cgroup_mutex); + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + for_each_subsys(ss, i) { css = rcu_dereference_check(cgrp->subsys[ss->id], true); if (!css) @@ -196,7 +203,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) (unsigned long)css, css->id, atomic_read(&css->online_cnt), pbuf); } - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } @@ -221,12 +229,17 @@ static void cgroup_masks_read_one(struct seq_file *seq, const char *name, static int cgroup_masks_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; - mutex_lock(&cgroup_mutex); cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } -- cgit From 33e4f80ee69b5168badf37edbfed796eb48434b9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Jun 2017 22:56:34 +0200 Subject: ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ during suspend-to-idle transitions and, consequently, any events signaled through it wake up the system from that state. However, on some systems some of the events signaled via the ACPI SCI while suspended to idle should not cause the system to wake up. In fact, quite often they should just be discarded. Arguably, systems should not resume entirely on such events, but in order to decide which events really should cause the system to resume and which are spurious, it is necessary to resume up to the point when ACPI SCIs are actually handled and processed, which is after executing dpm_resume_noirq() in the system resume path. For this reasons, add a loop around freeze_enter() in which the platforms can process events signaled via multiplexed IRQ lines like the ACPI SCI and add suspend-to-idle hooks that can be used for this purpose to struct platform_freeze_ops. In the ACPI case, the ->wake hook is used for checking if the SCI has triggered while suspended and deferring the interrupt-induced system wakeup until the events signaled through it are actually processed sufficiently to decide whether or not the system should resume. In turn, the ->sync hook allows all of the relevant event queues to be flushed so as to prevent events from being missed due to race conditions. In addition to that, some ACPI code processing wakeup events needs to be modified to use the "hard" version of wakeup triggers, so that it will cause a system resume to happen on device-induced wakeup events even if the "soft" mechanism to prevent the system from suspending is not enabled. However, to preserve the existing behavior with respect to suspend-to-RAM, this only is done in the suspend-to-idle case and only if an SCI has occurred while suspended. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/battery.c | 2 +- drivers/acpi/button.c | 5 +++-- drivers/acpi/device_pm.c | 9 ++++++++- drivers/acpi/internal.h | 2 ++ drivers/acpi/sleep.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/base/power/main.c | 5 ----- drivers/base/power/wakeup.c | 18 ++++++++++++------ include/acpi/acpi_bus.h | 6 +++++- include/linux/suspend.h | 7 +++++-- kernel/power/process.c | 2 +- kernel/power/suspend.c | 35 +++++++++++++++++++++++++++++------ 11 files changed, 103 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index d42eeef9d928..1cbb88d938e5 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -782,7 +782,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume) if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) || (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) && (battery->capacity_now <= battery->alarm))) - pm_wakeup_event(&battery->device->dev, 0); + acpi_pm_wakeup_event(&battery->device->dev); return result; } diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index e19f530f1083..91cfdf377df7 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -217,7 +217,7 @@ static int acpi_lid_notify_state(struct acpi_device *device, int state) } if (state) - pm_wakeup_event(&device->dev, 0); + acpi_pm_wakeup_event(&device->dev); ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device); if (ret == NOTIFY_DONE) @@ -402,7 +402,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event) } else { int keycode; - pm_wakeup_event(&device->dev, 0); + acpi_pm_wakeup_event(&device->dev); if (button->suspended) break; @@ -534,6 +534,7 @@ static int acpi_button_add(struct acpi_device *device) lid_device = device; } + device_init_wakeup(&device->dev, true); printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device)); return 0; diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index f13c62c4b117..ca0210213773 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" @@ -385,6 +386,12 @@ EXPORT_SYMBOL(acpi_bus_power_manageable); #ifdef CONFIG_PM static DEFINE_MUTEX(acpi_pm_notifier_lock); +void acpi_pm_wakeup_event(struct device *dev) +{ + pm_wakeup_dev_event(dev, 0, acpi_s2idle_wakeup()); +} +EXPORT_SYMBOL_GPL(acpi_pm_wakeup_event); + static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used) { struct acpi_device *adev; @@ -399,7 +406,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used) mutex_lock(&acpi_pm_notifier_lock); if (adev->wakeup.flags.notifier_present) { - __pm_wakeup_event(adev->wakeup.ws, 0); + pm_wakeup_ws_event(adev->wakeup.ws, 0, acpi_s2idle_wakeup()); if (adev->wakeup.context.func) adev->wakeup.context.func(&adev->wakeup.context); } diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 66229ffa909b..75924ea69071 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -198,8 +198,10 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); Suspend/Resume -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT +extern bool acpi_s2idle_wakeup(void); extern int acpi_sleep_init(void); #else +static inline bool acpi_s2idle_wakeup(void) { return false; } static inline int acpi_sleep_init(void) { return -ENXIO; } #endif diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index a4782c75ebdd..555de11a56b6 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -650,6 +650,8 @@ static const struct platform_suspend_ops acpi_suspend_ops_old = { .recover = acpi_pm_finish, }; +static bool s2idle_wakeup; + static int acpi_freeze_begin(void) { acpi_scan_lock_acquire(); @@ -666,6 +668,33 @@ static int acpi_freeze_prepare(void) return 0; } +static void acpi_freeze_wake(void) +{ + /* + * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means + * that the SCI has triggered while suspended, so cancel the wakeup in + * case it has not been a wakeup event (the GPEs will be checked later). + */ + if (acpi_sci_irq_valid() && + !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) { + pm_system_cancel_wakeup(); + s2idle_wakeup = true; + } +} + +static void acpi_freeze_sync(void) +{ + /* + * Process all pending events in case there are any wakeup ones. + * + * The EC driver uses the system workqueue, so that one needs to be + * flushed too. + */ + acpi_os_wait_events_complete(); + flush_scheduled_work(); + s2idle_wakeup = false; +} + static void acpi_freeze_restore(void) { if (acpi_sci_irq_valid()) @@ -682,6 +711,8 @@ static void acpi_freeze_end(void) static const struct platform_freeze_ops acpi_freeze_ops = { .begin = acpi_freeze_begin, .prepare = acpi_freeze_prepare, + .wake = acpi_freeze_wake, + .sync = acpi_freeze_sync, .restore = acpi_freeze_restore, .end = acpi_freeze_end, }; @@ -700,9 +731,15 @@ static void acpi_sleep_suspend_setup(void) } #else /* !CONFIG_SUSPEND */ +#define s2idle_wakeup (false) static inline void acpi_sleep_suspend_setup(void) {} #endif /* !CONFIG_SUSPEND */ +bool acpi_s2idle_wakeup(void) +{ + return s2idle_wakeup; +} + #ifdef CONFIG_PM_SLEEP static u32 saved_bm_rld; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 253f860e8981..ef5b6a6e5045 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1095,11 +1095,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a if (async_error) goto Complete; - if (pm_wakeup_pending()) { - async_error = -EBUSY; - goto Complete; - } - if (dev->power.syscore || dev->power.direct_complete) goto Complete; diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index c313b600d356..9c36b27996fc 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -28,8 +28,8 @@ bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ unsigned int pm_wakeup_irq __read_mostly; -/* If set and the system is suspending, terminate the suspend. */ -static bool pm_abort_suspend __read_mostly; +/* If greater than 0 and the system is suspending, terminate the suspend. */ +static atomic_t pm_abort_suspend __read_mostly; /* * Combined counters of registered wakeup events and wakeup events in progress. @@ -855,20 +855,26 @@ bool pm_wakeup_pending(void) pm_print_active_wakeup_sources(); } - return ret || pm_abort_suspend; + return ret || atomic_read(&pm_abort_suspend) > 0; } void pm_system_wakeup(void) { - pm_abort_suspend = true; + atomic_inc(&pm_abort_suspend); freeze_wake(); } EXPORT_SYMBOL_GPL(pm_system_wakeup); -void pm_wakeup_clear(void) +void pm_system_cancel_wakeup(void) +{ + atomic_dec(&pm_abort_suspend); +} + +void pm_wakeup_clear(bool reset) { - pm_abort_suspend = false; pm_wakeup_irq = 0; + if (reset) + atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 79c0af419300..63a90a624a0f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -598,15 +598,19 @@ static inline bool acpi_device_always_present(struct acpi_device *adev) #endif #ifdef CONFIG_PM +void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_device_run_wake(struct device *, bool); #else +static inline void acpi_pm_wakeup_event(struct device *dev) +{ +} static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, - void (*work_func)(struct work_struct *work)) + void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0b1cf32edfd7 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -189,6 +189,8 @@ struct platform_suspend_ops { struct platform_freeze_ops { int (*begin)(void); int (*prepare)(void); + void (*wake)(void); + void (*sync)(void); void (*restore)(void); void (*end)(void); }; @@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); -extern void pm_wakeup_clear(void); +extern void pm_system_cancel_wakeup(void); +extern void pm_wakeup_clear(bool reset); extern void pm_system_irq_wakeup(unsigned int irq_number); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); @@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} -static inline void pm_wakeup_clear(void) {} +static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline void lock_system_sleep(void) {} diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(); + pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..3ecf275d7e44 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,6 +72,8 @@ static void freeze_begin(void) static void freeze_enter(void) { + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -84,11 +86,9 @@ static void freeze_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); - pr_debug("PM: suspend-to-idle\n"); /* Make the current CPU wait so it can enter the idle loop too. */ wait_event(suspend_freeze_wait_head, suspend_freeze_state == FREEZE_STATE_WAKE); - pr_debug("PM: resume from suspend-to-idle\n"); cpuidle_pause(); put_online_cpus(); @@ -98,6 +98,31 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); +} + +static void s2idle_loop(void) +{ + pr_debug("PM: suspend-to-idle\n"); + + do { + freeze_enter(); + + if (freeze_ops && freeze_ops->wake) + freeze_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + if (freeze_ops && freeze_ops->sync) + freeze_ops->sync(); + + if (pm_wakeup_pending()) + break; + + pm_wakeup_clear(false); + } while (!dpm_suspend_noirq(PMSG_SUSPEND)); + + pr_debug("PM: resume from suspend-to-idle\n"); } void freeze_wake(void) @@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - trace_suspend_resume(TPS("machine_suspend"), state, true); - freeze_enter(); - trace_suspend_resume(TPS("machine_suspend"), state, false); - goto Platform_wake; + s2idle_loop(); + goto Platform_early_resume; } error = disable_nonboot_cpus(); -- cgit From cd33f5f2cbfaadc21270f3ddac7c3c33e0a1a28c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 11:53:09 -0400 Subject: audit: make sure we never skip the multicast broadcast When the auditd connection is reset, either intentionally or due to a failure, any records that were in the main backlog queue would not be sent in a multicast broadcast. This patch fixes this problem by not flushing the main backlog queue on a connection reset, the main kauditd_thread() will take care of that normally. Resolves: https://github.com/linux-audit/audit-kernel/issues/41 Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e1e2b3abfb93..7cad70214b81 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -605,11 +605,10 @@ static void auditd_reset(const struct auditd_connection *ac) if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); - /* flush all of the main and retry queues to the hold queue */ + /* flush the retry queue to the hold queue, but don't touch the main + * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb); - while ((skb = skb_dequeue(&audit_queue))) - kauditd_hold_skb(skb); } /** -- cgit From feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jun 2017 17:04:55 -0400 Subject: tracing: Show address when function names are not found Currently, when a function is not found in kallsyms, instead of simply showing the function address, it shows nothing at all: # echo ':mod:kvm_intel' > /sys/kernel/tracing/set_ftrace_filter # echo function > /sys/kernel/tracing/set_ftrace_filter # qemu -enable-kvm /home/my-qemu-image # rmmod kvm_intel # cat /sys/kernel/tracing/trace qemu-system-x86-2408 [001] d..2 135.013238: <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: <-__do_cpuid_ent When it should show: qemu-system-x86-2408 [001] d..2 135.013238: 0xffffffffa02a39f0 <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: 0xffffffffa02a2ba0 <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: 0xffffffffa029e4e0 <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: 0xffffffffa02a1380 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e160 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e180 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e520 <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: 0xffffffffa02a13b0 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: 0xffffffffa02a1380 <-__do_cpuid_ent instead. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 08f9bab8089e..01ff99969ca7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; sprint_symbol(str, address); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } #ifndef CONFIG_64BIT -- cgit From 239946314e57711d7da546b67964d0b387a3ee42 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jun 2017 15:07:39 -0700 Subject: bpf: possibly avoid extra masking for narrower load in verifier Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") permits narrower load for certain ctx fields. The commit however will already generate a masking even if the prog-specific ctx conversion produces the result with narrower size. For example, for __sk_buff->protocol, the ctx conversion loads the data into register with 2-byte load. A narrower 2-byte load should not generate masking. For __sk_buff->vlan_present, the conversion function set the result as either 0 or 1, essentially a byte. The narrower 2-byte or 1-byte load should not generate masking. To avoid unnecessary masking, prog-specific *_is_valid_access now passes converted_op_size back to verifier, which indicates the valid data width after perceived future conversion. Based on this information, verifier is able to avoid unnecessary marking. Since we want more information back from prog-specific *_is_valid_access checking, all of them are packed into one data structure for more clarity. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 11 +++++- include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 29 ++++++++++---- kernel/trace/bpf_trace.c | 17 +++++--- net/core/filter.c | 92 +++++++++++++++++++++++++------------------- 5 files changed, 97 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1bcbf0a71f75..deca4e7f2845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -149,6 +149,15 @@ enum bpf_reg_type { struct bpf_prog; +/* The information passed from prog-specific *_is_valid_access + * back to the verifier. + */ +struct bpf_insn_access_aux { + enum bpf_reg_type reg_type; + int ctx_field_size; + int converted_op_size; +}; + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -157,7 +166,7 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size); + struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 189741c0da85..621076f56251 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -73,7 +73,8 @@ struct bpf_insn_aux_data { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; - int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */ + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ + int converted_op_size; /* the valid value width after perceived conversion */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44b97d958fb7..74ea96ea391b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -761,22 +761,34 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - int ctx_field_size = 0; + struct bpf_insn_access_aux info = { .reg_type = *reg_type }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { - /* a non zero ctx_field_size indicates: + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + /* a non zero info.ctx_field_size indicates: * . For this field, the prog type specific ctx conversion algorithm * only supports whole field access. * . This ctx access is a candiate for later verifier transformation * to load the whole field and then apply a mask to get correct result. + * a non zero info.converted_op_size indicates perceived actual converted + * value width in convert_ctx_access. */ - if (ctx_field_size) - env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + if ((info.ctx_field_size && !info.converted_op_size) || + (!info.ctx_field_size && info.converted_op_size)) { + verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", + env->prog->type, off, size); + return -EACCES; + } + + if (info.ctx_field_size) { + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; + } + *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -3388,7 +3400,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; + int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3431,7 +3443,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) off = insn->off; size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - is_narrower_load = (type == BPF_READ && size < ctx_field_size); + converted_op_size = env->insn_aux_data[i + delta].converted_op_size; + is_narrower_load = type == BPF_READ && size < ctx_field_size; /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific @@ -3453,7 +3466,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load) { + if (is_narrower_load && size < converted_op_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9d3ec8253131..97c46b440cd6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,7 +581,7 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { int sample_period_off; @@ -595,12 +595,17 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - *ctx_field_size = 8; + int allowed; + #ifdef __LITTLE_ENDIAN - return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; + allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; #else - return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; + allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; #endif + if (!allowed) + return false; + info->ctx_field_size = 8; + info->converted_op_size = 8; } else { if (size != sizeof(long)) return false; diff --git a/net/core/filter.c b/net/core/filter.c index 60ed6f343a63..4b788007415f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2856,8 +2856,37 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } +static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info) +{ + info->ctx_field_size = 4; + switch (off) { + case offsetof(struct __sk_buff, pkt_type) ... + offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_present) ... + offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1: + info->converted_op_size = 1; + break; + case offsetof(struct __sk_buff, queue_mapping) ... + offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, protocol) ... + offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_tci) ... + offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_proto) ... + offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, tc_index) ... + offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + info->converted_op_size = 2; + break; + default: + info->converted_op_size = 4; + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2875,24 +2904,32 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, break; case offsetof(struct __sk_buff, data) ... offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + if (size != sizeof(__u32)) + return false; + info->reg_type = PTR_TO_PACKET; + break; case offsetof(struct __sk_buff, data_end) ... offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: if (size != sizeof(__u32)) return false; + info->reg_type = PTR_TO_PACKET_END; break; default: - /* permit narrower load for not cb/data/data_end fields */ - *ctx_field_size = 4; if (type == BPF_WRITE) { if (size != sizeof(__u32)) return false; } else { - if (size != sizeof(__u32)) + int allowed; + + /* permit narrower load for not cb/data/data_end fields */ #ifdef __LITTLE_ENDIAN - return (off & 0x3) == 0 && (size == 1 || size == 2); + allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0; #else - return (off & 0x3) + size == 4 && (size == 1 || size == 2); + allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0; #endif + if (!allowed) + return false; + __set_access_aux_info(off, info); } } @@ -2901,8 +2938,7 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { switch (off) { case offsetof(struct __sk_buff, tc_classid) ... @@ -2924,13 +2960,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { switch (off) { case offsetof(struct __sk_buff, tc_classid) ... @@ -2950,22 +2985,12 @@ static bool lwt_is_valid_access(int off, int size, } } - switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; - break; - } - - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -3028,8 +3053,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -3045,16 +3069,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, } } - switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; - break; - } - - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3071,18 +3086,17 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; switch (off) { case offsetof(struct xdp_md, data): - *reg_type = PTR_TO_PACKET; + info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_end): - *reg_type = PTR_TO_PACKET_END; + info->reg_type = PTR_TO_PACKET_END; break; } -- cgit From f59dd9c886acb3abb188e8e94a99436560976835 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:02 -0700 Subject: time: add get_timespec64 and put_timespec64 Add helper functions to convert between struct timespec64 and struct timespec at userspace boundaries. This is a preparatory patch to use timespec64 as the basic type internally in the kernel as timespec is not y2038 safe on 32 bit systems. The patch helps the cause by containing all data conversions at the userspace boundaries within these functions. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/compat.h | 2 ++ include/linux/time.h | 5 +++++ kernel/compat.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/time.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 425563c7647b..3eb04016ffa9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -164,6 +164,8 @@ extern int compat_get_timespec(struct timespec *, const void __user *); extern int compat_put_timespec(const struct timespec *, void __user *); extern int compat_get_timeval(struct timeval *, const void __user *); extern int compat_put_timeval(const struct timeval *, void __user *); +extern int compat_get_timespec64(struct timespec64 *, const void __user *); +extern int compat_put_timespec64(const struct timespec64 *, void __user *); /* * This function convert a timespec if necessary and returns a *user diff --git a/include/linux/time.h b/include/linux/time.h index c0543f5f25de..36afb579495f 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -8,6 +8,11 @@ extern struct timezone sys_tz; +int get_timespec64(struct timespec64 *ts, + const struct timespec __user *uts); +int put_timespec64(const struct timespec64 *ts, + struct timespec __user *uts); + #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) static inline int timespec_equal(const struct timespec *a, diff --git a/kernel/compat.c b/kernel/compat.c index ebd8bdc3fd68..73f26ba44a8a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +static int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +static int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) diff --git a/kernel/time/time.c b/kernel/time/time.c index 7c89e437c4d7..adb9853ca6b0 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -890,3 +890,31 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } + +int get_timespec64(struct timespec64 *ts, + const struct timespec __user *uts) +{ + struct timespec kts; + int ret; + + ret = copy_from_user(&kts, uts, sizeof(kts)); + if (ret) + return -EFAULT; + + ts->tv_sec = kts.tv_sec; + ts->tv_nsec = kts.tv_nsec; + + return 0; +} +EXPORT_SYMBOL_GPL(get_timespec64); + +int put_timespec64(const struct timespec64 *ts, + struct timespec __user *uts) +{ + struct timespec kts = { + .tv_sec = ts->tv_sec, + .tv_nsec = ts->tv_nsec + }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_timespec64); -- cgit From d5b7ffbfbdacc29e4db035f90665951668fa9c58 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:03 -0700 Subject: time: introduce {get,put}_itimerspec64 As we change the user space type for the timerfd and posix timer functions to newer data types, we need some form of conversion helpers to avoid duplicating that logic. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/compat.h | 4 ++++ include/linux/posix-timers.h | 1 - include/linux/time.h | 13 +++++++++++++ kernel/compat.c | 21 +++++++++++++++++++++ kernel/time/time.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 3eb04016ffa9..2ed54020ace0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -166,6 +166,10 @@ extern int compat_get_timeval(struct timeval *, const void __user *); extern int compat_put_timeval(const struct timeval *, void __user *); extern int compat_get_timespec64(struct timespec64 *, const void __user *); extern int compat_put_timespec64(const struct timespec64 *, void __user *); +extern int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits); +extern int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits); /* * This function convert a timespec if necessary and returns a *user diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 29f1b7f09ced..62839fd04dce 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -113,5 +113,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); void posixtimer_rearm(struct siginfo *info); - #endif diff --git a/include/linux/time.h b/include/linux/time.h index 36afb579495f..f9858d7e6361 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -12,6 +12,10 @@ int get_timespec64(struct timespec64 *ts, const struct timespec __user *uts); int put_timespec64(const struct timespec64 *ts, struct timespec __user *uts); +int get_itimerspec64(struct itimerspec64 *it, + const struct itimerspec __user *uit); +int put_itimerspec64(const struct itimerspec64 *it, + struct itimerspec __user *uit); #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) @@ -275,4 +279,13 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } +static inline bool itimerspec64_valid(const struct itimerspec64 *its) +{ + if (!timespec64_valid(&(its->it_interval)) || + !timespec64_valid(&(its->it_value))) + return false; + + return true; +} + #endif diff --git a/kernel/compat.c b/kernel/compat.c index 73f26ba44a8a..a350deda503a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -586,6 +586,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } +int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits) +{ + + if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || + __compat_get_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_compat_itimerspec64); + +int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits) +{ + if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || + __compat_put_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_compat_itimerspec64); + /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/time.c b/kernel/time/time.c index adb9853ca6b0..44a8c1402133 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -918,3 +918,33 @@ int put_timespec64(const struct timespec64 *ts, return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); + +int get_itimerspec64(struct itimerspec64 *it, + const struct itimerspec __user *uit) +{ + int ret; + + ret = get_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = get_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(get_itimerspec64); + +int put_itimerspec64(const struct itimerspec64 *it, + struct itimerspec __user *uit) +{ + int ret; + + ret = put_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = put_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(put_itimerspec64); -- cgit From 63a766a1780f9581e8885bdb64270a594a84f81a Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:04 -0700 Subject: posix-stubs: Conditionally include COMPAT_SYS_NI defines These apis only need to be defined if CONFIG_COMPAT is enabled. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 38f3b20efa29..65878221cbfb 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -41,12 +41,6 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif -COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC @@ -138,6 +132,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, } #ifdef CONFIG_COMPAT +COMPAT_SYS_NI(timer_create); +COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); + COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { -- cgit From 1ba5c08b58a0c21fca222f1bf2fde184aa26103f Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 6 Jun 2017 14:17:39 +0200 Subject: kernel/module.c: suppress warning about unused nowarn variable This patch fix the following warning: kernel/module.c: In function 'add_usage_links': kernel/module.c:1653:6: warning: variable 'nowarn' set but not used [-Wunused-but-set-variable] [jeyu: folded in first patch since it only swapped the function order so that del_usage_links can be called from add_usage_links] Signed-off-by: Corentin Labbe Signed-off-by: Jessica Yu --- kernel/module.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3803449ca219..f546d574f436 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1666,31 +1666,36 @@ static inline void remove_notes_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static void add_usage_links(struct module *mod) +static void del_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; - int nowarn; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); mutex_unlock(&module_mutex); #endif } -static void del_usage_links(struct module *mod) +static int add_usage_links(struct module *mod) { + int ret = 0; #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); #endif + return ret; } static int module_add_modinfo_attrs(struct module *mod) @@ -1801,13 +1806,18 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; - add_usage_links(mod); + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + add_sect_attrs(mod, info); add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: -- cgit From 673feb9d76ab3eddde7acfd94b206e321cfc90b9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 15:26:26 -0400 Subject: ftrace: Add :mod: caching infrastructure to trace_array This is the start of the infrastructure work to allow for tracing module functions before it is loaded. Currently the following command: # echo :mod:some-mod > set_ftrace_filter will enable tracing of all functions within the module "some-mod" if it is loaded. What we want, is if the module is not loaded, that line will be saved. When the module is loaded, then the "some-mod" will have that line executed on it, so that the functions within it starts being traced. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace.h | 12 +++++ 2 files changed, 148 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e5841dc14b5..1867edec6269 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1293,6 +1293,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) FTRACE_WARN_ON(hash->count); } +static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) +{ + list_del(&ftrace_mod->list); + kfree(ftrace_mod->module); + kfree(ftrace_mod->func); + kfree(ftrace_mod); +} + +static void clear_ftrace_mod_list(struct list_head *head) +{ + struct ftrace_mod_load *p, *n; + + /* stack tracer isn't supported yet */ + if (!head) + return; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(p, n, head, list) + free_ftrace_mod(p); + mutex_unlock(&ftrace_lock); +} + static void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) @@ -1346,6 +1368,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return hash; } + +static int ftrace_add_mod(struct trace_array *tr, + const char *func, const char *module, + int enable) +{ + struct ftrace_mod_load *ftrace_mod; + struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; + + ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + if (!ftrace_mod) + return -ENOMEM; + + ftrace_mod->func = kstrdup(func, GFP_KERNEL); + ftrace_mod->module = kstrdup(module, GFP_KERNEL); + ftrace_mod->enable = enable; + + if (!ftrace_mod->func || !ftrace_mod->module) + goto out_free; + + list_add(&ftrace_mod->list, mod_head); + + return 0; + + out_free: + free_ftrace_mod(ftrace_mod); + + return -ENOMEM; +} + static struct ftrace_hash * alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { @@ -3457,6 +3508,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, { struct ftrace_iterator *iter; struct ftrace_hash *hash; + struct list_head *mod_head; + struct trace_array *tr = ops->private; int ret = 0; ftrace_ops_init(ops); @@ -3478,18 +3531,23 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, mutex_lock(&ops->func_hash->regex_lock); - if (flag & FTRACE_ITER_NOTRACE) + if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - else + mod_head = tr ? &tr->mod_trace : NULL; + } else { hash = ops->func_hash->filter_hash; + mod_head = tr ? &tr->mod_notrace : NULL; + } if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; - if (file->f_flags & O_TRUNC) + if (file->f_flags & O_TRUNC) { iter->hash = alloc_ftrace_hash(size_bits); - else + clear_ftrace_mod_list(mod_head); + } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + } if (!iter->hash) { trace_parser_put(&iter->parser); @@ -3761,6 +3819,68 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return ret; } +static bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + const char this_mod[] = "__this_module"; + const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; + char modname[modname_size + 1]; + unsigned long val; + int n; + + n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + + if (n > modname_size) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static int cache_mod(struct trace_array *tr, + const char *func, char *module, int enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; + int ret; + + mutex_lock(&ftrace_lock); + + /* We do not cache inverse filters */ + if (func[0] == '!') { + func++; + ret = -EINVAL; + + /* Look to remove this hash */ + list_for_each_entry_safe(ftrace_mod, n, head, list) { + if (strcmp(ftrace_mod->module, module) != 0) + continue; + + /* no func matches all */ + if (!func || strcmp(func, "*") == 0 || + (ftrace_mod->func && + strcmp(ftrace_mod->func, func) == 0)) { + ret = 0; + free_ftrace_mod(ftrace_mod); + continue; + } + } + goto out; + } + + ret = -EINVAL; + /* We only care about modules that have not been loaded yet */ + if (module_exists(module)) + goto out; + + /* Save this string off, and execute it when the module is loaded */ + ret = ftrace_add_mod(tr, func, module, enable); + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -3768,10 +3888,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static int ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *func, char *cmd, char *module, int enable) + char *func_orig, char *cmd, char *module, int enable) { + char *func; int ret; + /* match_records() modifies func, and we need the original */ + func = kstrdup(func_orig, GFP_KERNEL); + if (!func) + return -ENOMEM; + /* * cmd == 'mod' because we only registered this func * for the 'mod' ftrace_func_command. @@ -3780,8 +3906,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, * parameter. */ ret = match_records(hash, func, strlen(func), module); + kfree(func); + if (!ret) - return -EINVAL; + return cache_mod(tr, func_orig, module, enable); if (ret < 0) return ret; return 0; @@ -5570,6 +5698,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { INIT_LIST_HEAD(&tr->func_probes); + INIT_LIST_HEAD(&tr->mod_trace); + INIT_LIST_HEAD(&tr->mod_notrace); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 69a3ab3ee4f5..d63550cdbdfa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -263,7 +263,10 @@ struct trace_array { struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; #ifdef CONFIG_DYNAMIC_FTRACE + /* All of these are protected by the ftrace_lock */ struct list_head func_probes; + struct list_head mod_trace; + struct list_head mod_notrace; #endif /* function tracing enabled */ int function_enabled; @@ -761,6 +764,15 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash; + +struct ftrace_mod_load { + struct list_head list; + char *func; + char *module; + int enable; +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; -- cgit From 5985ea8bd5d1b820b909af49fbc2767a990080a6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 16:05:11 -0400 Subject: ftrace: Have the cached module list show in set_ftrace_filter When writing in a module filter into set_ftrace_filter for a module that is not yet loaded, it it cached, and will be executed when the module is loaded (although that is not implemented yet at this commit). Display the list of cached modules to be traced. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +- kernel/trace/ftrace.c | 112 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 102 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1b6992e994e6..9fb9a67dc9d4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -446,7 +446,8 @@ enum { FTRACE_ITER_PRINTALL = (1 << 2), FTRACE_ITER_DO_PROBES = (1 << 3), FTRACE_ITER_PROBE = (1 << 4), - FTRACE_ITER_ENABLED = (1 << 5), + FTRACE_ITER_MOD = (1 << 5), + FTRACE_ITER_ENABLED = (1 << 6), }; void arch_ftrace_update_code(int command); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1867edec6269..bfdbce78064b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3112,6 +3112,7 @@ ftrace_allocate_pages(unsigned long num_to_init) struct ftrace_iterator { loff_t pos; loff_t func_pos; + loff_t mod_pos; struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; @@ -3119,6 +3120,8 @@ struct ftrace_iterator { struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; + struct trace_array *tr; + struct list_head *mod_list; int pidx; int idx; unsigned flags; @@ -3203,13 +3206,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; - if (iter->func_pos > *pos) + if (iter->mod_pos > *pos) return NULL; iter->probe = NULL; iter->probe_entry = NULL; iter->pidx = 0; - for (l = 0; l <= (*pos - iter->func_pos); ) { + for (l = 0; l <= (*pos - iter->mod_pos); ) { p = t_probe_next(m, &l); if (!p) break; @@ -3247,6 +3250,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) return 0; } +static void * +t_mod_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + + (*pos)++; + iter->pos = *pos; + + iter->mod_list = iter->mod_list->next; + + if (iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) { + iter->flags &= ~FTRACE_ITER_MOD; + return NULL; + } + + iter->mod_pos = *pos; + + return iter; +} + +static void *t_mod_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->mod_pos = iter->func_pos; + + /* probes are only available if tr is set */ + if (!iter->tr) + return NULL; + + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_mod_next(m, &l); + if (!p) + break; + } + if (!p) { + iter->flags &= ~FTRACE_ITER_MOD; + return t_probe_start(m, pos); + } + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_MOD; + + return iter; +} + +static int +t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_mod_load *ftrace_mod; + struct trace_array *tr = iter->tr; + + if (WARN_ON_ONCE(!iter->mod_list) || + iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) + return -EIO; + + ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); + + if (ftrace_mod->func) + seq_printf(m, "%s", ftrace_mod->func); + else + seq_putc(m, '*'); + + seq_printf(m, ":mod:%s\n", ftrace_mod->module); + + return 0; +} + static void * t_func_next(struct seq_file *m, loff_t *pos) { @@ -3288,7 +3367,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - loff_t l = *pos; /* t_hash_start() must use original pos */ + loff_t l = *pos; /* t_probe_start() must use original pos */ void *ret; if (unlikely(ftrace_disabled)) @@ -3297,16 +3376,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_next(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_next(m, pos); + if (iter->flags & FTRACE_ITER_PRINTALL) { /* next must increment pos, and t_probe_start does not */ (*pos)++; - return t_probe_start(m, &l); + return t_mod_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_probe_start(m, &l); + return t_mod_start(m, &l); return ret; } @@ -3315,7 +3397,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -3344,15 +3426,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) ftrace_hash_empty(iter->hash)) { iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_probe_start(m, pos); + return t_mod_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_PROBE) - return t_probe_start(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3368,7 +3450,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_probe_start(m, pos); + return t_mod_start(m, pos); return iter; } @@ -3402,6 +3484,9 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_show(m, iter); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_show(m, iter); + if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) seq_puts(m, "#### no functions disabled ####\n"); @@ -3528,17 +3613,20 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; + iter->tr = tr; mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - mod_head = tr ? &tr->mod_trace : NULL; + mod_head = tr ? &tr->mod_notrace : NULL; } else { hash = ops->func_hash->filter_hash; - mod_head = tr ? &tr->mod_notrace : NULL; + mod_head = tr ? &tr->mod_trace : NULL; } + iter->mod_list = mod_head; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; -- cgit From d7fbf8df7ca0a5c7e85db79f7005f99cb461c525 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 10:57:21 -0400 Subject: ftrace: Implement cached modules tracing on module load If a module is cached in the set_ftrace_filter, and that module is loaded, then enable tracing on that module as if the cached module text was written into set_ftrace_filter just as the module is loaded. # echo ":mod:kvm_intel" > # cat /sys/kernel/tracing/set_ftrace_filter #### all functions enabled #### :mod:kvm_intel # modprobe kvm_intel # cat /sys/kernel/tracing/set_ftrace_filter vmx_get_rflags [kvm_intel] vmx_get_pkru [kvm_intel] vmx_get_interrupt_shadow [kvm_intel] vmx_rdtscp_supported [kvm_intel] vmx_invpcid_supported [kvm_intel] [..] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bfdbce78064b..f1ccf8be9df7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3969,6 +3969,97 @@ static int cache_mod(struct trace_array *tr, return ret; } +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable); + +static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, + char *mod, bool enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct ftrace_hash **orig_hash, *new_hash; + LIST_HEAD(process_mods); + char *func; + int ret; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, + *orig_hash); + if (!new_hash) + return; /* Warn? */ + + mutex_lock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, head, list) { + + if (strcmp(ftrace_mod->module, mod) != 0) + continue; + + if (ftrace_mod->func) + func = kstrdup(ftrace_mod->func, GFP_KERNEL); + else + func = kstrdup("*", GFP_KERNEL); + + if (!func) /* warn? */ + continue; + + list_del(&ftrace_mod->list); + list_add(&ftrace_mod->list, &process_mods); + + /* Use the newly allocated func, as it may be "*" */ + kfree(ftrace_mod->func); + ftrace_mod->func = func; + } + + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { + + func = ftrace_mod->func; + + /* Grabs ftrace_lock, which is why we have this extra step */ + match_records(new_hash, func, strlen(func), mod); + free_ftrace_mod(ftrace_mod); + } + + mutex_lock(&ftrace_lock); + + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + new_hash, enable); + mutex_unlock(&ftrace_lock); + + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(new_hash); +} + +static void process_cached_mods(const char *mod_name) +{ + struct trace_array *tr; + char *mod; + + mod = kstrdup(mod_name, GFP_KERNEL); + if (!mod) + return; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!list_empty(&tr->mod_trace)) + process_mod_list(&tr->mod_trace, tr->ops, mod, true); + if (!list_empty(&tr->mod_notrace)) + process_mod_list(&tr->mod_notrace, tr->ops, mod, false); + } + mutex_unlock(&trace_types_lock); + + kfree(mod); +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -5682,6 +5773,8 @@ void ftrace_module_enable(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + + process_cached_mods(mod->name); } void ftrace_module_init(struct module *mod) -- cgit From 8c08f0d5c6fb10ff93ffb1cbf416f4f1c3a52a80 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 11:47:31 -0400 Subject: ftrace: Have cached module filters be an active filter When a module filter is added to set_ftrace_filter, if the module is not loaded, it is cached. This should be considered an active filter, and function tracing should be filtered by this. That is, if a cached module filter is the only filter set, then no function tracing should be happening, as all the functions available will be filtered out. This makes sense, as the reason to add a cached module filter, is to trace the module when you load it. There shouldn't be any other tracing happening until then. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 20 +++++++++++++++----- kernel/trace/trace.h | 7 ++++++- 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9fb9a67dc9d4..5857390ac35a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -120,6 +120,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * this ops will fail to register or set_filter_ip. * PID - Is affected by set_ftrace_pid (allows filtering on those pids) * RCU - Set when the ops can only be called when RCU is watching. + * TRACE_ARRAY - The ops->private points to a trace_array descriptor. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -138,6 +139,7 @@ enum { FTRACE_OPS_FL_IPMODIFY = 1 << 13, FTRACE_OPS_FL_PID = 1 << 14, FTRACE_OPS_FL_RCU = 1 << 15, + FTRACE_OPS_FL_TRACE_ARRAY = 1 << 16, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ccf8be9df7..914539e3e301 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1410,6 +1410,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) if (!new_hash) return NULL; + if (hash) + new_hash->flags = hash->flags; + /* Empty hash? */ if (ftrace_hash_empty(hash)) return new_hash; @@ -1454,7 +1457,7 @@ __ftrace_hash_move(struct ftrace_hash *src) /* * If the new source is empty, just return the empty_hash. */ - if (!src->count) + if (ftrace_hash_empty(src)) return EMPTY_HASH; /* @@ -1471,6 +1474,8 @@ __ftrace_hash_move(struct ftrace_hash *src) if (!new_hash) return NULL; + new_hash->flags = src->flags; + size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; @@ -1701,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, struct dyn_ftrace *rec; bool update = false; int count = 0; - int all = 0; + int all = false; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -1722,7 +1727,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, hash = ops->func_hash->filter_hash; other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) - all = 1; + all = true; } else { inc = !inc; hash = ops->func_hash->notrace_hash; @@ -4028,6 +4033,9 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, free_ftrace_mod(ftrace_mod); } + if (enable && list_empty(head)) + new_hash->flags &= ~FTRACE_HASH_FL_MOD; + mutex_lock(&ftrace_lock); ret = ftrace_hash_move_and_update_ops(ops, orig_hash, @@ -5035,9 +5043,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); - if (filter_hash) + if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - else + if (!list_empty(&iter->tr->mod_trace)) + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } else orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d63550cdbdfa..13823951e42b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -773,10 +773,15 @@ struct ftrace_mod_load { int enable; }; +enum { + FTRACE_HASH_FL_MOD = (1 << 0), +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; + unsigned long flags; struct rcu_head rcu; }; @@ -785,7 +790,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { - return !hash || !hash->count; + return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); } /* Standard output formatting function used for function return traces */ -- cgit From 131b63515932d18a3b1a60db3958f3c0dd5462bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Feb 2017 09:24:24 -0800 Subject: seccomp: Clean up core dump logic This just cleans up the core dumping logic to avoid the braces around the RET_KILL case. Signed-off-by: Kees Cook --- kernel/seccomp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 65f61077ad50..fce83885b7ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL: - default: { - siginfo_t info; + default: audit_seccomp(this_syscall, SIGSYS, action); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { + siginfo_t info; + /* Show the original registers in the dump. */ syscall_rollback(current, task_pt_regs(current)); /* Trigger a manual coredump since do_exit skips it. */ @@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } do_exit(SIGSYS); } - } unreachable(); -- cgit From 0b5fa2290637a3235898d18dc0e7a136783f1bd2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 26 Jun 2017 09:24:00 -0700 Subject: seccomp: Switch from atomic_t to recount_t This switches the seccomp usage tracking from atomic_t to refcount_t to gain refcount overflow protections. Cc: Elena Reshetova Cc: David Windsor Cc: Hans Liljestrand Signed-off-by: Kees Cook --- kernel/seccomp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fce83885b7ef..98b59b5db90b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,7 +13,7 @@ * of Berkeley Packet Filters/Linux Socket Filters. */ -#include +#include #include #include #include @@ -56,7 +56,7 @@ * to a task_struct (other than @usage). */ struct seccomp_filter { - atomic_t usage; + refcount_t usage; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - atomic_set(&sfilter->usage, 1); + refcount_set(&sfilter->usage, 1); return sfilter; } @@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; /* Reference count is bounded by the number of total processes. */ - atomic_inc(&orig->usage); + refcount_inc(&orig->usage); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ - while (orig && atomic_dec_and_test(&orig->usage)) { + while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); -- cgit From 49368a47f6dc1b256e3b83813da5c9b0731fe268 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sat, 3 Jun 2017 20:52:32 +1000 Subject: PM / hibernate: Use CONFIG_HAVE_SET_MEMORY for include condition Kbuild reported a build failure when CONFIG_STRICT_KERNEL_RWX was enabled on powerpc. We don't yet have ARCH_HAS_SET_MEMORY and ppc32 saw a build failure. I've only done a basic compile test with a config that has hibernation enabled. Fixes: 50327ddfbc92 (kernel/power/snapshot.c: use set_memory.h header) Reported-by: Christophe Leroy Signed-off-by: Balbir Singh Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fa46606f3356..71730d672290 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,13 +36,13 @@ #include #include #include -#ifdef CONFIG_STRICT_KERNEL_RWX +#ifdef CONFIG_ARCH_HAS_SET_MEMORY #include #endif #include "power.h" -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -77,7 +77,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_STRICT_KERNEL_RWX */ +#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); -- cgit From eba74c294467d55c697e2199c37dfaf8126fe396 Mon Sep 17 00:00:00 2001 From: BaoJun Luo Date: Tue, 27 Jun 2017 02:10:44 +0200 Subject: PM / hibernate: Drop redundant parameter of swsusp_alloc() The first parameter of swsusp_alloc is not used, so drop it. Signed-off-by: BaoJun Luo [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71730d672290..b7708e319941 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1929,8 +1929,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, * also be located in the high memory, because of the way in which * copy_data_pages() works. */ -static int swsusp_alloc(struct memory_bitmap *orig_bm, - struct memory_bitmap *copy_bm, +static int swsusp_alloc(struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { @@ -1976,7 +1975,7 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { printk(KERN_ERR "PM: Memory allocation failed\n"); return -ENOMEM; } -- cgit From 6a9c981b1e9657ca5866d10aa38b8a4fe1159138 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:02:49 -0400 Subject: ftrace: Remove unused function ftrace_arch_read_dyn_info() ftrace_arch_read_dyn_info() was used so that archs could add its own debug information into the dyn_ftrace_total_info in the tracefs file system. That file is for debugging usage of dynamic ftrace. No arch uses that function anymore, so just get rid of it. This also allows for tracing_read_dyn_info() to be cleaned up a bit. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 19ac2088d10a..14318ce92b13 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6737,33 +6737,18 @@ static const struct file_operations tracing_stats_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + char buf[64]; /* Not too big for a shallow stack */ int r; - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + r = scnprintf(buf, 63, "%ld", *p); buf[r++] = '\n'; - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - mutex_unlock(&dyn_info_mutex); - - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static const struct file_operations tracing_dyn_info_fops = { -- cgit From 83dd14933e33a45e9b366c572e15505982b46845 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:04:40 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info file The dyn_ftrace_total_info file is used to show how many functions have been converted into nops and can be used by ftrace. The problem is that it does not get decremented when functions are removed (init boot code being freed, and modules being freed). That means the number is very inaccurate everytime functions are removed from the ftrace tables. Decrement it when functions are removed. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 914539e3e301..7509ef9810bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5705,6 +5705,7 @@ void ftrace_release_mod(struct module *mod) if (pg == ftrace_pages) ftrace_pages = next_to_ftrace_page(last_pg); + ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); -- cgit From d914ba37d7145acb9fd3bb23075c2d56e5a44eb6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jun 2017 19:01:55 -0700 Subject: tracing: Add support for recording tgid of tasks Inorder to support recording of tgid, the following changes are made: * Introduce a new API (tracing_record_taskinfo) to additionally record the tgid along with the task's comm at the same time. This has has the benefit of not setting trace_cmdline_save before all the information for a task is saved. * Add a new API tracing_record_taskinfo_sched_switch to record task information for 2 tasks at a time (previous and next) and use it from sched_switch probe. * Preserve the old API (tracing_record_cmdline) and create it as a wrapper around the new one so that existing callers aren't affected. * Reuse the existing sched_switch and sched_wakeup probes to record tgid information and add a new option 'record-tgid' to enable recording of tgid When record-tgid option isn't enabled to being with, we take care to make sure that there's isn't memory or runtime overhead. Link: http://lkml.kernel.org/r/20170627020155.5139-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 13 ++++- kernel/trace/trace.c | 105 ++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 7 +++ kernel/trace/trace_events.c | 42 ++++++++++++++- kernel/trace/trace_sched_switch.c | 72 +++++++++++++++++++++----- 5 files changed, 213 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a556805eff8a..f73cedfa2e0b 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -151,7 +151,15 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, int type, unsigned long len, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); +#define TRACE_RECORD_CMDLINE BIT(0) +#define TRACE_RECORD_TGID BIT(1) + +void tracing_record_taskinfo(struct task_struct *task, int flags); +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags); + +void tracing_record_cmdline(struct task_struct *task); +void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); @@ -290,6 +298,7 @@ struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, + EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, @@ -303,6 +312,7 @@ enum { * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch + * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED @@ -315,6 +325,7 @@ enum { enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), + EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 14318ce92b13..ab9db750dd29 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_cmdline_save); +static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { - __this_cpu_write(trace_cmdline_save, true); + __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { @@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) } } +static int *tgid_map; + #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer { static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; +static atomic_t trace_record_taskinfo_disabled __read_mostly; static inline char *get_saved_cmdlines(int idx) { @@ -1990,16 +1992,87 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -void tracing_record_cmdline(struct task_struct *tsk) +int trace_find_tgid(int pid) +{ + if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) + return 0; + + return tgid_map[pid]; +} + +static int trace_save_tgid(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + return 0; + + tgid_map[tsk->pid] = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task - task to record + * @flags - TRACE_RECORD_CMDLINE for recording comm + * - TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) + return; + if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) + return; + if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) return; - if (!__this_cpu_read(trace_cmdline_save)) + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev - previous task during sched_switch + * @next - next task during sched_switch + * @flags - TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) return; - if (trace_save_cmdline(tsk)) - __this_cpu_write(trace_cmdline_save, false); + if ((flags & TRACE_RECORD_CMDLINE) && + (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) + return; + + if ((flags & TRACE_RECORD_TGID) && + (!trace_save_tgid(prev) || !trace_save_tgid(next))) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); } /* @@ -3144,7 +3217,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) #endif if (!iter->snapshot) - atomic_inc(&trace_record_cmdline_disabled); + atomic_inc(&trace_record_taskinfo_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -3189,7 +3262,7 @@ static void s_stop(struct seq_file *m, void *p) #endif if (!iter->snapshot) - atomic_dec(&trace_record_cmdline_disabled); + atomic_dec(&trace_record_taskinfo_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -4236,6 +4309,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_RECORD_TGID) { + if (!tgid_map) + tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + GFP_KERNEL); + if (!tgid_map) { + tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + } + if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13823951e42b..6ade1c55cc3a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -640,6 +640,9 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_start_tgid_record(void); +void tracing_stop_tgid_record(void); + int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -700,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE @@ -1124,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ + C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ @@ -1440,6 +1445,8 @@ struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); +extern void trace_event_enable_tgid_record(bool enable); + extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 83dfd0dbbbfe..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) mutex_unlock(&event_mutex); } +void trace_event_enable_tgid_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + mutex_lock(&event_mutex); + do_for_each_event_file(tr, file) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } else { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, + &file->flags); + } + } while_for_each_event_file(); + mutex_unlock(&event_mutex); +} + static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + tgid = true; + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { - tracing_stop_cmdline_record(); + if (cmd) + tracing_stop_cmdline_record(); + if (tgid) + tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -12,27 +12,38 @@ #include "trace.h" -static int sched_ref; +#define RECORD_CMDLINE 1 +#define RECORD_TGID 2 + +static int sched_cmdline_ref; +static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(prev); - tracing_record_cmdline(next); + if (!flags) + return; + tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(current); + if (!flags) + return; + tracing_record_taskinfo(current, flags); } static int tracing_sched_register(void) @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } -static void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(int ops) { + bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref++; + break; + + case RECORD_TGID: + sched_tgid_ref++; + break; + } + + if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } -static void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref--; + break; + + case RECORD_TGID: + sched_tgid_ref--; + break; + } + + if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { - tracing_start_sched_switch(); + tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { - tracing_stop_sched_switch(); + tracing_stop_sched_switch(RECORD_CMDLINE); +} + +void tracing_start_tgid_record(void) +{ + tracing_start_sched_switch(RECORD_TGID); +} + +void tracing_stop_tgid_record(void) +{ + tracing_stop_sched_switch(RECORD_TGID); } -- cgit From 441dae8f2f2975c68101a84bc3f528ec95ecf7c3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 25 Jun 2017 22:38:43 -0700 Subject: tracing: Add support for display of tgid in trace output Earlier patches introduced ability to record the tgid using the 'record-tgid' option. Here we read the tgid and output it if the option is enabled. Link: http://lkml.kernel.org/r/20170626053844.5746-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 ++++++++++++++++++++++-------------- kernel/trace/trace_output.c | 9 +++++++++ 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab9db750dd29..c579dea4a0eb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3319,23 +3319,29 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER_RECORD_TGID; + print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" - "# | | | | |\n"); + + seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { - print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n" - "# / _----=> need-resched\n" - "# | / _---=> hardirq/softirq\n" - "# || / _--=> preempt-depth\n" - "# ||| / delay\n" - "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" - "# | | | |||| | |\n"); + bool tgid = flags & TRACE_ITER_RECORD_TGID; + + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); + seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); + seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); } void @@ -3651,9 +3657,11 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + print_func_help_header_irq(iter->trace_buffer, + m, trace_flags); else - print_func_help_header(iter->trace_buffer, m); + print_func_help_header(iter->trace_buffer, m, + trace_flags); } } } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 01ff99969ca7..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -597,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + unsigned int tgid = trace_find_tgid(entry->pid); + + if (!tgid) + trace_seq_printf(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); -- cgit From 93437353daeff31bd5b11810daa4d2d509d1a64e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 26 May 2017 14:12:25 -0700 Subject: module: use list_for_each_entry_rcu() on find_module_all() The module list has been using RCU in a lot of other calls for a while now, we just overlooked changing this one over to use RCU. Signed-off-by: Luis R. Rodriguez Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f546d574f436..afc6ede7bcdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -603,7 +603,7 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) -- cgit From 165d1cc0074b2f938586274776d029b9bce914c4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 23 Jun 2017 12:19:12 -0700 Subject: kmod: reduce atomic operations on kmod_concurrent and simplify When checking if we want to allow a kmod thread to kick off we increment, then read to see if we should enable a thread. If we were over the allowed limit limit we decrement. Splitting the increment far apart from decrement means there could be a time where two increments happen potentially giving a false failure on a thread which should have been allowed. CPU1 CPU2 atomic_inc() atomic_inc() atomic_read() atomic_read() atomic_dec() atomic_dec() In this case a read on CPU1 gets the atomic_inc()'s and we could negate it from getting a kmod thread. We could try to prevent this with a lock or preemption but that is overkill. We can fix by reducing the number of atomic operations. We do this by inverting the logic of of the enabler, instead of incrementing kmod_concurrent as we get new kmod users, define the variable kmod_concurrent_max as the max number of currently allowed kmod users and as we get new kmod users just decrement it if its still positive. This combines the dec and read in one atomic operation. In this case we no longer get the same false failure: CPU1 CPU2 atomic_dec_if_positive() atomic_dec_if_positive() atomic_inc() atomic_inc() The number of threads is computed at init, and since the current computation of kmod_concurrent includes the thread count we can avoid setting kmod_concurrent_max later in boot through an init call by simply sticking to 50 as the kmod_concurrent_max. The assumption here is a system with modules must at least have ~16 MiB of RAM. Suggested-by: Petr Mladek Suggested-by: Dmitry Torokhov Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Signed-off-by: Jessica Yu --- kernel/kmod.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..ff68198fe83b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ #include -extern int max_threads; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -56,6 +54,20 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES +/* + * Assuming: + * + * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + * (u64) THREAD_SIZE * 8UL); + * + * If you need less than 50 threads would mean we're dealing with systems + * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * and this would only be an be an upper limit, after which the OOM killer + * would take effect. Systems like these are very unlikely if modules are + * enabled. + */ +#define MAX_KMOD_CONCURRENT 50 +static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); /* modprobe_path is set via /proc/sys. @@ -127,10 +139,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; int ret; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; /* @@ -154,21 +163,7 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { + if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { /* We may be blaming an innocent here, but unlikely */ if (kmod_loop_msg < 5) { printk(KERN_ERR @@ -176,7 +171,6 @@ int __request_module(bool wait, const char *fmt, ...) module_name); kmod_loop_msg++; } - atomic_dec(&kmod_concurrent); return -ENOMEM; } @@ -184,10 +178,12 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_dec(&kmod_concurrent); + atomic_inc(&kmod_concurrent_max); + return ret; } EXPORT_SYMBOL(__request_module); + #endif /* CONFIG_MODULES */ static void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit From 3b58a3c72f484393c65995a551902945f5a18c70 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 09:09:38 -0400 Subject: ftrace: Unlock hash mutex on failed allocation in process_mod_list() If the new_hash fails to allocate, then unlock the hash mutex on error. Reported-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7509ef9810bf..2c79630cd267 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3997,7 +3997,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!new_hash) - return; /* Warn? */ + goto out; /* warn? */ mutex_lock(&ftrace_lock); @@ -4042,6 +4042,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash, enable); mutex_unlock(&ftrace_lock); + out: mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(new_hash); -- cgit From 4ec78467858739c0119569c0610676aa50dfa8fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 11:57:03 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info for init functions Init boot up functions may be traced, but they are also freed when the kernel finishes booting. These are removed from the ftrace tables, and the debug variable for dyn_ftrace_total_info needs to reflect that as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c79630cd267..e392f750a1cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5825,6 +5825,7 @@ void __init ftrace_free_init_mem(void) if (!rec) continue; pg->index--; + ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); -- cgit From 824ecbe01c5d833b8c8a371c209e3ac3a76cd18a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 25 Jun 2017 00:27:59 -0400 Subject: cgroup: restructure cgroup_procs_write_permission() Restructure cgroup_procs_write_permission() to make extending permission logic easier. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 57 +++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dbfd7028b1c6..d48069ee84c2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2363,27 +2363,12 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - int ret = 0; - - if (cgroup_on_dfl(dst_cgrp)) { - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup *cgrp; - struct inode *inode; - - spin_lock_irq(&css_set_lock); - cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - while (!cgroup_is_descendant(dst_cgrp, cgrp)) - cgrp = cgroup_parent(cgrp); + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *src_cgrp, *com_cgrp; + struct inode *inode; + int ret; - ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_file.kn); - if (inode) { - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - } - } else { + if (!cgroup_on_dfl(dst_cgrp)) { const struct cred *cred = current_cred(); const struct cred *tcred = get_task_cred(task); @@ -2391,14 +2376,38 @@ static int cgroup_procs_write_permission(struct task_struct *task, * even if we're attaching all tasks in the thread group, * we only need to check permissions on one of them. */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->euid, tcred->suid)) + ret = 0; + else ret = -EACCES; + put_cred(tcred); + return ret; } - return ret; + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* and the common ancestor */ + com_cgrp = src_cgrp; + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + return 0; } /* -- cgit From 5136f6365ce3eace5a926e10f16ed2a233db5ba9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jun 2017 14:30:28 -0400 Subject: cgroup: implement "nsdelegate" mount option Currently, cgroup only supports delegation to !root users and cgroup namespaces don't get any special treatments. This limits the usefulness of cgroup namespaces as they by themselves can't be safe delegation boundaries. A process inside a cgroup can change the resource control knobs of the parent in the namespace root and may move processes in and out of the namespace if cgroups outside its namespace are visible somehow. This patch adds a new mount option "nsdelegate" which makes cgroup namespaces delegation boundaries. If set, cgroup behaves as if write permission based delegation took place at namespace boundaries - writes to the resource control knobs from the namespace root are denied and migration crossing the namespace boundary aren't allowed from inside the namespace. This allows cgroup namespace to function as a delegation boundary by itself. v2: Silently ignore nsdelegate specified on !init mounts. Signed-off-by: Tejun Heo Cc: Aravind Anbudurai Cc: Serge Hallyn Cc: Eric Biederman --- Documentation/cgroup-v2.txt | 61 +++++++++++++++++++++---------- include/linux/cgroup-defs.h | 9 +++++ kernel/cgroup/cgroup.c | 88 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 135 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 0260ed053efd..558c3a739baf 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. +cgroup v2 currently supports the following mount options. + + nsdelegate + + Consider cgroup namespaces as delegation boundaries. This + option is system wide and can only be set on mount or modified + through remount from the init namespace. The mount option is + ignored on non-init namespace mounts. Please refer to the + Delegation section for details. + 2-2. Organizing Processes @@ -308,19 +318,27 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" and -"cgroup.subtree_control" files to the user. Note that resource -control interface files in a given directory control the distribution -of the parent's resources and thus must not be delegated along with -the directory. - -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it received from the parent. The limits and other settings of all -resource controllers are hierarchical and regardless of what happens -in the delegated sub-hierarchy, nothing can escape the resource -restrictions imposed by the parent. +A cgroup can be delegated in two ways. First, to a less privileged +user by granting write access of the directory and its "cgroup.procs" +and "cgroup.subtree_control" files to the user. Second, if the +"nsdelegate" mount option is set, automatically to a cgroup namespace +on namespace creation. + +Because the resource control interface files in a given directory +control the distribution of the parent's resources, the delegatee +shouldn't be allowed to write to them. For the first method, this is +achieved by not granting access to these files. For the second, the +kernel rejects writes to all files other than "cgroup.procs" and +"cgroup.subtree_control" on a namespace root from inside the +namespace. + +The end results are equivalent for both delegation types. Once +delegated, the user can build sub-hierarchy under the directory, +organize processes inside it as it sees fit and further distribute the +resources it received from the parent. The limits and other settings +of all resource controllers are hierarchical and regardless of what +happens in the delegated sub-hierarchy, nothing can escape the +resource restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -330,10 +348,12 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. For -a process with a non-root euid to migrate a target process into a -cgroup by writing its PID to the "cgroup.procs" file, the following -conditions must be met. +can't be moved into or out of the sub-hierarchy by the delegatee. + +For delegations to a less privileged user, this is achieved by +requiring the following conditions for a process with a non-root euid +to migrate a target process into a cgroup by writing its PID to the +"cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file. @@ -360,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would not have write access to its "cgroup.procs" files and thus the write will be denied with -EACCES. +For delegations to namespaces, containment is achieved by requiring +that both the source and destination cgroups are reachable from the +namespace of the process which is attempting the migration. If either +is not reachable, the migration is rejected with -ENOENT. + 2-6. Guidelines @@ -1414,7 +1439,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All mount options and remounting are not supported. +- All v1 mount options are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3bc4196bf217..09f4c7df1478 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -67,12 +67,21 @@ enum { enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ + + /* + * Consider namespaces as delegation boundaries. If this flag is + * set, controller specific interface files in a namespace root + * aren't writeable from inside the namespace. + */ + CGRP_ROOT_NS_DELEGATE = (1 << 3), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d48069ee84c2..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } +static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +{ + char *token; + + *root_flags = 0; + + if (!data) + return 0; + + while ((token = strsep(&data, ",")) != NULL) { + if (!strcmp(token, "nsdelegate")) { + *root_flags |= CGRP_ROOT_NS_DELEGATE; + continue; + } + + pr_err("cgroup2: unknown option \"%s\"\n", token); + return -EINVAL; + } + + return 0; +} + +static void apply_cgroup_root_flags(unsigned int root_flags) +{ + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { + if (root_flags & CGRP_ROOT_NS_DELEGATE) + cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + } +} + +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) + seq_puts(seq, ",nsdelegate"); + return 0; +} + static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - pr_err("remount is not allowed\n"); - return -EINVAL; + unsigned int root_flags; + int ret; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) + return ret; + + apply_cgroup_root_flags(root_flags); + return 0; } /* @@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; + int ret; get_cgroup_ns(ns); @@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + unsigned int root_flags; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) { put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); + return ERR_PTR(ret); } + cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); + if (!IS_ERR(dentry)) + apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); @@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct kernfs_open_file *of) { struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; struct cgroup *src_cgrp, *com_cgrp; struct inode *inode; int ret; @@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (ret) return ret; + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, root_cgrp) || + !cgroup_is_descendant(dst_cgrp, root_cgrp))) + return -ENOENT; + return 0; } @@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; + /* + * If namespaces are delegation boundaries, disallow writes to + * files in an non-init namespace root from inside the namespace + * except for the files explicitly marked delegatable - + * cgroup.procs and cgroup.subtree_control. + */ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; + if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", + .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, @@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.subtree_control", + .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, -- cgit From 96b5b19459b3c2aed2872bac42cbe19edfae710f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 28 Jun 2017 18:32:31 -0700 Subject: module: make the modinfo name const This can be accomplished by making blacklisted() also accept const. Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook [jeyu: fix typo] Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index afc6ede7bcdf..d07287707557 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,7 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { - char *name; + const char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -3265,7 +3265,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, /* module_blacklist is a comma-separated list of module names */ static char *module_blacklist; -static bool blacklisted(char *module_name) +static bool blacklisted(const char *module_name) { const char *p; size_t len; -- cgit From 14dc6f04f49dc12614d7e90928b495b8d73cd471 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 27 Jun 2017 23:08:34 -0700 Subject: bpf: Add syscall lookup support for fd array and htab This patch allows userspace to do BPF_MAP_LOOKUP_ELEM on BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS. The lookup returns a prog-id or map-id to the userspace. The userspace can then use the BPF_PROG_GET_FD_BY_ID or BPF_MAP_GET_FD_BY_ID to get a fd. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 27 +++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ kernel/bpf/map_in_map.c | 5 +++++ kernel/bpf/map_in_map.h | 1 + kernel/bpf/syscall.c | 16 +++++++++++++--- 6 files changed, 70 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index deca4e7f2845..5175729270d7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,6 +36,7 @@ struct bpf_map_ops { int fd); void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + u32 (*map_fd_sys_lookup_elem)(void *ptr); }; struct bpf_map { @@ -288,9 +289,11 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); void bpf_fd_array_map_clear(struct bpf_map *map); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ecb43542246e..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -334,6 +334,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* only called from syscall */ +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **elem, *ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + elem = array_map_lookup_elem(map, key); + if (elem && (ptr = READ_ONCE(*elem))) + *value = map->ops->map_fd_sys_lookup_elem(ptr); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) bpf_prog_put(ptr); } +static u32 prog_fd_array_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_prog *)ptr)->aux->id; +} + /* decrement refcnt of all bpf_progs that are stored in this map */ void bpf_fd_array_map_clear(struct bpf_map *map) { @@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, + .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -585,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 004334ea13ba..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1243,6 +1243,26 @@ static void fd_htab_map_free(struct bpf_map *map) htab_map_free(map); } +/* only called from syscall */ +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + ptr = htab_map_lookup_elem(map, key); + if (ptr) + *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 59bcdf821ae4..1da574612bea 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr) */ bpf_map_put(ptr); } + +u32 bpf_map_fd_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_map *)ptr)->id; +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 177fadb689dc..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); +u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8942c820d620..4409ccca8831 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -24,6 +24,13 @@ #include #include +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -411,6 +418,8 @@ static int map_lookup_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + value_size = sizeof(u32); else value_size = map->value_size; @@ -426,9 +435,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - err = -ENOTSUPP; + } else if (IS_FD_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); -- cgit From 8007e40a24e12d35189203370268c7278f29ab74 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 28 Jun 2017 10:41:24 -0700 Subject: bpf: Fix out-of-bound access on interpreters[] The index is off-by-one when fp->aux->stack_depth has already been rounded up to 32. In particular, if stack_depth is 512, the index will be 16. The fix is to round_up and then takes -1 instead of round_down. [ 22.318680] ================================================================== [ 22.319745] BUG: KASAN: global-out-of-bounds in bpf_prog_select_runtime+0x48a/0x670 [ 22.320737] Read of size 8 at addr ffffffff82aadae0 by task sockex3/1946 [ 22.321646] [ 22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: G W 4.12.0-rc6-01680-g2ee87db3a287 #22 [ 22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 22.324260] Call Trace: [ 22.324612] dump_stack+0x67/0x99 [ 22.325081] print_address_description+0x1e8/0x290 [ 22.325734] ? bpf_prog_select_runtime+0x48a/0x670 [ 22.326360] kasan_report+0x265/0x350 [ 22.326860] __asan_report_load8_noabort+0x19/0x20 [ 22.327484] bpf_prog_select_runtime+0x48a/0x670 [ 22.328109] bpf_prog_load+0x626/0xd40 [ 22.328637] ? __bpf_prog_charge+0xc0/0xc0 [ 22.329222] ? check_nnp_nosuid.isra.61+0x100/0x100 [ 22.329890] ? __might_fault+0xf6/0x1b0 [ 22.330446] ? lock_acquire+0x360/0x360 [ 22.331013] SyS_bpf+0x67c/0x24d0 [ 22.331491] ? trace_hardirqs_on+0xd/0x10 [ 22.332049] ? __getnstimeofday64+0xaf/0x1c0 [ 22.332635] ? bpf_prog_get+0x20/0x20 [ 22.333135] ? __audit_syscall_entry+0x300/0x600 [ 22.333770] ? syscall_trace_enter+0x540/0xdd0 [ 22.334339] ? exit_to_usermode_loop+0xe0/0xe0 [ 22.334950] ? do_syscall_64+0x48/0x410 [ 22.335446] ? bpf_prog_get+0x20/0x20 [ 22.335954] do_syscall_64+0x181/0x410 [ 22.336454] entry_SYSCALL64_slow_path+0x25/0x25 [ 22.337121] RIP: 0033:0x7f263fe81f19 [ 22.337618] RSP: 002b:00007ffd9a3440c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 22.338619] RAX: ffffffffffffffda RBX: 0000000000aac5fb RCX: 00007f263fe81f19 [ 22.339600] RDX: 0000000000000030 RSI: 00007ffd9a3440d0 RDI: 0000000000000005 [ 22.340470] RBP: 0000000000a9a1e0 R08: 0000000000a9a1e0 R09: 0000009d00000001 [ 22.341430] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000010000 [ 22.342411] R13: 0000000000a9a023 R14: 0000000000000001 R15: 0000000000000003 [ 22.343369] [ 22.343593] The buggy address belongs to the variable: [ 22.344241] interpreters+0x80/0x980 [ 22.344708] [ 22.344908] Memory state around the buggy address: [ 22.345556] ffffffff82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa fa [ 22.346449] ffffffff82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 00 [ 22.347361] >ffffffff82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa [ 22.348301] ^ [ 22.349142] ffffffff82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00 [ 22.350058] ffffffff82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa fa [ 22.350984] ================================================================== Fixes: b870aa901f4b ("bpf: use different interpreter depending on required stack size") Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 774069ca18a7..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit From 59494fe2c89e46e85d83de9bc45dd1d528955c49 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 16:58:40 +0530 Subject: PM: hibernate: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 6332 488 308 7128 1bd8 kernel/power/hibernate.o File size After adding 'const': text data bss dec hex filename 6396 424 308 7128 1bd8 kernel/power/hibernate.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a8b978c35a6a..e1914c7b85b1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1108,7 +1108,7 @@ static struct attribute * g[] = { }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit From a9bd8dfa539493db265e46a496c1a89279ab31d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:39:01 -0400 Subject: kimage_file_prepare_segments(): don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/kexec_file.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..766e7e4d3ad9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -162,16 +162,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; + image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); + if (IS_ERR(image->cmdline_buf)) { + ret = PTR_ERR(image->cmdline_buf); + image->cmdline_buf = NULL; goto out; } -- cgit From e4448ed87ccdbacb74871736f63220642242b32f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:43:00 -0400 Subject: bpf: don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/bpf/syscall.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..4b8b10bddfde 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,14 +322,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -402,14 +399,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -488,14 +482,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } preempt_disable(); __this_cpu_inc(bpf_prog_active); @@ -507,7 +498,6 @@ static int map_delete_elem(union bpf_attr *attr) if (!err) trace_bpf_map_delete_elem(map, ufd, key); -free_key: kfree(key); err_put: fdput(f); @@ -536,14 +526,11 @@ static int map_get_next_key(union bpf_attr *attr) return PTR_ERR(map); if (ukey) { - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } } else { key = NULL; } -- cgit From 5c4994102fb508d4a0f7a8afa46560c314c1ebd4 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:05 -0700 Subject: posix-timers: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_gettime, clock_settime, clock_getres and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 83 +++++++++++++++++++++++++--------------------- kernel/time/posix-timers.c | 63 ++++++++++++++--------------------- 2 files changed, 70 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 65878221cbfb..06f34feb635e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -51,40 +51,52 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) +int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; + case CLOCK_REALTIME: + ktime_get_real_ts64(tp); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(tp); + break; + case CLOCK_BOOTTIME: + get_monotonic_boottime64(tp); + break; + default: + return -EINVAL; } - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return 0; +} +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -93,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + if (put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -142,41 +154,35 @@ COMPAT_SYS_NI(setitimer); COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct compat_timespec __user *,tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + int ret; + struct timespec64 kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; - } + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (compat_put_timespec(&kernel_tp, tp)) + if (compat_put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -185,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec(&rtn_tp, tp)) + if (compat_put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: return -EINVAL; } } + COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 82d67be7d9d1..39322ae5dd87 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1049,34 +1049,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (!kc || !kc->clock_set) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + struct timespec64 kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + error = kc->clock_get(which_clock, &kernel_tp); - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; return error; @@ -1109,17 +1105,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; + struct timespec64 rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && put_timespec64(&rtn_tp, tp)) error = -EFAULT; return error; @@ -1131,38 +1125,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 ts; if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&ts, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + err = kc->clock_get(which_clock, &ts); - if (!error && compat_put_timespec(&kernel_tp, tp)) - error = -EFAULT; + if (!err && compat_put_timespec64(&ts, tp)) + err = -EFAULT; - return error; + return err; } COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, @@ -1193,21 +1182,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); - - if (!error && tp && compat_put_timespec(&rtn_tp, tp)) - error = -EFAULT; + err = kc->clock_getres(which_clock, &ts); + if (!err && tp && compat_put_timespec64(&ts, tp)) + return -EFAULT; - return error; + return err; } + #endif /* -- cgit From c0edd7c9acd0eaee149ab6cb4441cc71a1af87f0 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:06 -0700 Subject: nanosleep: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_nanosleep and nanosleep and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/hrtimer.h | 2 +- kernel/time/alarmtimer.c | 4 ++-- kernel/time/hrtimer.c | 30 +++++++++++++----------------- kernel/time/posix-cpu-timers.c | 8 ++------ kernel/time/posix-timers.c | 20 ++++++++------------ 5 files changed, 26 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 255edd5e7a74..012c37fdb688 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -453,7 +453,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ -extern int nanosleep_copyout(struct restart_block *, struct timespec *); +extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c991cf212c6d..0b8ff7d257ea 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarmtimer_freezerset(absexp, type); restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { - struct timespec rmt; + struct timespec64 rmt; ktime_t rem; rem = ktime_sub(absexp, alarm_bases[type].gettime()); if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 81da124f1115..88f75f92ef36 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); -int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT case TT_COMPAT: - if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: - if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: @@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); - struct timespec rmt; + struct timespec64 rmt; if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } @@ -1546,19 +1546,17 @@ out: SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (copy_from_user(&tu, rqtp, sizeof(tu))) + if (get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #ifdef CONFIG_COMPAT @@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (compat_get_timespec(&tu, rqtp)) + if (compat_get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9df618ee64cf..7323da5950cc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1314,12 +1314,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, */ restart = ¤t->restart_block; restart->nanosleep.expires = expires; - if (restart->nanosleep.type != TT_NONE) { - struct timespec ts; - - ts = timespec64_to_timespec(it.it_value); - error = nanosleep_copyout(restart, &ts); - } + if (restart->nanosleep.type != TT_NONE) + error = nanosleep_copyout(restart, &it.it_value); } return error; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 39322ae5dd87..4b0fc3b0a1c4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1213,26 +1213,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #ifdef CONFIG_COMPAT @@ -1241,26 +1239,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #endif -- cgit From 725816e8aabb1c183baa2bc9572ab9a0d26b9ea1 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:08 -0700 Subject: posix_clocks: Use get_itimerspec64() and put_itimerspec64() Usage of these apis and their compat versions makes the syscalls: timer_settime and timer_gettime and their compat implementations simpler. This patch also serves as a preparatory patch for changing syscalls to use new time_t data types to support the y2038 effort by isolating the processing of user pointers through these apis. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-timers.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b0fc3b0a1c4..13d6881f908b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (put_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (put_compat_itimerspec(setting, &cur_setting)) + if (put_compat_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; int error = 0; if (!new_setting) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + if (put_itimerspec64(&old_spec, old_setting)) error = -EFAULT; } return error; @@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct compat_itimerspec __user *, new, struct compat_itimerspec __user *, old) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old ? &old_spec : NULL; int error = 0; if (!new) return -EINVAL; - if (get_compat_itimerspec(&new_spec, new)) + if (get_compat_itimerspec64(&new_spec, new)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (put_compat_itimerspec(old, &old_spec)) + if (put_compat_itimerspec64(&old_spec, old)) error = -EFAULT; } return error; -- cgit From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- arch/x86/include/asm/processor.h | 2 +- fs/mount.h | 4 ++-- fs/namei.c | 2 +- fs/proc/internal.h | 6 +++--- include/linux/binfmts.h | 4 ++-- include/linux/cdev.h | 2 +- include/linux/cred.h | 4 ++-- include/linux/dcache.h | 2 +- include/linux/fs.h | 17 +++++++++-------- include/linux/fs_struct.h | 2 +- include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/key-type.h | 4 ++-- include/linux/kmod.h | 2 +- include/linux/kobject.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/mm_types.h | 4 ++-- include/linux/module.h | 4 ++-- include/linux/mount.h | 2 +- include/linux/msg.h | 2 +- include/linux/path.h | 2 +- include/linux/pid_namespace.h | 2 +- include/linux/proc_ns.h | 2 +- include/linux/sched.h | 2 +- include/linux/sched/signal.h | 2 +- include/linux/sem.h | 2 +- include/linux/shm.h | 2 +- include/linux/sysctl.h | 2 +- include/linux/tty.h | 2 +- include/linux/tty_driver.h | 4 ++-- include/linux/user_namespace.h | 2 +- include/linux/utsname.h | 2 +- include/net/af_unix.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/sock.h | 2 +- kernel/futex.c | 4 ++-- security/keys/internal.h | 2 +- 38 files changed, 57 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..e2335edb9fc5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -129,7 +129,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -}; +} __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..e406b286fba1 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,7 @@ struct mnt_namespace { u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; -}; +} __randomize_layout; struct mnt_pcp { int mnt_count; @@ -68,7 +68,7 @@ struct mount { struct hlist_head mnt_pins; struct fs_pin mnt_umount; struct dentry *mnt_ex_mountpoint; -}; +} __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index 6571a5f5112e..1764620ac383 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -524,7 +524,7 @@ struct nameidata { struct inode *link_inode; unsigned root_seq; int dfd; -}; +} __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c5ae09b6c726..07b16318223f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,7 +51,7 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; -}; +} __randomize_layout; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); @@ -70,7 +70,7 @@ struct proc_inode { struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; -}; +} __randomize_layout; /* * General functions @@ -279,7 +279,7 @@ struct proc_maps_private { #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif -}; +} __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 05488da3aee9..3ae9013eeaaa 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -46,7 +46,7 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; -}; +} __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) @@ -81,7 +81,7 @@ struct linux_binfmt { int (*load_shlib)(struct file *); int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ -}; +} __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); diff --git a/include/linux/cdev.h b/include/linux/cdev.h index 408bc09ce497..cb28eb21e3ca 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -17,7 +17,7 @@ struct cdev { struct list_head list; dev_t dev; unsigned int count; -}; +} __randomize_layout; void cdev_init(struct cdev *, const struct file_operations *); diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..82c8a9e1aabb 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -31,7 +31,7 @@ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; -}; +} __randomize_layout; /** * get_group_info - Get a reference to a group info structure @@ -145,7 +145,7 @@ struct cred { struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ -}; +} __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d2e38dc6172c..7eb262e13d3c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -113,7 +113,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -}; +} __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..8f28143486c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; -}; +} __randomize_layout; static inline bool is_sync_kiocb(struct kiocb *kiocb) { @@ -392,7 +392,7 @@ struct address_space { gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ -} __attribute__((aligned(sizeof(long)))); +} __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit @@ -435,7 +435,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -}; +} __randomize_layout; /* * Radix-tree tags, for tagging dirty and writeback pages within the pagecache @@ -653,7 +653,7 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ -}; +} __randomize_layout; static inline unsigned int i_blocksize(const struct inode *node) { @@ -868,7 +868,8 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ +} __randomize_layout + __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; @@ -1005,7 +1006,7 @@ struct file_lock { int state; /* state of grant or error if -ve */ } afs; } fl_u; -}; +} __randomize_layout; struct file_lock_context { spinlock_t flc_lock; @@ -1404,7 +1405,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ -}; +} __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can @@ -1690,7 +1691,7 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); -}; +} __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 0efc3e62843a..7a026240cbb1 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -12,7 +12,7 @@ struct fs_struct { int umask; int in_exec; struct path root, pwd; -}; +} __randomize_layout; extern struct kmem_cache *fs_cachep; diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 71fd92d81b26..ea0eb0b5f98c 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -20,6 +20,6 @@ struct kern_ipc_perm { umode_t mode; unsigned long seq; void *security; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned_in_smp __randomize_layout; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 848e5796400e..65327ee0936b 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -61,7 +61,7 @@ struct ipc_namespace { struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 8496cf64575c..9520fc3c3b9a 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,7 +45,7 @@ struct key_preparsed_payload { size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time_t expiry; /* Expiry time of key */ -}; +} __randomize_layout; typedef int (*request_key_actor_t)(struct key_construction *key, const char *op, void *aux); @@ -158,7 +158,7 @@ struct key_type { /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ -}; +} __randomize_layout; extern struct key_type key_type_keyring; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4e441e00db5..655082c88fd9 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -64,7 +64,7 @@ struct subprocess_info { int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; -}; +} __randomize_layout; extern int call_usermodehelper(const char *path, char **argv, char **envp, int wait); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..084513350317 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -172,7 +172,7 @@ struct kset { spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; -}; +} __randomize_layout; extern void kset_init(struct kset *kset); extern int __must_check kset_register(struct kset *kset); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 080f34e66017..565163fc9ad4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1876,7 +1876,7 @@ struct security_hook_heads { struct list_head audit_rule_match; struct list_head audit_rule_free; #endif /* CONFIG_AUDIT */ -}; +} __randomize_layout; /* * Security module hook list structure. @@ -1887,7 +1887,7 @@ struct security_hook_list { struct list_head *head; union security_list_options hook; char *lsm; -}; +} __randomize_layout; /* * Initializing a security_hook_list structure takes diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 45cdb27791a3..ff151814a02d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,7 +342,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -}; +} __randomize_layout; struct core_thread { struct task_struct *task; @@ -500,7 +500,7 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; -}; +} __randomize_layout; extern struct mm_struct init_mm; diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..d93111d7def6 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -45,7 +45,7 @@ struct module_kobject { struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; -}; +} __randomize_layout; struct module_attribute { struct attribute attr; @@ -475,7 +475,7 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif -} ____cacheline_aligned; +} ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif diff --git a/include/linux/mount.h b/include/linux/mount.h index 8e0352af06b7..1ce85e6fd95f 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,7 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; -}; +} __randomize_layout; struct file; /* forward dec */ struct path; diff --git a/include/linux/msg.h b/include/linux/msg.h index f3f302f9c197..a001305f5a79 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -29,7 +29,7 @@ struct msg_queue { struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; -}; +} __randomize_layout; /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, diff --git a/include/linux/path.h b/include/linux/path.h index d1372186f431..cde895cc4af4 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -7,7 +7,7 @@ struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; -}; +} __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c2a989dee876..b09136f88cf4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -52,7 +52,7 @@ struct pid_namespace { int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -}; +} __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 58ab28d81fc2..06844b54dfc1 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -21,7 +21,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); -}; +} __randomize_layout; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..f833254fce00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -408,7 +408,7 @@ struct sched_rt_entity { /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif -}; +} __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c06d63b3a583..2a0dd40b15db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -222,7 +222,7 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ -}; +} __randomize_layout; /* * Bits in flags field of signal_struct. diff --git a/include/linux/sem.h b/include/linux/sem.h index 9edec926e9d9..23bcbdfad4a6 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,7 +21,7 @@ struct sem_array { int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ -}; +} __randomize_layout; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/shm.h b/include/linux/shm.h index 04e881829625..0fb7061ec54c 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -22,7 +22,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; struct list_head shm_clist; /* list by creator */ -}; +} __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..9ddeef2c03e2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -117,7 +117,7 @@ struct ctl_table struct ctl_table_poll *poll; void *extra1; void *extra2; -}; +} __randomize_layout; struct ctl_node { struct rb_node node; diff --git a/include/linux/tty.h b/include/linux/tty.h index d07cd2105a6c..73f8d0977bb0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -333,7 +333,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; -}; +} __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b742b5e47cc2..00b2213f6a35 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -291,7 +291,7 @@ struct tty_operations { void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif const struct file_operations *proc_fops; -}; +} __randomize_layout; struct tty_driver { int magic; /* magic number for this structure */ @@ -325,7 +325,7 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; -}; +} __randomize_layout; extern struct list_head tty_drivers; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 32354b4b4b2b..b3575ce29148 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -66,7 +66,7 @@ struct user_namespace { #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; -}; +} __randomize_layout; struct ucounts { struct hlist_node node; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 60f0bb83b313..da826ed059cf 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -26,7 +26,7 @@ struct uts_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..64e2a1e24a2c 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_skb_parms { u32 secid; /* Security ID */ #endif u32 consumed; -}; +} __randomize_layout; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e4dd3a214034..a62959d2b3f7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -155,7 +155,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; -}; +} __randomize_layout; struct neigh_ops { int family; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fe80bb48ab1f..a224196d16ac 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -147,7 +147,7 @@ struct net { #endif struct sock *diag_nlsk; atomic_t fnhe_genid; -}; +} __randomize_layout; #include diff --git a/include/net/sock.h b/include/net/sock.h index f33e3d134e0b..d349297db9e9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1113,7 +1113,7 @@ struct proto { atomic_t socks; #endif int (*diag_destroy)(struct sock *sk, int err); -}; +} __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..5616511abf39 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ diff --git a/security/keys/internal.h b/security/keys/internal.h index c0f8682eba69..6494954e9980 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -197,7 +197,7 @@ struct request_key_auth { void *callout_info; size_t callout_len; pid_t pid; -}; +} __randomize_layout; extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, -- cgit From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 30 Jun 2017 20:02:40 -0700 Subject: bpf: BPF support for sock_ops Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. The program will be called at appropriate times to set relevant connections parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some distinct advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version * Some fields are in network byte order reflecting the sock struct * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to * convert them to host byte order. */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; /* In network byte order */ __u32 local_ip4; /* In network byte order */ __u32 remote_ip6[4]; /* In network byte order */ __u32 local_ip6[4]; /* In network byte order */ __u32 remote_port; /* In network byte order */ __u32 local_port; /* In host byte horder */ }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 18 +++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 9 +++ include/net/tcp.h | 36 ++++++++++ include/uapi/linux/bpf.h | 30 ++++++++ kernel/bpf/cgroup.c | 37 ++++++++++ kernel/bpf/syscall.c | 5 ++ net/core/filter.c | 168 +++++++++++++++++++++++++++++++++++++++++++++ samples/bpf/bpf_load.c | 13 +++- 9 files changed, 314 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c970a25d2a49..360c082e885c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -7,6 +7,7 @@ struct sock; struct cgroup; struct sk_buff; +struct bpf_sock_ops_kern; #ifdef CONFIG_CGROUP_BPF @@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + } \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 03bf223f18be..3d137c33d664 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1fa26dc562ce..738f8b14f025 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_ops_kern { + struct sock *sk; + u32 op; + union { + u32 reply; + u32 replylong[4]; + }; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d0751b79d99c..e58500825006 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -46,6 +46,10 @@ #include #include +#include +#include +#include + extern struct inet_hashinfo tcp_hashinfo; extern struct percpu_counter tcp_orphan_count; @@ -2021,4 +2025,36 @@ int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +/* Call BPF_SOCK_OPS program that returns an int. If the return value + * is < 0, then the BPF op failed (for example if the loaded BPF + * program does not support the chosen operation or there is no BPF + * program loaded). + */ +#ifdef CONFIG_BPF +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + struct bpf_sock_ops_kern sock_ops; + int ret; + + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + + memset(&sock_ops, 0, sizeof(sock_ops)); + sock_ops.sk = sk; + sock_ops.op = op; + + ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); + if (ret == 0) + ret = sock_ops.reply; + else + ret = -1; + return ret; +} +#else +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + return -EPERM; +} +#endif + #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f94b48b168dc..01cd485ccd4f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,12 +120,14 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -720,4 +722,32 @@ struct bpf_map_info { __u32 map_flags; } __attribute__((aligned(8))); +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 reply; + __u32 replylong[4]; + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + +/** + * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock + * @sk: socket to get cgroup from + * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains + * sk with connection information (IP addresses, etc.) May not contain + * cgroup info if it is a req sock. + * @type: The type of program to be exectuted + * + * socket passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock_ops + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4409ccca8831..d4d47de75bba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_SOCK_OPS: + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; default: return -EINVAL; } @@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); @@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } + #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration diff --git a/net/core/filter.c b/net/core/filter.c index b39c869d22e3..1f6a26c4f8b9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3110,6 +3110,36 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool __is_valid_sock_ops_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + break; + default: + return false; + } + } + + return __is_valid_sock_ops_access(off, size); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -3379,6 +3409,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, op); + off += offsetof(struct bpf_sock_ops_kern, op); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + } + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -3428,6 +3590,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = { .convert_ctx_access = sock_filter_convert_ctx_access, }; +const struct bpf_verifier_ops sock_ops_prog_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index a91c57dd8571..a4be7cfa6519 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_perf_event = strncmp(event, "perf_event", 10) == 0; bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; + bool is_sockops = strncmp(event, "sockops", 7) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_CGROUP_SKB; } else if (is_cgroup_sk) { prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + } else if (is_sockops) { + prog_type = BPF_PROG_TYPE_SOCK_OPS; } else { printf("Unknown event '%s'\n", event); return -1; @@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket) { - event += 6; + if (is_socket || is_sockops) { + if (is_socket) + event += 6; + else + event += 7; if (*event != '/') return 0; event++; @@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0) + memcmp(shname, "cgroup/", 7) == 0 || + memcmp(shname, "sockops", 7) == 0) load_and_attach(shname, data->d_buf, data->d_size); } -- cgit From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:27 +0200 Subject: bpf: simplify narrower ctx access This work tries to make the semantics and code around the narrower ctx access a bit easier to follow. Right now everything is done inside the .is_valid_access(). Offset matching is done differently for read/write types, meaning writes don't support narrower access and thus matching only on offsetof(struct foo, bar) is enough whereas for read case that supports narrower access we must check for offsetof(struct foo, bar) + offsetof(struct foo, bar) + sizeof() - 1 for each of the cases. For read cases of individual members that don't support narrower access (like packet pointers or skb->cb[] case which has its own narrow access logic), we check as usual only offsetof(struct foo, bar) like in write case. Then, for the case where narrower access is allowed, we also need to set the aux info for the access. Meaning, ctx_field_size and converted_op_size have to be set. First is the original field size e.g. sizeof() as in above example from the user facing ctx, and latter one is the target size after actual rewrite happened, thus for the kernel facing ctx. Also here we need the range match and we need to keep track changing convert_ctx_access() and converted_op_size from is_valid_access() as both are not at the same location. We can simplify the code a bit: check_ctx_access() becomes simpler in that we only store ctx_field_size as a meta data and later in convert_ctx_accesses() we fetch the target_size right from the location where we do convert. Should the verifier be misconfigured we do reject for BPF_WRITE cases or target_size that are not provided. For the subsystems, we always work on ranges in is_valid_access() and add small helpers for ranges and narrow access, convert_ctx_accesses() sets target_size for the relevant instruction. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 9 +- include/linux/filter.h | 47 ++++++++++ kernel/bpf/verifier.c | 78 +++++++--------- kernel/trace/bpf_trace.c | 31 +++--- net/core/filter.c | 239 +++++++++++++++++++++-------------------------- 5 files changed, 209 insertions(+), 195 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5175729270d7..b69e7a5869ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -156,9 +156,14 @@ struct bpf_prog; struct bpf_insn_access_aux { enum bpf_reg_type reg_type; int ctx_field_size; - int converted_op_size; }; +static inline void +bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) +{ + aux->ctx_field_size = size; +} + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -173,7 +178,7 @@ struct bpf_verifier_ops { u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, - struct bpf_prog *prog); + struct bpf_prog *prog, u32 *target_size); int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 738f8b14f025..f1fc9baa3509 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -337,6 +337,22 @@ struct bpf_prog_aux; bpf_size; \ }) +#define bpf_size_to_bytes(bpf_size) \ +({ \ + int bytes = -EINVAL; \ + \ + if (bpf_size == BPF_B) \ + bytes = sizeof(u8); \ + else if (bpf_size == BPF_H) \ + bytes = sizeof(u16); \ + else if (bpf_size == BPF_W) \ + bytes = sizeof(u32); \ + else if (bpf_size == BPF_DW) \ + bytes = sizeof(u64); \ + \ + bytes; \ +}) + #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ @@ -351,6 +367,13 @@ struct bpf_prog_aux; __size; \ }) +#define BPF_LDST_BYTES(insn) \ + ({ \ + const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \ + WARN_ON(__size < 0); \ + __size; \ + }) + #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) @@ -401,6 +424,18 @@ struct bpf_prog_aux; #define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define bpf_ctx_range(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ + offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 + +#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ + ({ \ + BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \ + *(PTR_SIZE) = (SIZE); \ + offsetof(TYPE, MEMBER); \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { @@ -564,6 +599,18 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +{ + bool off_ok; +#ifdef __LITTLE_ENDIAN + off_ok = (off & (size_default - 1)) == 0; +#else + off_ok = (off & (size_default - 1)) + size == size_default; +#endif + return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) #ifdef CONFIG_ARCH_HAS_SET_MEMORY diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ea2adcb233b..6f820a044079 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, return 0; } -static int bpf_size_to_bytes(int bpf_size) -{ - if (bpf_size == BPF_W) - return 4; - else if (bpf_size == BPF_H) - return 2; - else if (bpf_size == BPF_B) - return 1; - else if (bpf_size == BPF_DW) - return 8; - else - return -EINVAL; -} - static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -761,7 +747,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - struct bpf_insn_access_aux info = { .reg_type = *reg_type }; + struct bpf_insn_access_aux info = { + .reg_type = *reg_type, + }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) @@ -769,25 +757,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, &info)) { - /* a non zero info.ctx_field_size indicates: - * . For this field, the prog type specific ctx conversion algorithm - * only supports whole field access. - * . This ctx access is a candiate for later verifier transformation - * to load the whole field and then apply a mask to get correct result. - * a non zero info.converted_op_size indicates perceived actual converted - * value width in convert_ctx_access. + /* A non zero info.ctx_field_size indicates that this field is a + * candidate for later verifier transformation to load the whole + * field and then apply a mask when accessed with a narrower + * access than actual ctx access size. A zero info.ctx_field_size + * will only allow for whole field access and rejects any other + * type of narrower access. */ - if ((info.ctx_field_size && !info.converted_op_size) || - (!info.ctx_field_size && info.converted_op_size)) { - verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", - env->prog->type, off, size); - return -EACCES; - } - - if (info.ctx_field_size) { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; - } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ @@ -3401,11 +3378,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; + int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; + bool is_narrower_load; + u32 target_size; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3445,39 +3424,50 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - off = insn->off; - size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - converted_op_size = env->insn_aux_data[i + delta].converted_op_size; - is_narrower_load = type == BPF_READ && size < ctx_field_size; + size = BPF_LDST_BYTES(insn); /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific * convert_ctx_access changes. If conversion is successful, * we will apply proper mask to the result. */ + is_narrower_load = size < ctx_field_size; if (is_narrower_load) { - int size_code = BPF_H; + u32 off = insn->off; + u8 size_code; + + if (type == BPF_WRITE) { + verbose("bpf verifier narrow ctx access misconfigured\n"); + return -EINVAL; + } + size_code = BPF_H; if (ctx_field_size == 4) size_code = BPF_W; else if (ctx_field_size == 8) size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + + target_size = 0; + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + (ctx_field_size && !target_size)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load && size < converted_op_size) { + + if (is_narrower_load && size < target_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); else insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 97c46b440cd6..5c6d538dbf43 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,7 +583,8 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - int sample_period_off; + const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, + sample_period); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -592,43 +593,35 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type if (off % size != 0) return false; - /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ - sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); - if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - int allowed; - -#ifdef __LITTLE_ENDIAN - allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; -#else - allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; -#endif - if (!allowed) + switch (off) { + case bpf_ctx_range(struct bpf_perf_event_data, sample_period): + bpf_ctx_record_field_size(info, size_sp); + if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) return false; - info->ctx_field_size = 8; - info->converted_op_size = 8; - } else { + break; + default: if (size != sizeof(long)) return false; } + return true; } static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): - BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, - offsetof(struct perf_sample_data, period)); + bpf_target_off(struct perf_sample_data, period, 8, + target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, diff --git a/net/core/filter.c b/net/core/filter.c index 29620df45b7c..94169572d002 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3088,38 +3088,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info) +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { - info->ctx_field_size = 4; - switch (off) { - case offsetof(struct __sk_buff, pkt_type) ... - offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_present) ... - offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1: - info->converted_op_size = 1; - break; - case offsetof(struct __sk_buff, queue_mapping) ... - offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, protocol) ... - offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_tci) ... - offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_proto) ... - offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_index) ... - offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: - info->converted_op_size = 2; - break; - default: - info->converted_op_size = 4; - } -} + const int size_default = sizeof(__u32); -static bool __is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -3128,40 +3101,24 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, return false; switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - if (off + size > - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; - case offsetof(struct __sk_buff, data) ... - offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: - if (size != sizeof(__u32)) + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) return false; - info->reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end) ... - offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: - if (size != sizeof(__u32)) - return false; - info->reg_type = PTR_TO_PACKET_END; break; default: + /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { - if (size != sizeof(__u32)) + if (size != size_default) return false; } else { - int allowed; - - /* permit narrower load for not cb/data/data_end fields */ -#ifdef __LITTLE_ENDIAN - allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0; -#else - allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0; -#endif - if (!allowed) + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; - __set_access_aux_info(off, info); } } @@ -3173,26 +3130,22 @@ static bool sk_filter_is_valid_access(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, data) ... - offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, data_end) ... - offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, @@ -3200,24 +3153,31 @@ static bool lwt_is_valid_access(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, tc_classid): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, @@ -3289,19 +3249,27 @@ static bool tc_cls_act_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, tc_index): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3374,98 +3342,108 @@ static bool sock_ops_is_valid_access(int off, int size, static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, len)); + bpf_target_off(struct sk_buff, len, 4, + target_size)); break; case offsetof(struct __sk_buff, protocol): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, protocol)); + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); break; case offsetof(struct __sk_buff, vlan_proto): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, vlan_proto)); + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); break; case offsetof(struct __sk_buff, priority): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, skb_iif)); + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); break; case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; case offsetof(struct __sk_buff, hash): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, hash)); + bpf_target_off(struct sk_buff, hash, 4, + target_size)); break; case offsetof(struct __sk_buff, mark): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, - si->src_reg, insn); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, - si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, queue_mapping, 2, + target_size)); + break; case offsetof(struct __sk_buff, vlan_present): - return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - si->dst_reg, si->src_reg, insn); - case offsetof(struct __sk_buff, vlan_tci): - return convert_skb_access(SKF_AD_VLAN_TAG, - si->dst_reg, si->src_reg, insn); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + if (si->off == offsetof(struct __sk_buff, vlan_tci)) { + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, + ~VLAN_TAG_PRESENT); + } else { + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + } + break; case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % @@ -3491,6 +3469,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, off); @@ -3516,14 +3495,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); #else if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); @@ -3534,10 +3513,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, napi_id)); + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else @@ -3552,7 +3530,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3596,22 +3574,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; default: - return bpf_convert_ctx_access(type, si, insn_buf, prog); + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); } return insn - insn_buf; @@ -3620,7 +3598,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3643,7 +3621,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, + u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; -- cgit From 9780c0ab1a4e64ef6998c4d83f9df5be806a02dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:28 +0200 Subject: bpf: export whether tail call has jited owner We do export through fdinfo already whether a prog is JITed or not, given a program load can fail in case of either prog or tail call map has JITed property, but neither both are JITed or not JITed, we can facilitate error reporting in loaders like iproute2 through exporting owner_jited of tail call map. We already do export owner_prog_type through this facility, so parser can pick up both for comparison. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d4d47de75bba..18980472f5b0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -216,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_map *map = filp->private_data; const struct bpf_array *array; u32 owner_prog_type = 0; + u32 owner_jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); owner_prog_type = array->owner_prog_type; + owner_jited = array->owner_jited; } seq_printf(m, @@ -236,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->map_flags, map->pages * 1ULL << PAGE_SHIFT); - if (owner_prog_type) + if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", owner_prog_type); + seq_printf(m, "owner_jited:\t%u\n", + owner_jited); + } } #endif -- cgit From 7bda4b40c5624c3f1c69227f8ebfd46a4b83f2ef Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:29 +0200 Subject: bpf: extend bpf_trace_printk to support %i Currently, bpf_trace_printk does not support common formatting symbol '%i' however vsprintf does and is what eventually gets called by bpf helper. If users are used to '%i' and currently make use of it, then bpf_trace_printk will just return with error without dumping anything to the trace pipe, so just add support for '%i' to the helper. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5c6d538dbf43..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) } /* - * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed + * Only limited trace_printk() conversion specifiers allowed: + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } - if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; fmt_cnt++; } -- cgit From 43188702b3d98d2792969a3377a30957f05695e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:30 +0200 Subject: bpf, verifier: add additional patterns to evaluate_reg_imm_alu Currently the verifier does not track imm across alu operations when the source register is of unknown type. This adds additional pattern matching to catch this and track imm. We've seen LLVM generating this pattern while working on cilium. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f820a044079..6a86723c5b64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1657,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); + + /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ + if (src_reg->imm > 0 && dst_reg->imm) { + switch (opcode) { + case BPF_ADD: + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + dst_reg->imm--; + break; + case BPF_AND: + /* dreg &= sreg + * AND can not extend zero bits only shrink + * Ex. 0x00..00ffffff + * & 0x0f..ffffffff + * ---------------- + * 0x00..00ffffff + */ + dst_reg->imm = max(src_reg->imm, 63 - imm_log2); + break; + case BPF_OR: + /* dreg |= sreg + * OR can only extend zero bits + * Ex. 0x00..00ffffff + * | 0x0f..ffffffff + * ---------------- + * 0x0f..00ffffff + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + break; + case BPF_SUB: + case BPF_MUL: + case BPF_RSH: + case BPF_LSH: + /* These may be flushed out later */ + default: + mark_reg_unknown_value(regs, insn->dst_reg); + } + } else { + mark_reg_unknown_value(regs, insn->dst_reg); + } + + dst_reg->type = UNKNOWN_VALUE; + return 0; +} + static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1666,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u64 dst_imm = dst_reg->imm; + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) + return evaluate_reg_imm_alu_unknown(env, insn); + /* dst_reg->type == CONST_IMM here. Simulate execution of insns * containing ALU ops. Don't care about overflow or negative * values, just add/sub/... them; registers are in u64. -- cgit From 3b9c08ae3dd44201b3a188aef34d6ddf73434015 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Jul 2017 11:53:40 +0200 Subject: Revert "sched/cputime: Refactor the cputime_adjust() code" This reverts commit 72298e5c92c50edd8cb7cfda4519483ce65fa166. As Peter explains: > Argh, no... That code was perfectly fine. The new code otoh is > convoluted. > > The old code had the following form: > > if (exception1) > deal with exception1 > > if (execption2) > deal with exception2 > > do normal stuff > > Which is as simple and straight forward as it gets. > > The new code otoh reads like: > > if (!exception1) { > if (exception2) > deal with exception 2 > else > do normal stuff > } So restore the old form. Also fix the comment describing the logic, as it was confusing. Requested-by: Peter Zijlstra Cc: Gustavo A. R. Silva Cc: Frans Klaver Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 67c70e287647..84a419bdf5aa 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; /* - * If either stime or both stime and utime are 0, assume all runtime is - * userspace. Once a task gets some ticks, the monotonicy code at - * 'update' will ensure things converge to the observed ratio. + * If either stime or utime are 0, assume all runtime is userspace. + * Once a task gets some ticks, the monotonicy code at 'update:' + * will ensure things converge to the observed ratio. */ - if (stime != 0) { - if (utime == 0) - stime = rtime; - else - stime = scale_stime(stime, rtime, stime + utime); + if (stime == 0) { + utime = rtime; + goto update; } + if (utime == 0) { + stime = rtime; + goto update; + } + + stime = scale_stime(stime, rtime, stime + utime); + +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. -- cgit From e5682b4eecb2b73282853d0ef314d3164b986997 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 4 Jul 2017 11:25:15 +0200 Subject: genirq/debugfs: Fix build for !CONFIG_IRQ_DOMAIN Fix this build error: kernel/irq/internals.h:440:20: error: inlining failed in call to always_inline 'irq_domain_debugfs_init': function body not available kernel/irq/debugfs.c:202:2: note: called from here irq_domain_debugfs_init(root_dir); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Sebastian Ott Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1707041124000.1712@schleppi --- kernel/irq/internals.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9da14d125df4..dbfba9933ed2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -437,7 +437,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -static inline void irq_domain_debugfs_init(struct dentry *root); +static inline void irq_domain_debugfs_init(struct dentry *root) +{ +} # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) -- cgit From 2372a519f63829b8effcdde5f4564a7e036294f0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Jul 2017 12:06:01 +0200 Subject: genirq: Force inlining of __irq_startup_managed to prevent build failure If CONFIG_SMP=n, and gcc (e.g. 4.1.2) decides not to inline __irq_startup_managed(), the build fails with: kernel/built-in.o: In function `irq_startup': (.text+0x38ed8): undefined reference to `irq_set_affinity_locked' Fix this by forcing inlining of __irq_startup_managed(). Fixes: 761ea388e8c4e3ac ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1499162761-12398-1-git-send-email-geert@linux-m68k.org --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2e30d925a40d..aa5497dfb29e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -234,7 +234,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) return IRQ_STARTUP_MANAGED; } #else -static int +static __always_inline int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; -- cgit From 3a90795e1e885167209056a1a90be965add30e25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:36 +0200 Subject: genirq: Move bus locking into __setup_irq() There is no point in having the irq_bus_lock() protection around all callers to __setup_irq(). Move it into __setup_irq(). This is also a preparatory patch for addressing the issues with the irq resource callbacks. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214343.960949031@linutronix.de --- kernel/irq/manage.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5c11c1730ba5..0934e02fa04e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + chip_bus_lock(desc); + /* * The following block of code has to be executed atomically */ @@ -1347,6 +1349,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); irq_setup_timings(desc, new); @@ -1378,6 +1381,8 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1417,9 +1422,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1674,9 +1677,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); @@ -1924,9 +1925,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1980,9 +1979,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); -- cgit From 9114014cf4e6df0b22d764380ae1fc54f1a7a8b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:37 +0200 Subject: genirq: Add mutex to irq desc to serialize request/free_irq() The irq_request/release_resources() callbacks ar currently invoked under desc->lock with interrupts disabled. This is a source of problems on RT and conceptually not required. Add a seperate mutex to struct irq_desc which allows to serialize request/free_irq(), which can be used to move the resource functions out of the desc->lock held region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.039220922@linutronix.de --- include/linux/irqdesc.h | 3 +++ kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 8 ++++++++ 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index d425a3a09722..3e90a094798d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -3,6 +3,7 @@ #include #include +#include /* * Core internal functions to deal with irq descriptors @@ -45,6 +46,7 @@ struct pt_regs; * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs + * @request_mutex: mutex to protect request/free before locking desc->lock * @dir: /proc/irq/ procfs entry * @debugfs_file: dentry for the debugfs file * @name: flow handler name for /proc/interrupts output @@ -96,6 +98,7 @@ struct irq_desc { struct rcu_head rcu; struct kobject kobj; #endif + struct mutex request_mutex; int parent_irq; struct module *owner; const char *name; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 948b50e78549..906a67e58391 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, affinity, owner); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0934e02fa04e..0139908b8d53 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + mutex_lock(&desc->request_mutex); + chip_bus_lock(desc); /* @@ -1350,6 +1352,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); @@ -1383,6 +1386,8 @@ out_unlock: chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1446,6 +1451,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); @@ -1521,6 +1527,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + mutex_unlock(&desc->request_mutex); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); -- cgit From 46e48e257360f0845fe17089713cbad4db611e70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:38 +0200 Subject: genirq: Move irq resource handling out of spinlocked region Aside of being conceptually wrong, there is also an actual (hard to trigger and mostly theoretical) problem. CPU0 CPU1 free_irq(X) interrupt X spin_lock(desc->lock) wake irq thread() spin_unlock(desc->lock) spin_lock(desc->lock) remove action() shutdown_irq() release_resources() thread_handler() spin_unlock(desc->lock) access released resources. synchronize_irq() Move the release resources invocation after synchronize_irq() so it's guaranteed that the threaded handler has finished. Move the resource request call out of the desc->lock held region as well, so the invocation context is the same for both request and release. This solves the problems with those functions on RT as well. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.117028181@linutronix.de --- kernel/irq/manage.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0139908b8d53..3e693430bfe1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1168,6 +1168,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags &= ~IRQF_ONESHOT; mutex_lock(&desc->request_mutex); + if (!desc->action) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mutex; + } + } chip_bus_lock(desc); @@ -1271,13 +1279,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - ret = irq_request_resources(desc); - if (ret) { - pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", - new->name, irq, desc->irq_data.chip->name); - goto out_unlock; - } - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1386,6 +1387,10 @@ out_unlock: chip_bus_sync_unlock(desc); + if (!desc->action) + irq_release_resources(desc); + +out_mutex: mutex_unlock(&desc->request_mutex); out_thread: @@ -1484,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_release_resources(desc); irq_remove_timings(desc); } @@ -1527,6 +1531,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + if (!desc->action) + irq_release_resources(desc); + mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); -- cgit From 2343877fbda701599653e63f8dcc318aa1bf15ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:39 +0200 Subject: genirq/timings: Move free timings out of spinlocked region No point to do memory management from a interrupt disabled spin locked region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Daniel Lezcano Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.196130646@linutronix.de --- kernel/irq/manage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3e693430bfe1..91e1f2390752 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1489,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_remove_timings(desc); } #ifdef CONFIG_SMP @@ -1531,8 +1530,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } - if (!desc->action) + if (!desc->action) { irq_release_resources(desc); + irq_remove_timings(desc); + } mutex_unlock(&desc->request_mutex); -- cgit From 1d0c6e593023ac5dafc2ea2b3f23d96f1c1f2fa2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 30 Jun 2017 10:22:14 +0530 Subject: PM / sleep: constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 3802 624 32 4458 116a kernel/power/main.o File size After adding 'const': text data bss dec hex filename 3866 560 32 4458 116a kernel/power/main.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -705,7 +705,7 @@ static struct attribute * g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:07 +0200 Subject: vtime, sched/cputime: Remove vtime_account_user() It's an unnecessary function between vtime_user_exit() and account_user_time(). Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 9 +-------- kernel/sched/cputime.c | 12 ++++++------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 0681fe25abeb..18b405e3cd93 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_account_user(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk); - -static inline void vtime_user_exit(struct task_struct *tsk) -{ - vtime_account_user(tsk); -} - +extern void vtime_user_exit(struct task_struct *tsk); extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 84a419bdf5aa..5adc896d0f64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -724,21 +724,21 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_account_user(struct task_struct *tsk) +void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); + __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_user_enter(struct task_struct *tsk) +void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); + tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; + account_user_time(tsk, get_vtime_delta(tsk)); write_seqcount_end(&tsk->vtime_seqcount); } -- cgit From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:08 +0200 Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime Even though it doesn't have functional consequences, setting the task's new context state after we actually accounted the pending vtime from the old context state makes more sense from a review perspective. vtime_user_exit() is the only function that doesn't follow that rule and that can bug the reviewer for a little while until he realizes there is no reason for this special case. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5adc896d0f64..ab68927e8e94 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct *tsk) void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) account_user_time(tsk, get_vtime_delta(tsk)); + tsk->vtime_snap_whence = VTIME_SYS; write_seqcount_end(&tsk->vtime_seqcount); } -- cgit From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:09 +0200 Subject: sched/cputime: Rename vtime fields The current "snapshot" based naming on vtime fields suggests we record some past event but that's a low level picture of their actual purpose which comes out blurry. The real point of these fields is to run a basic state machine that tracks down cputime entry while switching between contexts. So lets reflect that with more meaningful names. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 ++-- include/linux/sched.h | 4 ++-- kernel/fork.c | 4 ++-- kernel/sched/cputime.c | 30 +++++++++++++++--------------- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e049526bc188..3d537331cd4e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -171,8 +171,8 @@ extern struct cred init_cred; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ - .vtime_snap = 0, \ - .vtime_snap_whence = VTIME_SYS, + .vtime_starttime = 0, \ + .vtime_state = VTIME_SYS, #else # define INIT_VTIME(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c4ca7433d9d..ff001646549e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -689,7 +689,7 @@ struct task_struct { struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_t vtime_seqcount; - unsigned long long vtime_snap; + unsigned long long vtime_starttime; enum { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, @@ -697,7 +697,7 @@ struct task_struct { VTIME_USER, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, - } vtime_snap_whence; + } vtime_state; #endif #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..83c4f9bf3e14 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1638,8 +1638,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime_seqcount); - p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_INACTIVE; + p->vtime_starttime = 0; + p->vtime_state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ab68927e8e94..8c64753067c5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_snap)) + if (time_before(now, (unsigned long)tsk->vtime_starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_snap); + return jiffies_to_nsecs(now - tsk->vtime_starttime); } static u64 get_vtime_delta(struct task_struct *tsk) @@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); + delta = jiffies_to_nsecs(now - tsk->vtime_starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); + tsk->vtime_starttime = now; return delta - other; } @@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struct *tsk) { /* * The flags must be updated under the lock with - * the vtime_snap flush and update. + * the vtime_starttime flush and update. * That enforces a right ordering and update sequence * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. @@ -776,12 +776,12 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_snap_whence = VTIME_INACTIVE; + prev->vtime_state = VTIME_INACTIVE; write_seqcount_end(&prev->vtime_seqcount); write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; + current->vtime_state = VTIME_SYS; + current->vtime_starttime = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); - t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; + t->vtime_state = VTIME_SYS; + t->vtime_starttime = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } @@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) + if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); } while (read_seqcount_retry(&t->vtime_seqcount, seq)); @@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) + if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_snap_whence == VTIME_SYS) + else if (t->vtime_state == VTIME_SYS) *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } -- cgit From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:10 +0200 Subject: sched/cputime: Move the vtime task fields to their own struct We are about to add vtime accumulation fields to the task struct. Let's avoid more bloatification and gather vtime information to their own struct. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 +-- include/linux/sched.h | 26 ++++++----- kernel/fork.c | 6 +-- kernel/sched/cputime.c | 112 ++++++++++++++++++++++++++-------------------- 4 files changed, 86 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 3d537331cd4e..a2f6707e9fc0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -170,9 +170,9 @@ extern struct cred init_cred; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ - .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ - .vtime_starttime = 0, \ - .vtime_state = VTIME_SYS, + .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \ + .vtime.starttime = 0, \ + .vtime.state = VTIME_SYS, #else # define INIT_VTIME(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ff001646549e..eeff8a024f0c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -223,6 +223,21 @@ struct task_cputime { #define prof_exp stime #define sched_exp sum_exec_runtime +enum vtime_state { + /* Task is sleeping or running in a CPU with VTIME inactive: */ + VTIME_INACTIVE = 0, + /* Task runs in userspace in a CPU with VTIME active: */ + VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active: */ + VTIME_SYS, +}; + +struct vtime { + seqcount_t seqcount; + unsigned long long starttime; + enum vtime_state state; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ @@ -688,16 +703,7 @@ struct task_struct { u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_t vtime_seqcount; - unsigned long long vtime_starttime; - enum { - /* Task is sleeping or running in a CPU with VTIME inactive: */ - VTIME_INACTIVE = 0, - /* Task runs in userspace in a CPU with VTIME active: */ - VTIME_USER, - /* Task runs in kernelspace in a CPU with VTIME active: */ - VTIME_SYS, - } vtime_state; + struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/fork.c b/kernel/fork.c index 83c4f9bf3e14..d927ec11aa7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process( prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_init(&p->vtime_seqcount); - p->vtime_starttime = 0; - p->vtime_state = VTIME_INACTIVE; + seqcount_init(&p->vtime.seqcount); + p->vtime.starttime = 0; + p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8c64753067c5..9ee725edcbe0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static u64 vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_starttime)) + if (time_before(now, (unsigned long)vtime->starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_starttime); + return jiffies_to_nsecs(now - vtime->starttime); } -static u64 get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); u64 delta, other; @@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_starttime); + delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); - tsk->vtime_starttime = now; + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); + vtime->starttime = now; return delta - other; } static void __vtime_account_system(struct task_struct *tsk) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); } void vtime_account_system(struct task_struct *tsk) { - if (!vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + if (!vtime_delta(vtime)) return; - write_seqcount_begin(&tsk->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); + vtime->state = VTIME_USER; + write_seqcount_end(&vtime->seqcount); } void vtime_user_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); - tsk->vtime_snap_whence = VTIME_SYS; - write_seqcount_end(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) + account_user_time(tsk, get_vtime_delta(vtime)); + vtime->state = VTIME_SYS; + write_seqcount_end(&vtime->seqcount); } void vtime_guest_enter(struct task_struct *tsk) { + struct vtime *vtime = &tsk->vtime; /* * The flags must be updated under the lock with * the vtime_starttime flush and update. @@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + account_idle_time(get_vtime_delta(&tsk->vtime)); } void arch_vtime_task_switch(struct task_struct *prev) { - write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_state = VTIME_INACTIVE; - write_seqcount_end(&prev->vtime_seqcount); + struct vtime *vtime = &prev->vtime; - write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_state = VTIME_SYS; - current->vtime_starttime = jiffies; - write_seqcount_end(¤t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_INACTIVE; + write_seqcount_end(&vtime->seqcount); + + vtime = ¤t->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { + struct vtime *vtime = &t->vtime; unsigned long flags; local_irq_save(flags); - write_seqcount_begin(&t->vtime_seqcount); - t->vtime_state = VTIME_SYS; - t->vtime_starttime = jiffies; - write_seqcount_end(&t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } u64 task_gtime(struct task_struct *t) { + struct vtime *vtime = &t->vtime; unsigned int seq; u64 gtime; @@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t) return t->gtime; do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(t); + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + gtime += vtime_delta(vtime); - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); return gtime; } @@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t) */ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - u64 delta; + struct vtime *vtime = &t->vtime; unsigned int seq; + u64 delta; if (!vtime_accounting_enabled()) { *utime = t->utime; @@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) + if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) continue; - delta = vtime_delta(t); + delta = vtime_delta(vtime); /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_state == VTIME_SYS) + else if (vtime->state == VTIME_SYS) *stime += delta; - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 29 Jun 2017 19:15:11 +0200 Subject: sched/cputime: Accumulate vtime on top of nsec clocksource Currently the cputime source used by vtime is jiffies. When we cross a context boundary and jiffies have changed since the last snapshot, the pending cputime is accounted to the switching out context. This system works ok if the ticks are not aligned across CPUs. If they instead are aligned (ie: all fire at the same time) and the CPUs run in userspace, the jiffies change is only observed on tick exit and therefore the user cputime is accounted as system cputime. This is because the CPU that maintains timekeeping fires its tick at the same time as the others. It updates jiffies in the middle of the tick and the other CPUs see that update on IRQ exit: CPU 0 (timekeeper) CPU 1 ------------------- ------------- jiffies = N ... run in userspace for a jiffy tick entry tick entry (sees jiffies = N) set jiffies = N + 1 tick exit tick exit (sees jiffies = N + 1) account 1 jiffy as stime Fix this with using a nanosec clock source instead of jiffies. The cputime is then accumulated and flushed everytime the pending delta reaches a jiffy in order to mitigate the accounting overhead. [ fweisbec: changelog, rebase on struct vtime, field renames, add delta on cputime readers, keep idle vtime as-is (low overhead accounting), harmonize clock sources. ] Suggested-by: Thomas Gleixner Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Signed-off-by: Wanpeng Li Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index eeff8a024f0c..4818126c5153 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -236,6 +236,9 @@ struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; + u64 utime; + u64 stime; + u64 gtime; }; struct sched_info { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9ee725edcbe0..6e3ea4ac1bda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static u64 vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); + unsigned long long clock; - if (time_before(now, (unsigned long)vtime->starttime)) + clock = sched_clock_cpu(smp_processor_id()); + if (clock < vtime->starttime) return 0; - return jiffies_to_nsecs(now - vtime->starttime); + return clock - vtime->starttime; } static u64 get_vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(vtime); + u64 other; /* * Unlike tick based timing, vtime based timing never has lost @@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); - vtime->starttime = now; + vtime->starttime += delta; return delta - other; } -static void __vtime_account_system(struct task_struct *tsk) +static void __vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { - account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); + vtime->stime += get_vtime_delta(vtime); + if (vtime->stime >= TICK_NSEC) { + account_system_time(tsk, irq_count(), vtime->stime); + vtime->stime = 0; + } +} + +static void vtime_account_guest(struct task_struct *tsk, + struct vtime *vtime) +{ + vtime->gtime += get_vtime_delta(vtime); + if (vtime->gtime >= TICK_NSEC) { + account_guest_time(tsk, vtime->gtime); + vtime->gtime = 0; + } } void vtime_account_system(struct task_struct *tsk) @@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + /* We might have scheduled out from guest path */ + if (current->flags & PF_VCPU) + vtime_account_guest(tsk, vtime); + else + __vtime_account_system(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - account_user_time(tsk, get_vtime_delta(vtime)); + vtime->utime += get_vtime_delta(vtime); + if (vtime->utime >= TICK_NSEC) { + account_user_time(tsk, vtime->utime); + vtime->utime = 0; + } vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } @@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); current->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + vtime_account_guest(tsk, vtime); current->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(smp_processor_id()); write_seqcount_end(&vtime->seqcount); } @@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(cpu); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t) gtime = t->gtime; if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(vtime); + gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * the right place. */ if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += delta; + *utime += vtime->utime + delta; else if (vtime->state == VTIME_SYS) - *stime += delta; + *stime += vtime->stime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit From a0c4acd2c220376b4e9690e75782d0c0afdaab9f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 16 Jun 2017 16:44:34 +0300 Subject: locking/rwsem-spinlock: Fix EINTR branch in __down_write_common() If a writer could been woken up, the above branch if (sem->count == 0) break; would have moved us to taking the sem. So, it's not the time to wake a writer now, and only readers are allowed now. Thus, 0 must be passed to __rwsem_do_wake(). Next, __rwsem_do_wake() wakes readers unconditionally. But we mustn't do that if the sem is owned by writer in the moment. Otherwise, writer and reader own the sem the same time, which leads to memory corruption in callers. rwsem-xadd.c does not need that, as: 1) the similar check is made lockless there, 2) in __rwsem_mark_wake::try_reader_grant we test, that sem is not owned by writer. Signed-off-by: Kirill Tkhai Acked-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Niklas Cassel Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 17fcbd590d0c "locking/rwsem: Fix down_write_killable() for CONFIG_RWSEM_GENERIC_SPINLOCK=y" Link: http://lkml.kernel.org/r/149762063282.19811.9129615532201147826.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); - if (!list_empty(&sem->wait_list)) - __rwsem_do_wake(sem, 1); + if (!list_empty(&sem->wait_list) && sem->count >= 0) + __rwsem_do_wake(sem, 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return -EINTR; -- cgit From 69d71879d2cf67a381055f698a1d7def00dc4ed7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Jul 2017 09:45:43 -0400 Subject: ftrace: Test for NULL iter->tr in regex for stack_trace_filter changes As writing into stack_trace_filter, the iter-tr is not set and is NULL. Check if it is NULL before dereferencing it in ftrace_regex_release(). Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f8c18f15b190..2953d558bbee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5043,7 +5043,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (!list_empty(&iter->tr->mod_trace)) + if (iter->tr && !list_empty(&iter->tr->mod_trace)) iter->hash->flags |= FTRACE_HASH_FL_MOD; } else orig_hash = &iter->ops->func_hash->notrace_hash; -- cgit From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 7 Jun 2017 13:18:57 -0600 Subject: sched/fair: Fix load_balance() affinity redo path If load_balance() fails to migrate any tasks because all tasks were affined, load_balance() removes the source CPU from consideration and attempts to redo and balance among the new subset of CPUs. There is a bug in this code path where the algorithm considers all active CPUs in the system (minus the source that was just masked out). This is not valid for two reasons: some active CPUs may not be in the current scheduling domain and one of the active CPUs is dst_cpu. These CPUs should not be considered, as we cannot pull load from them. Instead of failing out of load_balance(), we may end up redoing the search with no valid CPUs and incorrectly concluding the domain is balanced. Additionally, if the group_imbalance flag was just set, it may also be incorrectly unset, thus the flag will not be seen by other CPUs in future load_balance() runs as that algorithm intends. Fix the check by removing CPUs not in the current domain and the dst_cpu from considertation, thus limiting the evaluation to valid remaining CPUs from which load might be migrated. Co-authored-by: Austin Christ Co-authored-by: Dietmar Eggemann Tested-by: Tyler Baicar Signed-off-by: Jeffrey Hugo Acked-by: Peter Zijlstra Cc: Austin Christ Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Timur Tabi Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Also avoid computing new_dst_cpu if we have already computed - * one in current iteration. + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have + * already computed one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ @@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .tasks = LIST_HEAD_INIT(env.tasks), }; - /* - * For NEWLY_IDLE load_balancing, we don't need to consider - * other cpus in our group - */ - if (idle == CPU_NEWLY_IDLE) - env.dst_grpmask = NULL; - - cpumask_copy(cpus, cpu_active_mask); + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); @@ -8151,7 +8144,15 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * Attempting to continue load balancing at the current + * sched_domain level only makes sense if there are + * active CPUs remaining as possible busiest CPUs to + * pull load from which are not contained within the + * destination group that is receiving any migrated + * load. + */ + if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + /* + * can_migrate_task() doesn't need to compute new_dst_cpu + * for active balancing. Since we have CPU_IDLE, but no + * @dst_grpmask we need to make that test go away with lying + * about DST_PINNED. + */ + .flags = LBF_DST_PINNED, }; schedstat_inc(sd->alb_count); -- cgit From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jul 2017 16:24:49 +0100 Subject: bpf: Implement show_options Implement the show_options superblock op for bpf as part of a bid to get rid of s_options and generic_show_options() to make it easier to implement a context-based mount where the mount options can be passed individually over a file descriptor. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: netdev@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; -- cgit From 9cd4f1a4e7a858849e889a081a99adff83e08e4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jul 2017 22:20:23 +0200 Subject: smp/hotplug: Move unparking of percpu threads to the control CPU Vikram reported the following backtrace: BUG: scheduling while atomic: swapper/7/0/0x00000002 CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.32-perf+ #680 schedule schedule_hrtimeout_range_clock schedule_hrtimeout wait_task_inactive __kthread_bind_mask __kthread_bind __kthread_unpark kthread_unpark cpuhp_online_idle cpu_startup_entry secondary_start_kernel He analyzed correctly that a parked cpu hotplug thread of an offlined CPU was still on the runqueue when the CPU came back online and tried to unpark it. This causes the thread which invoked kthread_unpark() to call wait_task_inactive() and subsequently schedule() with preemption disabled. His proposed workaround was to "make sure" that a parked thread has scheduled out when the CPU goes offline, so the situation cannot happen. But that's still wrong because the root cause is not the fact that the percpu thread is still on the runqueue and neither that preemption is disabled, which could be simply solved by enabling preemption before calling kthread_unpark(). The real issue is that the calling thread is the idle task of the upcoming CPU, which is not supposed to call anything which might sleep. The moron, who wrote that code, missed completely that kthread_unpark() might end up in schedule(). The solution is simpler than expected. The thread which controls the hotplug operation is waiting for the CPU to call complete() on the hotplug state completion. So the idle task of the upcoming CPU can set its state to CPUHP_AP_ONLINE_IDLE and invoke complete(). This in turn wakes the control task on a different CPU, which then can safely do the unpark and kick the now unparked hotplug thread of the upcoming CPU to complete the bringup to the final target state. Control CPU AP bringup_cpu(); __cpu_up() ------------> bringup_ap(); bringup_wait_for_ap() wait_for_completion(); cpuhp_online_idle(); <------------ complete(); unpark(AP->stopper); unpark(AP->hotplugthread); while(1) do_idle(); kick(AP->hotplugthread); wait_for_completion(); hotplug_thread() run_online_callbacks(); complete(); Fixes: 8df3e07e7f21 ("cpu/hotplug: Let upcoming cpu bring itself fully up") Reported-by: Vikram Mulukutla Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Sebastian Sewior Cc: Rusty Russell Cc: Tejun Heo Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707042218020.2131@nanos Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b03a32595cfe..ab860453841d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -271,11 +271,25 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); + static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); + BUG_ON(!cpu_online(cpu)); + + /* Unpark the stopper thread and the hotplug thread of the target cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) { + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + } return st->result; } @@ -296,9 +310,7 @@ static int bringup_cpu(unsigned int cpu) irq_unlock_sparse(); if (ret) return ret; - ret = bringup_wait_for_ap(cpu); - BUG_ON(!cpu_online(cpu)); - return ret; + return bringup_wait_for_ap(cpu); } /* @@ -767,31 +779,20 @@ void notify_cpu_starting(unsigned int cpu) } /* - * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread and unpark the smpboot threads. If the target state is - * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the - * cpu further. + * Called from the idle task. Wake up the controlling task which brings the + * stopper and the hotplug thread of the upcoming CPU up and then delegates + * the rest of the online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - unsigned int cpu = smp_processor_id(); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; st->state = CPUHP_AP_ONLINE_IDLE; - - /* Unpark the stopper thread and the hotplug thread of this cpu */ - stop_machine_unpark(cpu); - kthread_unpark(st->thread); - - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) - __cpuhp_kick_ap_work(st); - else - complete(&st->done); + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -- cgit From 99c621d704cf1c4eb74c3c42e674edf3df64f92d Mon Sep 17 00:00:00 2001 From: Michael Sartain Date: Wed, 5 Jul 2017 22:07:15 -0600 Subject: tracing: Add saved_tgids file to show cached pid to tgid mappings Export the cached pid / tgid mappings in debugfs tracing saved_tgids file. This allows user apps to translate the pids from a trace to their respective thread group. Example saved_tgids file with pid / tgid values separated by ' ': # cat saved_tgids 1048 1048 1047 1047 7 7 1049 1047 1054 1047 1053 1047 Link: http://lkml.kernel.org/r/20170630004023.064965233@goodmis.org Link: http://lkml.kernel.org/r/20170706040713.unwkumbta5menygi@mikesart-cos Reviewed-by: Joel Fernandes Signed-off-by: Michael Sartain Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00e2e4169b1e..f079a8ca1117 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4688,6 +4688,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -7920,6 +7990,9 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); + trace_eval_init(); trace_create_eval_file(d_tracer); -- cgit From c80081b9209713e0fe86d3def395a9fc66503c58 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 6 Jul 2017 14:29:04 +0200 Subject: genirq: Allow to pass the IRQF_TIMER flag with percpu irq request The irq timings infrastructure tracks when interrupts occur in order to statistically predict te next interrupt event. There is no point to track timer interrupts and try to predict them because the next expiration time is already known. This can be avoided via the IRQF_TIMER flag which is passed by timer drivers in request_irq(). It marks the interrupt as timer based which alloes to ignore these interrupts in the timings code. Per CPU interrupts which are requested via request_percpu_+irq() have no flag argument, so marking per cpu timer interrupts is not possible and they get tracked pointlessly. Add __request_percpu_irq() as a variant of request_percpu_irq() with a flags argument and make request_percpu_irq() an inline wrapper passing flags = 0. The flag parameter is restricted to IRQF_TIMER as all other IRQF_ flags make no sense for per cpu interrupts. The next step is to convert all existing users of request_percpu_irq() and then remove the wrapper and the underscores. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: nicolas.pitre@linaro.org Cc: vincent.guittot@linaro.org Cc: rafael@kernel.org Link: http://lkml.kernel.org/r/1499344144-3964-1-git-send-email-daniel.lezcano@linaro.org --- include/linux/interrupt.h | 11 ++++++++++- kernel/irq/manage.c | 15 ++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 37f8e354f564..5ac6e238555e 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -152,8 +152,17 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check +__request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *percpu_dev_id); + +static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *percpu_dev_id); + const char *devname, void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, percpu_dev_id); +} extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91e1f2390752..5624b2dd6b58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1950,9 +1950,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * request_percpu_irq - allocate a percpu interrupt line + * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * @@ -1965,8 +1966,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * the handler gets called with the interrupted CPU's instance of * that variable. */ -int request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev_id) +int __request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -1980,12 +1982,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + if (flags && flags != IRQF_TIMER) + return -EINVAL; + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; @@ -2004,7 +2009,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } -EXPORT_SYMBOL_GPL(request_percpu_irq); +EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. -- cgit From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Thu, 6 Jul 2017 15:35:31 -0700 Subject: kernel/extable.c: mark core_kernel_text notrace core_kernel_text is used by MIPS in its function graph trace processing, so having this method traced leads to an infinite set of recursive calls such as: Call Trace: ftrace_return_to_handler+0x50/0x128 core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 return_to_handler+0x10/0x30 return_to_handler+0x0/0x30 return_to_handler+0x0/0x30 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 (...) Mark the function notrace to avoid it being traced. Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Marcin Nowakowski Reviewed-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Meyer Cc: Ingo Molnar Cc: Steven Rostedt Cc: Daniel Borkmann Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 0fbdd8582f08..223df4a328a4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) -- cgit From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:55 -0700 Subject: kernel/power/snapshot.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b7708e319941..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -30,15 +30,13 @@ #include #include #include +#include #include #include #include #include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif #include "power.h" -- cgit From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:58 -0700 Subject: kernel/module.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. The usages of set_memory_xx() are still guarded by CONFIG_STRICT_MODULE_RWX. Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d7eb41d772c4..8f883d86cedc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -49,9 +49,7 @@ #include #include #include -#ifdef CONFIG_STRICT_MODULE_RWX -#include -#endif +#include #include #include #include -- cgit From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 9 +- arch/powerpc/mm/mem.c | 10 +- arch/s390/mm/init.c | 30 +----- arch/sh/mm/init.c | 8 +- arch/x86/mm/init_32.c | 5 +- arch/x86/mm/init_64.c | 9 +- drivers/base/memory.c | 52 ++++++----- include/linux/memory_hotplug.h | 13 +-- include/linux/mmzone.h | 16 ++++ kernel/memremap.c | 4 + mm/memory_hotplug.c | 201 +++++++++++++++++++++++++---------------- mm/sparse.c | 3 +- 12 files changed, 185 insertions(+), 175 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39e2aeb4669d..80db57d063d0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -648,18 +648,11 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e6b2e6618b6c..72c46eb53215 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a3d549966b6a..bfa918e3592b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, !for_device); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index a9d57f75ae8c..3813a610a2bb 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 94594b889144..a424066d0552 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,13 +825,10 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d64291459b6..06afa84ac0a0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1e884d82af6f..b86fda30ce62 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn, end_pfn; - unsigned long valid_start, valid_end, valid_pages; + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct zone *zone; - int zone_shift = 0; + unsigned long valid_start_pfn, valid_end_pfn; + bool append = false; + int nid; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - end_pfn = start_pfn + nr_pages; - - /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + /* + * The block contains more than one zone can not be offlined. + * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + */ + if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) return sprintf(buf, "none\n"); - zone = page_zone(pfn_to_page(valid_start)); - valid_pages = valid_end - valid_start; - - /* MMOP_ONLINE_KEEP */ - sprintf(buf, "%s", zone->name); + start_pfn = valid_start_pfn; + nr_pages = valid_end_pfn - start_pfn; - /* MMOP_ONLINE_KERNEL */ - zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + /* + * Check the existing zone. Make sure that we do that only on the + * online nodes otherwise the page_zone is not reliable + */ + if (mem->state == MEM_ONLINE) { + strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + goto out; } - /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + nid = pfn_to_nid(start_pfn); + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name); + append = true; } + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { + if (append) + strcat(buf, " "); + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + } +out: strcat(buf, "\n"); return strlen(buf); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a61aede1b391..8a07a49fd8dc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf7e08c5a8..abc1641011f2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +/* + * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty + * intersection with the given zone + */ +static inline bool zone_intersects(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (zone_is_empty(zone)) + return false; + if (start_pfn >= zone_end_pfn(zone) || + start_pfn + nr_pages <= zone->zone_start_pfn) + return false; + + return true; +} + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..281eb478856a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2ebe9ad7f6c..9438ffe24cb2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -433,25 +433,6 @@ out_fail: return -1; } -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn, bool want_memblock) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; + int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); - + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); if (ret < 0) return ret; - ret = __add_zone(zone, phys_start_pfn); + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; + if (!pfn_valid(pfn)) + continue; - if (ret < 0) - return ret; + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); + } if (!want_memblock) return 0; @@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, +int __ref __add_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, bool want_memblock) { unsigned long i; @@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL]; - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(normal_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; - } +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP inherits the current zone which is + * ZONE_NORMAL by default but we might be within ZONE_MOVABLE + * already. + */ + if (zone_intersects(movable_zone, start_pfn, nr_pages)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(pfn_to_nid(pfn))) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) + if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid)) return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) diff --git a/mm/sparse.c b/mm/sparse.c index 9d7fd666015e..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; -- cgit From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/linux/memory_hotplug.h | 2 +- kernel/memremap.c | 2 +- mm/memory_hotplug.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 80db57d063d0..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 72c46eb53215..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bfa918e3592b..8111694ce55a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, !for_device); + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3813a610a2bb..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a424066d0552..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,12 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06afa84ac0a0..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,7 +772,7 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4d65a2fcac15..780c806e17d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/memremap.c b/kernel/memremap.c index 281eb478856a..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4263fa6f2ab4..9b04cf5ea813 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; -- cgit From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Jul 2017 15:38:32 -0700 Subject: kernel/exit.c: don't include unused userfaultfd_k.h Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback userfaultfd_exit") removed userfaultfd callback from exit() which makes the include of unnecessary. Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b0cc86a2d00b..2bbc23273e2f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:11 -0700 Subject: mm: update callers to use HASH_ZERO flag Update dcache, inode, pid, mountpoint, and mount hash tables to use HASH_ZERO, and remove initialization after allocations. In case of places where HASH_EARLY was used such as in __pv_init_lock_hash the zeroed hash table was already assumed, because memblock zeroes the memory. CPU: SPARC M6, Memory: 7T Before fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 11.798s After fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 3.198s CPU: Intel Xeon E5-2630, Memory: 2.2T: Before fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.245s After fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.244s Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 18 ++++-------------- fs/inode.c | 14 ++------------ fs/namespace.c | 10 ++-------- kernel/locking/qspinlock_paravirt.h | 3 ++- kernel/pid.c | 7 ++----- 5 files changed, 12 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index a9f995f6859e..a140fe1dbb1a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3546,8 +3546,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3559,24 +3557,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3590,14 +3583,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/inode.c b/fs/inode.c index ab3b9a795c0b..5cbc8e6e9390 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1915,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1928,20 +1926,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1959,14 +1952,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff --git a/fs/namespace.c b/fs/namespace.c index f70914a859a4..81f934b5d571 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3248,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) -- cgit From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:06 -0700 Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 6 +-- include/uapi/linux/mempolicy.h | 8 ---- kernel/cgroup/cpuset.c | 4 +- mm/mempolicy.c | 102 ++++++++--------------------------------- 4 files changed, 21 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ecb6cbeede5a..3a58b4be1b0c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, @@ -260,8 +259,7 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new, - enum mpol_rebind_step step) + const nodemask_t *new) { } diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 9cd8b21dddbe..2a4d89508fec 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,13 +24,6 @@ enum { MPOL_MAX, /* always last member of enum */ }; -enum mpol_rebind_step { - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ - MPOL_REBIND_NSTEP, -}; - /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -65,7 +58,6 @@ enum mpol_rebind_step { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..5fd1bdbaa381 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; if (need_loop) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c60807625fd5..047181452040 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; -- cgit From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:09 -0700 Subject: mm, cpuset: always use seqlock when changing task's nodemask When updating task's mems_allowed and rebinding its mempolicy due to cpuset's mems being changed, we currently only take the seqlock for writing when either the task has a mempolicy, or the new mems has no intersection with the old mems. This should be enough to prevent a parallel allocation seeing no available nodes, but the optimization is IMHO unnecessary (cpuset updates should not be frequent), and we still potentially risk issues if the intersection of new and old nodes has limited amount of free/reclaimable memory. Let's just use the seqlock for all tasks. Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5fd1bdbaa381..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void) * @tsk: the task to change * @newmems: new nodes that the task will be set * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. + * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed + * and rebind an eventual tasks' mempolicy. If the task is allocating in + * parallel, it might temporarily see an empty intersection, which results in + * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool need_loop; - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * read_mems_allowed_begin(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); - write_seqcount_begin(&tsk->mems_allowed_seq); - } + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; - if (need_loop) { - write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); task_unlock(tsk); } -- cgit From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:49 -0700 Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages The kmem-specific functions do the same thing. Switch and drop. Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 17 ----------------- kernel/fork.c | 8 ++++---- mm/slab.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa506ae61d66..5a72d8377942 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -929,19 +929,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -/** - * memcg_kmem_update_page_stat - update kmem page state statistics - * @page: the page - * @idx: page state item to account - * @val: number of pages (positive or negative) - */ -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ - if (memcg_kmem_enabled() && page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); -} - #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -964,10 +951,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ -} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..aa01b810c0bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } diff --git a/mm/slab.h b/mm/slab.h index 69f0579cb5aa..7b84e3839dfe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -285,10 +285,10 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) return ret; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << order); return 0; } @@ -298,10 +298,10 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, if (!memcg_kmem_enabled()) return; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + -(1 << order)); memcg_kmem_uncharge(page, order); } -- cgit From f610c9d68b1a47f539b7764f4b5ce07d32fb9ae1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Jul 2017 08:57:57 +0200 Subject: genirq/debugfs: Remove redundant NULL pointer check debugfs_remove() can be called with a NULL pointer. Fixes: 087cdfb662ae5 ("genirq/debugfs: Add proper debugfs interface") Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 14fe862aa2e3..ed47688b8e79 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1667,8 +1667,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - if (d->debugfs_file) - debugfs_remove(d->debugfs_file); + debugfs_remove(d->debugfs_file); } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit From c5c601c4295f89368f4a304cb3ae4aebdf80db22 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 7 Jul 2017 09:39:59 +0100 Subject: irqdomain: Allow ACPI device nodes to be used as irqdomain identifiers A number of irqchip implementations are (ab)using the irqdomain allocator by passing a fwnode that is neither a FWNODE_OF or a FWNODE_IRQCHIP. This is pretty bad, but it also feels pretty crap to force these drivers to allocate their own irqchip_fwid when they already have a proper fwnode. Instead, let's teach the irqdomain allocator about ACPI device nodes, and add some lovely name generation code... Tested on an arm64 D05 system. Reported-and-tested-by: John Garry Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Agustin Vega-Frias Cc: Ma Jun Cc: Hanjun Guo Link: http://lkml.kernel.org/r/20170707083959.10349-1-marc.zyngier@arm.com --- kernel/irq/irqdomain.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ed47688b8e79..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "irq: " fmt +#include #include #include #include @@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } +#ifdef CONFIG_ACPI + } else if (is_acpi_device_node(fwnode)) { + struct acpi_buffer buf = { + .length = ACPI_ALLOCATE_BUFFER, + }; + acpi_handle handle; + + handle = acpi_device_handle(to_acpi_device_node(fwnode)); + if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { + domain->name = buf.pointer; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + domain->fwnode = fwnode; +#endif } else if (of_node) { char *name; -- cgit From eaf260ac04d9b4cf9f458d5c97555bfff2da526e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:21 -0700 Subject: tracing: Treat recording comm for idle task as a success Currently we stop recording comm for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of comm for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f079a8ca1117..6722d86f2af5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* -- cgit From bd45d34d25720a820021c8ea45de5cd607eace64 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:22 -0700 Subject: tracing: Treat recording tgid for idle task as a success Currently we stop recording tgid for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of tgid for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6722d86f2af5..aee11e3a394f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2006,7 +2006,11 @@ int trace_find_tgid(int pid) static int trace_save_tgid(struct task_struct *tsk) { - if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) return 0; tgid_map[tsk->pid] = tsk->tgid; -- cgit From 29b1a8ad7df4528b862a79e3d5fb0936f4d199c7 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:23 -0700 Subject: tracing: Attempt to record other information even if some fail In recent patches where we record comm and tgid at the same time, we skip continuing to record if any fail. Fix that by trying to record as many things as we can even if some couldn't be recorded. If any information isn't recorded, then we don't set trace_taskinfo_save as before. Link: http://lkml.kernel.org/r/20170706230023.17942-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aee11e3a394f..92af8fd1429b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2037,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags) */ void tracing_record_taskinfo(struct task_struct *task, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) - return; - if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -2058,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && - (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) - return; + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - if ((flags & TRACE_RECORD_TGID) && - (!trace_save_tgid(prev) || !trace_save_tgid(next))) + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); -- cgit From 5671360f29c68d9079914438f6a0109ef62f82a8 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 8 Jul 2017 04:56:58 +0900 Subject: locking/qspinlock: Explicitly include asm/prefetch.h In architectures that use qspinlock, like x86, prefetch is loaded indirectly via the asm/qspinlock.h include. On other architectures, like OpenRISC, which may want to use asm-generic/qspinlock.h the built will fail without the asm/prefetch.h include. Fix this by including directly. Signed-off-by: Stafford Horne Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170707195658.23840-1-shorne@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include -- cgit From 634a81609561f05266e1f625b6f2567c2e0b0419 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Jul 2017 11:26:39 -0400 Subject: fix waitid(2) breakage We lose the distinction between "found a PID" and "nothing, but that's not an error" a bit too early in waitid(). Easily fixed, fortunately... Reported-by: Markus Trippelsdorf Fixes: 67d7ddded322 ("waitid(2): leave copyout of siginfo to syscall itself") Tested-by: Markus Trippelsdorf Signed-off-by: Al Viro --- kernel/exit.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2bbc23273e2f..608c9775a37b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1590,9 +1590,6 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) - ret = 0; - put_pid(pid); return ret; } @@ -1603,6 +1600,11 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) @@ -1612,7 +1614,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); @@ -1714,6 +1716,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err && uru) { /* kernel_waitid() overwrites everything in ru */ @@ -1729,7 +1736,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); -- cgit From fca18a47cf3eb8425ec19c2dfc374f3d04f5219f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Sat, 8 Jul 2017 00:27:30 +0530 Subject: trace/kprobes: Sanitize derived event names When we derive event names, convert some expected symbols (such as ':' used to specify module:name and '.' present in some symbols) into underscores so that the event name is not rejected. Before this patch: # echo 'p kobject_example:foo_store' > kprobe_events trace_kprobe: Failed to allocate trace_probe.(-22) -sh: write error: Invalid argument After this patch: # echo 'p kobject_example:foo_store' > kprobe_events # cat kprobe_events p:kprobes/p_kobject_example_foo_store_0 kobject_example:foo_store Link: http://lkml.kernel.org/r/66c189e09e71361aba91dd4a5bd146a1b62a7a51.1499453040.git.naveen.n.rao@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..44fd819aa33d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* @@ -740,6 +748,7 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, -- cgit From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:02 -0700 Subject: mm: make PR_SET_THP_DISABLE immediately active PR_SET_THP_DISABLE has a rather subtle semantic. It doesn't affect any existing mapping because it only updated mm->def_flags which is a template for new mappings. The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE flag set. This can be quite surprising for all those applications which do not do prctl(); fork() & exec() and want to control their own THP behavior. Another usecase when the immediate semantic of the prctl might be useful is a combination of pre- and post-copy migration of containers with CRIU. In this case CRIU populates a part of a memory region with data that was saved during the pre-copy stage. Afterwards, the region is registered with userfaultfd and CRIU expects to get page faults for the parts of the region that were not yet populated. However, khugepaged collapses the pages and the expected page faults do not occur. In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a temporary mechanism for enabling/disabling THP process wide. Implementation wise, a new MMF_DISABLE_THP flag is added. This flag is tested when decision whether to use huge pages is taken either during page fault of at the time of THP collapse. It should be noted, that the new implementation makes PR_SET_THP_DISABLE master override to any per-VMA setting, which was not the case previously. Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE") Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 3 ++- include/linux/sched/coredump.h | 5 ++++- kernel/sys.c | 6 +++--- mm/khugepaged.c | 3 ++- mm/shmem.c | 8 +++++--- 6 files changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d3b3e8fcc717..40d7b7dd2653 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); (1<vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ !is_vma_temporary_stack(__vma)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 5d9a400af509..f0d7335336cd 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE)) + !(vm_flags & VM_NOHUGEPAGE) && + !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 69eedcef8f03..98ae0d05aa32 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..73fc0af147d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/shmem.c b/mm/shmem.c index 9418f5a9bc46..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); -- cgit From 9dcdcea11491f6eee65bd1b352293ca01e4b7997 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 10 Jul 2017 15:51:14 -0700 Subject: kernel/ksysfs.c: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1120 544 16 1680 690 kernel/ksysfs.o File size After adding 'const': text data bss dec hex filename 1160 480 16 1656 678 kernel/ksysfs.o Link: http://lkml.kernel.org/r/aa224b3cc923fdbb3edd0c41b2c639c85408c9e8.1498737347.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Kees Cook Cc: Russell King Cc: Dave Young Cc: Hari Bathini Cc: Petr Tesarik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..df1a9aa602a0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { NULL }; -static struct attribute_group kernel_attr_group = { +static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; -- cgit From b7b2562f7252878e18de60c24f320052076f9de8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 10 Jul 2017 15:51:17 -0700 Subject: kernel/groups.c: use sort library function setgroups is not exactly a hot path, so we might as well use the library function instead of open-coding the sorting. Saves ~150 bytes. Link: http://lkml.kernel.org/r/1497301378-22739-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple Shell sort */ +static int gid_cmp(const void *_a, const void *_b) +{ + kgid_t a = *(kgid_t *)_a; + kgid_t b = *(kgid_t *)_b; + + return gid_gt(a, b) - gid_lt(a, b); +} + static void groups_sort(struct group_info *group_info) { - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - kgid_t tmp = group_info->gid[right]; - - while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { - group_info->gid[right] = group_info->gid[left]; - right = left; - left -= stride; - } - group_info->gid[right] = tmp; - } - stride /= 3; - } + sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), + gid_cmp, NULL); } /* a simple bsearch */ -- cgit From 63b23e2cbc8e80de3e40184ecb2c3bfb705776fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jul 2017 15:51:20 -0700 Subject: kernel/kallsyms.c: replace all_var with IS_ENABLED(CONFIG_KALLSYMS_ALL) 'all_var' looks like a variable, but is actually a macro. Use IS_ENABLED(CONFIG_KALLSYMS_ALL) for clarification. Link: http://lkml.kernel.org/r/1497577591-3434-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -28,12 +28,6 @@ #include -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - /* * These will be re-linked against their real values * during the second link stage. @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) static int is_ksym_addr(unsigned long addr) { - if (all_var) + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; - else if (all_var) + else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; -- cgit From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/module.c | 3 ++- arch/mips/kernel/traps.c | 3 ++- arch/sh/mm/extable_64.c | 34 ++++++++++++++++++---------------- arch/sparc/mm/extable.c | 28 ++++++++++++++-------------- include/linux/extable.h | 5 +++-- kernel/extable.c | 3 ++- kernel/module.c | 2 +- lib/extable.c | 41 +++++++++++++++++++++-------------------- 8 files changed, 63 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 94627a3a6a0d..50c020c47e54 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr) spin_lock_irqsave(&dbe_lock, flags); list_for_each_entry(dbe, &dbe_list, dbe_list) { - e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr); + e = search_extable(dbe->dbe_start, + dbe->dbe_end - dbe->dbe_start, addr); if (e) break; } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 38dfa27730ff..b68b4d0726d3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr); + e = search_extable(__start___dbe_table, + __stop___dbe_table - __start___dbe_table, addr); if (!e) e = search_module_dbetables(addr); return e; diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index b90cdfad2c78..7a3b4d33d2e7 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -10,6 +10,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include #include #include @@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long return NULL; } +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > _elt->insn) + return 1; + if (_key < _elt->insn) + return -1; + return 0; +} + /* Simple binary search */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { const struct exception_table_entry *mid; @@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first, if (mid) return mid; - while (first <= last) { - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } int fixup_exception(struct pt_regs *regs) diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index db214e9931d9..2422511dc8c5 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start, /* Caller knows they are in a range if ret->fixup == 0 */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *start, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - const struct exception_table_entry *walk; + int i; /* Single insn entries are encoded as: * word 1: insn address @@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start, */ /* 1. Try to find an exact match. */ - for (walk = start; walk <= last; walk++) { - if (walk->fixup == 0) { + for (i = 0; i < num; i++) { + if (base[i].fixup == 0) { /* A range entry, skip both parts. */ - walk++; + i++; continue; } /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) + if (base[i].fixup == -1) continue; - if (walk->insn == value) - return walk; + if (base[i].insn == value) + return &base[i]; } /* 2. Try to find a range match. */ - for (walk = start; walk <= (last - 1); walk++) { - if (walk->fixup) + for (i = 0; i < (num - 1); i++) { + if (base[i].fixup) continue; - if (walk[0].insn <= value && walk[1].insn > value) - return walk; + if (base[i].insn <= value && base[i + 1].insn > value) + return &base[i]; - walk++; + i++; } return NULL; diff --git a/include/linux/extable.h b/include/linux/extable.h index 7effea4b257d..28addad0dda7 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -2,13 +2,14 @@ #define _LINUX_EXTABLE_H #include /* for NULL */ +#include struct module; struct exception_table_entry; const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value); void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/lib/extable.c b/lib/extable.c index 62968daa66a9..f54996fdd0b8 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size) * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ -static int cmp_ex(const void *a, const void *b) +static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; @@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); + cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES @@ -93,6 +94,20 @@ void trim_init_extable(struct module *m) #endif /* !ARCH_HAS_SORT_EXTABLE */ #ifndef ARCH_HAS_SEARCH_EXTABLE + +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > ex_to_insn(_elt)) + return 1; + if (_key < ex_to_insn(_elt)) + return -1; + return 0; +} + /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, @@ -101,25 +116,11 @@ void trim_init_extable(struct module *m) * already sorted. */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - while (first <= last) { - const struct exception_table_entry *mid; - - mid = ((last - first) >> 1) + first; - /* - * careful, the distance between value and insn - * can be larger than MAX_LONG: - */ - if (ex_to_insn(mid) < value) - first = mid + 1; - else if (ex_to_insn(mid) > value) - last = mid - 1; - else - return mid; - } - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } #endif -- cgit From 4ea77014af0d6205b05503d1c7aac6eace11d473 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:52:57 -0700 Subject: kernel/signal.c: avoid undefined behaviour in kill_something_info When running kill(72057458746458112, 0) in userspace I hit the following issue. UBSAN: Undefined behaviour in kernel/signal.c:1462:11 negation of -2147483648 cannot be represented in type 'int': CPU: 226 PID: 9849 Comm: test Tainted: G B ---- ------- 3.10.0-327.53.58.70.x86_64_ubsan+ #116 Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, BIOS BLHSV028 11/11/2014 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SYSC_kill+0x43e/0x4d0 SyS_kill+0xe/0x10 system_call_fastpath+0x16/0x1b Add code to avoid the UBSAN detection. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1496670008-59084-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 48a59eefd8ad..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, -- cgit From dd83c161fbcc5d8be637ab159c0de015cbff5ba4 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:53:01 -0700 Subject: kernel/exit.c: avoid undefined behaviour when calling wait4() wait4(-2147483648, 0x20, 0, 0xdd0000) triggers: UBSAN: Undefined behaviour in kernel/exit.c:1651:9 The related calltrace is as follows: negation of -2147483648 cannot be represented in type 'int': CPU: 9 PID: 16482 Comm: zj Tainted: G B ---- ------- 3.10.0-327.53.58.71.x86_64+ #66 Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 /BC11BTSA , BIOS CTSAV036 04/27/2011 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SyS_wait4+0x1cb/0x1e0 system_call_fastpath+0x16/0x1b Exclude the overflow to avoid the UBSAN warning. Link: http://lkml.kernel.org/r/1497264618-20212-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: David Rientjes Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 608c9775a37b..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { -- cgit From dea1d0f5f1284e3defee4b8484d9fc230686cd42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 22:06:24 +0200 Subject: smp/hotplug: Replace BUG_ON and react useful The move of the unpark functions to the control thread moved the BUG_ON() there as well. While it made some sense in the idle thread of the upcoming CPU, it's bogus to crash the control thread on the already online CPU, especially as the function has a return value and the callsite is prepared to handle an error return. Replace it with a WARN_ON_ONCE() and return a proper error code. Fixes: 9cd4f1a4e7a8 ("smp/hotplug: Move unparking of percpu threads to the control CPU") Rightfully-ranted-at-by: Linux Torvalds Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ab860453841d..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -279,7 +279,8 @@ static int bringup_wait_for_ap(unsigned int cpu) /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); - BUG_ON(!cpu_online(cpu)); + if (WARN_ON_ONCE((!cpu_online(cpu)))) + return -ECANCELED; /* Unpark the stopper thread and the hotplug thread of the target cpu */ stop_machine_unpark(cpu); -- cgit From b11fb73743fc406204e0749ead18560aeda8b136 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 15:43:24 -0400 Subject: tracing: Fixup trace file header alignment The addition of TGID to the tracing header added a check to see if TGID shoudl be displayed or not, and updated the header accordingly. Unfortunately, it broke the default header. Also add constant strings to use for spacing. This does remove the visibility of the header a bit, but cuts it down from the extended lines much greater than 80 characters. Before this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU#|||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.277830: migration_init <-do_one_initcall swapper/0-1 [002] d... 13.861967: Unknown type 1201 swapper/0-1 [002] d..1 13.861970: Unknown type 1202 After this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.278245: migration_init <-do_one_initcall swapper/0-1 [003] d... 13.861189: Unknown type 1201 swapper/0-1 [003] d..1 13.861192: Unknown type 1202 Cc: Joel Fernandes Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92af8fd1429b..dabd810a10cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3358,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - - seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); - seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); - seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); - seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void -- cgit From bbd1d27d863d5c0acee65ecd0c2e34035e1df5ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 19:21:04 -0400 Subject: tracing: Do note expose stack_trace_filter without DYNAMIC_FTRACE The "stack_trace_filter" file only makes sense if DYNAMIC_FTRACE is configured in. If it is not, then the user can not filter any functions. Not only that, the open function causes warnings when DYNAMIC_FTRACE is not set. Link: http://lkml.kernel.org/r/20170710110521.600806-1-arnd@arndb.de Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); -- cgit From 69449bbd65687e8e5fb968a5a0c46089f6af6001 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Jul 2017 10:44:03 +0200 Subject: ftrace: Hide cached module code for !CONFIG_MODULES When modules are disabled, we get a harmless build warning: kernel/trace/ftrace.c:4051:13: error: 'process_cached_mods' defined but not used [-Werror=unused-function] This adds the same #ifdef around the new code that exists around its caller. Link: http://lkml.kernel.org/r/20170710084413.1820568-1-arnd@arndb.de Fixes: d7fbf8df7ca0 ("ftrace: Implement cached modules tracing on module load") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2953d558bbee..4706f0ed193e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3978,6 +3978,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable); +#ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) { @@ -4068,6 +4069,7 @@ static void process_cached_mods(const char *mod_name) kfree(mod); } +#endif /* * We register the module command as a template to show others how -- cgit From 19d39a3810e7032f311ef83effdac40339b9d022 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 23:41:52 +0200 Subject: genirq: Keep chip buslock across irq_request/release_resources() Moving the irq_request/release_resources() callbacks out of the spinlocked, irq disabled and bus locked region, unearthed an interesting abuse of the irq_bus_lock/irq_bus_sync_unlock() callbacks. The OMAP GPIO driver does merily power management inside of them. The irq_request_resources() callback of this GPIO irqchip calls a function which reads a GPIO register. That read aborts now because the clock of the GPIO block is not magically enabled via the irq_bus_lock() callback. Move the callbacks under the bus lock again to prevent this. In the free_irq() path this requires to drop the bus_lock before calling synchronize_irq() and reaquiring it before calling the irq_release_resources() callback. The bus lock can't be held because: 1) The data which has been changed between bus_lock/un_lock is cached in the irq chip driver private data and needs to go out to the irq chip via the slow bus (usually SPI or I2C) before calling synchronize_irq(). That's the reason why this bus_lock/unlock magic exists in the first place, as you cannot do SPI/I2C transactions while holding desc->lock with interrupts disabled. 2) synchronize_irq() will actually deadlock, if there is a handler on flight. These chips use threaded handlers for obvious reasons, as they allow to do SPI/I2C communication. When the threaded handler returns then bus_lock needs to be taken in irq_finalize_oneshot() as we need to talk to the actual irq chip once more. After that the threaded handler is marked done, which makes synchronize_irq() return. So if we hold bus_lock accross the synchronize_irq() call, the handler cannot mark itself done because it blocks on the bus lock. That in turn makes synchronize_irq() wait forever on the threaded handler to complete.... Add the missing unlock of desc->request_mutex in the error path of __free_irq() and add a bunch of comments to explain the locking and protection rules. Fixes: 46e48e257360 ("genirq: Move irq resource handling out of spinlocked region") Reported-and-tested-by: Sebastian Reichel Reported-and-tested-by: Tony Lindgren Reported-by: Pavel Machek Signed-off-by: Thomas Gleixner Not-longer-ranted-at-by: Linus Torvalds Cc: Linus Walleij Cc: Grygorii Strashko Cc: Marc Zyngier --- kernel/irq/manage.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5624b2dd6b58..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. + * + * Locking rules: + * + * desc->request_mutex Provides serialization against a concurrent free_irq() + * chip_bus_lock Provides serialization for slow bus operations + * desc->lock Provides serialization against hard interrupts + * + * chip_bus_lock and desc->lock are sufficient for all other management and + * interrupt related functions. desc->request_mutex solely serializes + * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) @@ -1167,20 +1177,35 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + /* + * Protects against a concurrent __free_irq() call which might wait + * for synchronize_irq() to complete without holding the optional + * chip bus lock and desc->lock. + */ mutex_lock(&desc->request_mutex); + + /* + * Acquire bus lock as the irq_request_resources() callback below + * might rely on the serialization or the magic power management + * functions which are abusing the irq_bus_lock() callback, + */ + chip_bus_lock(desc); + + /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mutex; + goto out_bus_unlock; } } - chip_bus_lock(desc); - /* * The following block of code has to be executed atomically + * protected against a concurrent interrupt and any of the other + * management calls which are not serialized via + * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; @@ -1286,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - irq_release_resources(desc); + if (ret) goto out_unlock; - } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ @@ -1385,12 +1408,10 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); - if (!desc->action) irq_release_resources(desc); - -out_mutex: +out_bus_unlock: + chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: @@ -1472,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); return NULL; } @@ -1498,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + /* + * Drop bus_lock here so the changes which were done in the chip + * callbacks above are synced out to the irq chips which hang + * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * + * Aside of that the bus_lock can also be taken from the threaded + * handler in irq_finalize_oneshot() which results in a deadlock + * because synchronize_irq() would wait forever for the thread to + * complete, which is blocked on the bus lock. + * + * The still held desc->request_mutex() protects against a + * concurrent request_irq() of this irq so the release of resources + * and timing data is properly serialized. + */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1530,8 +1566,15 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + /* Last action releases resources */ if (!desc->action) { + /* + * Reaquire bus lock as irq_release_resources() might + * require it to deallocate resources over the slow bus. + */ + chip_bus_lock(desc); irq_release_resources(desc); + chip_bus_sync_unlock(desc); irq_remove_timings(desc); } -- cgit From ab2f7cf141aa6734c4ca7525132d8cc236efee77 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 6 Jul 2017 10:53:20 -0700 Subject: cpufreq: schedutil: Fix sugov_start() versus sugov_update_shared() race With a shared policy in place, when one of the CPUs in the policy is hotplugged out and then brought back online, sugov_stop() and sugov_start() are called in order. sugov_stop() removes utilization hooks for each CPU in the policy and does nothing else in the for_each_cpu() loop. sugov_start() on the other hand iterates through the CPUs in the policy and re-initializes the per-cpu structure _and_ adds the utilization hook. This implies that the scheduler is allowed to invoke a CPU's utilization update hook when the rest of the per-cpu structures have yet to be re-inited. Apart from some strange values in tracepoints this doesn't cause a problem, but if we do end up accessing a pointer from the per-cpu sugov_cpu structure somewhere in the sugov_update_shared() path, we will likely see crashes since the memset for another CPU in the policy is free to race with sugov_update_shared from the CPU that is ready to go. So let's fix this now to first init all per-cpu structures, and then add the per-cpu utilization update hooks all at once. Signed-off-by: Vikram Mulukutla Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : -- cgit From 44925dfff05fd1a897992d278b15a6b6b55e79a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:33:40 +0300 Subject: ftrace: Remove an unneeded NULL check "func" can't be NULL and it doesn't make sense to check because we've already derefenced it. Link: http://lkml.kernel.org/r/20170712073340.4enzeojeoupuds5a@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4706f0ed193e..5fb5b40b3ae8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3950,7 +3950,7 @@ static int cache_mod(struct trace_array *tr, continue; /* no func matches all */ - if (!func || strcmp(func, "*") == 0 || + if (strcmp(func, "*") == 0 || (ftrace_mod->func && strcmp(ftrace_mod->func, func) == 0)) { ret = 0; -- cgit From 2e028c4fe12907f226b8221815f16c2486ad3aa7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:35:57 +0300 Subject: ftrace: Fix uninitialized variable in match_records() My static checker complains that if "func" is NULL then "clear_filter" is uninitialized. This seems like it could be true, although it's possible something subtle is happening that I haven't seen. kernel/trace/ftrace.c:3844 match_records() error: uninitialized symbol 'clear_filter'. Link: http://lkml.kernel.org/r/20170712073556.h6tkpjcdzjaozozs@mwanda Cc: stable@vger.kernel.org Fixes: f0a3b154bd7 ("ftrace: Clarify code for mod command") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5fb5b40b3ae8..53f6b6401cf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3816,7 +3816,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, -- cgit From 58c7ffc0747a3a9145629d4966291f0586703767 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 12 Jul 2017 04:59:45 +0100 Subject: fix a braino in compat_sys_getrlimit() Reported-and-tested-by: Meelis Roos Fixes: commit d9e968cb9f84 "getrlimit()/setrlimit(): move compat to native" Signed-off-by: Al Viro Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 73fc0af147d0..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, ret = do_prlimit(current, resource, NULL, &r); if (!ret) { - struct rlimit r32; + struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else -- cgit From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Jul 2017 14:33:11 -0700 Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts The reason to disable interrupts seems to be to avoid switching to a different processor while handling per cpu data using individual loads and stores. If we use per cpu RMV primitives we will not have to disable interrupts. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org Signed-off-by: Christoph Lameter Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f69a3e5281e..d2b9d7c31eaf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; -- cgit From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:14 -0700 Subject: kexec: move vmcoreinfo out of the kernel's .bss section As Eric said, "what we need to do is move the variable vmcoreinfo_note out of the kernel's .bss section. And modify the code to regenerate and keep this information in something like the control page. Definitely something like this needs a page all to itself, and ideally far away from any other kernel data structures. I clearly was not watching closely the data someone decided to keep this silly thing in the kernel's .bss section." This patch allocates extra pages for these vmcoreinfo_XXX variables, one advantage is that it enhances some safety of vmcoreinfo, because vmcoreinfo now is kept far away from other kernel data structures. Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Reviewed-by: Juergen Gross Suggested-by: Eric Biederman Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Hari Bathini Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 5 ----- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/setup.c | 6 ------ arch/x86/kernel/crash.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- include/linux/crash_core.h | 4 ++-- kernel/crash_core.c | 26 ++++++++++++++++++++++---- kernel/ksysfs.c | 2 +- 8 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 599507bcec91..c14815dca747 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) #endif } -phys_addr_t paddr_vmcoreinfo_note(void) -{ - return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); -} - diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 49a6bd45957b..3d0b14afa232 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3ae756c0db3d..3d1d808ea8a9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); } -static void __init setup_vmcoreinfo(void) -{ - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - #ifdef CONFIG_CRASH_DUMP /* @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..cab28cf2cffb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 4090a42578a8..87506a02e914 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -19,7 +19,7 @@ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) -#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_BYTES PAGE_SIZE #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) #define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ @@ -56,7 +56,7 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..2837d6164db8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,10 @@ #include /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; +u32 *vmcoreinfo_note; /* * parsing the "crashkernel" commandline @@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index df1a9aa602a0..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit From 5203f4995d9a87952a83c2ce7866adbbe8f97bb5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:17 -0700 Subject: powerpc/fadump: use the correct VMCOREINFO_NOTE_SIZE for phdr vmcoreinfo_max_size stands for the vmcoreinfo_data, the correct one we should use is vmcoreinfo_note whose total size is VMCOREINFO_NOTE_SIZE. Like explained in commit 77019967f06b ("kdump: fix exported size of vmcoreinfo note"), it should not affect the actual function, but we better fix it, also this change should be safe and backward compatible. After this, we can get rid of variable vmcoreinfo_max_size, let's use the corresponding macros directly, fewer variables means more safety for vmcoreinfo operation. [xlpang@redhat.com: fix build warning] Link: http://lkml.kernel.org/r/1494830606-27736-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/1493281021-20737-2-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Reviewed-by: Mahesh Salgaonkar Reviewed-by: Dave Young Cc: Hari Bathini Cc: Benjamin Herrenschmidt Cc: Eric Biederman Cc: Juergen Gross Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/fadump.c | 3 +-- include/linux/crash_core.h | 1 - kernel/crash_core.c | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3079518f2245..dc0c49cfd90a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -999,8 +999,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 87506a02e914..e5df1b3cf072 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -58,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2837d6164db8..315adbf9cb68 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -16,7 +16,6 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; u32 *vmcoreinfo_note; /* @@ -343,7 +342,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); -- cgit From 1229384f5b856d83698c38f9dedfd836e26711cb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:21 -0700 Subject: kdump: protect vmcoreinfo data under the crash memory Currently vmcoreinfo data is updated at boot time subsys_initcall(), it has the risk of being modified by some wrong code during system is running. As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on, when using "crash", "makedumpfile", etc utility to parse this vmcore, we probably will get "Segmentation fault" or other unexpected errors. E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the system; 3) trigger kdump, then we obviously will fail to recognize the crash context correctly due to the corrupted vmcoreinfo. Now except for vmcoreinfo, all the crash data is well protected(including the cpu note which is fully updated in the crash path, thus its correctness is guaranteed). Given that vmcoreinfo data is a large chunk prepared for kdump, we better protect it as well. To solve this, we relocate and copy vmcoreinfo_data to the crash memory when kdump is loading via kexec syscalls. Because the whole crash memory will be protected by existing arch_kexec_protect_crashkres() mechanism, we naturally protect vmcoreinfo_data from write(even read) access under kernel direct mapping after kdump is loaded. Since kdump is usually loaded at the very early stage after boot, we can trust the correctness of the vmcoreinfo data copied. On the other hand, we still need to operate the vmcoreinfo safe copy when crash happens to generate vmcoreinfo_note again, we rely on vmap() to map out a new kernel virtual address and update to use this new one instead in the following crash_save_vmcoreinfo(). BTW, we do not touch vmcoreinfo_note, because it will be fully updated using the protected vmcoreinfo_data after crash which is surely correct just like the cpu crash note. Link: http://lkml.kernel.org/r/1493281021-20737-3-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Eric Biederman Cc: Hari Bathini Cc: Juergen Gross Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 +- include/linux/kexec.h | 2 ++ kernel/crash_core.c | 17 ++++++++++++++++- kernel/kexec.c | 8 ++++++++ kernel/kexec_core.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 8 ++++++++ 6 files changed, 74 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index e5df1b3cf072..2df2118fbe13 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -28,6 +28,7 @@ typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +void crash_update_vmcoreinfo_safecopy(void *ptr); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) @@ -57,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern u32 *vmcoreinfo_note; -extern size_t vmcoreinfo_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 65888418fb69..dd056fab9e35 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -172,6 +172,7 @@ struct kimage { unsigned long start; struct page *control_code_page; struct page *swap_page; + void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; @@ -241,6 +242,7 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 315adbf9cb68..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,9 +15,12 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; +static size_t vmcoreinfo_size; u32 *vmcoreinfo_note; +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + /* * parsing the "crashkernel" commandline * @@ -323,11 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 766e7e4d3ad9..c8f7f77e9fa9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -298,6 +298,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; -- cgit From a19ac3374995382a994653ff372b98ea7cbad548 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:30 -0700 Subject: sysctl: kdoc'ify sysctl_writes_strict Document the different sysctl_writes_strict modes in code. Link: http://lkml.kernel.org/r/20170519033554.18592-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..02725178694a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit From d383d48470819e86fe30eb72f0e9494e1ee0e2af Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:33 -0700 Subject: sysctl: fold sysctl_writes_strict checks into helper The mode sysctl_writes_strict positional checks keep being copy and pasted as we add new proc handlers. Just add a helper to avoid code duplication. Link: http://lkml.kernel.org/r/20170519033554.18592-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 02725178694a..6f3bb1f099fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1970,6 +1970,32 @@ static void warn_sysctl_write(struct ctl_table *table) current->comm, table->procname); } +/** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1990,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2193,17 +2219,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2468,17 +2485,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; -- cgit From 4f2fec00afa60aa8e5d1b7f2a8e0526900f55623 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:36 -0700 Subject: sysctl: simplify unsigned int support Commit e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") added proc_douintvec() to start help adding support for unsigned int, this however was only half the work needed. Two fixes have come in since then for the following issues: o Printing the values shows a negative value, this happens since do_proc_dointvec() and this uses proc_put_long() This was fixed by commit 5380e5644afbba9 ("sysctl: don't print negative flag for proc_douintvec"). o We can easily wrap around the int values: UINT_MAX is 4294967295, if we echo in 4294967295 + 1 we end up with 0, using 4294967295 + 2 we end up with 1. o We echo negative values in and they are accepted This was fixed by commit 425fffd886ba ("sysctl: report EINVAL if value is larger than UINT_MAX for proc_douintvec"). It still also failed to be added to sysctl_check_table()... instead of adding it with the current implementation just provide a proper and simplified unsigned int support without any array unsigned int support with no negative support at all. Historically sysctl proc helpers have supported arrays, due to the complexity this adds though we've taken a step back to evaluate array users to determine if its worth upkeeping for unsigned int. An evaluation using Coccinelle has been done to perform a grammatical search to ask ourselves: o How many sysctl proc_dointvec() (int) users exist which likely should be moved over to proc_douintvec() (unsigned int) ? Answer: about 8 - Of these how many are array users ? Answer: Probably only 1 o How many sysctl array users exist ? Answer: about 12 This last question gives us an idea just how popular arrays: they are not. Array support should probably just be kept for strings. The identified uint ports are: drivers/infiniband/core/ucma.c - max_backlog drivers/infiniband/core/iwcm.c - default_backlog net/core/sysctl_net_core.c - rps_sock_flow_sysctl() net/netfilter/nf_conntrack_timestamp.c - nf_conntrack_timestamp -- bool net/netfilter/nf_conntrack_acct.c nf_conntrack_acct -- bool net/netfilter/nf_conntrack_ecache.c - nf_conntrack_events -- bool net/netfilter/nf_conntrack_helper.c - nf_conntrack_helper -- bool net/phonet/sysctl.c proc_local_port_range() The only possible array users is proc_local_port_range() but it does not seem worth it to add array support just for this given the range support works just as well. Unsigned int support should be desirable more for when you *need* more than INT_MAX or using int min/max support then does not suffice for your ranges. If you forget and by mistake happen to register an unsigned int proc entry with an array, the driver will fail and you will get something as follows: sysctl table check failed: debug/test_sysctl//uint_0002 array now allowed CPU: 2 PID: 1342 Comm: modprobe Tainted: G W E Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: dump_stack+0x63/0x81 __register_sysctl_table+0x350/0x650 ? kmem_cache_alloc_trace+0x107/0x240 __register_sysctl_paths+0x1b3/0x1e0 ? 0xffffffffc005f000 register_sysctl_table+0x1f/0x30 test_sysctl_init+0x10/0x1000 [test_sysctl] do_one_initcall+0x52/0x1a0 ? kmem_cache_alloc_trace+0x107/0x240 do_init_module+0x5f/0x200 load_module+0x1867/0x1bd0 ? __symbol_put+0x60/0x60 SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 entry_SYSCALL_64_fastpath+0x1e/0xad RIP: 0033:0x7f042b22d119 Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Link: http://lkml.kernel.org/r/20170519033554.18592-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Alexey Dobriyan Cc: Subash Abhinov Kasiviswanathan Cc: Liping Zhang Cc: Alexey Dobriyan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 14 +++++ kernel/sysctl.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 32c9c5630507..ee6feba8b6c0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1061,6 +1061,18 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } +static int sysctl_check_table_array(const char *path, struct ctl_table *table) +{ + int err = 0; + + if (table->proc_handler == proc_douintvec) { + if (table->maxlen != sizeof(unsigned int)) + err |= sysctl_err(path, table, "array now allowed"); + } + + return err; +} + static int sysctl_check_table(const char *path, struct ctl_table *table) { int err = 0; @@ -1081,6 +1093,8 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "No data"); if (!table->maxlen) err |= sysctl_err(path, table, "No maxlen"); + else + err |= sysctl_check_table_array(path, table); } if (!table->proc_handler) err |= sysctl_err(path, table, "No proc_handler"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f3bb1f099fa..d12078fc215f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2175,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2287,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2322,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* -- cgit From 61d9b56a89208d8cccd0b4cfec7e6959717e16e3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:40 -0700 Subject: sysctl: add unsigned int range support To keep parity with regular int interfaces provide the an unsigned int proc_douintvec_minmax() which allows you to specify a range of allowed valid numbers. Adding proc_douintvec_minmax_sysadmin() is easy but we can wait for an actual user for that. Link: http://lkml.kernel.org/r/20170519033554.18592-6-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook Cc: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ee6feba8b6c0..8f9d564d0969 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1065,7 +1065,8 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) { int err = 0; - if (table->proc_handler == proc_douintvec) { + if ((table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array now allowed"); } @@ -1083,6 +1084,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..225001d437ae 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -47,6 +47,9 @@ extern int proc_douintvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d12078fc215f..df9f2a367882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2567,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3066,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3108,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit From 9380fa60b10ebd6ee7c3fcdb2cf162f4d7cf9fc5 Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 12 Jul 2017 14:34:01 -0700 Subject: kernel/sysctl_binary.c: check name array length in deprecated_sysctl_warning() Prevent use of uninitialized memory (originating from the stack frame of do_sysctl()) by verifying that the name array is filled with sufficient input data before comparing its specific entries with integer constants. Through timing measurement or analyzing the kernel debug logs, a user-mode program could potentially infer the results of comparisons against the uninitialized memory, and acquire some (very limited) information about the state of the kernel stack. The change also eliminates possible future warnings by tools such as KMSAN and other code checkers / instrumentations. Link: http://lkml.kernel.org/r/20170524122139.21333-1-mjurczyk@google.com Signed-off-by: Mateusz Jurczyk Acked-by: Kees Cook Cc: "David S. Miller" Cc: Matthew Whitehead Cc: "Eric W. Biederman" Cc: Tetsuo Handa Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { -- cgit From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 12 Jul 2017 14:34:28 -0700 Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files With current epoll architecture target files are addressed with file_struct and file descriptor number, where the last is not unique. Moreover files can be transferred from another process via unix socket, added into queue and closed then so we won't find this descriptor in the task fdinfo list. Thus to checkpoint and restore such processes CRIU needs to find out where exactly the target file is present to add it into epoll queue. For this sake one can use kcmp call where some particular target file from the queue is compared with arbitrary file passed as an argument. Because epoll target files can have same file descriptor number but different file_struct a caller should explicitly specify the offset within. To test if some particular file is matching entry inside epoll one have to - fill kcmp_epoll_slot structure with epoll file descriptor, target file number and target file offset (in case if only one target is present then it should be 0) - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot) - the kernel fetch file pointer matching file descriptor @fd of pid1 - lookups for file struct in epoll queue of pid2 and returns traditional 0,1,2 result for sorting purpose Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com Signed-off-by: Cyrill Gorcunov Acked-by: Andrey Vagin Cc: Al Viro Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Jason Baron Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 42 ++++++++++++++++++++++++++++++++++ include/linux/eventpoll.h | 3 +++ include/uapi/linux/kcmp.h | 10 +++++++++ kernel/kcmp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 322904c3ebdf..e7e9901c3790 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 6daf6d4971f6..d8625d214ea7 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -14,6 +14,7 @@ #define _LINUX_EVENTPOLL_H #include +#include /* Forward declarations to avoid compiler errors */ @@ -22,6 +23,8 @@ struct file; #ifdef CONFIG_EPOLL +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h index 84df14b37360..481e103da78e 100644 --- a/include/uapi/linux/kcmp.h +++ b/include/uapi/linux/kcmp.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_KCMP_H #define _UAPI_LINUX_KCMP_H +#include + /* Comparison type */ enum kcmp_type { KCMP_FILE, @@ -10,8 +12,16 @@ enum kcmp_type { KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, + KCMP_EPOLL_TFD, KCMP_TYPES, }; +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + #endif /* _UAPI_LINUX_KCMP_H */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; -- cgit From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 12 Jul 2017 14:34:35 -0700 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: "Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below" Why add a new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance I've made the current interface work with all types of our sandboxes. For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to make /proc entries non-root owned. So I am fine with the current version of the code. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++ fs/proc/base.c | 52 +++++++++++++++ include/linux/sched.h | 1 + kernel/fork.c | 4 ++ lib/fault-inject.c | 7 ++ 5 files changed, 142 insertions(+) (limited to 'kernel') diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..192d8cbcc5f9 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,22 @@ use the boot option: fail_futex= mmc_core.fail_request=,,, +o proc entries + +- /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the current task fail + (N is 0-based). Read from this file returns a single char 'Y' or 'N' + that says if the fault setup with a previous write to this file was + injected or not, and disables the fault if it wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +294,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 0;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + read(fail_nth, buf, 1); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err); + if (buf[0] != 'Y') + break; + } + return 0; +} + +An example output: + +0-th fault Y: res=-1/23 +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e1927ccd48..88b773f318cd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1355,6 +1355,53 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err, n; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + err = kstrtoint_from_user(buf, count, 10, &n); + if (err) + return err; + if (n < 0 || n == INT_MAX) + return -EINVAL; + current->fail_nth = n + 1; + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + if (count < 1) + return -EINVAL; + err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf); + if (err) + return err; + current->fail_nth = 0; + return 1; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -3311,6 +3358,11 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + /* + * Operations on the file check that the task is current, + * so we create it with 0666 to support testing under unprivileged user. + */ + REG("fail-nth", 0666, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/include/linux/sched.h b/include/linux/sched.h index 20814b7d7d70..3822d749fc9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -974,6 +974,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/kernel/fork.c b/kernel/fork.c index d2b9d7c31eaf..ade237a96308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) -- cgit From f2e0cff85ed111a3cf24d894c3fa11697dfae628 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:43 -0700 Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog() For architectures that define HAVE_NMI_WATCHDOG, instead of having them provide the complete touch_nmi_watchdog() function, just have them provide arch_touch_nmi_watchdog(). This gives the generic code more flexibility in implementing this function, and arch implementations don't miss out on touching the softlockup watchdog or other generic details. Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/nmi.h | 2 ++ arch/blackfin/kernel/nmi.c | 2 +- arch/mn10300/include/asm/nmi.h | 2 ++ arch/mn10300/kernel/mn10300-watchdog-low.S | 8 ++++---- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/sparc/include/asm/nmi.h | 1 + arch/sparc/kernel/nmi.c | 6 ++---- include/linux/nmi.h | 27 ++++++++++++++++----------- kernel/watchdog_hld.c | 5 ++--- 9 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/asm/nmi.h b/arch/blackfin/include/asm/nmi.h index b9caac4fcfd8..107d23705f46 100644 --- a/arch/blackfin/include/asm/nmi.h +++ b/arch/blackfin/include/asm/nmi.h @@ -9,4 +9,6 @@ #include +extern void arch_touch_nmi_watchdog(void); + #endif diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 633c37083e87..1e714329fe8a 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -190,7 +190,7 @@ static int __init init_nmi_wdt(void) } device_initcall(init_nmi_wdt); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { atomic_set(&nmi_touched[smp_processor_id()], 1); } diff --git a/arch/mn10300/include/asm/nmi.h b/arch/mn10300/include/asm/nmi.h index f3671cbbc117..b05627597b1b 100644 --- a/arch/mn10300/include/asm/nmi.h +++ b/arch/mn10300/include/asm/nmi.h @@ -11,4 +11,6 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +extern void arch_touch_nmi_watchdog(void); + #endif /* _ASM_NMI_H */ diff --git a/arch/mn10300/kernel/mn10300-watchdog-low.S b/arch/mn10300/kernel/mn10300-watchdog-low.S index f2f5c9cfaabd..34f8773de7d0 100644 --- a/arch/mn10300/kernel/mn10300-watchdog-low.S +++ b/arch/mn10300/kernel/mn10300-watchdog-low.S @@ -50,9 +50,9 @@ watchdog_handler: # we can't inline it) # ############################################################################### - .globl touch_nmi_watchdog - .type touch_nmi_watchdog,@function -touch_nmi_watchdog: + .globl arch_touch_nmi_watchdog + .type arch_touch_nmi_watchdog,@function +arch_touch_nmi_watchdog: clr d0 clr d1 mov watchdog_alert_counter, a0 @@ -63,4 +63,4 @@ touch_nmi_watchdog: lne ret [],0 - .size touch_nmi_watchdog,.-touch_nmi_watchdog + .size arch_touch_nmi_watchdog,.-arch_touch_nmi_watchdog diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index a2d8e6938d67..0d5641beadf5 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -31,7 +31,7 @@ static unsigned int watchdog; static unsigned int watchdog_hz = 1; unsigned int watchdog_alert_counter[NR_CPUS]; -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); /* * the best way to detect whether a CPU has a 'hard lockup' problem diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h index 26ad2b2607c6..284eac3ffaf2 100644 --- a/arch/sparc/include/asm/nmi.h +++ b/arch/sparc/include/asm/nmi.h @@ -7,6 +7,7 @@ void nmi_adjust_hz(unsigned int new_hz); extern atomic_t nmi_active; +void arch_touch_nmi_watchdog(void); void start_nmi_watchdog(void *unused); void stop_nmi_watchdog(void *unused); diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 95e73c63c99d..048ad783ea3f 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -51,7 +51,7 @@ static DEFINE_PER_CPU(unsigned int, last_irq_sum); static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { if (atomic_read(&nmi_active)) { int cpu; @@ -61,10 +61,8 @@ void touch_nmi_watchdog(void) per_cpu(nmi_touch, cpu) = 1; } } - - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5e2e57536d98..bd387ef8bccd 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -6,6 +6,9 @@ #include #include +#if defined(CONFIG_HAVE_NMI_WATCHDOG) +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -58,6 +61,18 @@ static inline void reset_hung_task_detector(void) #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) +extern void hardlockup_detector_disable(void); +#else +static inline void hardlockup_detector_disable(void) {} +#endif + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +extern void arch_touch_nmi_watchdog(void); +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -65,21 +80,11 @@ static inline void reset_hung_task_detector(void) * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. */ -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -#include -extern void touch_nmi_watchdog(void); -#else static inline void touch_nmi_watchdog(void) { + arch_touch_nmi_watchdog(); touch_softlockup_watchdog(); } -#endif - -#if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void hardlockup_detector_disable(void); -#else -static inline void hardlockup_detector_disable(void) {} -#endif /* * Create trigger_all_cpu_backtrace() out of the arch-provided diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..90d688df6ce1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +66,8 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, -- cgit From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:46 -0700 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. [npiggin@gmail.com: fix] Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 25 ++++- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 2 +- include/linux/nmi.h | 29 +++-- kernel/Makefile | 2 +- kernel/sysctl.c | 31 +++--- kernel/watchdog.c | 243 +++++++++++++++++++++++++++-------------- kernel/watchdog_hld.c | 32 ------ lib/Kconfig.debug | 45 +++++--- 11 files changed, 251 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index cae0958a2298..fb9bd7d36b05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE config HAVE_NMI bool -config HAVE_NMI_WATCHDOG - depends on HAVE_NMI - bool # # An arch should select this if it provides all these things: # @@ -288,6 +285,28 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. +config HAVE_HARDLOCKUP_DETECTOR_PERF + bool + depends on HAVE_PERF_EVENTS_NMI + help + The arch chooses to use the generic perf-NMI-based hardlockup + detector. Must define HAVE_PERF_EVENTS_NMI. + +config HAVE_NMI_WATCHDOG + depends on HAVE_NMI + bool + help + The arch provides a low level NMI watchdog. It provides + asm/nmi.h, and defines its own arch_touch_nmi_watchdog(). + +config HAVE_HARDLOCKUP_DETECTOR_ARCH + bool + select HAVE_NMI_WATCHDOG + help + The arch chooses to provide its own hardlockup detector, which is + a superset of the HAVE_NMI_WATCHDOG. It also conforms to config + interfaces and parameters provided by hardlockup detector subsystem. + config HAVE_PERF_REGS bool help diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7177a3f4f418..63ed758e1d20 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..074a075a9cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a18681353d..3d2b8ce54e00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include #include -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index bd387ef8bccd..8aa01fd859fb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -11,13 +11,21 @@ #endif #ifdef CONFIG_LOCKUP_DETECTOR +void lockup_detector_init(void); +#else +static inline void lockup_detector_init(void) +{ +} +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned int hardlockup_panic; -void lockup_detector_init(void); +extern int soft_watchdog_enabled; +extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { @@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } -static inline void lockup_detector_init(void) -{ -} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +extern unsigned int hardlockup_panic; #else static inline void hardlockup_detector_disable(void) {} #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); #else +#if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif +#endif /** * touch_nmi_watchdog - restart NMI watchdog timeout. @@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu) } #endif -#ifdef CONFIG_LOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh); +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern atomic_t watchdog_park_in_progress; +extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f28f4252e54a..b0d01c6d4e03 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -778,34 +778,45 @@ config DEBUG_SHIRQ menu "Debug Lockups and Hangs" config LOCKUP_DETECTOR - bool "Detect Hard and Soft Lockups" + bool + +config SOFTLOCKUP_DETECTOR + bool "Detect Soft Lockups" depends on DEBUG_KERNEL && !S390 + select LOCKUP_DETECTOR help Say Y here to enable the kernel to act as a watchdog to detect - hard and soft lockups. + soft lockups. Softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config HARDLOCKUP_DETECTOR_PERF + bool + select SOFTLOCKUP_DETECTOR + +# +# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard +# lockup detector rather than the perf based detector. +# +config HARDLOCKUP_DETECTOR + bool "Detect Hard Lockups" + depends on DEBUG_KERNEL && !S390 + depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH + select LOCKUP_DETECTOR + select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF + select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH + help + Say Y here to enable the kernel to act as a watchdog to detect + hard lockups. + Hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. - The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. - -config HARDLOCKUP_DETECTOR - def_bool y - depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR @@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL - default LOCKUP_DETECTOR + default SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in -- cgit From a10a842ff81a7e3810817b3b04e4c432b6191e21 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:49 -0700 Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs After reconfiguring watchdog sysctls etc., architecture specific watchdogs may not get all their parameters updated. watchdog_nmi_reconfigure() can be implemented to pull the new values in and set the arch NMI watchdog. [npiggin@gmail.com: add code comments] Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com [arnd@arndb.de: hide unused function] Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Signed-off-by: Arnd Bergmann Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1fba9c3d66dc..cabe3e9fb620 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) { @@ -123,6 +127,22 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ +void __weak watchdog_nmi_reconfigure(void) +{ +} + + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Helper for online, unparked cpus. */ @@ -600,6 +620,14 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} +#endif + #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) { @@ -619,6 +647,13 @@ static void watchdog_disable_all_cpus(void) { } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return 0; +} +#endif + static void set_sample_period(void) { } @@ -651,6 +686,8 @@ int lockup_detector_suspend(void) watchdog_enabled = 0; } + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); return ret; @@ -671,6 +708,8 @@ void lockup_detector_resume(void) if (watchdog_running && !watchdog_suspended) watchdog_unpark_threads(); + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); } @@ -696,6 +735,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -881,12 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); -#endif } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit From e2ae8ab4b571e2e4094a28acb60649bc2732c67f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jul 2017 14:35:58 -0700 Subject: kexec_file: adjust declaration of kexec_purgatory Defining kexec_purgatory as a zero-length char array upsets compile time size checking. Since this is built on a per-arch basis, define it as an unsized char array (like is done for other similar things, e.g. linker sections). This silences the warning generated by the future CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte" array. This drops the __weak and uses an extern instead, since both users define kexec_purgatory. Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: "Eric W. Biederman" Cc: Daniel Micay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 7 ------- kernel/kexec_internal.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c8f7f77e9fa9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ -- cgit From 7cd815bce828220deffd1654265f0ef891567774 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 12 Jul 2017 14:36:20 -0700 Subject: fork,random: use get_random_canary() to set tsk->stack_canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ade237a96308..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* -- cgit From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 13 Jul 2017 14:18:24 +0800 Subject: locking/rtmutex: Remove unnecessary priority adjustment We don't need to adjust priority before adding a new pi_waiter, the priority only needs to be updated after pi_waiter change or task priority change. Steven Rostedt pointed out: "Interesting, I did some git mining and this was added with the original entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I don't see the purpose of adjusting the task prio here. It is done before anything changes in the task." Signed-off-by: Alex Shi Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org [ Enhance the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; -- cgit From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Jul 2017 00:40:28 -0700 Subject: sched/cputime: Don't use smp_processor_id() in preemptible context Recent kernels trigger this warning: BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181 caller is debug_smp_processor_id+0x17/0x19 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1 Call Trace: dump_stack+0x82/0xb8 check_preemption_disabled() debug_smp_processor_id() vtime_delta() task_cputime() thread_group_cputime() thread_group_cputime_adjusted() wait_consider_task() do_wait() SYSC_wait4() do_syscall_64() entry_SYSCALL64_slow_path() As Frederic pointed out: | Although those sched_clock_cpu() things seem to only matter when the | sched_clock() is unstable. And that stability is a condition for nohz_full | to work anyway. So probably sched_clock() alone would be enough. This patch fixes it by replacing sched_clock_cpu() with sched_clock() to avoid calling smp_processor_id() in a preemptible context. Reported-by: Xiaolong Ye Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6e3ea4ac1bda..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime) { unsigned long long clock; - clock = sched_clock_cpu(smp_processor_id()); + clock = sched_clock(); if (clock < vtime->starttime) return 0; @@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(smp_processor_id()); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); } @@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(cpu); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:24:29 -0700 Subject: sched/deadline: Fix confusing comments about selection of top pi-waiter This comment in the code is incomplete, and I believe it begs a definition of dl_boosted to make sense of the condition that follows. Rewrite the comment and also rearrange the condition that follows to reflect the first condition "we have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a typo that follows. Signed-off-by: Joel Fernandes Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. -- cgit From 5f92a7b0fcd627fbd06ceb1cee3bbe5d08d13356 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jul 2017 14:49:46 -0700 Subject: kernel/watchdog.c: use better pr_fmt prefix After commit 73ce0511c436 ("kernel/watchdog.c: move hardlockup detector to separate file"), 'NMI watchdog' is inappropriate in kernel/watchdog.c, using 'watchdog' only. Link: http://lkml.kernel.org/r/1499928642-48983-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Babu Moger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cabe3e9fb620..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,7 +9,7 @@ * to those contributors as well. */ -#define pr_fmt(fmt) "NMI watchdog: " fmt +#define pr_fmt(fmt) "watchdog: " fmt #include #include -- cgit From 6d7964a722afc8e4f880b947f174009063028c99 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 14 Jul 2017 14:50:11 -0700 Subject: kmod: throttle kmod thread limit If we reach the limit of modprobe_limit threads running the next request_module() call will fail. The original reason for adding a kill was to do away with possible issues with in old circumstances which would create a recursive series of request_module() calls. We can do better than just be super aggressive and reject calls once we've reached the limit by simply making pending callers wait until the threshold has been reduced, and then throttling them in, one by one. This throttling enables requests over the kmod concurrent limit to be processed once a pending request completes. Only the first item queued up to wait is woken up. The assumption here is once a task is woken it will have no other option to also kick the queue to check if there are more pending tasks -- regardless of whether or not it was successful. By throttling and processing only max kmod concurrent tasks we ensure we avoid unexpected fatal request_module() calls, and we keep memory consumption on module loading to a minimum. With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run time to run both tests: time ./kmod.sh -t 0008 real 0m16.366s user 0m0.883s sys 0m8.916s time ./kmod.sh -t 0009 real 0m50.803s user 0m0.791s sys 0m9.852s Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 16 +++++++--------- tools/testing/selftests/kmod/kmod.sh | 24 ++---------------------- 2 files changed, 9 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ff68198fe83b..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem); */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. @@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...) va_list args; char module_name[MODULE_NAME_LEN]; int ret; - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - return -ENOMEM; + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + wait_event_interruptible(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0); } trace_module_request(module_name, wait, _RET_IP_); @@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); return ret; } diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 10196a62ed09..8cecae9a8bca 100644 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -59,28 +59,8 @@ ALL_TESTS="$ALL_TESTS 0004:1:1" ALL_TESTS="$ALL_TESTS 0005:10:1" ALL_TESTS="$ALL_TESTS 0006:10:1" ALL_TESTS="$ALL_TESTS 0007:5:1" - -# Disabled tests: -# -# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" -# Current best-effort failure interpretation: -# Enough module requests get loaded in place fast enough to reach over the -# max_modprobes limit and trigger a failure -- before we're even able to -# start processing pending requests. -ALL_TESTS="$ALL_TESTS 0008:150:0" - -# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" -# Current best-effort failure interpretation: -# -# get_fs_type() requests modules using aliases as such the optimization in -# place today to look for already loaded modules will not take effect and -# we end up requesting a new module to load, this bumps the kmod_concurrent, -# and in certain circumstances can lead to pushing the kmod_concurrent over -# the max_modprobe limit. -# -# This test fails much easier than test 0008 since the alias optimizations -# are not in place. -ALL_TESTS="$ALL_TESTS 0009:150:0" +ALL_TESTS="$ALL_TESTS 0008:150:1" +ALL_TESTS="$ALL_TESTS 0009:150:1" test_modprobe() { -- cgit From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 17 Jul 2017 19:47:02 +0200 Subject: genirq/PM: Properly pretend disabled state when force resuming interrupts Interrupts with the IRQF_FORCE_RESUME flag set have also the IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond the suspension of device irqs. Force resume works by pretending that the interrupt was disabled and then calling __irq_enable(). Incrementing the disabled depth counter was enough to do that, but with the recent changes which use state flags to avoid unnecessary hardware access, this is not longer sufficient. If the state flags are not set, then the hardware callbacks are not invoked and the interrupt line stays disabled in "hardware". Set the disabled and masked state when pretending that an interrupt got disabled by suspend. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Suggested-by: Thomas Gleixner Signed-off-by: Juergen Gross Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com --- kernel/irq/chip.c | 10 ---------- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/pm.c | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d171bc57e1e0..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dbfba9933ed2..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); -- cgit From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:14:16 -0700 Subject: tracing/ring_buffer: Try harder to allocate ftrace can fail to allocate per-CPU ring buffer on systems with a large number of CPUs coupled while large amounts of cache happening in the page cache. Currently the ring buffer allocation doesn't retry in the VM implementation even if direct-reclaim made some progress but still wasn't able to find a free page. On retrying I see that the allocations almost always succeed. The retry doesn't happen because __GFP_NORETRY is used in the tracer to prevent the case where we might OOM, however if we drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag introduced recently [1]. Tested the following still succeeds without destabilizing a system with 1GB memory. echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb [1] https://marc.info/?l=linux-mm&m=149820805124906&w=2 Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com Cc: Tim Murray Cc: Ingo Molnar Cc: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); -- cgit From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 18 Jul 2017 14:37:24 +0800 Subject: audit: fix memleak in auditd_send_unicast_skb. Found this issue by kmemleak report, auditd_send_unicast_skb did not free skb if rcu_dereference(auditd_conn) returns null. unreferenced object 0xffff88082568ce00 (size 256): comm "auditd", pid 1119, jiffies 4294708499 backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_node+0xcc/0x210 [] __alloc_skb+0x5d/0x290 [] audit_make_reply+0x54/0xd0 [] audit_receive_msg+0x967/0xd70 ---------------- (gdb) list *audit_receive_msg+0x967 0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133). 1132 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); --------------- [] audit_receive+0x52/0xa0 [] netlink_unicast+0x181/0x240 [] netlink_sendmsg+0x2c2/0x3b0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x102/0x190 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x1a/0xa5 [] 0xffffffffffffffff Signed-off-by: Shu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7cad70214b81..07def5e49cc9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } -- cgit From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 20 Jul 2017 18:36:09 +0800 Subject: tracing: Fix kmemleak in instance_rmdir Hit the kmemleak when executing instance_rmdir, it forgot releasing mem of tracing_cpumask. With this fix, the warn does not appear any more. unreferenced object 0xffff93a8dfaa7c18 (size 8): comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s) hex dump (first 8 bytes): ff ff ff ff ff ff ff ff ........ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] alloc_cpumask_var_node+0x23/0x30 [] alloc_cpumask_var+0xe/0x10 [] instance_mkdir+0x90/0x240 [] tracefs_syscall_mkdir+0x40/0x70 [] vfs_mkdir+0x109/0x1b0 [] SyS_mkdir+0xd0/0x100 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d0ffcc49dba..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- include/linux/trace_events.h | 2 +- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 4 files changed, 34 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5857390ac35a..6383115e9d2c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,8 +145,8 @@ enum { #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_hash __rcu *notrace_hash; + struct ftrace_hash __rcu *filter_hash; struct mutex regex_lock; }; @@ -168,7 +168,7 @@ static inline void ftrace_free_init_mem(void) { } */ struct ftrace_ops { ftrace_func_t func; - struct ftrace_ops *next; + struct ftrace_ops __rcu *next; unsigned long flags; void *private; ftrace_func_t saved_func; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f73cedfa2e0b..536c80ff7ad9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -338,7 +338,7 @@ enum { struct trace_event_file { struct list_head list; struct trace_event_call *event_call; - struct event_filter *filter; + struct event_filter __rcu *filter; struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b6401cf0..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 621076f56251..8e5d31f6faef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,7 @@ struct bpf_reg_state { u32 min_align; u32 aux_off; u32 aux_off_align; + bool value_from_signed; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Jul 2017 16:14:55 +0200 Subject: perf/core: Fix locking for children siblings group read We're missing ctx lock when iterating children siblings within the perf_read path for group reading. Following race and crash can happen: User space doing read syscall on event group leader: T1: perf_read lock event->ctx->mutex perf_read_group lock leader->child_mutex __perf_read_group_add(child) list_for_each_entry(sub, &leader->sibling_list, group_entry) ----> sub might be invalid at this point, because it could get removed via perf_event_exit_task_context in T2 Child exiting and cleaning up its events: T2: perf_event_exit_task_context lock ctx->mutex list_for_each_entry_safe(child_event, next, &child_ctx->event_list,... perf_event_exit_event(child) lock ctx->lock perf_group_detach(child) unlock ctx->lock ----> child is removed from sibling_list without any sync with T1 path above ... free_event(child) Before the child is removed from the leader's child_list, (and thus is omitted from perf_read_group processing), we need to ensure that perf_read_group touches child's siblings under its ctx->lock. Peter further notes: | One additional note; this bug got exposed by commit: | | ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") | | which made it possible to actually trigger this code-path. Tested-by: Andi Kleen Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd396770..c17c0881fd36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } -- cgit