diff options
Diffstat (limited to 'kernel/cgroup/rstat.c')
| -rw-r--r-- | kernel/cgroup/rstat.c | 118 | 
1 files changed, 98 insertions, 20 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb499..fb8b49437573 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@  #include <linux/btf.h>  #include <linux/btf_ids.h> +#include <trace/events/cgroup.h> +  static DEFINE_SPINLOCK(cgroup_rstat_lock);  static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)  	return per_cpu_ptr(cgrp->rstat_cpu, cpu);  } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, +				     struct cgroup *cgrp, const bool fast_path) +{ +	unsigned long flags; +	bool contended; + +	/* +	 * The _irqsave() is needed because cgroup_rstat_lock is +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring +	 * this lock with the _irq() suffix only disables interrupts on +	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables +	 * interrupts on both configurations. The _irqsave() ensures +	 * that interrupts are always disabled and later restored. +	 */ +	contended = !raw_spin_trylock_irqsave(cpu_lock, flags); +	if (contended) { +		if (fast_path) +			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); +		else +			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + +		raw_spin_lock_irqsave(cpu_lock, flags); +	} + +	if (fast_path) +		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); +	else +		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + +	return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, +			      struct cgroup *cgrp, unsigned long flags, +			      const bool fast_path) +{ +	if (fast_path) +		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); +	else +		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + +	raw_spin_unlock_irqrestore(cpu_lock, flags); +} +  /**   * cgroup_rstat_updated - keep track of updated rstat_cpu   * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)  	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))  		return; -	raw_spin_lock_irqsave(cpu_lock, flags); +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);  	/* put @cgrp and all ancestors on the corresponding updated lists */  	while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)  		cgrp = parent;  	} -	raw_spin_unlock_irqrestore(cpu_lock, flags); +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);  }  /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)  	struct cgroup *head = NULL, *parent, *child;  	unsigned long flags; -	/* -	 * The _irqsave() is needed because cgroup_rstat_lock is -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring -	 * this lock with the _irq() suffix only disables interrupts on -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables -	 * interrupts on both configurations. The _irqsave() ensures -	 * that interrupts are always disabled and later restored. -	 */ -	raw_spin_lock_irqsave(cpu_lock, flags); +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);  	/* Return NULL if this subtree is not on-list */  	if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)  	if (child != root)  		head = cgroup_rstat_push_children(head, child, cpu);  unlock_ret: -	raw_spin_unlock_irqrestore(cpu_lock, flags); +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);  	return head;  } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,  __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments.  The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) +	__acquires(&cgroup_rstat_lock) +{ +	bool contended; + +	contended = !spin_trylock_irq(&cgroup_rstat_lock); +	if (contended) { +		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); +		spin_lock_irq(&cgroup_rstat_lock); +	} +	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) +	__releases(&cgroup_rstat_lock) +{ +	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); +	spin_unlock_irq(&cgroup_rstat_lock); +} +  /* see cgroup_rstat_flush() */  static void cgroup_rstat_flush_locked(struct cgroup *cgrp)  	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)  		/* play nice and yield if necessary */  		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { -			spin_unlock_irq(&cgroup_rstat_lock); +			__cgroup_rstat_unlock(cgrp, cpu);  			if (!cond_resched())  				cpu_relax(); -			spin_lock_irq(&cgroup_rstat_lock); +			__cgroup_rstat_lock(cgrp, cpu);  		}  	}  } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)  {  	might_sleep(); -	spin_lock_irq(&cgroup_rstat_lock); +	__cgroup_rstat_lock(cgrp, -1);  	cgroup_rstat_flush_locked(cgrp); -	spin_unlock_irq(&cgroup_rstat_lock); +	__cgroup_rstat_unlock(cgrp, -1);  }  /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)  	__acquires(&cgroup_rstat_lock)  {  	might_sleep(); -	spin_lock_irq(&cgroup_rstat_lock); +	__cgroup_rstat_lock(cgrp, -1);  	cgroup_rstat_flush_locked(cgrp);  }  /**   * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint   */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp)  	__releases(&cgroup_rstat_lock)  { -	spin_unlock_irq(&cgroup_rstat_lock); +	__cgroup_rstat_unlock(cgrp, -1);  }  int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)  #ifdef CONFIG_SCHED_CORE  		forceidle_time = cgrp->bstat.forceidle_sum;  #endif -		cgroup_rstat_flush_release(); +		cgroup_rstat_flush_release(cgrp);  	} else {  		root_cgroup_cputime(&bstat);  		usage = bstat.cputime.sum_exec_runtime;  |