diff options
Diffstat (limited to 'fs/bcachefs/rcu_pending.c')
-rw-r--r-- | fs/bcachefs/rcu_pending.c | 603 |
1 files changed, 603 insertions, 0 deletions
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c new file mode 100644 index 000000000000..8f8d914d3998 --- /dev/null +++ b/fs/bcachefs/rcu_pending.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ + +#include <linux/generic-radix-tree.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/srcu.h> +#include <linux/vmalloc.h> + +#include "rcu_pending.h" +#include "darray.h" +#include "util.h" + +#define static_array_for_each(_a, _i) \ + for (typeof(&(_a)[0]) _i = _a; \ + _i < (_a) + ARRAY_SIZE(_a); \ + _i++) + +enum rcu_pending_special { + RCU_PENDING_KVFREE = 1, + RCU_PENDING_CALL_RCU = 2, +}; + +#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) +#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) + +static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? get_state_synchronize_srcu(ssp) + : get_state_synchronize_rcu(); +} + +static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? start_poll_synchronize_srcu(ssp) + : start_poll_synchronize_rcu(); +} + +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) +{ + return ssp + ? poll_state_synchronize_srcu(ssp, cookie) + : poll_state_synchronize_rcu(cookie); +} + +static inline void __rcu_barrier(struct srcu_struct *ssp) +{ + return ssp + ? srcu_barrier(ssp) + : rcu_barrier(); +} + +static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func) +{ + if (ssp) + call_srcu(ssp, rhp, func); + else + call_rcu(rhp, func); +} + +struct rcu_pending_seq { + /* + * We're using a radix tree like a vector - we're just pushing elements + * onto the end; we're using a radix tree instead of an actual vector to + * avoid reallocation overhead + */ + GENRADIX(struct rcu_head *) objs; + size_t nr; + struct rcu_head **cursor; + unsigned long seq; +}; + +struct rcu_pending_list { + struct rcu_head *head; + struct rcu_head *tail; + unsigned long seq; +}; + +struct rcu_pending_pcpu { + struct rcu_pending *parent; + spinlock_t lock; + int cpu; + + /* + * We can't bound the number of unprocessed gp sequence numbers, and we + * can't efficiently merge radix trees for expired grace periods, so we + * need darray/vector: + */ + DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; + + /* Third entry is for expired objects: */ + struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; + + struct rcu_head cb; + bool cb_armed; + struct work_struct work; +}; + +static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) +{ + if (p->objs.nr) + return true; + + static_array_for_each(p->lists, i) + if (i->head) + return true; + + return false; +} + +static void rcu_pending_list_merge(struct rcu_pending_list *l1, + struct rcu_pending_list *l2) +{ + if (!l1->head) + l1->head = l2->head; + else + l1->tail->next = l2->head; + l1->tail = l2->tail; + + l2->head = l2->tail = NULL; +} + +static void rcu_pending_list_add(struct rcu_pending_list *l, + struct rcu_head *n) +{ + if (!l->head) + l->head = n; + else + l->tail->next = n; + l->tail = n; + n->next = NULL; +} + +static void merge_expired_lists(struct rcu_pending_pcpu *p) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + + for (struct rcu_pending_list *i = p->lists; i < expired; i++) + if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) + rcu_pending_list_merge(expired, i); +} + +static noinline void __process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + struct rcu_pending_seq objs = {}; + struct rcu_head *list = NULL; + + if (p->objs.nr && + __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { + objs = p->objs.data[0]; + darray_remove_item(&p->objs, p->objs.data); + } + + merge_expired_lists(p); + + list = expired->head; + expired->head = expired->tail = NULL; + + spin_unlock_irqrestore(&p->lock, flags); + + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + for (size_t i = 0; i < objs.nr; ) { + size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); + + kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); + i += nr_this_node; + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; + list = obj->next; + + /* + * low bit of pointer indicates whether rcu_head needs + * to be freed - kvfree_rcu_mightsleep() + */ + BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); + + void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); + bool free_head = ((unsigned long) obj->func) & 1UL; + + kvfree(ptr); + if (free_head) + kfree(obj); + } + + break; + + case RCU_PENDING_CALL_RCU: + for (size_t i = 0; i < objs.nr; i++) { + struct rcu_head *obj = *genradix_ptr(&objs.objs, i); + obj->func(obj); + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; + list = obj->next; + obj->func(obj); + } + break; + + default: + for (size_t i = 0; i < objs.nr; i++) + pending->process(pending, *genradix_ptr(&objs.objs, i)); + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; + list = obj->next; + pending->process(pending, obj); + } + break; + } +} + +static bool process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + /* + * XXX: we should grab the gp seq once and avoid multiple function + * calls, this is called from __rcu_pending_enqueue() fastpath in + * may_sleep==true mode + */ + if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || + (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || + (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || + p->lists[2].head) { + __process_finished_items(pending, p, flags); + return true; + } + + return false; +} + +static void rcu_pending_work(struct work_struct *work) +{ + struct rcu_pending_pcpu *p = + container_of(work, struct rcu_pending_pcpu, work); + struct rcu_pending *pending = p->parent; + unsigned long flags; + + do { + spin_lock_irqsave(&p->lock, flags); + } while (process_finished_items(pending, p, flags)); + + spin_unlock_irqrestore(&p->lock, flags); +} + +static void rcu_pending_rcu_cb(struct rcu_head *rcu) +{ + struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); + + schedule_work_on(p->cpu, &p->work); + + unsigned long flags; + spin_lock_irqsave(&p->lock, flags); + if (__rcu_pending_has_pending(p)) + __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); + else + p->cb_armed = false; + spin_unlock_irqrestore(&p->lock, flags); +} + +static __always_inline struct rcu_pending_seq * +get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) +{ + darray_for_each_reverse(p->objs, objs) + if (objs->seq == seq) + return objs; + + if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) + return NULL; + + return &darray_last(p->objs); +} + +static noinline bool +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, + struct rcu_head *head, void *ptr, + unsigned long *flags) +{ + if (ptr) { + if (!head) { + /* + * kvfree_rcu_mightsleep(): we weren't passed an + * rcu_head, but we need one: use the low bit of the + * ponter to free to flag that the head needs to be + * freed as well: + */ + ptr = (void *)(((unsigned long) ptr)|1UL); + head = kmalloc(sizeof(*head), __GFP_NOWARN); + if (!head) { + spin_unlock_irqrestore(&p->lock, *flags); + head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); + /* + * dropped lock, did GFP_KERNEL allocation, + * check for gp expiration + */ + if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { + kvfree(--ptr); + kfree(head); + spin_lock_irqsave(&p->lock, *flags); + return false; + } + } + } + + head->func = ptr; + } +again: + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (i->seq == seq) { + rcu_pending_list_add(i, head); + return false; + } + } + + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (!i->head) { + i->seq = seq; + rcu_pending_list_add(i, head); + return true; + } + } + + merge_expired_lists(p); + goto again; +} + +/* + * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via + * pending->pracess) once grace period elapses. + * + * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall + * back to a linked list. + * + * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a + * process callback + * + * - If @ptr and @head are both not NULL, we're kvfree_rcu() + * + * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() + * + * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process + * expired items. + */ +static __always_inline void +__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + void *ptr, bool may_sleep) +{ + + struct rcu_pending_pcpu *p; + struct rcu_pending_seq *objs; + struct genradix_node *new_node = NULL; + unsigned long seq, flags; + bool start_gp = false; + + BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + seq = __get_state_synchronize_rcu(pending->srcu); +restart: + if (may_sleep && + unlikely(process_finished_items(pending, p, flags))) + goto check_expired; + + /* + * In kvfree_rcu() mode, the radix tree is only for slab pointers so + * that we can do kfree_bulk() - vmalloc pointers always use the linked + * list: + */ + if (ptr && unlikely(is_vmalloc_addr(ptr))) + goto list_add; + + objs = get_object_radix(p, seq); + if (unlikely(!objs)) + goto list_add; + + if (unlikely(!objs->cursor)) { + /* + * New radix tree nodes must be added under @p->lock because the + * tree root is in a darray that can be resized (typically, + * genradix supports concurrent unlocked allocation of new + * nodes) - hence preallocation and the retry loop: + */ + objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, + objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); + if (unlikely(!objs->cursor)) { + if (may_sleep) { + spin_unlock_irqrestore(&p->lock, flags); + + gfp_t gfp = GFP_KERNEL; + if (!head) + gfp |= __GFP_NOFAIL; + + new_node = genradix_alloc_node(gfp); + if (!new_node) + may_sleep = false; + goto check_expired; + } +list_add: + start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); + goto start_gp; + } + } + + *objs->cursor++ = ptr ?: head; + /* zero cursor if we hit the end of a radix tree node: */ + if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) + objs->cursor = NULL; + start_gp = !objs->nr; + objs->nr++; +start_gp: + if (unlikely(start_gp)) { + /* + * We only have one callback (ideally, we would have one for + * every outstanding graceperiod) - so if our callback is + * already in flight, we may still have to start a grace period + * (since we used get_state() above, not start_poll()) + */ + if (!p->cb_armed) { + p->cb_armed = true; + __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); + } else { + __start_poll_synchronize_rcu(pending->srcu); + } + } + spin_unlock_irqrestore(&p->lock, flags); +free_node: + if (new_node) + genradix_free_node(new_node); + return; +check_expired: + if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + kvfree(ptr); + break; + case RCU_PENDING_CALL_RCU: + head->func(head); + break; + default: + pending->process(pending, head); + break; + } + goto free_node; + } + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + goto restart; +} + +void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) +{ + __rcu_pending_enqueue(pending, obj, NULL, true); +} + +static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) +{ + struct rcu_head *ret = NULL; + + spin_lock_irq(&p->lock); + darray_for_each(p->objs, objs) + if (objs->nr) { + ret = *genradix_ptr(&objs->objs, --objs->nr); + objs->cursor = NULL; + if (!objs->nr) + genradix_free(&objs->objs); + goto out; + } + + static_array_for_each(p->lists, i) + if (i->head) { + ret = i->head; + i->head = ret->next; + if (!i->head) + i->tail = NULL; + goto out; + } +out: + spin_unlock_irq(&p->lock); + + return ret; +} + +struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) +{ + return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); +} + +struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) +{ + struct rcu_head *ret = rcu_pending_dequeue(pending); + + if (ret) + return ret; + + int cpu; + for_each_possible_cpu(cpu) { + ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); + if (ret) + break; + } + return ret; +} + +static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + spin_lock_irq(&p->lock); + if (__rcu_pending_has_pending(p) || p->cb_armed) { + spin_unlock_irq(&p->lock); + return true; + } + spin_unlock_irq(&p->lock); + } + + return false; +} + +void rcu_pending_exit(struct rcu_pending *pending) +{ + int cpu; + + if (!pending->p) + return; + + while (rcu_pending_has_pending_or_armed(pending)) { + __rcu_barrier(pending->srcu); + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + + static_array_for_each(p->lists, i) + WARN_ON(i->head); + WARN_ON(p->objs.nr); + darray_exit(&p->objs); + } + free_percpu(pending->p); +} + +/** + * rcu_pending_init: - initialize a rcu_pending + * + * @pending: Object to init + * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal + * RCU flavor + * @process: Callback function invoked on objects once their RCU barriers + * have completed; if NULL, kvfree() is used. + */ +int rcu_pending_init(struct rcu_pending *pending, + struct srcu_struct *srcu, + rcu_pending_process_fn process) +{ + pending->p = alloc_percpu(struct rcu_pending_pcpu); + if (!pending->p) + return -ENOMEM; + + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + p->parent = pending; + p->cpu = cpu; + spin_lock_init(&p->lock); + darray_init(&p->objs); + INIT_WORK(&p->work, rcu_pending_work); + } + + pending->srcu = srcu; + pending->process = process; + + return 0; +} |