From f728c4a9e8405caae69d4bc1232c54ff57b5d20f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Jul 2021 11:03:52 +0800 Subject: [PATCH 01/10] workqueue: Fix possible memory leaks in wq_numa_init() In error handling branch "if (WARN_ON(node == NUMA_NO_NODE))", the previously allocated memories are not released. Doing this before allocating memory eliminates memory leaks. tj: Note that the condition only occurs when the arch code is pretty broken and the WARN_ON might as well be BUG_ON(). Signed-off-by: Zhen Lei Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f148eacda55a..542c2d03dab6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5902,6 +5902,13 @@ static void __init wq_numa_init(void) return; } + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); @@ -5919,11 +5926,6 @@ static void __init wq_numa_init(void) for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } cpumask_set_cpu(cpu, tbl[node]); } From 67dc8325370844ffce92aa59abe8b453aa6aa83c Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Sat, 31 Jul 2021 08:01:29 +0800 Subject: [PATCH 02/10] workqueue: Fix typo in comments Fix typo: *assing ==> assign *alloced ==> allocated *Retun ==> Return *excute ==> execute v1->v2: *reverse 'iff' *update changelog Signed-off-by: Cai Huoqing Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index d15a7730ee18..5fcf3d048a5a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -324,7 +324,7 @@ enum { * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to - * excute the work item on that CPU breaking the idleness, which in + * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 542c2d03dab6..d3c35e47aa90 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -524,7 +524,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** - * worker_pool_assign_id - allocate ID and assing it to @pool + * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned @@ -3763,7 +3763,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } -/* initialize newly alloced @pwq which is associated with @wq and @pool */ +/* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { @@ -5331,7 +5331,7 @@ static int workqueue_apply_unbound_cpumask(void) * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * - * Retun: 0 - Success + * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ From e441b56fe438fd126b9eea7d30c57d3cd3f34e14 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Aug 2021 11:50:36 +0800 Subject: [PATCH 03/10] workqueue: Replace deprecated ida_simple_*() with ida_alloc()/ida_free() Replace ida_simple_get() with ida_alloc() and ida_simple_remove() with ida_free(), the latter is more concise and intuitive. In addition, if ida_alloc() fails, NULL is returned directly. This eliminates unnecessary initialization of two local variables and an 'if' judgment. Signed-off-by: Zhen Lei Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d3c35e47aa90..29f049463d88 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1912,14 +1912,14 @@ static void worker_detach_from_pool(struct worker *worker) */ static struct worker *create_worker(struct worker_pool *pool) { - struct worker *worker = NULL; - int id = -1; + struct worker *worker; + int id; char id_buf[16]; /* ID is needed to determine kthread name */ - id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) - goto fail; + return NULL; worker = alloc_worker(pool->node); if (!worker) @@ -1954,8 +1954,7 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: - if (id >= 0) - ida_simple_remove(&pool->worker_ida, id); + ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2378,7 +2377,7 @@ woke_up: set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); - ida_simple_remove(&pool->worker_ida, worker->id); + ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); kfree(worker); return 0; From ffd8bea81fbb5abe6518bce8d6297a149b935cf7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:20 +0200 Subject: [PATCH 04/10] workqueue: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Tejun Heo Reviewed-by: Lai Jiangshan Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29f049463d88..9bce39dba297 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3292,7 +3292,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -3304,7 +3304,7 @@ int schedule_on_each_cpu(work_func_t func) for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); + cpus_read_unlock(); free_percpu(works); return 0; } @@ -4015,14 +4015,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); - put_online_cpus(); + cpus_read_unlock(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, @@ -4067,7 +4067,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ @@ -4195,7 +4195,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - get_online_cpus(); + cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ @@ -4205,7 +4205,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -5167,10 +5167,10 @@ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { long ret = -ENODEV; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(cpu)) ret = work_on_cpu(cpu, fn, arg); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_on_cpu_safe); @@ -5442,7 +5442,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, @@ -5452,7 +5452,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return written; } From f97a4a1a3f8769e3452885967955e21c88f3f263 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:34 +0800 Subject: [PATCH 05/10] workqueue: Rename "delayed" (delayed by active management) to "inactive" There are two kinds of "delayed" work items in workqueue subsystem. One is for timer-delayed work items which are visible to workqueue users. The other kind is for work items delayed by active management which can not be directly visible to workqueue users. We mixed the word "delayed" for both kinds and caused somewhat ambiguity. This patch renames the later one (delayed by active management) to "inactive", because it is used for workqueue active management and most of its related symbols are named with "active" or "activate". All "delayed" and "DELAYED" are carefully checked and renamed one by one to avoid accidentally changing the name of the other kind for timer-delayed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 +-- kernel/workqueue.c | 58 +++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5fcf3d048a5a..3d4edd072a9b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -29,7 +29,7 @@ void delayed_work_timer_fn(struct timer_list *t); enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ + WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -42,7 +42,7 @@ enum { WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9bce39dba297..9a00ba096032 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -207,7 +207,7 @@ struct pool_workqueue { /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -1136,7 +1136,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -1144,16 +1144,16 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } -static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); } /** @@ -1176,10 +1176,10 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) pwq->nr_in_flight[color]--; pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); } /* is flush in progress and are we at the flushing tip? */ @@ -1281,14 +1281,14 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* - * A delayed work item cannot be grabbed directly because + * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work); list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, get_work_color(work)); @@ -1490,8 +1490,8 @@ retry: if (list_empty(worklist)) pwq->pool->watchdog_ts = jiffies; } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + worklist = &pwq->inactive_works; } debug_work_activate(work); @@ -2530,7 +2530,7 @@ repeat: /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2956,7 +2956,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -3712,7 +3712,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3741,9 +3741,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && + while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); kick = true; } @@ -3774,7 +3774,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); @@ -4361,7 +4361,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; @@ -4557,7 +4557,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - ret = !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); @@ -4753,11 +4753,11 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("\n"); } - if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false; - pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } @@ -4787,7 +4787,7 @@ void show_workqueue_state(void) bool idle = true; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } @@ -4799,7 +4799,7 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) show_pwq(pwq); raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -5182,7 +5182,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: From c4560c2c88a4c809800ba8e76faabaf80bf6ee89 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:35 +0800 Subject: [PATCH 06/10] workqueue: Change arguement of pwq_dec_nr_in_flight() Make pwq_dec_nr_in_flight() use work_data rather just work_color. Prepare for later patch to get WORK_STRUCT_INACTIVE bit from work_data in pwq_dec_nr_in_flight(). No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a00ba096032..41796000f3eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -579,9 +579,9 @@ static unsigned int work_color_to_flags(int color) return color << WORK_STRUCT_COLOR_SHIFT; } -static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } @@ -1159,7 +1159,7 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1167,8 +1167,10 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { + int color = get_work_color(work_data); + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) goto out_put; @@ -1291,7 +1293,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_inactive_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -2172,7 +2174,7 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; + unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP /* @@ -2208,7 +2210,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + work_data = *work_data_bits(work); /* * Record wq name for cmdline and debug reporting, may get @@ -2314,7 +2316,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + pwq_dec_nr_in_flight(pwq, work_data); } /** From d21cece0dbb424ad3ff9e49bde6954632b8efede Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:36 +0800 Subject: [PATCH 07/10] workqueue: Change the code of calculating work_flags in insert_wq_barrier() Add a local var @work_flags to calculate work_flags step by step, so that we don't need to squeeze several flags in only the last line of code. Parepare for next patch to add a bit to barrier work item's flag. Not squshing this to next patch makes it clear that what it will have changed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41796000f3eb..84fd2a8f56aa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2659,8 +2659,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); struct list_head *head; - unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked @@ -2686,13 +2686,12 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; __set_bit(WORK_STRUCT_LINKED_BIT, bits); } debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + insert_work(pwq, &barr->work, head, work_flags); } /** From 018f3a13dd6300701103f268b6bfec0a56beea57 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:37 +0800 Subject: [PATCH 08/10] workqueue: Mark barrier work with WORK_STRUCT_INACTIVE Currently, WORK_NO_COLOR has two meanings: Not participate in flushing Not participate in nr_active And only non-barrier work items are marked with WORK_STRUCT_INACTIVE when they are in inactive_works list. The barrier work items are not marked INACTIVE even linked in inactive_works list since these tail items are always moved together with the head work item. These definitions are simple, clean and practical. (Except a small blemish that only the first meaning of WORK_NO_COLOR is documented in include/linux/workqueue.h while both meanings are in workqueue.c) But dual-purpose WORK_NO_COLOR used for barrier work items has proven to be problematical[1]. Only the second purpose is obligatory. So we plan to make barrier work items participate in flushing but keep them still not participating in nr_active. So the plan is to mark barrier work items inactive without using WORK_NO_COLOR in this patch so that we can assign a flushing color to them in next patch. The reasonable way is to add or reuse a bit in work data of the work item. But adding a bit will double the size of pool_workqueue. Currently, WORK_STRUCT_INACTIVE is only used in try_to_grab_pending() for user-queued work items and try_to_grab_pending() can't work for barrier work items. So we extend WORK_STRUCT_INACTIVE to also mark barrier work items no matter which list they are in because we don't need to determind which list a barrier work item is in. So the meaning of WORK_STRUCT_INACTIVE becomes just "the work items don't participate in nr_active" (no matter whether it is a barrier work item or a user-queued work item). And WORK_STRUCT_INACTIVE for user-queued work items means they are in inactive_works list. This patch does it by setting WORK_STRUCT_INACTIVE for barrier work items in insert_wq_barrier() and checking WORK_STRUCT_INACTIVE first in pwq_dec_nr_in_flight(). And the meaning of WORK_NO_COLOR is reduced to only "not participating in flushing". There is no functionality change intended in this patch. Because WORK_NO_COLOR+WORK_STRUCT_INACTIVE represents the previous WORK_NO_COLOR in meaning and try_to_grab_pending() doesn't use for barrier work items and avoids being confused by this extended WORK_STRUCT_INACTIVE. A bunch of comment for nr_active & WORK_STRUCT_INACTIVE is also added for documenting how WORK_STRUCT_INACTIVE works in nr_active management. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 84fd2a8f56aa..6657bcbd2064 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -205,6 +205,23 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ @@ -1171,19 +1188,21 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - /* uncolored work items don't participate in flushing or nr_active */ + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } + } + + /* uncolored work items don't participate in flushing */ if (color == WORK_NO_COLOR) goto out_put; pwq->nr_in_flight[color]--; - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1283,6 +1302,10 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left * on the inactive_works list, will confuse pwq->nr_active @@ -2675,6 +2698,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; + /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. From d812796eb3906424cd3c0eea530983961e2f88bd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:38 +0800 Subject: [PATCH 09/10] workqueue: Assign a color to barrier work items There was no strong reason to or not to flush barrier work items in flush_workqueue(). And we have to make barrier work items not participate in nr_active so we had been using WORK_NO_COLOR for them which also makes them can't be flushed by flush_workqueue(). And the users of flush_workqueue() often do not intend to wait barrier work items issued by flush_work(). That made the choice sound perfect. But barrier work items have reference to internal structure (pool_workqueue) and the worker thread[s] is/are still busy for the workqueue user when the barrrier work items are not done. So it is reasonable to make flush_workqueue() also watch for flush_work() to make it more robust. And a problem[1] reported by Li Zhe shows that we need such robustness. The warning logs are listed below: WARNING: CPU: 0 PID: 19336 at kernel/workqueue.c:4430 destroy_workqueue+0x11a/0x2f0 ***** destroy_workqueue: test_workqueue9 has the following busy pwq pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=0/1 refcnt=2 in-flight: 5658:wq_barrier_func Showing busy workqueues and worker pools: ***** It shows that even after drain_workqueue() returns, the barrier work item is still in flight and the pwq (and a worker) is still busy on it. The problem is caused by flush_workqueue() not watching flush_work(): Thread A Worker /* normal work item with linked */ process_scheduled_works() destroy_workqueue() process_one_work() drain_workqueue() /* run normal work item */ /-- pwq_dec_nr_in_flight() flush_workqueue() <---/ /* the last normal work item is done */ sanity_check process_one_work() /-- raw_spin_unlock_irq(&pool->lock) raw_spin_lock_irq(&pool->lock) <-/ /* maybe preempt */ *WARNING* wq_barrier_func() /* maybe preempt by cond_resched() */ Thread A can get the pool lock after the Worker unlocks the pool lock before running wq_barrier_func(). And if there is any preemption happen around wq_barrier_func(), destroy_workqueue()'s sanity check is more likely to get the lock and catch it. (Note: preemption is not necessary to cause the bug, the unlocking is enough to possibly trigger the WARNING.) A simple solution might be just executing all linked barrier work items once without releasing pool lock after the head work item's pwq_dec_nr_in_flight(). But this solution has two problems: 1) the head work item might also be barrier work item when the user-queued work item is cancelled. For example: thread 1: thread 2: queue_work(wq, &my_work) flush_work(&my_work) cancel_work_sync(&my_work); /* Neiter my_work nor the barrier work is scheduled. */ destroy_workqueue(wq); /* This is an easier way to catch the WARNING. */ 2) there might be too much linked barrier work items and running them all once without releasing pool lock just causes trouble. The only solution is to make flush_workqueue() aslo watch barrier work items. So we have to assign a color to these barrier work items which is the color of the head (user-queued) work item. Assigning a color doesn't cause any problem in ative management, because the prvious patch made barrier work items not participate in nr_active via WORK_STRUCT_INACTIVE rather than reliance on the (old) WORK_NO_COLOR. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Reported-by: Li Zhe Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++++++-------- kernel/workqueue_internal.h | 3 ++- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6657bcbd2064..33a6b4a2443d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1197,10 +1197,6 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ } } - /* uncolored work items don't participate in flushing */ - if (color == WORK_NO_COLOR) - goto out_put; - pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ @@ -1307,7 +1303,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left + * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. @@ -2234,6 +2230,7 @@ __acquires(&pool->lock) worker->current_func = work->func; worker->current_pwq = pwq; work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get @@ -2339,6 +2336,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->current_color = INT_MAX; pwq_dec_nr_in_flight(pwq, work_data); } @@ -2682,7 +2680,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { - unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head; /* @@ -2705,17 +2704,22 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } + pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 498de0e909a4..e00b1204a8e9 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -30,7 +30,8 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + unsigned int current_color; /* L: current_work's color */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ From bdb0a6548d2233dada752109a71bcf4c9b8ae6d6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:39 +0800 Subject: [PATCH 10/10] workqueue: Remove unused WORK_NO_COLOR WORK_NO_COLOR has no user now, just remove it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 3d4edd072a9b..2ebef6b1a3d6 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -51,19 +51,14 @@ enum { WORK_STRUCT_STATIC = 0, #endif - /* - * The last color is no color used for works which don't - * participate in workqueue flushing. - */ - WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS) - 1, - WORK_NO_COLOR = WORK_NR_COLORS, + WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. - * This makes pwqs aligned to 256 bytes and allows 15 workqueue + * This makes pwqs aligned to 256 bytes and allows 16 workqueue * flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT +