From 61d0fbb4b6f707d57555c9dc6d8a69144e0e8641 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:31:45 +0800
Subject: [PATCH 01/20] workqueue: use "pool->cpu < 0" to stand for an unbound
 pool

There is a piece of sanity checks code in the put_unbound_pool().
The meaning of this code is "if it is not an unbound pool, it will complain
and return" IIUC. But the code uses "pool->flags & POOL_DISASSOCIATED"
imprecisely due to a non-unbound pool may also have this flags.

We should use "pool->cpu < 0" to stand for an unbound pool, so we covert the
code to it.

There is no strictly wrong if we still keep "pool->flags & POOL_DISASSOCIATED"
here, but it is just a noise if we keep it:
  1) we focus on "unbound" here, not "[dis]association".
  2) "pool->cpu < 0" already implies "pool->flags & POOL_DISASSOCIATED".

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6203d2900877..ea2c5de502aa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3457,7 +3457,7 @@ static void put_unbound_pool(struct worker_pool *pool)
 		return;
 
 	/* sanity checks */
-	if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+	if (WARN_ON(!(pool->cpu < 0)) ||
 	    WARN_ON(!list_empty(&pool->worklist)))
 		return;
 

From e6a9a7712331b9d2f551cad8630f6a6c6428c721 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:32:07 +0800
Subject: [PATCH 02/20] workqueue: remove the empty check in too_many_workers()

The commit ea1abd6197d5 ("workqueue: reimplement idle worker rebinding")
used a trick which simply removes all to-be-bound idle workers from the
idle list and lets them add themselves back after completing rebinding.

And this trick caused the @worker_pool->nr_idle may deviate than the actual
number of idle workers on @worker_pool->idle_list.  More specifically,
nr_idle may be non-zero while ->idle_list is empty.  All users of
->nr_idle and ->idle_list are audited.  The only affected one is
too_many_workers() which is updated to check %false if ->idle_list is
empty regardless of ->nr_idle.

The commit/trick was complicated due to it just tried to simplify an even
more complicated problem (workers had to rebind itself). But the commit
a9ab775bcadf ("workqueue: directly restore CPU affinity of workers
from CPU_ONLINE") fixed all these problems and the mentioned trick was
useless and is gone.

So, now the @worker_pool->nr_idle is exactly the actual number of workers
on @worker_pool->idle_list. too_many_workers() should recover as it was
before the trick. So we remove the empty check.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ea2c5de502aa..170b724b2d69 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -758,13 +758,6 @@ static bool too_many_workers(struct worker_pool *pool)
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
-	/*
-	 * nr_idle and idle_list may disagree if idle rebinding is in
-	 * progress.  Never return %true if idle_list is empty.
-	 */
-	if (list_empty(&pool->idle_list))
-		return false;
-
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
 

From e212f361fb5a82b91f0ba2fc9690a47a57e414d1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:32:17 +0800
Subject: [PATCH 03/20] workqueue: use schedule_timeout_interruptible() instead
 of open code

schedule_timeout_interruptible(CREATE_COOLDOWN) is exactly the same as
the original code.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 170b724b2d69..fcdf0430595a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1917,8 +1917,7 @@ restart:
 		if (!need_to_create_worker(pool))
 			break;
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(CREATE_COOLDOWN);
+		schedule_timeout_interruptible(CREATE_COOLDOWN);
 
 		if (!need_to_create_worker(pool))
 			break;

From 25ef09586df2b84a595c00ed058916122ff03297 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:32:41 +0800
Subject: [PATCH 04/20] workqueue: remove useless WARN_ON_ONCE()

The @cpu is fetched via smp_processor_id() in this function,
so the check is useless.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fcdf0430595a..4cfcd027e4bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4461,8 +4461,6 @@ static void wq_unbind_fn(struct work_struct *work)
 	struct worker *worker;
 
 	for_each_cpu_worker_pool(pool, cpu) {
-		WARN_ON_ONCE(cpu != smp_processor_id());
-
 		mutex_lock(&pool->attach_mutex);
 		spin_lock_irq(&pool->lock);
 

From b62c075194f409ed887670af897ad22d44ed8ac8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:32:52 +0800
Subject: [PATCH 05/20] workqueue: clear leftover flags when detached

When a worker is detached, the worker->flags may still have WORKER_UNBOUND
or WORKER_REBOUND, it is OK for all cases:
  1) if it is a normal worker, the worker will be dead, it is OK.
  2) if it is a rescuer, it may re-attach to a pool with this leftover flag[s],
     it is still correct except it may cause unneeded wakeup.

It is correct but not good, so we just remove the leftover flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4cfcd027e4bf..4f8d72dae0e8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1663,6 +1663,9 @@ static void worker_detach_from_pool(struct worker *worker,
 		detach_completion = pool->detach_completion;
 	mutex_unlock(&pool->attach_mutex);
 
+	/* clear leftover flags without pool->lock after it is detached */
+	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
 	if (detach_completion)
 		complete(detach_completion);
 }

From 92b69f509196fc5afc6a357511b864d372f42419 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:33:08 +0800
Subject: [PATCH 06/20] workqueue: sanity check pool->cpu in
 wq_worker_sleeping()

In theory, pool->cpu is equals to @cpu in wq_worker_sleeping() after
worker->flags is checked.

And "pool->cpu != cpu" sanity check will help us if something wrong.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f8d72dae0e8..8474e5752f1f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -843,7 +843,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
 	pool = worker->pool;
 
 	/* this can only happen on the local cpu */
-	if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+	if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
 		return NULL;
 
 	/*

From 3de5e88485b22f30403045bd83d4815ae2207b19 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:33:27 +0800
Subject: [PATCH 07/20] workqueue: clear POOL_DISASSOCIATED in rebind_workers()

a9ab775bcadf ("workqueue: directly restore CPU affinity of workers
from CPU_ONLINE") moved pool locking into rebind_workers() but left
"pool->flags &= ~POOL_DISASSOCIATED" in workqueue_cpu_up_callback().

There is nothing necessarily wrong with it, but there is no benefit
either.  Let's move it into rebind_workers() and achieve the following
benefits:

  1) better readability, POOL_DISASSOCIATED is cleared in rebind_workers()
     as expected.

  2) we can guarantee that, when POOL_DISASSOCIATED is clear, the
     running workers of the pool are on the local CPU (pool->cpu).

tj: Minor description update.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8474e5752f1f..68461b8d9b39 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4535,6 +4535,7 @@ static void rebind_workers(struct worker_pool *pool)
 						  pool->attrs->cpumask) < 0);
 
 	spin_lock_irq(&pool->lock);
+	pool->flags &= ~POOL_DISASSOCIATED;
 
 	for_each_pool_worker(worker, pool) {
 		unsigned int worker_flags = worker->flags;
@@ -4637,10 +4638,6 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 			mutex_lock(&pool->attach_mutex);
 
 			if (pool->cpu == cpu) {
-				spin_lock_irq(&pool->lock);
-				pool->flags &= ~POOL_DISASSOCIATED;
-				spin_unlock_irq(&pool->lock);
-
 				rebind_workers(pool);
 			} else if (pool->cpu < 0) {
 				restore_unbound_workers_cpumask(pool, cpu);

From 85327af61d4cb1700d4c71e8080f7c7df9bccafb Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Jun 2014 15:33:28 +0800
Subject: [PATCH 08/20] workqueue: stronger test in process_one_work()

When POOL_DISASSOCIATED is cleared, the running worker's local CPU should
be the same as pool->cpu without any exception even during cpu-hotplug.

This patch changes "(proposition_A && proposition_B && proposition_C)"
to "(proposition_B && proposition_C)", so if the old compound
proposition is true, the new one must be true too. so this won't hide
any possible bug which can be hit by old test.

tj: Minor description update and dropped the obvious comment.

CC: Jason J. Herne <jjherne@linux.vnet.ibm.com>
CC: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 68461b8d9b39..f344334f5690 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2015,13 +2015,7 @@ __acquires(&pool->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
-	/*
-	 * Ensure we're on the correct CPU.  DISASSOCIATED test is
-	 * necessary to avoid spurious warnings from rescuers servicing the
-	 * unbound or a disassociated pool.
-	 */
-	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-		     !(pool->flags & POOL_DISASSOCIATED) &&
+	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
 		     raw_smp_processor_id() != pool->cpu);
 
 	/*

From 9c34a7042e6fe79b900bd44db03dde8a4ecdf6f6 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 11 Jul 2014 00:11:13 +0800
Subject: [PATCH 09/20] workqueue: reuse the already calculated pwq in
 try_to_grab_pending()

try_to_grab_pending() was re-calculating the associated pwq using
get_work_pwq() when it already has it cached in a local varible and
the association can't change.  Reuse the local variable instead.

This doesn't introduce any functional changes.

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f344334f5690..338d418ba6aa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1225,7 +1225,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 			pwq_activate_delayed_work(work);
 
 		list_del_init(&work->entry);
-		pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+		pwq_dec_nr_in_flight(pwq, get_work_color(work));
 
 		/* work->data points to pwq iff queued, point to pool */
 		set_work_pool_and_keep_pending(work, pool->id);

From f7537df5206929c6a6f878da7c8ecd4143793376 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 15 Jul 2014 17:24:15 +0800
Subject: [PATCH 10/20] workqueue: alloc struct worker on its local node

When the create_worker() is called from non-manager, the struct worker
is allocated from the node of the caller which may be different from the
node of pool->node.

So we add a node ID argument for the alloc_worker() to ensure the
struct worker is allocated from the preferable node.

tj: @nid renamed to @node for consistency.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 338d418ba6aa..a791a8c32b4f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1595,11 +1595,11 @@ static void worker_leave_idle(struct worker *worker)
 	list_del_init(&worker->entry);
 }
 
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
 {
 	struct worker *worker;
 
-	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
 	if (worker) {
 		INIT_LIST_HEAD(&worker->entry);
 		INIT_LIST_HEAD(&worker->scheduled);
@@ -1694,7 +1694,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 	if (id < 0)
 		goto fail;
 
-	worker = alloc_worker();
+	worker = alloc_worker(pool->node);
 	if (!worker)
 		goto fail;
 
@@ -4110,7 +4110,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	if (flags & WQ_MEM_RECLAIM) {
 		struct worker *rescuer;
 
-		rescuer = alloc_worker();
+		rescuer = alloc_worker(NUMA_NO_NODE);
 		if (!rescuer)
 			goto err_destroy;
 

From d8ca83e68c8c7efd8e5920a046d5c576b08609f0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 16 Jul 2014 14:56:36 +0800
Subject: [PATCH 11/20] workqueue: wake regular worker if need_more_worker()
 when rescuer leave the pool

We don't need to wake up regular worker when nr_running==1,
so need_more_worker() is sufficient here.

And need_more_worker() gives us better readability due to the name of
"keep_working()" implies the rescuer should keep working now but
the rescuer is actually leaving.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a791a8c32b4f..d3444169e261 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2313,11 +2313,11 @@ repeat:
 		put_pwq(pwq);
 
 		/*
-		 * Leave this pool.  If keep_working() is %true, notify a
+		 * Leave this pool.  If need_more_worker() is %true, notify a
 		 * regular worker; otherwise, we end up with 0 concurrency
 		 * and stalling the execution.
 		 */
-		if (keep_working(pool))
+		if (need_more_worker(pool))
 			wake_up_worker(pool);
 
 		rescuer->pool = NULL;

From a489a03eca74cd1d5ac771f4b2ae2c826aab0b30 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:01:59 +0800
Subject: [PATCH 12/20] workqueue: remove an unneeded UNBOUND test before
 waking up the next worker

In process_one_work():

	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
		wake_up_worker(pool);

the first test is unneeded.  Even if the first test is removed, it
doesn't affect the wake-up logic for WORKER_UNBOUND, and it will not
introduce any useless wake-ups for normal per-cpu workers since
nr_running is always >= 1.  It will introduce useless/redundant
wake-ups for CPU_INTENSIVE, but this case is rare and the next patch
will also remove this redundant wake-up.

tj: Minor updates to the description and comment.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d3444169e261..c20cfbe89194 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2048,10 +2048,13 @@ __acquires(&pool->lock)
 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
 	/*
-	 * Unbound pool isn't concurrency managed and work items should be
-	 * executed ASAP.  Wake up another worker if necessary.
+	 * Wake up another worker if necessary.  The condition is always
+	 * false for normal per-cpu workers since nr_running would always
+	 * be >= 1 at this point.  This is used to chain execution of the
+	 * pending work items for WORKER_NOT_RUNNING workers such as the
+	 * UNBOUND ones.
 	 */
-	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+	if (need_more_worker(pool))
 		wake_up_worker(pool);
 
 	/*

From 228f1d0018ba6b24c9f718a97a5bc35b24f1e1e3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:02:00 +0800
Subject: [PATCH 13/20] workqueue: remove @wakeup from worker_set_flags()

worker_set_flags() has only two callers, each specifying %true and
%false for @wakeup.  Let's push the wake up to the caller and remove
@wakeup from worker_set_flags().  The caller can use the following
instead if wakeup is necessary:

	worker_set_flags();
	if (need_more_worker(pool))
 		wake_up_worker(pool);

This makes the code simpler.  This patch doesn't introduce behavior
changes.

tj: Updated description and comments.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c20cfbe89194..54efc68f656e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -867,35 +867,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
  * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: self
  * @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
  *
- * Set @flags in @worker->flags and adjust nr_running accordingly.  If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
  *
  * CONTEXT:
  * spin_lock_irq(pool->lock)
  */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
-				    bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
 {
 	struct worker_pool *pool = worker->pool;
 
 	WARN_ON_ONCE(worker->task != current);
 
-	/*
-	 * If transitioning into NOT_RUNNING, adjust nr_running and
-	 * wake up an idle worker as necessary if requested by
-	 * @wakeup.
-	 */
+	/* If transitioning into NOT_RUNNING, adjust nr_running. */
 	if ((flags & WORKER_NOT_RUNNING) &&
 	    !(worker->flags & WORKER_NOT_RUNNING)) {
-		if (wakeup) {
-			if (atomic_dec_and_test(&pool->nr_running) &&
-			    !list_empty(&pool->worklist))
-				wake_up_worker(pool);
-		} else
-			atomic_dec(&pool->nr_running);
+		atomic_dec(&pool->nr_running);
 	}
 
 	worker->flags |= flags;
@@ -2041,18 +2028,20 @@ __acquires(&pool->lock)
 	list_del_init(&work->entry);
 
 	/*
-	 * CPU intensive works don't participate in concurrency
-	 * management.  They're the scheduler's responsibility.
+	 * CPU intensive works don't participate in concurrency management.
+	 * They're the scheduler's responsibility.  This takes @worker out
+	 * of concurrency management and the next code block will chain
+	 * execution of the pending work items.
 	 */
 	if (unlikely(cpu_intensive))
-		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+		worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
 	/*
 	 * Wake up another worker if necessary.  The condition is always
 	 * false for normal per-cpu workers since nr_running would always
 	 * be >= 1 at this point.  This is used to chain execution of the
 	 * pending work items for WORKER_NOT_RUNNING workers such as the
-	 * UNBOUND ones.
+	 * UNBOUND and CPU_INTENSIVE ones.
 	 */
 	if (need_more_worker(pool))
 		wake_up_worker(pool);
@@ -2210,7 +2199,7 @@ recheck:
 		}
 	} while (keep_working(pool));
 
-	worker_set_flags(worker, WORKER_PREP, false);
+	worker_set_flags(worker, WORKER_PREP);
 sleep:
 	/*
 	 * pool->lock is held and there's no work to process and no need to

From 051e1850106687896d4c4eeaf6ae4d61c4862e85 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:03:02 +0800
Subject: [PATCH 14/20] workqueue: unfold start_worker() into create_worker()

Simply unfold the code of start_worker() into create_worker() and
remove the original start_worker() and create_and_start_worker().

The only trade-off is the introduced overhead that the pool->lock
is released and regrabbed after the newly worker is started.
The overhead is acceptible since the manager is slow path.

And because this new locking behavior, the newly created worker
may grab the lock earlier than the manager and go to process
work items. In this case, the recheck need_to_create_worker() may be
true as expected and the manager goes to restart which is the
correct behavior.

tj: Minor updates to description and comments.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 75 +++++++++++-----------------------------------
 1 file changed, 18 insertions(+), 57 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 54efc68f656e..4cb8527a5783 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1540,7 +1540,7 @@ static void worker_enter_idle(struct worker *worker)
 			 (worker->hentry.next || worker->hentry.pprev)))
 		return;
 
-	/* can't use worker_set_flags(), also called from start_worker() */
+	/* can't use worker_set_flags(), also called from create_worker() */
 	worker->flags |= WORKER_IDLE;
 	pool->nr_idle++;
 	worker->last_active = jiffies;
@@ -1661,8 +1661,7 @@ static void worker_detach_from_pool(struct worker *worker,
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
  *
- * Create a new worker which is attached to @pool.  The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
  *
  * CONTEXT:
  * Might sleep.  Does GFP_KERNEL allocations.
@@ -1707,6 +1706,13 @@ static struct worker *create_worker(struct worker_pool *pool)
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
 
+	/* start the newly created worker */
+	spin_lock_irq(&pool->lock);
+	worker->pool->nr_workers++;
+	worker_enter_idle(worker);
+	wake_up_process(worker->task);
+	spin_unlock_irq(&pool->lock);
+
 	return worker;
 
 fail:
@@ -1716,44 +1722,6 @@ fail:
 	return NULL;
 }
 
-/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
-	worker->pool->nr_workers++;
-	worker_enter_idle(worker);
-	wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
-	struct worker *worker;
-
-	worker = create_worker(pool);
-	if (worker) {
-		spin_lock_irq(&pool->lock);
-		start_worker(worker);
-		spin_unlock_irq(&pool->lock);
-	}
-
-	return worker ? 0 : -ENOMEM;
-}
-
 /**
  * destroy_worker - destroy a workqueue worker
  * @worker: worker to be destroyed
@@ -1892,19 +1860,7 @@ restart:
 	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
 	while (true) {
-		struct worker *worker;
-
-		worker = create_worker(pool);
-		if (worker) {
-			del_timer_sync(&pool->mayday_timer);
-			spin_lock_irq(&pool->lock);
-			start_worker(worker);
-			if (WARN_ON_ONCE(need_to_create_worker(pool)))
-				goto restart;
-			return true;
-		}
-
-		if (!need_to_create_worker(pool))
+		if (create_worker(pool) || !need_to_create_worker(pool))
 			break;
 
 		schedule_timeout_interruptible(CREATE_COOLDOWN);
@@ -1915,6 +1871,11 @@ restart:
 
 	del_timer_sync(&pool->mayday_timer);
 	spin_lock_irq(&pool->lock);
+	/*
+	 * This is necessary even after a new worker was just successfully
+	 * created as @pool->lock was dropped and the new worker might have
+	 * already become busy.
+	 */
 	if (need_to_create_worker(pool))
 		goto restart;
 	return true;
@@ -3537,7 +3498,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 		goto fail;
 
 	/* create and start the initial worker */
-	if (create_and_start_worker(pool) < 0)
+	if (!create_worker(pool))
 		goto fail;
 
 	/* install */
@@ -4611,7 +4572,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 		for_each_cpu_worker_pool(pool, cpu) {
 			if (pool->nr_workers)
 				continue;
-			if (create_and_start_worker(pool) < 0)
+			if (!create_worker(pool))
 				return NOTIFY_BAD;
 		}
 		break;
@@ -4911,7 +4872,7 @@ static int __init init_workqueues(void)
 
 		for_each_cpu_worker_pool(pool, cpu) {
 			pool->flags &= ~POOL_DISASSOCIATED;
-			BUG_ON(create_and_start_worker(pool) < 0);
+			BUG_ON(!create_worker(pool));
 		}
 	}
 

From 13b1d625ef430ab7af059099828e1b9fba622369 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:03:47 +0800
Subject: [PATCH 15/20] workqueue: move rescuer pool detachment to the end

In 51697d393922 ("workqueue: use generic attach/detach routine for
rescuers"), The rescuer detaches itself from the pool before put_pwq()
so that the put_unbound_pool() will not destroy the rescuer-attached
pool.

It is unnecessary.  worker_detach_from_pool() can be used as the last
statement to access to the pool just like the regular workers,
put_unbound_pool() will wait for it to detach and then free the pool.

So we move the worker_detach_from_pool() down, make it coincide with
the regular workers.

tj: Minor description update.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4cb8527a5783..4d9600faa400 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2253,15 +2253,10 @@ repeat:
 				move_linked_works(work, scheduled, &n);
 
 		process_scheduled_works(rescuer);
-		spin_unlock_irq(&pool->lock);
-
-		worker_detach_from_pool(rescuer, pool);
-
-		spin_lock_irq(&pool->lock);
 
 		/*
 		 * Put the reference grabbed by send_mayday().  @pool won't
-		 * go away while we're holding its lock.
+		 * go away while we're still attached to it.
 		 */
 		put_pwq(pwq);
 
@@ -2274,8 +2269,11 @@ repeat:
 			wake_up_worker(pool);
 
 		rescuer->pool = NULL;
-		spin_unlock(&pool->lock);
-		spin_lock(&wq_mayday_lock);
+		spin_unlock_irq(&pool->lock);
+
+		worker_detach_from_pool(rescuer, pool);
+
+		spin_lock_irq(&wq_mayday_lock);
 	}
 
 	spin_unlock_irq(&wq_mayday_lock);

From 29b1cb416a2920fbc70041e4382920ae2d86f426 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:04:27 +0800
Subject: [PATCH 16/20] workqueue: remove the stale comment in
 pwq_unbound_release_workfn()

In 75ccf5950f82 ("workqueue: prepare flush_workqueue() for dynamic
creation and destrucion of unbound pool_workqueues"), a comment
about the synchronization for the pwq in pwq_unbound_release_workfn()
was added. The comment claimed the flush_mutex wasn't strictly
necessary, it was correct in that time, due to the pwq was protected
by workqueue_lock.

But it is incorrect now since the wq->flush_mutex was renamed to
wq->mutex and workqueue_lock was removed, the wq->mutex is strictly
needed. But the comment was miss-updated when the synchronization
was changed.

This patch removes the incorrect comments and doesn't add any new
comment to explain why wq->mutex is needed here, which is definitely
obvious and wq->pwqs_node has "WQ" notation in its definition which is
better comment.

The old commit mentioned above also introduced a comment in link_pwq()
about the synchronization. This comment is also removed in this patch
since the whole link_pwq() is proteced by wq->mutex.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4d9600faa400..0732d33d6dd5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3530,11 +3530,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 	if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
 		return;
 
-	/*
-	 * Unlink @pwq.  Synchronization against wq->mutex isn't strictly
-	 * necessary on release but do it anyway.  It's easier to verify
-	 * and consistent with the linking path.
-	 */
 	mutex_lock(&wq->mutex);
 	list_del_rcu(&pwq->pwqs_node);
 	is_last = list_empty(&wq->pwqs);
@@ -3631,10 +3626,7 @@ static void link_pwq(struct pool_workqueue *pwq)
 	if (!list_empty(&pwq->pwqs_node))
 		return;
 
-	/*
-	 * Set the matching work_color.  This is synchronized with
-	 * wq->mutex to avoid confusing flush_workqueue().
-	 */
+	/* set the matching work_color */
 	pwq->work_color = wq->work_color;
 
 	/* sync max_active to the current setting */

From 3fb1823c093ebe1869d34005837f64df64713780 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:04:49 +0800
Subject: [PATCH 17/20] workqueue: remove the misnamed out_unlock label in
 get_unbound_pool()

After the locking was moved up to the caller of the get_unbound_pool(),
out_unlock label doesn't need to do any unlock operation and the name
became bad, so we just remove this label, and the only usage-site
"goto out_unlock" is subsituted to "return pool".

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0732d33d6dd5..a71cf176dce0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3463,7 +3463,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
 		if (wqattrs_equal(pool->attrs, attrs)) {
 			pool->refcnt++;
-			goto out_unlock;
+			return pool;
 		}
 	}
 
@@ -3501,7 +3501,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 
 	/* install */
 	hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
+
 	return pool;
 fail:
 	if (pool)

From ddcb57e2ed0a4d0de5aef06735dd9df98894f818 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 22 Jul 2014 13:05:40 +0800
Subject: [PATCH 18/20] workqueue: use nr_node_ids instead of wq_numa_tbl_len

They are the same and nr_node_ids is provided by the memory subsystem.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a71cf176dce0..370f9476fc95 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
 
 static struct kmem_cache *pwq_cache;
 
-static int wq_numa_tbl_len;		/* highest possible NUMA node id + 1 */
 static cpumask_var_t *wq_numa_possible_cpumask;
 					/* possible CPUs of each node */
 
@@ -3763,7 +3762,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
 		return -EINVAL;
 
-	pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4011,7 +4010,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 
 	/* allocate wq and format name */
 	if (flags & WQ_UNBOUND)
-		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+		tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
 
 	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
 	if (!wq)
@@ -4782,10 +4781,6 @@ static void __init wq_numa_init(void)
 	cpumask_var_t *tbl;
 	int node, cpu;
 
-	/* determine NUMA pwq table len - highest node id + 1 */
-	for_each_node(node)
-		wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
 	if (num_possible_nodes() <= 1)
 		return;
 
@@ -4802,7 +4797,7 @@ static void __init wq_numa_init(void)
 	 * available.  Build one from cpu_to_node() which should have been
 	 * fully initialized by now.
 	 */
-	tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+	tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
 	BUG_ON(!tbl);
 
 	for_each_node(node)

From ed1403ec2bd463050e481cd29be40c504021676c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 26 Jul 2014 12:03:59 +0800
Subject: [PATCH 19/20] kthread_work: wake up worker only when the worker is
 idle

If the worker is already executing a work item when another is queued,
we can safely skip wakeup without worrying about stalling queue thus
avoiding waking up the busy worker spuriously.  Spurious wakeups
should be fine but still isn't nice and avoiding it is trivial here.

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/kthread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
 
 	list_add_tail(&work->node, pos);
 	work->worker = worker;
-	if (likely(worker->task))
+	if (!worker->current_work && likely(worker->task))
 		wake_up_process(worker->task);
 }
 

From 95847e1bd34c0de86039408b24a05f07e788061d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 26 Jul 2014 12:04:00 +0800
Subject: [PATCH 20/20] kthread_work: remove the unused wait_queue_head

The wait_queue_head_t kthread_work->done is unused since
flush_kthread_work() has been re-implemented.  Let's remove it
including the initialization code.  This makes
DEFINE_KTHREAD_WORK_ONSTACK() unnecessary, removed.

tj: Updated description.  Removed DEFINE_KTHREAD_WORK_ONSTACK().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kthread.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 7dcef3317689..13d55206ccf6 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -73,7 +73,6 @@ struct kthread_worker {
 struct kthread_work {
 	struct list_head	node;
 	kthread_work_func_t	func;
-	wait_queue_head_t	done;
 	struct kthread_worker	*worker;
 };
 
@@ -85,7 +84,6 @@ struct kthread_work {
 #define KTHREAD_WORK_INIT(work, fn)	{				\
 	.node = LIST_HEAD_INIT((work).node),				\
 	.func = (fn),							\
-	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
 	}
 
 #define DEFINE_KTHREAD_WORKER(worker)					\
@@ -95,22 +93,16 @@ struct kthread_work {
 	struct kthread_work work = KTHREAD_WORK_INIT(work, fn)
 
 /*
- * kthread_worker.lock and kthread_work.done need their own lockdep class
- * keys if they are defined on stack with lockdep enabled.  Use the
- * following macros when defining them on stack.
+ * kthread_worker.lock needs its own lockdep class key when defined on
+ * stack with lockdep enabled.  Use the following macros in such cases.
  */
 #ifdef CONFIG_LOCKDEP
 # define KTHREAD_WORKER_INIT_ONSTACK(worker)				\
 	({ init_kthread_worker(&worker); worker; })
 # define DEFINE_KTHREAD_WORKER_ONSTACK(worker)				\
 	struct kthread_worker worker = KTHREAD_WORKER_INIT_ONSTACK(worker)
-# define KTHREAD_WORK_INIT_ONSTACK(work, fn)				\
-	({ init_kthread_work((&work), fn); work; })
-# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn)				\
-	struct kthread_work work = KTHREAD_WORK_INIT_ONSTACK(work, fn)
 #else
 # define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker)
-# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) DEFINE_KTHREAD_WORK(work, fn)
 #endif
 
 extern void __init_kthread_worker(struct kthread_worker *worker,
@@ -127,7 +119,6 @@ extern void __init_kthread_worker(struct kthread_worker *worker,
 		memset((work), 0, sizeof(struct kthread_work));		\
 		INIT_LIST_HEAD(&(work)->node);				\
 		(work)->func = (fn);					\
-		init_waitqueue_head(&(work)->done);			\
 	} while (0)
 
 int kthread_worker_fn(void *worker_ptr);