blk-mq: use static mapping

blk-mq layer performs a remapping between s/w and h/w contexts and also between h/w contexts and CPUs, whenever a CPU hotplug event happens. This remapping has to wait for queue freezing which may take tens of miliseconds, resulting in a high latency in CPU hotplug path. This patch makes the above mentioned mappings static so that we can avoid remapping when CPU hotplug event happens and this results in improved CPU hotplug latency of up to 90 percent for CPU up path and of up to 50 percent for CPU down path. Change-Id: Idf38cb6c4e78c91fda3c86608c6d0441f01ab435 Signed-off-by: Imran Khan <kimran@codeaurora.org>
2016-09-15 16:44:02 +05:30 · 2016-09-15 16:44:02 +05:30 · 6b520c7ac7
commit 6b520c7ac7
parent 4b91c1a0f7
3 changed files with 39 additions and 85 deletions
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@ -31,8 +31,8 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }

-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-			    const struct cpumask *online_mask)
+static int blk_mq_update_queue_map(unsigned int *map,
+		unsigned int nr_queues, const struct cpumask *online_mask)
 {
 	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
 	cpumask_var_t cpus;
@ -52,18 +52,14 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,

 	queue = 0;
 	for_each_possible_cpu(i) {
-		if (!cpumask_test_cpu(i, online_mask)) {
-			map[i] = 0;
-			continue;
-		}
-
 		/*
 		 * Easy case - we have equal or more hardware queues. Or
 		 * there are no thread siblings to take into account. Do
 		 * 1:1 if enough, or sequential mapping if less.
 		 */
-		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
-			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+		if (nr_queues >= nr_cpu_ids) {
+			map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues,
+					queue);
 			queue++;
 			continue;
 		}
@ -75,7 +71,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
 		 */
 		first_sibling = get_first_sibling(i);
 		if (first_sibling == i) {
-			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+			map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues,
 							queue);
 			queue++;
 		} else
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@ -1783,10 +1783,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;

-		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpu_online(i))
-			continue;
-
 		hctx = q->mq_ops->map_queue(q, i);

 		/*
@ -1820,12 +1816,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	 * Map software to hardware queues
 	 */
 	queue_for_each_ctx(q, ctx, i) {
-		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpumask_test_cpu(i, online_mask))
-			continue;
-
 		hctx = q->mq_ops->map_queue(q, i);
-		cpumask_set_cpu(i, hctx->cpumask);
+		if (cpumask_test_cpu(i, online_mask))
+			cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@ -1863,17 +1856,22 @@ static void blk_mq_map_swqueue(struct request_queue *q,

 		/*
 		 * Initialize batch roundrobin counts
+		 * Set next_cpu for only those hctxs that have an online CPU
+		 * in their cpumask field. For hctxs that belong to few online
+		 * and few offline CPUs, this will always provide one CPU from
+		 * online ones. For hctxs belonging to all offline CPUs, their
+		 * cpumask will be updated in reinit_notify.
 		 */
-		hctx->next_cpu = cpumask_first(hctx->cpumask);
-		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+		if (cpumask_first(hctx->cpumask) < nr_cpu_ids) {
+			hctx->next_cpu = cpumask_first(hctx->cpumask);
+			hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+		}
 	}

 	queue_for_each_ctx(q, ctx, i) {
-		if (!cpumask_test_cpu(i, online_mask))
-			continue;
-
 		hctx = q->mq_ops->map_queue(q, i);
-		cpumask_set_cpu(i, hctx->tags->cpumask);
+		if (cpumask_test_cpu(i, online_mask))
+			cpumask_set_cpu(i, hctx->tags->cpumask);
 	}
 }

@ -2101,38 +2099,13 @@ void blk_mq_free_queue(struct request_queue *q)
 	blk_mq_free_hw_queues(q, set);
 }

-/* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q,
-				const struct cpumask *online_mask)
-{
-	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
-
-	blk_mq_sysfs_unregister(q);
-
-	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
-
-	/*
-	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
-	 * we should change hctx numa_node according to new topology (this
-	 * involves free and re-allocate memory, worthy doing?)
-	 */
-
-	blk_mq_map_swqueue(q, online_mask);
-
-	blk_mq_sysfs_register(q);
-}
-
 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 				      unsigned long action, void *hcpu)
 {
 	struct request_queue *q;
+	struct blk_mq_hw_ctx *hctx;
+	int i;
 	int cpu = (unsigned long)hcpu;
-	/*
-	 * New online cpumask which is going to be set in this hotplug event.
-	 * Declare this cpumasks as global as cpu-hotplug operation is invoked
-	 * one-by-one and dynamically allocating this could result in a failure.
-	 */
-	static struct cpumask online_new;

 	/*
 	 * Before hotadded cpu starts handling requests, new mappings must
@ -2154,44 +2127,31 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DEAD:
 	case CPU_UP_CANCELED:
-		cpumask_copy(&online_new, cpu_online_mask);
+		mutex_lock(&all_q_mutex);
+		list_for_each_entry(q, &all_q_list, all_q_node) {
+			queue_for_each_hw_ctx(q, hctx, i) {
+				cpumask_clear_cpu(cpu, hctx->cpumask);
+				cpumask_clear_cpu(cpu, hctx->tags->cpumask);
+			}
+		}
+		mutex_unlock(&all_q_mutex);
 		break;
 	case CPU_UP_PREPARE:
-		cpumask_copy(&online_new, cpu_online_mask);
-		cpumask_set_cpu(cpu, &online_new);
+		/* Update hctx->cpumask for newly onlined CPUs */
+		mutex_lock(&all_q_mutex);
+		list_for_each_entry(q, &all_q_list, all_q_node) {
+			queue_for_each_hw_ctx(q, hctx, i) {
+				cpumask_set_cpu(cpu, hctx->cpumask);
+				hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+				cpumask_set_cpu(cpu, hctx->tags->cpumask);
+			}
+		}
+		mutex_unlock(&all_q_mutex);
 		break;
 	default:
 		return NOTIFY_OK;
 	}

-	mutex_lock(&all_q_mutex);
-
-	/*
-	 * We need to freeze and reinit all existing queues.  Freezing
-	 * involves synchronous wait for an RCU grace period and doing it
-	 * one by one may take a long time.  Start freezing all queues in
-	 * one swoop and then wait for the completions so that freezing can
-	 * take place in parallel.
-	 */
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node) {
-		blk_mq_freeze_queue_wait(q);
-
-		/*
-		 * timeout handler can't touch hw queue during the
-		 * reinitialization
-		 */
-		del_timer_sync(&q->timeout);
-	}
-
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_queue_reinit(q, &online_new);
-
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_unfreeze_queue(q);
-
-	mutex_unlock(&all_q_mutex);
 	return NOTIFY_OK;
 }

--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@ -48,8 +48,6 @@ void blk_mq_disable_hotplug(void);
 * CPU -> queue mappings
 */
 extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-				   const struct cpumask *online_mask);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);

 /*