From: "Juergen Gross" <jgross@xxxxxxxx>
Date: Thu, 18 Feb 2021 09:22:54 +0100
Subject: [PATCH] kernel/smp: Provide CSD lock timeout diagnostics

This commit causes csd_lock_wait() to emit diagnostics when a CPU
fails to respond quickly enough to one of the smp_call_function()
family of function calls.

In case such a stall is detected the cpu which ought to execute the
function will be pinged again in case the IPI somehow got lost.

This commit is based on an upstream patch by Paul E. McKenney.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
diff --git a/kernel/smp.c b/kernel/smp.c
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -19,6 +19,9 @@
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
 #include <linux/hypervisor.h>
+#include <linux/sched/clock.h>
+#include <linux/nmi.h>
+#include <linux/sched/debug.h>
 
 #include "smpboot.h"
 
@@ -96,6 +99,79 @@ void __init call_function_init(void)
 	smpcfd_prepare_cpu(smp_processor_id());
 }
 
+static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
+static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
+static DEFINE_PER_CPU(void *, cur_csd_info);
+
+#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
+atomic_t csd_bug_count = ATOMIC_INIT(0);
+
+/* Record current CSD work for current CPU, NULL to erase. */
+static void csd_lock_record(call_single_data_t *csd)
+{
+	if (!csd) {
+		smp_mb(); /* NULL cur_csd after unlock. */
+		__this_cpu_write(cur_csd, NULL);
+		return;
+	}
+	__this_cpu_write(cur_csd_func, csd->func);
+	__this_cpu_write(cur_csd_info, csd->info);
+	smp_wmb(); /* func and info before csd. */
+	__this_cpu_write(cur_csd, csd);
+	smp_mb(); /* Update cur_csd before function call. */
+		  /* Or before unlock, as the case may be. */
+}
+
+/* Complain if too much time spent waiting. */
+static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned int cpu)
+{
+	bool firsttime;
+	u64 ts2, ts_delta;
+	call_single_data_t *cpu_cur_csd;
+	unsigned int flags = READ_ONCE(csd->flags);
+
+	if (!(flags & CSD_FLAG_LOCK)) {
+		if (!unlikely(*bug_id))
+			return true;
+		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
+			 *bug_id, raw_smp_processor_id(), cpu);
+		return true;
+	}
+
+	ts2 = sched_clock();
+	ts_delta = ts2 - *ts1;
+	if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
+		return false;
+
+	firsttime = !*bug_id;
+	if (firsttime)
+		*bug_id = atomic_inc_return(&csd_bug_count);
+	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpu)); /* Before func and info. */
+	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
+		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+		 cpu, csd->func, csd->info);
+	if (cpu_cur_csd && csd != cpu_cur_csd) {
+		pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
+			 *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpu)),
+			 READ_ONCE(per_cpu(cur_csd_info, cpu)));
+	} else {
+		pr_alert("\tcsd: CSD lock (#%d) %s.\n",
+			 *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
+	}
+	if (cpu >= 0) {
+		if (!trigger_single_cpu_backtrace(cpu))
+			dump_cpu_task(cpu);
+		if (!cpu_cur_csd) {
+			pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
+			arch_send_call_function_single_ipi(cpu);
+		}
+	}
+	dump_stack();
+	*ts1 = ts2;
+
+	return false;
+}
+
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -103,14 +179,23 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd, unsigned int cpu)
 {
-	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
+	int bug_id = 0;
+	u64 ts0, ts1;
+
+	ts1 = ts0 = sched_clock();
+	for (;;) {
+		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, cpu))
+			break;
+		cpu_relax();
+	}
+	smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock(call_single_data_t *csd)
+static __always_inline void csd_lock(call_single_data_t *csd, unsigned int cpu)
 {
-	csd_lock_wait(csd);
+	csd_lock_wait(csd, cpu);
 	csd->flags |= CSD_FLAG_LOCK;
 
 	/*
@@ -148,9 +233,11 @@ static int generic_exec_single(int cpu,
 		 * We can unlock early even for the synchronous on-stack case,
 		 * since we're doing this from the same CPU..
 		 */
+		csd_lock_record(csd);
 		csd_unlock(csd);
 		local_irq_save(flags);
 		func(info);
+		csd_lock_record(NULL);
 		local_irq_restore(flags);
 		return 0;
 	}
@@ -238,6 +325,7 @@ static void flush_smp_call_function_queu
 		smp_call_func_t func = csd->func;
 		void *info = csd->info;
 
+		csd_lock_record(csd);
 		/* Do we wait until *after* callback? */
 		if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
 			func(info);
@@ -246,6 +334,7 @@ static void flush_smp_call_function_queu
 			csd_unlock(csd);
 			func(info);
 		}
+		csd_lock_record(NULL);
 	}
 
 	/*
@@ -293,13 +382,13 @@ int smp_call_function_single(int cpu, sm
 	csd = &csd_stack;
 	if (!wait) {
 		csd = this_cpu_ptr(&csd_data);
-		csd_lock(csd);
+		csd_lock(csd, cpu);
 	}
 
 	err = generic_exec_single(cpu, csd, func, info);
 
 	if (wait)
-		csd_lock_wait(csd);
+		csd_lock_wait(csd, cpu);
 
 	put_cpu();
 
@@ -331,7 +420,7 @@ int smp_call_function_single_async(int c
 
 	/* We could deadlock if we have to wait here with interrupts disabled! */
 	if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
-		csd_lock_wait(csd);
+		csd_lock_wait(csd, cpu);
 
 	csd->flags = CSD_FLAG_LOCK;
 	smp_wmb();
@@ -448,7 +537,7 @@ void smp_call_function_many(const struct
 	for_each_cpu(cpu, cfd->cpumask) {
 		call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 
-		csd_lock(csd);
+		csd_lock(csd, cpu);
 		if (wait)
 			csd->flags |= CSD_FLAG_SYNCHRONOUS;
 		csd->func = func;
@@ -465,7 +554,7 @@ void smp_call_function_many(const struct
 			call_single_data_t *csd;
 
 			csd = per_cpu_ptr(cfd->csd, cpu);
-			csd_lock_wait(csd);
+			csd_lock_wait(csd, cpu);
 		}
 	}
 }