drivers/debug/watchdog_hld.c - manifest_repos/kernel - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Detect hard lockups on a system
  *
  * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
  *
  * Note: Most of this code is borrowed heavily from the original softlockup
  * detector, so thanks to Ingo for the initial implementation.
  * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
  * to those contributors as well.
  */

 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/sched/debug.h>
 #include <linux/smpboot.h>
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>

 #include "lockup.h"

 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static cpumask_t __read_mostly watchdog_cpus;

 static u64 __read_mostly sample_period;
 static int hardlockup_thresh = 10; /* seconds */

 static int hardlockup_panic = 1;
 module_param(hardlockup_panic, int, 0644);

 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_lock_cnt);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);

 static void set_sample_period(void)
 {
 	sample_period = 1 * (u64)NSEC_PER_SEC;
 }

 static unsigned int watchdog_next_cpu(unsigned int cpu)
 {
 	cpumask_t cpus = watchdog_cpus;
 	unsigned int next_cpu;

 	next_cpu = cpumask_next(cpu, &cpus);
 	if (next_cpu >= nr_cpu_ids)
 		next_cpu = cpumask_first(&cpus);

 	if (next_cpu == cpu)
 		return nr_cpu_ids;

 	return next_cpu;
 }

 static int is_hardlockup_other_cpu(unsigned int cpu)
 {
 	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
 	unsigned long lock_cnt = per_cpu(hrtimer_interrupts_lock_cnt, cpu);

 	if (hrint ==  per_cpu(hrtimer_interrupts_saved, cpu)) {
 		per_cpu(hrtimer_interrupts_lock_cnt, cpu) = ++lock_cnt;
 		if (lock_cnt > hardlockup_thresh)
 			return 1;
 	} else {
 		per_cpu(hrtimer_interrupts_lock_cnt, cpu) = 0;
 	}

 	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
 	return 0;
 }

 static void watchdog_check_hardlockup_other_cpu(void)
 {
 	unsigned int next_cpu;

 	/* check for a hardlockup on the next cpu */
 	next_cpu = watchdog_next_cpu(smp_processor_id());
 	if (next_cpu >= nr_cpu_ids)
 		return;

 	/*mem barrier*/
 	smp_rmb();

 	if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
 		per_cpu(watchdog_nmi_touch, next_cpu) = false;
 		return;
 	}

 	if (is_hardlockup_other_cpu(next_cpu)) {
 		/* only warn once */
 		if (per_cpu(hard_watchdog_warn, next_cpu) == true)
 			return;

 		pr_lockup_info(next_cpu);

 		if (hardlockup_panic)
 			panic("Watchdog detected hard LOCKUP on cpu %u",
 			      next_cpu);
 		else
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
 			     next_cpu);

 		per_cpu(hard_watchdog_warn, next_cpu) = true;
 	} else {
 		per_cpu(hard_watchdog_warn, next_cpu) = false;
 	}
 }

 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
 	/* kick the hardlockup detector */
 	__this_cpu_inc(hrtimer_interrupts);

 	/* test for hardlockups on the next cpu */
 	watchdog_check_hardlockup_other_cpu();

 	/* .. and repeat */
 	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

 	return HRTIMER_RESTART;
 }

 static int aml_watchdog_nmi_enable(unsigned int cpu)
 {
 	/*
 	 * The new cpu will be marked online before the first hrtimer interrupt
 	 * runs on it.  If another cpu tests for a hardlockup on the new cpu
 	 * before it has run its first hrtimer, it will get a false positive.
 	 * Touch the watchdog on the new cpu to delay the first check for at
 	 * least 3 sampling periods to guarantee one hrtimer has run on the new
 	 * cpu.
 	 */
 	per_cpu(watchdog_nmi_touch, cpu) = true;
 	/*mem barrier*/
 	smp_wmb();
 	cpumask_set_cpu(cpu, &watchdog_cpus);
 	return 0;
 }

 static void aml_watchdog_nmi_disable(unsigned int cpu)
 {
 	unsigned int next_cpu = watchdog_next_cpu(cpu);

 	/*
 	 * Offlining this cpu will cause the cpu before this one to start
 	 * checking the one after this one.  If this cpu just finished checking
 	 * the next cpu and updating hrtimer_interrupts_saved, and then the
 	 * previous cpu checks it within one sample period, it will trigger a
 	 * false positive.  Touch the watchdog on the next cpu to prevent it.
 	 */
 	if (next_cpu < nr_cpu_ids)
 		per_cpu(watchdog_nmi_touch, next_cpu) = true;
 	/*mem barrier*/
 	smp_wmb();
 	cpumask_clear_cpu(cpu, &watchdog_cpus);
 }

 static void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;

 	/* Enable the perf event */
 	aml_watchdog_nmi_enable(cpu);

 	/* done here because hrtimer_start can only pin to smp_processor_id() */
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
 		      HRTIMER_MODE_REL_PINNED);
 }

 static void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

 	hrtimer_cancel(hrtimer);
 	/* disable the perf event */
 	aml_watchdog_nmi_disable(cpu);
 }

 static int hld_should_run(unsigned int cpu)
 {
 	return 0;
 }

 static void hld_run(unsigned int cpu)
 {
 }

 static void hld_cleanup(unsigned int cpu, bool online)
 {
 	watchdog_disable(cpu);
 }

 static void hld_setup(unsigned int cpu)
 {
 	watchdog_enable(cpu);
 }

 static void hld_park(unsigned int cpu)
 {
 	watchdog_disable(cpu);
 }

 static void hld_unpark(unsigned int cpu)
 {
 	watchdog_enable(cpu);
 }

 DEFINE_PER_CPU(struct task_struct *, aml_hld);

 static struct smp_hotplug_thread hld_threads = {
 	.store			= &aml_hld,
 	.thread_should_run	= hld_should_run,
 	.thread_fn		= hld_run,
 	.thread_comm		= "aml_hld/%u",
 	.setup			= hld_setup,
 	.cleanup		= hld_cleanup,
 	.park			= hld_park,
 	.unpark			= hld_unpark,
 };

 int aml_hld_init(void)
 {
 	int ret;

 	set_sample_period();

 	ret = smpboot_register_percpu_thread(&hld_threads);
 	if (ret)
 		pr_err("create hld_threads failed\n");

 	return ret;
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Detect hard lockups on a system
	*
	* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
	*
	* Note: Most of this code is borrowed heavily from the original softlockup
	* detector, so thanks to Ingo for the initial implementation.
	* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
	* to those contributors as well.
	*/

	#include <linux/nmi.h>
	#include <linux/atomic.h>
	#include <linux/module.h>
	#include <linux/sched/debug.h>
	#include <linux/smpboot.h>
	#include <asm/irq_regs.h>
	#include <linux/perf_event.h>

	#include "lockup.h"

	static DEFINE_PER_CPU(bool, hard_watchdog_warn);
	static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
	static cpumask_t __read_mostly watchdog_cpus;

	static u64 __read_mostly sample_period;
	static int hardlockup_thresh = 10; /* seconds */

	static int hardlockup_panic = 1;
	module_param(hardlockup_panic, int, 0644);

	static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
	static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
	static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_lock_cnt);
	static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);

	static void set_sample_period(void)
	{
	sample_period = 1 * (u64)NSEC_PER_SEC;
	}

	static unsigned int watchdog_next_cpu(unsigned int cpu)
	{
	cpumask_t cpus = watchdog_cpus;
	unsigned int next_cpu;

	next_cpu = cpumask_next(cpu, &cpus);
	if (next_cpu >= nr_cpu_ids)
	next_cpu = cpumask_first(&cpus);

	if (next_cpu == cpu)
	return nr_cpu_ids;

	return next_cpu;
	}

	static int is_hardlockup_other_cpu(unsigned int cpu)
	{
	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
	unsigned long lock_cnt = per_cpu(hrtimer_interrupts_lock_cnt, cpu);

	if (hrint == per_cpu(hrtimer_interrupts_saved, cpu)) {
	per_cpu(hrtimer_interrupts_lock_cnt, cpu) = ++lock_cnt;
	if (lock_cnt > hardlockup_thresh)
	return 1;
	} else {
	per_cpu(hrtimer_interrupts_lock_cnt, cpu) = 0;
	}

	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
	return 0;
	}

	static void watchdog_check_hardlockup_other_cpu(void)
	{
	unsigned int next_cpu;

	/* check for a hardlockup on the next cpu */
	next_cpu = watchdog_next_cpu(smp_processor_id());
	if (next_cpu >= nr_cpu_ids)
	return;

	/mem barrier/
	smp_rmb();

	if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
	per_cpu(watchdog_nmi_touch, next_cpu) = false;
	return;
	}

	if (is_hardlockup_other_cpu(next_cpu)) {
	/* only warn once */
	if (per_cpu(hard_watchdog_warn, next_cpu) == true)
	return;

	pr_lockup_info(next_cpu);

	if (hardlockup_panic)
	panic("Watchdog detected hard LOCKUP on cpu %u",
	next_cpu);
	else
	WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
	next_cpu);

	per_cpu(hard_watchdog_warn, next_cpu) = true;
	} else {
	per_cpu(hard_watchdog_warn, next_cpu) = false;
	}
	}

	/* watchdog kicker functions */
	static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
	{
	/* kick the hardlockup detector */
	__this_cpu_inc(hrtimer_interrupts);

	/* test for hardlockups on the next cpu */
	watchdog_check_hardlockup_other_cpu();

	/* .. and repeat */
	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

	return HRTIMER_RESTART;
	}

	static int aml_watchdog_nmi_enable(unsigned int cpu)
	{
	/*
	* The new cpu will be marked online before the first hrtimer interrupt
	* runs on it. If another cpu tests for a hardlockup on the new cpu
	* before it has run its first hrtimer, it will get a false positive.
	* Touch the watchdog on the new cpu to delay the first check for at
	* least 3 sampling periods to guarantee one hrtimer has run on the new
	* cpu.
	*/
	per_cpu(watchdog_nmi_touch, cpu) = true;
	/mem barrier/
	smp_wmb();
	cpumask_set_cpu(cpu, &watchdog_cpus);
	return 0;
	}

	static void aml_watchdog_nmi_disable(unsigned int cpu)
	{
	unsigned int next_cpu = watchdog_next_cpu(cpu);

	/*
	* Offlining this cpu will cause the cpu before this one to start
	* checking the one after this one. If this cpu just finished checking
	* the next cpu and updating hrtimer_interrupts_saved, and then the
	* previous cpu checks it within one sample period, it will trigger a
	* false positive. Touch the watchdog on the next cpu to prevent it.
	*/
	if (next_cpu < nr_cpu_ids)
	per_cpu(watchdog_nmi_touch, next_cpu) = true;
	/mem barrier/
	smp_wmb();
	cpumask_clear_cpu(cpu, &watchdog_cpus);
	}

	static void watchdog_enable(unsigned int cpu)
	{
	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

	/* kick off the timer for the hardlockup detector */
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hrtimer->function = watchdog_timer_fn;

	/* Enable the perf event */
	aml_watchdog_nmi_enable(cpu);

	/* done here because hrtimer_start can only pin to smp_processor_id() */
	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
	HRTIMER_MODE_REL_PINNED);
	}

	static void watchdog_disable(unsigned int cpu)
	{
	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

	hrtimer_cancel(hrtimer);
	/* disable the perf event */
	aml_watchdog_nmi_disable(cpu);
	}

	static int hld_should_run(unsigned int cpu)
	{
	return 0;
	}

	static void hld_run(unsigned int cpu)
	{
	}

	static void hld_cleanup(unsigned int cpu, bool online)
	{
	watchdog_disable(cpu);
	}

	static void hld_setup(unsigned int cpu)
	{
	watchdog_enable(cpu);
	}

	static void hld_park(unsigned int cpu)
	{
	watchdog_disable(cpu);
	}

	static void hld_unpark(unsigned int cpu)
	{
	watchdog_enable(cpu);
	}

	DEFINE_PER_CPU(struct task_struct *, aml_hld);

	static struct smp_hotplug_thread hld_threads = {
	.store = &aml_hld,
	.thread_should_run = hld_should_run,
	.thread_fn = hld_run,
	.thread_comm = "aml_hld/%u",
	.setup = hld_setup,
	.cleanup = hld_cleanup,
	.park = hld_park,
	.unpark = hld_unpark,
	};

	int aml_hld_init(void)
	{
	int ret;

	set_sample_period();

	ret = smpboot_register_percpu_thread(&hld_threads);
	if (ret)
	pr_err("create hld_threads failed\n");

	return ret;
	}