blob: 75e917d1c38ac8f402c81deadc76dc5cf4bacedd [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Detect hard lockups on a system
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
* Note: Most of this code is borrowed heavily from the original softlockup
* detector, so thanks to Ingo for the initial implementation.
* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
* to those contributors as well.
*/
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
#include "lockup.h"
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static cpumask_t __read_mostly watchdog_cpus;
static u64 __read_mostly sample_period;
static int hardlockup_thresh = 10; /* seconds */
static int hardlockup_panic = 1;
module_param(hardlockup_panic, int, 0644);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_lock_cnt);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static void set_sample_period(void)
{
sample_period = 1 * (u64)NSEC_PER_SEC;
}
static unsigned int watchdog_next_cpu(unsigned int cpu)
{
cpumask_t cpus = watchdog_cpus;
unsigned int next_cpu;
next_cpu = cpumask_next(cpu, &cpus);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(&cpus);
if (next_cpu == cpu)
return nr_cpu_ids;
return next_cpu;
}
static int is_hardlockup_other_cpu(unsigned int cpu)
{
unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
unsigned long lock_cnt = per_cpu(hrtimer_interrupts_lock_cnt, cpu);
if (hrint == per_cpu(hrtimer_interrupts_saved, cpu)) {
per_cpu(hrtimer_interrupts_lock_cnt, cpu) = ++lock_cnt;
if (lock_cnt > hardlockup_thresh)
return 1;
} else {
per_cpu(hrtimer_interrupts_lock_cnt, cpu) = 0;
}
per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
return 0;
}
static void watchdog_check_hardlockup_other_cpu(void)
{
unsigned int next_cpu;
/* check for a hardlockup on the next cpu */
next_cpu = watchdog_next_cpu(smp_processor_id());
if (next_cpu >= nr_cpu_ids)
return;
/*mem barrier*/
smp_rmb();
if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
per_cpu(watchdog_nmi_touch, next_cpu) = false;
return;
}
if (is_hardlockup_other_cpu(next_cpu)) {
/* only warn once */
if (per_cpu(hard_watchdog_warn, next_cpu) == true)
return;
pr_lockup_info(next_cpu);
if (hardlockup_panic)
panic("Watchdog detected hard LOCKUP on cpu %u",
next_cpu);
else
WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
next_cpu);
per_cpu(hard_watchdog_warn, next_cpu) = true;
} else {
per_cpu(hard_watchdog_warn, next_cpu) = false;
}
}
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
/* kick the hardlockup detector */
__this_cpu_inc(hrtimer_interrupts);
/* test for hardlockups on the next cpu */
watchdog_check_hardlockup_other_cpu();
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
return HRTIMER_RESTART;
}
static int aml_watchdog_nmi_enable(unsigned int cpu)
{
/*
* The new cpu will be marked online before the first hrtimer interrupt
* runs on it. If another cpu tests for a hardlockup on the new cpu
* before it has run its first hrtimer, it will get a false positive.
* Touch the watchdog on the new cpu to delay the first check for at
* least 3 sampling periods to guarantee one hrtimer has run on the new
* cpu.
*/
per_cpu(watchdog_nmi_touch, cpu) = true;
/*mem barrier*/
smp_wmb();
cpumask_set_cpu(cpu, &watchdog_cpus);
return 0;
}
static void aml_watchdog_nmi_disable(unsigned int cpu)
{
unsigned int next_cpu = watchdog_next_cpu(cpu);
/*
* Offlining this cpu will cause the cpu before this one to start
* checking the one after this one. If this cpu just finished checking
* the next cpu and updating hrtimer_interrupts_saved, and then the
* previous cpu checks it within one sample period, it will trigger a
* false positive. Touch the watchdog on the next cpu to prevent it.
*/
if (next_cpu < nr_cpu_ids)
per_cpu(watchdog_nmi_touch, next_cpu) = true;
/*mem barrier*/
smp_wmb();
cpumask_clear_cpu(cpu, &watchdog_cpus);
}
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
/* Enable the perf event */
aml_watchdog_nmi_enable(cpu);
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
}
static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
hrtimer_cancel(hrtimer);
/* disable the perf event */
aml_watchdog_nmi_disable(cpu);
}
static int hld_should_run(unsigned int cpu)
{
return 0;
}
static void hld_run(unsigned int cpu)
{
}
static void hld_cleanup(unsigned int cpu, bool online)
{
watchdog_disable(cpu);
}
static void hld_setup(unsigned int cpu)
{
watchdog_enable(cpu);
}
static void hld_park(unsigned int cpu)
{
watchdog_disable(cpu);
}
static void hld_unpark(unsigned int cpu)
{
watchdog_enable(cpu);
}
DEFINE_PER_CPU(struct task_struct *, aml_hld);
static struct smp_hotplug_thread hld_threads = {
.store = &aml_hld,
.thread_should_run = hld_should_run,
.thread_fn = hld_run,
.thread_comm = "aml_hld/%u",
.setup = hld_setup,
.cleanup = hld_cleanup,
.park = hld_park,
.unpark = hld_unpark,
};
int aml_hld_init(void)
{
int ret;
set_sample_period();
ret = smpboot_register_percpu_thread(&hld_threads);
if (ret)
pr_err("create hld_threads failed\n");
return ret;
}