blob: 30cf2ff58cd74b3f47ab3348bcee467dac1edcd5 [file] [log] [blame]
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2019 Amlogic, Inc. All rights reserved.
*/
#if defined(CONFIG_ANDROID_VENDOR_HOOKS) && defined(CONFIG_FAIR_GROUP_SCHED)
#include <linux/stacktrace.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/smp.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/arm-smccc.h>
#include <linux/kprobes.h>
#include <linux/time.h>
#include <linux/delay.h>
#include <sched.h>
#include <trace/hooks/sched.h>
#include <trace/events/meson_atrace.h>
static int sched_big_weight = 10; // * NICE_0_LOAD
module_param(sched_big_weight, int, 0644);
static int sched_interactive_task_util = 150;
module_param(sched_interactive_task_util, int, 0644);
static int sched_task_low_prio = 125;
module_param(sched_task_low_prio, int, 0644);
static int sched_task_high_prio = 110;
module_param(sched_task_high_prio, int, 0644);
static int sched_rt_nice_enable;
module_param(sched_rt_nice_enable, int, 0644);
static int sched_rt_nice_debug;
module_param(sched_rt_nice_debug, int, 0644);
static int sched_rt_nice_prio = 110;
module_param(sched_rt_nice_prio, int, 0644);
static unsigned long sched_rt_nice_gran = 4000000; //4ms
module_param(sched_rt_nice_gran, ulong, 0644);
static int sched_check_preempt_wakeup_enable = 1;
module_param(sched_check_preempt_wakeup_enable, int, 0644);
static int sched_check_preempt_wakeup_debug;
module_param(sched_check_preempt_wakeup_debug, int, 0644);
/* default 3ms, same with wakeup_granularity_ns(4*core smp) */
static unsigned long sched_check_preempt_wakeup_gran = 3000000;
module_param(sched_check_preempt_wakeup_gran, ulong, 0644);
static int sched_pick_next_task_enable = 1;
module_param(sched_pick_next_task_enable, int, 0644);
static int sched_pick_next_task_debug;
module_param(sched_pick_next_task_debug, int, 0644);
static int sched_pick_next_task_wait_socre = 10; //1ms+
module_param(sched_pick_next_task_wait_socre, int, 0644);
static int sched_pick_next_task_util_score = 80; //load.util_avg <= 200
module_param(sched_pick_next_task_util_score, int, 0644);
static int sched_pick_next_task_ignore_wait_prio = 120;
module_param(sched_pick_next_task_ignore_wait_prio, int, 0644);
static int sched_place_entity_enable = 1;
module_param(sched_place_entity_enable, int, 0644);
static int sched_place_entity_debug;
module_param(sched_place_entity_debug, int, 0644);
static int sched_place_entity_factor = 3;
module_param(sched_place_entity_factor, int, 0644);
static int sched_check_preempt_tick_enable = 1;
module_param(sched_check_preempt_tick_enable, int, 0644);
static int sched_check_preempt_tick_debug;
module_param(sched_check_preempt_tick_debug, int, 0644);
#ifdef CONFIG_SMP
static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
bool sync)
{
/*
* If the waker is CFS, then an RT sync wakeup would preempt the waker
* and force it to run for a likely small time after the RT wakee is
* done. So, only honor RT sync wakeups from RT wakers.
*/
return sync && task_has_rt_policy(rq->curr) &&
p->prio <= rq->rt.highest_prio.next &&
rq->rt.rt_nr_running <= 2;
}
#else
static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
bool sync)
{
return 0;
}
#endif
static void aml_select_rt_nice(void *data, struct task_struct *p,
int prev_cpu, int sd_flag,
int wake_flags, int *new_cpu)
{
int test = 0;
struct rq *rq;
struct task_struct *curr;
int this_cpu;
struct rq *this_cpu_rq;
unsigned long rtime = 0;
int lowest_prio_cpu = -1;
int lowest_prio = -1;
int tmp_cpu;
bool sync = !!(wake_flags & WF_SYNC);
if (!sched_rt_nice_enable)
return;
rcu_read_lock();
rq = cpu_rq(prev_cpu);
/* coverity[overrun-local] prev_cpu is safe */
curr = READ_ONCE(rq->curr);
this_cpu = smp_processor_id();
this_cpu_rq = cpu_rq(this_cpu);
if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
*new_cpu = this_cpu;
goto out_unlock;
}
if (!curr)
goto out_unlock;
if (task_may_not_preempt(curr, prev_cpu) || rt_task(curr)) {
test = 1;
} else if (curr->prio <= sched_rt_nice_prio) {
#ifdef CONFIG_FAIR_GROUP_SCHED
if (curr->se.depth == 1 &&
curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD)
goto out_unlock;
#endif
//high prio normal interactive task
if (curr->se.avg.util_avg >= sched_interactive_task_util)
goto out_unlock;
update_rq_clock(rq);
rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
rtime += (rq_clock_task(rq) - curr->se.exec_start);
if (rtime >= sched_rt_nice_gran)
goto out_unlock;
test = 1;
}
if (!test)
goto out_unlock;
for_each_cpu(tmp_cpu, p->cpus_ptr) {
/* coverity[overrun-local] for_each_cpu() is safe */
struct task_struct *task = READ_ONCE(cpu_rq(tmp_cpu)->curr);
if (task && task->pid == 0) {
if (sched_rt_nice_debug)
aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu idle_cpu:%d\n",
p->comm, p->pid, curr->comm, curr->pid,
curr->prio, curr->se.avg.util_avg, rtime,
tmp_cpu);
*new_cpu = tmp_cpu;
goto out_unlock;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
if (task && task->se.depth == 1 &&
task->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
if (sched_rt_nice_debug)
aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu low_share_group_cpu:%d\n",
p->comm, p->pid, curr->comm, curr->pid,
curr->prio, curr->se.avg.util_avg, rtime,
tmp_cpu);
*new_cpu = tmp_cpu;
goto out_unlock;
}
#endif
if (task && task->prio > lowest_prio) {
lowest_prio = task->prio;
lowest_prio_cpu = tmp_cpu;
}
}
if (lowest_prio_cpu != -1) {
if (sched_rt_nice_debug)
aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu lowest_prio_cpu:%d\n",
p->comm, p->pid, curr->comm, curr->pid,
curr->prio, curr->se.avg.util_avg, rtime,
lowest_prio_cpu);
*new_cpu = lowest_prio_cpu;
}
out_unlock:
rcu_read_unlock();
}
static void aml_check_preempt_wakeup(void *data, struct rq *rq, struct task_struct *p, bool *preempt, bool *nopreempt,
int wake_flags, struct sched_entity *se, struct sched_entity *pse,
int next_buddy_marked, unsigned int granularity)
{
struct task_struct *curr = rq->curr;
unsigned long delta_exec = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
int cpu = cpu_of(rq);
if (!sched_check_preempt_wakeup_enable)
return;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (p->se.depth == 1 &&
p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("ignore:%d low-share group:%s share=%lu\n",
cpu, p->sched_task_group->css.cgroup->kn->name,
p->se.parent->my_q->tg->shares);
*nopreempt = 1;
return;
}
if (curr->se.depth == 1 &&
curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("resched:%d current low-share group:%s share=%lu\n",
cpu, curr->sched_task_group->css.cgroup->kn->name,
curr->se.parent->my_q->tg->shares);
*preempt = 1;
return;
}
#endif
if (p->prio >= sched_task_low_prio) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("ignore:%d low-prio task: prio=%d\n", cpu, p->prio);
*nopreempt = 1;
return;
}
if (curr->prio >= sched_task_low_prio) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("resched:%d low-prio current task: prio=%d\n", cpu, p->prio);
*preempt = 1;
return;
}
if (curr->prio <= sched_task_high_prio && curr->se.avg.util_avg < sched_interactive_task_util &&
delta_exec <= sched_check_preempt_wakeup_gran) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("ignore:%d current interactive min_gran: delta_exec=%lu\n", cpu, delta_exec);
*nopreempt = 1;
return;
}
if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("resched:%d new interactive\n", cpu);
*preempt = 1;
return;
}
}
void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
static struct sched_entity *___pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
return __node_2_se(left);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return __node_2_se(next);
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
static int task_interactive_score(struct task_struct *p, unsigned long weight, int ignore_wait)
{
int score, weight_score, prio_score, wait_score, util_score;
unsigned long delta;
wait_score = 0;
if (weight < sched_big_weight * NICE_0_LOAD ||
p->prio > sched_task_high_prio ||
p->se.avg.util_avg >= sched_interactive_task_util)
return 0;
weight_score = (weight / NICE_0_LOAD - 10) * 5; //share 10240 = 0, 20480 = 50, 40960+ = 100;
if (weight_score > 100)
weight_score = 100;
prio_score = (sched_task_high_prio - p->prio) * 10;
if (!ignore_wait && sched_place_entity_enable) {
delta = rq_clock(rq_of(p->se.cfs_rq)) - p->android_kabi_reserved1;
delta = delta >> 20;
wait_score = delta * 10; //wait 1ms = 10, 10ms = 100, 20ms = 200;
if (wait_score < sched_pick_next_task_wait_socre)
return 0;
}
util_score = sched_interactive_task_util - p->se.avg.util_avg;
score = weight_score + prio_score + wait_score + util_score;
if (sched_pick_next_task_debug)
aml_trace_printk("interactive_task: %s/%d score:%d/%d,%d,%d,%d, wait:%llu util=%lu\n",
p->comm, p->pid, score, weight_score, prio_score, wait_score, util_score,
p->android_kabi_reserved1, p->se.avg.util_avg);
return score;
}
static struct sched_entity *__aml_pick_next_task(struct cfs_rq *cfs_rq, unsigned long weight, int *score, int ignore_wait)
{
struct sched_entity *se, *ret;
int max_score = 0;
int tmp_score;
*score = 0;
ret = NULL;
se = ___pick_first_entity(cfs_rq);
while (se) {
if (!entity_is_task(se))
WARN(1, "not support 2+ level cgroups");
tmp_score = task_interactive_score(task_of(se), weight, ignore_wait);
if (tmp_score > max_score) {
ret = se;
max_score = tmp_score;
*score = max_score;
}
se = __pick_next_entity(se);
}
return ret;
}
static void aml_pick_next_task(void *data, struct rq *rq, struct task_struct **p_new, struct sched_entity **se_new,
bool *repick, bool simple, struct task_struct *prev)
{
struct sched_entity *ret, *p;
struct sched_entity *se;
int score, max_score;
struct task_struct *aml_p = NULL;
struct sched_entity *aml_se = NULL;
struct task_struct *curr = rq->curr;
int ignore_wait = 0;
if (!sched_pick_next_task_enable)
return;
ret = NULL;
max_score = 0;
//if current task is big-group interactive task, select it again
if (!simple && prev->on_rq && prev->se.depth == 1 && prev->se.parent->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD &&
task_interactive_score(prev, prev->se.parent->my_q->tg->shares, 1)) {
if (sched_pick_next_task_debug)
aml_trace_printk("try_again:%s/%d -> %s/%d\n", (*p_new)->comm, (*p_new)->pid, prev->comm, prev->pid);
*p_new = prev;
return;
}
if (task_has_dl_policy(curr) || task_has_rt_policy(curr)) {
ignore_wait = 1;
} else if (fair_policy(curr->policy)) {
if ((curr->se.depth == 1 && curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) ||
curr->prio >= sched_pick_next_task_ignore_wait_prio)
ignore_wait = 1;
}
se = ___pick_first_entity(&rq->cfs);
while (se) {
if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
p = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, ignore_wait);
if (p && score > max_score) {
ret = p;
max_score = score;
}
}
se = __pick_next_entity(se);
}
if (!ret)
return;
if (simple) {
aml_se = ret;
aml_p = task_of(aml_se);
*p_new = aml_p;
if (sched_pick_next_task_debug)
aml_trace_printk("select_simple: %s/%d\n", aml_p->comm, aml_p->pid);
while (aml_se) {
set_next_entity(cfs_rq_of(aml_se), aml_se);
aml_se = parent_entity(aml_se);
}
*repick = 1;
} else {
aml_se = ret;
aml_p = task_of(aml_se);
if (sched_pick_next_task_debug)
aml_trace_printk("select: %s/%d\n", aml_p->comm, aml_p->pid);
*p_new = aml_p;
*se_new = aml_se;
}
}
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
max_vruntime = vruntime;
return max_vruntime;
}
static void aml_place_entity(void *data, struct cfs_rq *cfs_rq, struct sched_entity *se,
int initial, u64 *vruntime)
{
u64 vruntime_new = cfs_rq->min_vruntime;
unsigned long thresh;
if (!sched_place_entity_enable)
return;
if (initial)
return;
if (sched_place_entity_factor) {
thresh = sysctl_sched_latency / sched_place_entity_factor;
vruntime_new -= thresh;
se->vruntime = max_vruntime(se->vruntime, vruntime_new);
if (sched_place_entity_debug && entity_is_task(se))
aml_trace_printk("cpu:%d task:%s/%d(%s) vrutime:%llu(%llu->%llu)\n",
cpu_of(rq_of(cfs_rq)),
task_of(se)->comm, task_of(se)->pid,
task_of(se)->sched_task_group->css.cgroup->kn->name,
se->vruntime, cfs_rq->min_vruntime, vruntime_new);
}
//task_struct.android_kabi_reserved1: last wakeup time
if (entity_is_task(se))
task_of(se)->android_kabi_reserved1 = rq_clock(rq_of(cfs_rq));
}
static void aml_check_preempt_tick(void *data, struct task_struct *p, unsigned long *ideal_runtime,
bool *skip_preempt, unsigned long delta_exec, struct cfs_rq *cfs_rq,
struct sched_entity *curr, unsigned int granularity)
{
struct sched_entity *se, *se_long_wait;
int score;
if (!sched_check_preempt_tick_enable)
return;
if (p->prio >= sched_task_low_prio) {
if (sched_check_preempt_tick_debug)
aml_trace_printk("resched: low-prio task_prio=%d\n", p->prio);
*ideal_runtime = 0; //resched
return;
}
if (p->se.depth == 1 &&
p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
if (sched_check_preempt_tick_debug)
aml_trace_printk("resched: low-share group:%s share=%lu\n",
p->sched_task_group->css.cgroup->kn->name,
p->se.parent->my_q->tg->shares);
*ideal_runtime = 0; //resched
return;
}
if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util &&
delta_exec <= sched_check_preempt_wakeup_gran) {
if (sched_check_preempt_wakeup_debug)
aml_trace_printk("ignore: current interactive min_gran: delta_exec=%lu\n", delta_exec);
*skip_preempt = 1;
return;
}
//if any long_wait big group task exsit
se = ___pick_first_entity(cfs_rq);
while (se) {
if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
se_long_wait = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, 0);
if (se_long_wait) {
if (sched_check_preempt_tick_debug)
aml_trace_printk("resched long_wait task:%s/%d score=%d\n",
task_of(se_long_wait)->comm,
task_of(se_long_wait)->pid,
score);
*ideal_runtime = 0; //resched
return;
}
}
se = __pick_next_entity(se);
}
}
static int cpupri_check_rt_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
//overwrite function check_rt_ret() return value
regs->regs[0] = 0;
return 0;
}
NOKPROBE_SYMBOL(cpupri_check_rt_ret_handler);
static struct kretprobe cpupri_check_rt_kretprobe = {
.handler = cpupri_check_rt_ret_handler,
/* Probe up to 20 instances concurrently. */
.maxactive = 20,
};
int aml_sched_init(void)
{
int ret;
register_trace_android_rvh_select_task_rq_rt(aml_select_rt_nice, NULL);
register_trace_android_rvh_check_preempt_wakeup(aml_check_preempt_wakeup, NULL);
register_trace_android_rvh_replace_next_task_fair(aml_pick_next_task, NULL);
register_trace_android_rvh_place_entity(aml_place_entity, NULL);
register_trace_android_rvh_check_preempt_tick(aml_check_preempt_tick, NULL);
cpupri_check_rt_kretprobe.kp.symbol_name = "cpupri_check_rt";
ret = register_kretprobe(&cpupri_check_rt_kretprobe);
if (ret < 0)
pr_err("register_kretprobe failed, returned %d\n", ret);
pr_debug("Planted return probe at %s: %px\n",
cpupri_check_rt_kretprobe.kp.symbol_name, cpupri_check_rt_kretprobe.kp.addr);
return 0;
}
#else
int aml_sched_init(void)
{
return 0;
}
#endif