drivers/debug/sched.c - manifest_repos/kernel - Git at Google

 // SPDX-License-Identifier: (GPL-2.0+ OR MIT)
 /*
  * Copyright (c) 2019 Amlogic, Inc. All rights reserved.
  */

 #if defined(CONFIG_ANDROID_VENDOR_HOOKS) && defined(CONFIG_FAIR_GROUP_SCHED)

 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/types.h>
 #include <linux/smp.h>
 #include <linux/irqflags.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/arm-smccc.h>
 #include <linux/kprobes.h>
 #include <linux/time.h>
 #include <linux/delay.h>
 #include <sched.h>

 #include <trace/hooks/sched.h>
 #include <trace/events/meson_atrace.h>

 static int sched_big_weight = 10; // * NICE_0_LOAD
 module_param(sched_big_weight, int, 0644);

 static int sched_interactive_task_util = 150;
 module_param(sched_interactive_task_util, int, 0644);

 static int sched_task_low_prio = 125;
 module_param(sched_task_low_prio, int, 0644);

 static int sched_task_high_prio = 110;
 module_param(sched_task_high_prio, int, 0644);

 static int sched_rt_nice_enable;
 module_param(sched_rt_nice_enable, int, 0644);

 static int sched_rt_nice_debug;
 module_param(sched_rt_nice_debug, int, 0644);

 static int sched_rt_nice_prio = 110;
 module_param(sched_rt_nice_prio, int, 0644);

 static unsigned long sched_rt_nice_gran = 4000000; //4ms
 module_param(sched_rt_nice_gran, ulong, 0644);

 static int sched_check_preempt_wakeup_enable = 1;
 module_param(sched_check_preempt_wakeup_enable, int, 0644);

 static int sched_check_preempt_wakeup_debug;
 module_param(sched_check_preempt_wakeup_debug, int, 0644);

 /* default 3ms, same with wakeup_granularity_ns(4*core smp) */
 static unsigned long sched_check_preempt_wakeup_gran = 3000000;
 module_param(sched_check_preempt_wakeup_gran, ulong, 0644);

 static int sched_pick_next_task_enable = 1;
 module_param(sched_pick_next_task_enable, int, 0644);

 static int sched_pick_next_task_debug;
 module_param(sched_pick_next_task_debug, int, 0644);

 static int sched_pick_next_task_wait_socre = 10; //1ms+
 module_param(sched_pick_next_task_wait_socre, int, 0644);

 static int sched_pick_next_task_util_score = 80; //load.util_avg <= 200
 module_param(sched_pick_next_task_util_score, int, 0644);

 static int sched_pick_next_task_ignore_wait_prio = 120;
 module_param(sched_pick_next_task_ignore_wait_prio, int, 0644);

 static int sched_place_entity_enable = 1;
 module_param(sched_place_entity_enable, int, 0644);

 static int sched_place_entity_debug;
 module_param(sched_place_entity_debug, int, 0644);

 static int sched_place_entity_factor = 3;
 module_param(sched_place_entity_factor, int, 0644);

 static int sched_check_preempt_tick_enable = 1;
 module_param(sched_check_preempt_tick_enable, int, 0644);

 static int sched_check_preempt_tick_debug;
 module_param(sched_check_preempt_tick_debug, int, 0644);

 #ifdef CONFIG_SMP
 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
 					bool sync)
 {
 	/*
 	 * If the waker is CFS, then an RT sync wakeup would preempt the waker
 	 * and force it to run for a likely small time after the RT wakee is
 	 * done. So, only honor RT sync wakeups from RT wakers.
 	 */
 	return sync && task_has_rt_policy(rq->curr) &&
 		p->prio <= rq->rt.highest_prio.next &&
 		rq->rt.rt_nr_running <= 2;
 }
 #else
 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
 					bool sync)
 {
 	return 0;
 }
 #endif

 static void aml_select_rt_nice(void *data, struct task_struct *p,
 				int prev_cpu, int sd_flag,
 				int wake_flags, int *new_cpu)
 {
 	int test = 0;
 	struct rq *rq;
 	struct task_struct *curr;
 	int this_cpu;
 	struct rq *this_cpu_rq;
 	unsigned long rtime = 0;
 	int lowest_prio_cpu = -1;
 	int lowest_prio = -1;
 	int tmp_cpu;
 	bool sync = !!(wake_flags & WF_SYNC);

 	if (!sched_rt_nice_enable)
 		return;

 	rcu_read_lock();
 	rq = cpu_rq(prev_cpu);
 	/* coverity[overrun-local] prev_cpu is safe */
 	curr = READ_ONCE(rq->curr);
 	this_cpu = smp_processor_id();
 	this_cpu_rq = cpu_rq(this_cpu);

 	if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
 	    cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
 		*new_cpu = this_cpu;
 		goto out_unlock;
 	}

 	if (!curr)
 		goto out_unlock;

 	if (task_may_not_preempt(curr, prev_cpu) || rt_task(curr)) {
 		test = 1;
 	} else if (curr->prio <= sched_rt_nice_prio) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		if (curr->se.depth == 1 &&
 		    curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD)
 			goto out_unlock;
 #endif

 		//high prio normal interactive task
 		if (curr->se.avg.util_avg >= sched_interactive_task_util)
 			goto out_unlock;

 		update_rq_clock(rq);

 		rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
 		rtime += (rq_clock_task(rq) - curr->se.exec_start);
 		if (rtime >= sched_rt_nice_gran)
 			goto out_unlock;

 		test = 1;
 	}

 	if (!test)
 		goto out_unlock;

 	for_each_cpu(tmp_cpu, p->cpus_ptr) {
 		/* coverity[overrun-local] for_each_cpu() is safe */
 		struct task_struct *task = READ_ONCE(cpu_rq(tmp_cpu)->curr);

 		if (task && task->pid == 0) {
 			if (sched_rt_nice_debug)
 				aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu idle_cpu:%d\n",
 					     p->comm, p->pid, curr->comm, curr->pid,
 					     curr->prio, curr->se.avg.util_avg, rtime,
 					     tmp_cpu);

 			*new_cpu = tmp_cpu;
 			goto out_unlock;
 		}

 #ifdef CONFIG_FAIR_GROUP_SCHED
 		if (task && task->se.depth == 1 &&
 		    task->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
 			if (sched_rt_nice_debug)
 				aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu low_share_group_cpu:%d\n",
 					     p->comm, p->pid, curr->comm, curr->pid,
 					     curr->prio, curr->se.avg.util_avg, rtime,
 					     tmp_cpu);

 			*new_cpu = tmp_cpu;
 			goto out_unlock;
 		}
 #endif

 		if (task && task->prio > lowest_prio) {
 			lowest_prio = task->prio;
 			lowest_prio_cpu = tmp_cpu;
 		}
 	}

 	if (lowest_prio_cpu != -1) {
 		if (sched_rt_nice_debug)
 			aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu lowest_prio_cpu:%d\n",
 				     p->comm, p->pid, curr->comm, curr->pid,
 				     curr->prio, curr->se.avg.util_avg, rtime,
 				     lowest_prio_cpu);
 		*new_cpu = lowest_prio_cpu;
 	}

 out_unlock:
 	rcu_read_unlock();
 }

 static void aml_check_preempt_wakeup(void *data, struct rq *rq, struct task_struct *p, bool *preempt, bool *nopreempt,
 				     int wake_flags, struct sched_entity *se, struct sched_entity *pse,
 				     int next_buddy_marked, unsigned int granularity)
 {
 	struct task_struct *curr = rq->curr;
 	unsigned long delta_exec = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
 	int cpu = cpu_of(rq);

 	if (!sched_check_preempt_wakeup_enable)
 		return;

 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (p->se.depth == 1 &&
 	    p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("ignore:%d low-share group:%s share=%lu\n",
 				     cpu, p->sched_task_group->css.cgroup->kn->name,
 				     p->se.parent->my_q->tg->shares);
 		 *nopreempt = 1;
 		return;
 	}

 	if (curr->se.depth == 1 &&
 	    curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("resched:%d current low-share group:%s share=%lu\n",
 				     cpu, curr->sched_task_group->css.cgroup->kn->name,
 				     curr->se.parent->my_q->tg->shares);
 		 *preempt = 1;
 		return;
 	}
 #endif

 	if (p->prio >= sched_task_low_prio) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("ignore:%d low-prio task: prio=%d\n", cpu, p->prio);
 		 *nopreempt = 1;
 		return;
 	}

 	if (curr->prio >= sched_task_low_prio) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("resched:%d low-prio current task: prio=%d\n", cpu, p->prio);
 		 *preempt = 1;
 		return;
 	}

 	if (curr->prio <= sched_task_high_prio && curr->se.avg.util_avg < sched_interactive_task_util &&
 	    delta_exec <= sched_check_preempt_wakeup_gran) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("ignore:%d current interactive min_gran: delta_exec=%lu\n", cpu, delta_exec);
 		 *nopreempt = 1;
 		return;
 	}

 	if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("resched:%d new interactive\n", cpu);
 		 *preempt = 1;
 		return;
 	}
 }

 void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);

 #define __node_2_se(node) \
 	rb_entry((node), struct sched_entity, run_node)

 static struct sched_entity *___pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);

 	if (!left)
 		return NULL;

 	return __node_2_se(left);
 }

 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 {
 	struct rb_node *next = rb_next(&se->run_node);

 	if (!next)
 		return NULL;

 	return __node_2_se(next);
 }

 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
 		return se->parent;
 }

 static int task_interactive_score(struct task_struct *p, unsigned long weight, int ignore_wait)
 {
 	int score, weight_score, prio_score, wait_score, util_score;
 	unsigned long delta;

 	wait_score = 0;

 	if (weight < sched_big_weight * NICE_0_LOAD ||
 	    p->prio > sched_task_high_prio ||
 	    p->se.avg.util_avg >= sched_interactive_task_util)
 		return 0;

 	weight_score = (weight / NICE_0_LOAD - 10) * 5;  //share 10240 = 0, 20480 = 50, 40960+ = 100;
 	if (weight_score > 100)
 		weight_score = 100;

 	prio_score = (sched_task_high_prio - p->prio) * 10;

 	if (!ignore_wait && sched_place_entity_enable) {
 		delta = rq_clock(rq_of(p->se.cfs_rq)) - p->android_kabi_reserved1;
 		delta = delta >> 20;
 		wait_score = delta * 10; //wait 1ms = 10, 10ms = 100, 20ms = 200;

 		if (wait_score < sched_pick_next_task_wait_socre)
 			return 0;
 	}

 	util_score = sched_interactive_task_util - p->se.avg.util_avg;

 	score = weight_score + prio_score + wait_score + util_score;
 	if (sched_pick_next_task_debug)
 		aml_trace_printk("interactive_task: %s/%d score:%d/%d,%d,%d,%d, wait:%llu util=%lu\n",
 			     p->comm, p->pid, score, weight_score, prio_score, wait_score, util_score,
 			     p->android_kabi_reserved1, p->se.avg.util_avg);

 	return score;
 }

 static struct sched_entity *__aml_pick_next_task(struct cfs_rq *cfs_rq, unsigned long weight, int *score, int ignore_wait)
 {
 	struct sched_entity *se, *ret;
 	int max_score = 0;
 	int tmp_score;

 	*score = 0;
 	ret = NULL;

 	se = ___pick_first_entity(cfs_rq);

 	while (se) {
 		if (!entity_is_task(se))
 			WARN(1, "not support 2+ level cgroups");

 		tmp_score = task_interactive_score(task_of(se), weight, ignore_wait);
 		if (tmp_score > max_score) {
 			ret = se;
 			max_score = tmp_score;
 			*score = max_score;
 		}

 		se = __pick_next_entity(se);
 	}

 	return ret;
 }

 static void aml_pick_next_task(void *data, struct rq *rq, struct task_struct **p_new, struct sched_entity **se_new,
 			       bool *repick, bool simple, struct task_struct *prev)
 {
 	struct sched_entity *ret, *p;
 	struct sched_entity *se;
 	int score, max_score;
 	struct task_struct *aml_p = NULL;
 	struct sched_entity *aml_se = NULL;
 	struct task_struct *curr = rq->curr;
 	int ignore_wait = 0;

 	if (!sched_pick_next_task_enable)
 		return;

 	ret = NULL;
 	max_score = 0;

 	//if current task is big-group interactive task, select it again
 	if (!simple && prev->on_rq && prev->se.depth == 1 && prev->se.parent->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD &&
 	    task_interactive_score(prev, prev->se.parent->my_q->tg->shares, 1)) {
 		if (sched_pick_next_task_debug)
 			aml_trace_printk("try_again:%s/%d -> %s/%d\n", (*p_new)->comm, (*p_new)->pid, prev->comm, prev->pid);

 		*p_new = prev;
 		return;
 	}

 	if (task_has_dl_policy(curr) || task_has_rt_policy(curr)) {
 		ignore_wait = 1;
 	} else if (fair_policy(curr->policy)) {
 		if ((curr->se.depth == 1 && curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) ||
 		    curr->prio >= sched_pick_next_task_ignore_wait_prio)
 			ignore_wait = 1;
 	}

 	se = ___pick_first_entity(&rq->cfs);

 	while (se) {
 		if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
 			p = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, ignore_wait);
 			if (p && score > max_score) {
 				ret = p;
 				max_score = score;
 			}
 		}

 		se = __pick_next_entity(se);
 	}

 	if (!ret)
 		return;

 	if (simple) {
 		aml_se = ret;
 		aml_p = task_of(aml_se);

 		*p_new = aml_p;

 		if (sched_pick_next_task_debug)
 			aml_trace_printk("select_simple: %s/%d\n", aml_p->comm, aml_p->pid);

 		while (aml_se) {
 			set_next_entity(cfs_rq_of(aml_se), aml_se);
 			aml_se = parent_entity(aml_se);
 		}

 		*repick = 1;
 	} else {
 		aml_se = ret;
 		aml_p = task_of(aml_se);

 		if (sched_pick_next_task_debug)
 			aml_trace_printk("select: %s/%d\n", aml_p->comm, aml_p->pid);

 		*p_new = aml_p;
 		*se_new = aml_se;
 	}
 }

 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - max_vruntime);

 	if (delta > 0)
 		max_vruntime = vruntime;

 	return max_vruntime;
 }

 static void aml_place_entity(void *data, struct cfs_rq *cfs_rq, struct sched_entity *se,
 			     int initial, u64 *vruntime)
 {
 	u64 vruntime_new = cfs_rq->min_vruntime;
 	unsigned long thresh;

 	if (!sched_place_entity_enable)
 		return;

 	if (initial)
 		return;

 	if (sched_place_entity_factor) {
 		thresh = sysctl_sched_latency / sched_place_entity_factor;
 		vruntime_new -= thresh;

 		se->vruntime = max_vruntime(se->vruntime, vruntime_new);

 		if (sched_place_entity_debug && entity_is_task(se))
 			aml_trace_printk("cpu:%d task:%s/%d(%s) vrutime:%llu(%llu->%llu)\n",
 				      cpu_of(rq_of(cfs_rq)),
 				      task_of(se)->comm, task_of(se)->pid,
 				      task_of(se)->sched_task_group->css.cgroup->kn->name,
 				      se->vruntime, cfs_rq->min_vruntime, vruntime_new);
 	}

 	//task_struct.android_kabi_reserved1: last wakeup time
 	if (entity_is_task(se))
 		task_of(se)->android_kabi_reserved1 = rq_clock(rq_of(cfs_rq));
 }

 static void aml_check_preempt_tick(void *data, struct task_struct *p, unsigned long *ideal_runtime,
 				   bool *skip_preempt, unsigned long delta_exec, struct cfs_rq *cfs_rq,
 				   struct sched_entity *curr, unsigned int granularity)
 {
 	struct sched_entity *se, *se_long_wait;
 	int score;

 	if (!sched_check_preempt_tick_enable)
 		return;

 	if (p->prio >= sched_task_low_prio) {
 		if (sched_check_preempt_tick_debug)
 			aml_trace_printk("resched: low-prio task_prio=%d\n", p->prio);

 		*ideal_runtime = 0; //resched
 		return;
 	}

 	if (p->se.depth == 1 &&
 	    p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
 		if (sched_check_preempt_tick_debug)
 			aml_trace_printk("resched: low-share group:%s share=%lu\n",
 				     p->sched_task_group->css.cgroup->kn->name,
 				     p->se.parent->my_q->tg->shares);
 		*ideal_runtime = 0; //resched
 		return;
 	}

 	if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util &&
 	    delta_exec <= sched_check_preempt_wakeup_gran) {
 		if (sched_check_preempt_wakeup_debug)
 			aml_trace_printk("ignore: current interactive min_gran: delta_exec=%lu\n", delta_exec);
 		 *skip_preempt = 1;
 		return;
 	}

 	//if any long_wait big group task exsit
 	se = ___pick_first_entity(cfs_rq);

 	while (se) {
 		if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
 			se_long_wait  = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, 0);
 			if (se_long_wait) {
 				if (sched_check_preempt_tick_debug)
 					aml_trace_printk("resched long_wait task:%s/%d score=%d\n",
 						     task_of(se_long_wait)->comm,
 						     task_of(se_long_wait)->pid,
 						     score);

 				*ideal_runtime = 0; //resched
 				return;
 			}
 		}
 		se = __pick_next_entity(se);
 	}
 }

 static int cpupri_check_rt_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	//overwrite function check_rt_ret() return value
 	regs->regs[0] = 0;

 	return 0;
 }

 NOKPROBE_SYMBOL(cpupri_check_rt_ret_handler);

 static struct kretprobe cpupri_check_rt_kretprobe = {
 	.handler		= cpupri_check_rt_ret_handler,
 	/* Probe up to 20 instances concurrently. */
 	.maxactive		= 20,
 };

 int aml_sched_init(void)
 {
 	int ret;

 	register_trace_android_rvh_select_task_rq_rt(aml_select_rt_nice, NULL);
 	register_trace_android_rvh_check_preempt_wakeup(aml_check_preempt_wakeup, NULL);
 	register_trace_android_rvh_replace_next_task_fair(aml_pick_next_task, NULL);
 	register_trace_android_rvh_place_entity(aml_place_entity, NULL);
 	register_trace_android_rvh_check_preempt_tick(aml_check_preempt_tick, NULL);

 	cpupri_check_rt_kretprobe.kp.symbol_name = "cpupri_check_rt";
 	ret = register_kretprobe(&cpupri_check_rt_kretprobe);

 	if (ret < 0)
 		pr_err("register_kretprobe failed, returned %d\n", ret);

 	pr_debug("Planted return probe at %s: %px\n",
 		cpupri_check_rt_kretprobe.kp.symbol_name, cpupri_check_rt_kretprobe.kp.addr);

 	return 0;
 }
 #else
 int aml_sched_init(void)
 {
 	return 0;
 }
 #endif
	// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
	/*
	* Copyright (c) 2019 Amlogic, Inc. All rights reserved.
	*/

	#if defined(CONFIG_ANDROID_VENDOR_HOOKS) && defined(CONFIG_FAIR_GROUP_SCHED)

	#include <linux/stacktrace.h>
	#include <linux/export.h>
	#include <linux/types.h>
	#include <linux/smp.h>
	#include <linux/irqflags.h>
	#include <linux/sched.h>
	#include <linux/moduleparam.h>
	#include <linux/debugfs.h>
	#include <linux/module.h>
	#include <linux/uaccess.h>
	#include <linux/sched/clock.h>
	#include <linux/sched/debug.h>
	#include <linux/slab.h>
	#include <linux/interrupt.h>
	#include <linux/arm-smccc.h>
	#include <linux/kprobes.h>
	#include <linux/time.h>
	#include <linux/delay.h>
	#include <sched.h>

	#include <trace/hooks/sched.h>
	#include <trace/events/meson_atrace.h>

	static int sched_big_weight = 10; // * NICE_0_LOAD
	module_param(sched_big_weight, int, 0644);

	static int sched_interactive_task_util = 150;
	module_param(sched_interactive_task_util, int, 0644);

	static int sched_task_low_prio = 125;
	module_param(sched_task_low_prio, int, 0644);

	static int sched_task_high_prio = 110;
	module_param(sched_task_high_prio, int, 0644);

	static int sched_rt_nice_enable;
	module_param(sched_rt_nice_enable, int, 0644);

	static int sched_rt_nice_debug;
	module_param(sched_rt_nice_debug, int, 0644);

	static int sched_rt_nice_prio = 110;
	module_param(sched_rt_nice_prio, int, 0644);

	static unsigned long sched_rt_nice_gran = 4000000; //4ms
	module_param(sched_rt_nice_gran, ulong, 0644);

	static int sched_check_preempt_wakeup_enable = 1;
	module_param(sched_check_preempt_wakeup_enable, int, 0644);

	static int sched_check_preempt_wakeup_debug;
	module_param(sched_check_preempt_wakeup_debug, int, 0644);

	/* default 3ms, same with wakeup_granularity_ns(4core smp) /
	static unsigned long sched_check_preempt_wakeup_gran = 3000000;
	module_param(sched_check_preempt_wakeup_gran, ulong, 0644);

	static int sched_pick_next_task_enable = 1;
	module_param(sched_pick_next_task_enable, int, 0644);

	static int sched_pick_next_task_debug;
	module_param(sched_pick_next_task_debug, int, 0644);

	static int sched_pick_next_task_wait_socre = 10; //1ms+
	module_param(sched_pick_next_task_wait_socre, int, 0644);

	static int sched_pick_next_task_util_score = 80; //load.util_avg <= 200
	module_param(sched_pick_next_task_util_score, int, 0644);

	static int sched_pick_next_task_ignore_wait_prio = 120;
	module_param(sched_pick_next_task_ignore_wait_prio, int, 0644);

	static int sched_place_entity_enable = 1;
	module_param(sched_place_entity_enable, int, 0644);

	static int sched_place_entity_debug;
	module_param(sched_place_entity_debug, int, 0644);

	static int sched_place_entity_factor = 3;
	module_param(sched_place_entity_factor, int, 0644);

	static int sched_check_preempt_tick_enable = 1;
	module_param(sched_check_preempt_tick_enable, int, 0644);

	static int sched_check_preempt_tick_debug;
	module_param(sched_check_preempt_tick_debug, int, 0644);

	#ifdef CONFIG_SMP
	static inline bool should_honor_rt_sync(struct rq rq, struct task_struct p,
	bool sync)
	{
	/*
	* If the waker is CFS, then an RT sync wakeup would preempt the waker
	* and force it to run for a likely small time after the RT wakee is
	* done. So, only honor RT sync wakeups from RT wakers.
	*/
	return sync && task_has_rt_policy(rq->curr) &&
	p->prio <= rq->rt.highest_prio.next &&
	rq->rt.rt_nr_running <= 2;
	}
	#else
	static inline bool should_honor_rt_sync(struct rq rq, struct task_struct p,
	bool sync)
	{
	return 0;
	}
	#endif

	static void aml_select_rt_nice(void data, struct task_struct p,
	int prev_cpu, int sd_flag,
	int wake_flags, int *new_cpu)
	{
	int test = 0;
	struct rq *rq;
	struct task_struct *curr;
	int this_cpu;
	struct rq *this_cpu_rq;
	unsigned long rtime = 0;
	int lowest_prio_cpu = -1;
	int lowest_prio = -1;
	int tmp_cpu;
	bool sync = !!(wake_flags & WF_SYNC);

	if (!sched_rt_nice_enable)
	return;

	rcu_read_lock();
	rq = cpu_rq(prev_cpu);
	/* coverity[overrun-local] prev_cpu is safe */
	curr = READ_ONCE(rq->curr);
	this_cpu = smp_processor_id();
	this_cpu_rq = cpu_rq(this_cpu);

	if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
	cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
	*new_cpu = this_cpu;
	goto out_unlock;
	}

	if (!curr)
	goto out_unlock;

	if (task_may_not_preempt(curr, prev_cpu) \|\| rt_task(curr)) {
	test = 1;
	} else if (curr->prio <= sched_rt_nice_prio) {
	#ifdef CONFIG_FAIR_GROUP_SCHED
	if (curr->se.depth == 1 &&
	curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD)
	goto out_unlock;
	#endif

	//high prio normal interactive task
	if (curr->se.avg.util_avg >= sched_interactive_task_util)
	goto out_unlock;

	update_rq_clock(rq);

	rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
	rtime += (rq_clock_task(rq) - curr->se.exec_start);
	if (rtime >= sched_rt_nice_gran)
	goto out_unlock;

	test = 1;
	}

	if (!test)
	goto out_unlock;

	for_each_cpu(tmp_cpu, p->cpus_ptr) {
	/* coverity[overrun-local] for_each_cpu() is safe */
	struct task_struct *task = READ_ONCE(cpu_rq(tmp_cpu)->curr);

	if (task && task->pid == 0) {
	if (sched_rt_nice_debug)
	aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu idle_cpu:%d\n",
	p->comm, p->pid, curr->comm, curr->pid,
	curr->prio, curr->se.avg.util_avg, rtime,
	tmp_cpu);

	*new_cpu = tmp_cpu;
	goto out_unlock;
	}

	#ifdef CONFIG_FAIR_GROUP_SCHED
	if (task && task->se.depth == 1 &&
	task->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
	if (sched_rt_nice_debug)
	aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu low_share_group_cpu:%d\n",
	p->comm, p->pid, curr->comm, curr->pid,
	curr->prio, curr->se.avg.util_avg, rtime,
	tmp_cpu);

	*new_cpu = tmp_cpu;
	goto out_unlock;
	}
	#endif

	if (task && task->prio > lowest_prio) {
	lowest_prio = task->prio;
	lowest_prio_cpu = tmp_cpu;
	}
	}

	if (lowest_prio_cpu != -1) {
	if (sched_rt_nice_debug)
	aml_trace_printk("wake:%s/%d curr:%s/%d prio=%d util=%lu rtime=%lu lowest_prio_cpu:%d\n",
	p->comm, p->pid, curr->comm, curr->pid,
	curr->prio, curr->se.avg.util_avg, rtime,
	lowest_prio_cpu);
	*new_cpu = lowest_prio_cpu;
	}

	out_unlock:
	rcu_read_unlock();
	}

	static void aml_check_preempt_wakeup(void data, struct rq rq, struct task_struct p, bool preempt, bool *nopreempt,
	int wake_flags, struct sched_entity se, struct sched_entity pse,
	int next_buddy_marked, unsigned int granularity)
	{
	struct task_struct *curr = rq->curr;
	unsigned long delta_exec = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
	int cpu = cpu_of(rq);

	if (!sched_check_preempt_wakeup_enable)
	return;

	#ifdef CONFIG_FAIR_GROUP_SCHED
	if (p->se.depth == 1 &&
	p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("ignore:%d low-share group:%s share=%lu\n",
	cpu, p->sched_task_group->css.cgroup->kn->name,
	p->se.parent->my_q->tg->shares);
	*nopreempt = 1;
	return;
	}

	if (curr->se.depth == 1 &&
	curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("resched:%d current low-share group:%s share=%lu\n",
	cpu, curr->sched_task_group->css.cgroup->kn->name,
	curr->se.parent->my_q->tg->shares);
	*preempt = 1;
	return;
	}
	#endif

	if (p->prio >= sched_task_low_prio) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("ignore:%d low-prio task: prio=%d\n", cpu, p->prio);
	*nopreempt = 1;
	return;
	}

	if (curr->prio >= sched_task_low_prio) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("resched:%d low-prio current task: prio=%d\n", cpu, p->prio);
	*preempt = 1;
	return;
	}

	if (curr->prio <= sched_task_high_prio && curr->se.avg.util_avg < sched_interactive_task_util &&
	delta_exec <= sched_check_preempt_wakeup_gran) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("ignore:%d current interactive min_gran: delta_exec=%lu\n", cpu, delta_exec);
	*nopreempt = 1;
	return;
	}

	if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("resched:%d new interactive\n", cpu);
	*preempt = 1;
	return;
	}
	}

	void set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se);

	#define __node_2_se(node) \
	rb_entry((node), struct sched_entity, run_node)

	static struct sched_entity ___pick_first_entity(struct cfs_rq cfs_rq)
	{
	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);

	if (!left)
	return NULL;

	return __node_2_se(left);
	}

	static struct sched_entity __pick_next_entity(struct sched_entity se)
	{
	struct rb_node *next = rb_next(&se->run_node);

	if (!next)
	return NULL;

	return __node_2_se(next);
	}

	static inline struct sched_entity parent_entity(struct sched_entity se)
	{
	return se->parent;
	}

	static int task_interactive_score(struct task_struct *p, unsigned long weight, int ignore_wait)
	{
	int score, weight_score, prio_score, wait_score, util_score;
	unsigned long delta;

	wait_score = 0;

	if (weight < sched_big_weight * NICE_0_LOAD \|\|
	p->prio > sched_task_high_prio \|\|
	p->se.avg.util_avg >= sched_interactive_task_util)
	return 0;

	weight_score = (weight / NICE_0_LOAD - 10) * 5; //share 10240 = 0, 20480 = 50, 40960+ = 100;
	if (weight_score > 100)
	weight_score = 100;

	prio_score = (sched_task_high_prio - p->prio) * 10;

	if (!ignore_wait && sched_place_entity_enable) {
	delta = rq_clock(rq_of(p->se.cfs_rq)) - p->android_kabi_reserved1;
	delta = delta >> 20;
	wait_score = delta * 10; //wait 1ms = 10, 10ms = 100, 20ms = 200;

	if (wait_score < sched_pick_next_task_wait_socre)
	return 0;
	}

	util_score = sched_interactive_task_util - p->se.avg.util_avg;

	score = weight_score + prio_score + wait_score + util_score;
	if (sched_pick_next_task_debug)
	aml_trace_printk("interactive_task: %s/%d score:%d/%d,%d,%d,%d, wait:%llu util=%lu\n",
	p->comm, p->pid, score, weight_score, prio_score, wait_score, util_score,
	p->android_kabi_reserved1, p->se.avg.util_avg);

	return score;
	}

	static struct sched_entity __aml_pick_next_task(struct cfs_rq cfs_rq, unsigned long weight, int *score, int ignore_wait)
	{
	struct sched_entity se, ret;
	int max_score = 0;
	int tmp_score;

	*score = 0;
	ret = NULL;

	se = ___pick_first_entity(cfs_rq);

	while (se) {
	if (!entity_is_task(se))
	WARN(1, "not support 2+ level cgroups");

	tmp_score = task_interactive_score(task_of(se), weight, ignore_wait);
	if (tmp_score > max_score) {
	ret = se;
	max_score = tmp_score;
	*score = max_score;
	}

	se = __pick_next_entity(se);
	}

	return ret;
	}

	static void aml_pick_next_task(void data, struct rq rq, struct task_struct p_new, struct sched_entity se_new,
	bool repick, bool simple, struct task_struct prev)
	{
	struct sched_entity ret, p;
	struct sched_entity *se;
	int score, max_score;
	struct task_struct *aml_p = NULL;
	struct sched_entity *aml_se = NULL;
	struct task_struct *curr = rq->curr;
	int ignore_wait = 0;

	if (!sched_pick_next_task_enable)
	return;

	ret = NULL;
	max_score = 0;

	//if current task is big-group interactive task, select it again
	if (!simple && prev->on_rq && prev->se.depth == 1 && prev->se.parent->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD &&
	task_interactive_score(prev, prev->se.parent->my_q->tg->shares, 1)) {
	if (sched_pick_next_task_debug)
	aml_trace_printk("try_again:%s/%d -> %s/%d\n", (p_new)->comm, (p_new)->pid, prev->comm, prev->pid);

	*p_new = prev;
	return;
	}

	if (task_has_dl_policy(curr) \|\| task_has_rt_policy(curr)) {
	ignore_wait = 1;
	} else if (fair_policy(curr->policy)) {
	if ((curr->se.depth == 1 && curr->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) \|\|
	curr->prio >= sched_pick_next_task_ignore_wait_prio)
	ignore_wait = 1;
	}

	se = ___pick_first_entity(&rq->cfs);

	while (se) {
	if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
	p = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, ignore_wait);
	if (p && score > max_score) {
	ret = p;
	max_score = score;
	}
	}

	se = __pick_next_entity(se);
	}

	if (!ret)
	return;

	if (simple) {
	aml_se = ret;
	aml_p = task_of(aml_se);

	*p_new = aml_p;

	if (sched_pick_next_task_debug)
	aml_trace_printk("select_simple: %s/%d\n", aml_p->comm, aml_p->pid);

	while (aml_se) {
	set_next_entity(cfs_rq_of(aml_se), aml_se);
	aml_se = parent_entity(aml_se);
	}

	*repick = 1;
	} else {
	aml_se = ret;
	aml_p = task_of(aml_se);

	if (sched_pick_next_task_debug)
	aml_trace_printk("select: %s/%d\n", aml_p->comm, aml_p->pid);

	*p_new = aml_p;
	*se_new = aml_se;
	}
	}

	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
	{
	s64 delta = (s64)(vruntime - max_vruntime);

	if (delta > 0)
	max_vruntime = vruntime;

	return max_vruntime;
	}

	static void aml_place_entity(void data, struct cfs_rq cfs_rq, struct sched_entity *se,
	int initial, u64 *vruntime)
	{
	u64 vruntime_new = cfs_rq->min_vruntime;
	unsigned long thresh;

	if (!sched_place_entity_enable)
	return;

	if (initial)
	return;

	if (sched_place_entity_factor) {
	thresh = sysctl_sched_latency / sched_place_entity_factor;
	vruntime_new -= thresh;

	se->vruntime = max_vruntime(se->vruntime, vruntime_new);

	if (sched_place_entity_debug && entity_is_task(se))
	aml_trace_printk("cpu:%d task:%s/%d(%s) vrutime:%llu(%llu->%llu)\n",
	cpu_of(rq_of(cfs_rq)),
	task_of(se)->comm, task_of(se)->pid,
	task_of(se)->sched_task_group->css.cgroup->kn->name,
	se->vruntime, cfs_rq->min_vruntime, vruntime_new);
	}

	//task_struct.android_kabi_reserved1: last wakeup time
	if (entity_is_task(se))
	task_of(se)->android_kabi_reserved1 = rq_clock(rq_of(cfs_rq));
	}

	static void aml_check_preempt_tick(void data, struct task_struct p, unsigned long *ideal_runtime,
	bool skip_preempt, unsigned long delta_exec, struct cfs_rq cfs_rq,
	struct sched_entity *curr, unsigned int granularity)
	{
	struct sched_entity se, se_long_wait;
	int score;

	if (!sched_check_preempt_tick_enable)
	return;

	if (p->prio >= sched_task_low_prio) {
	if (sched_check_preempt_tick_debug)
	aml_trace_printk("resched: low-prio task_prio=%d\n", p->prio);

	*ideal_runtime = 0; //resched
	return;
	}

	if (p->se.depth == 1 &&
	p->se.parent->my_q->tg->shares < sched_big_weight * NICE_0_LOAD) {
	if (sched_check_preempt_tick_debug)
	aml_trace_printk("resched: low-share group:%s share=%lu\n",
	p->sched_task_group->css.cgroup->kn->name,
	p->se.parent->my_q->tg->shares);
	*ideal_runtime = 0; //resched
	return;
	}

	if (p->prio <= sched_task_high_prio && p->se.avg.util_avg < sched_interactive_task_util &&
	delta_exec <= sched_check_preempt_wakeup_gran) {
	if (sched_check_preempt_wakeup_debug)
	aml_trace_printk("ignore: current interactive min_gran: delta_exec=%lu\n", delta_exec);
	*skip_preempt = 1;
	return;
	}

	//if any long_wait big group task exsit
	se = ___pick_first_entity(cfs_rq);

	while (se) {
	if (!entity_is_task(se) && se->my_q->tg->shares >= sched_big_weight * NICE_0_LOAD) {
	se_long_wait = __aml_pick_next_task(group_cfs_rq(se), se->my_q->tg->shares, &score, 0);
	if (se_long_wait) {
	if (sched_check_preempt_tick_debug)
	aml_trace_printk("resched long_wait task:%s/%d score=%d\n",
	task_of(se_long_wait)->comm,
	task_of(se_long_wait)->pid,
	score);

	*ideal_runtime = 0; //resched
	return;
	}
	}
	se = __pick_next_entity(se);
	}
	}

	static int cpupri_check_rt_ret_handler(struct kretprobe_instance ri, struct pt_regs regs)
	{
	//overwrite function check_rt_ret() return value
	regs->regs[0] = 0;

	return 0;
	}

	NOKPROBE_SYMBOL(cpupri_check_rt_ret_handler);

	static struct kretprobe cpupri_check_rt_kretprobe = {
	.handler = cpupri_check_rt_ret_handler,
	/* Probe up to 20 instances concurrently. */
	.maxactive = 20,
	};

	int aml_sched_init(void)
	{
	int ret;

	register_trace_android_rvh_select_task_rq_rt(aml_select_rt_nice, NULL);
	register_trace_android_rvh_check_preempt_wakeup(aml_check_preempt_wakeup, NULL);
	register_trace_android_rvh_replace_next_task_fair(aml_pick_next_task, NULL);
	register_trace_android_rvh_place_entity(aml_place_entity, NULL);
	register_trace_android_rvh_check_preempt_tick(aml_check_preempt_tick, NULL);

	cpupri_check_rt_kretprobe.kp.symbol_name = "cpupri_check_rt";
	ret = register_kretprobe(&cpupri_check_rt_kretprobe);

	if (ret < 0)
	pr_err("register_kretprobe failed, returned %d\n", ret);

	pr_debug("Planted return probe at %s: %px\n",
	cpupri_check_rt_kretprobe.kp.symbol_name, cpupri_check_rt_kretprobe.kp.addr);

	return 0;
	}
	#else
	int aml_sched_init(void)
	{
	return 0;
	}
	#endif