diff options
author | Kyle De'Vir | 2020-10-31 16:42:00 +1000 |
---|---|---|
committer | Kyle De'Vir | 2020-10-31 16:42:00 +1000 |
commit | 6f40efa306743e2b073701ddf0d57b0be572431f (patch) | |
tree | 3498dededb2ddba3d58e8f0c6d80edc9195844d6 | |
parent | a1a105f02721459d7ab47a5abd2684937fed2eab (diff) | |
download | aur-6f40efa306743e2b073701ddf0d57b0be572431f.tar.gz |
5.9.2.arch1
-rw-r--r-- | .SRCINFO | 16 | ||||
-rw-r--r-- | 0005-undead-glitched-pds.patch (renamed from 0005-glitched-pds.patch) | 75 | ||||
-rw-r--r-- | 0005-v5.9_undead-pds099o.patch (renamed from 0009-prjc_v5.9-r0.patch) | 5046 | ||||
-rw-r--r-- | PKGBUILD | 18 | ||||
-rw-r--r-- | config | 9 |
5 files changed, 2573 insertions, 2591 deletions
@@ -1,8 +1,8 @@ pkgbase = linux-pds pkgdesc = Linux - pkgver = 5.9.1.arch1 + pkgver = 5.9.2.arch1 pkgrel = 1 - url = https://git.archlinux.org/linux.git/log/?h=v5.9.1-arch1 + url = https://git.archlinux.org/linux.git/log/?h=v5.9.2-arch1 arch = x86_64 license = GPL2 makedepends = bc @@ -16,21 +16,21 @@ pkgbase = linux-pds makedepends = imagemagick makedepends = git options = !strip - source = git+https://git.archlinux.org/linux?signed#tag=v5.9.1-arch1 + source = git+https://git.archlinux.org/linux?signed#tag=v5.9.2-arch1 source = git+https://github.com/graysky2/kernel_gcc_patch source = config source = sphinx-workaround.patch - source = 0009-prjc_v5.9-r0.patch - source = 0005-glitched-pds.patch + source = 0005-v5.9_undead-pds099o.patch + source = 0005-undead-glitched-pds.patch validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886 validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E validpgpkeys = A2FF3A36AAA56654109064AB19802F8B0D70FC30 sha512sums = SKIP sha512sums = SKIP - sha512sums = 29e6b6b45fec5a93cfdd41d2286c406ed94aaee0148df0e452ace250eeff9287cf87d9a339af34b9beec690db5a3b439a2c7c441313f05f577a4e11b056b1610 + sha512sums = cefb516ae87c748f8fa6c5f227d932938be06e32774305cbea4d29c342359ffcd4eed21b80cb560d0a3e0a016c801a1446034b5aec521808f0e27d5897e155d9 sha512sums = 98e97155f86bbe837d43f27ec1018b5b6fdc6c372d6f7f2a0fe29da117d53979d9f9c262f886850d92002898682781029b80d4ee923633fc068f979e6c8254be - sha512sums = afc135ec7c147ab6dc22e34f1f3373bde30a3a5fb77032832470ededf97a0a1a3e1fd4294bd0a03ef3edc51a10331ba7e37e63d5f6d6d603111600693bac9755 - sha512sums = 889f0a49f326de3f119290256393b09a9e9241c2a297ca0b7967a2884e4e35d71388d2a559e4c206f55f67228b65e8f2013a1ec61f6ff8f1de3b6a725fd5fa57 + sha512sums = e41d0f8a3ace142947fc5497f7377cf5a497ce1764ca96fdc6dc4915b027ac99a15296ad22c4ef99a3a5eb812614b5b280480249747a5c318452543cd85ce620 + sha512sums = 2cf83af1322f0fe5b9751e2b77fa1c890c7c22d9213b1cdfb57ca7f7a89a2cb263c213e178417ae1b7e947b386796b4b71507b127ec698cba661799346b33bbd pkgname = linux-pds pkgdesc = The Linux kernel and modules ~ featuring Alfred Chen's PDS CPU scheduler, rebased by TkG diff --git a/0005-glitched-pds.patch b/0005-undead-glitched-pds.patch index 08c9ef32e880..1e015a41a9db 100644 --- a/0005-glitched-pds.patch +++ b/0005-undead-glitched-pds.patch @@ -88,3 +88,78 @@ index 9270a4370d54..30d01e647417 100644 static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -715,6 +715,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_PDS + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/0009-prjc_v5.9-r0.patch b/0005-v5.9_undead-pds099o.patch index 550d29c8fa37..69c84d7f5f93 100644 --- a/0009-prjc_v5.9-r0.patch +++ b/0005-v5.9_undead-pds099o.patch @@ -1,154 +1,151 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a1068742a6df..b97a9697fde4 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4611,6 +4611,12 @@ - - sbni= [NET] Granch SBNI12 leased line adapter - -+ sched_timeslice= -+ [KNL] Time slice in us for BMQ/PDS scheduler. -+ Format: <int> (must be >= 1000) -+ Default: 4000 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_debug [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index d4b32cc32bb7..14118e5168ef 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1515,3 +1515,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +From abe64ed9851070719c21d76f348f638d0803e2f9 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch <ti3nou@gmail.com> +Date: Thu, 29 Oct 2020 21:28:03 +0100 +Subject: PDS 099o, 5.9 rebase + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt new file mode 100644 -index 000000000000..05c84eec0f31 +index 000000000000..709e86f6487e --- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- + +CONTENT +======== + -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 7101ac64bb20..1072a32fbca2 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1005,6 +1005,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ help ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index aa39ff31ec9f..eb72535ba99a 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index ac361a8b1d3b..cbf7ed716f20 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..f85926764f9a 100644 --- a/fs/proc/base.c @@ -162,21 +159,37 @@ index 617db4e0faa0..f85926764f9a 100644 (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index fed6ba96c527..f03a5ee419a1 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -169,7 +169,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..8918609cb9f0 100644 +index afe01e232935..192c955964d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ @@ -187,57 +200,54 @@ index afe01e232935..8918609cb9f0 100644 /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; -@@ -652,12 +653,18 @@ struct task_struct { +@@ -651,9 +652,13 @@ struct task_struct { + unsigned int flags; unsigned int ptrace; - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; +#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct __call_single_node wake_entry; +#endif -+ +#ifdef CONFIG_SMP #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; +@@ -662,6 +667,7 @@ struct task_struct { unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; -@@ -671,6 +678,7 @@ struct task_struct { + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -670,6 +676,7 @@ struct task_struct { + * used CPU that may be idle. */ int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ #endif int on_rq; - -@@ -679,13 +687,33 @@ struct task_struct { +@@ -679,13 +686,27 @@ struct task_struct { int normal_prio; unsigned int rt_priority; -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+ int bmq_idx; -+ struct list_head bmq_node; -+#endif /* CONFIG_SCHED_BMQ */ +#ifdef CONFIG_SCHED_PDS ++ int time_slice; + u64 deadline; -+ u64 priodl; + /* skip list level */ + int sl_level; + /* skip list node */ + struct skiplist_node sl_node; -+#endif /* CONFIG_SCHED_PDS */ ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; + /* sched_clock time spent running */ + u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ ++#else /* CONFIG_SCHED_PDS */ const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -250,44 +260,56 @@ index afe01e232935..8918609cb9f0 100644 #ifdef CONFIG_UCLAMP_TASK /* -@@ -1332,6 +1360,15 @@ struct task_struct { +@@ -1332,6 +1353,29 @@ struct task_struct { */ }; -+#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); +#define tsk_seruntime(t) ((t)->sched_time) +/* replace the uncertian rt_timeout with 0UL */ +#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) +#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} +#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) +#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ + static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..179d77c8360e 100644 +index 1aff00b65f3c..a5e5fc2c9170 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ +@@ -1,5 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0 */ -+#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_PDS + -+static inline int dl_task(struct task_struct *p) ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) +{ -+ return 0; ++ return 1; +} + -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((p)->priodl) -+#endif -+ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} +#else + +#define __tsk_deadline(p) ((p)->dl.deadline) @@ -295,107 +317,118 @@ index 1aff00b65f3c..179d77c8360e 100644 /* * SCHED_DEADLINE tasks has negative priorities, reflecting * the fact that any of them has higher prio than RT and -@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) { return dl_prio(p->prio); } -+#endif /* CONFIG_SCHED_ALT */ ++#endif /* CONFIG_SCHED_PDS */ static inline bool dl_time_before(u64 a, u64 b) { diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..42730d27ceb5 100644 +index 7d64feafc408..fba04bb91492 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h -@@ -20,11 +20,20 @@ +@@ -20,7 +20,18 @@ */ #define MAX_USER_RT_PRIO 100 + ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ 7 -+#endif -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ 0 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 +index e5af028c08b4..a96012e6f15e 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) if (policy == SCHED_FIFO || policy == SCHED_RR) return true; -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS if (policy == SCHED_DEADLINE) return true; +#endif return false; } +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index a98965007eef..c68b76cc01dc 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h new file mode 100644 -index 000000000000..47ca955a451d +index 000000000000..713fedd8034f --- /dev/null +++ b/include/linux/skip_list.h @@ -0,0 +1,177 @@ +/* -+ * Copyright (C) 2016 Alfred Chen. -+ * -+ * Code based on Con Kolivas's skip list implementation for BFS, and -+ * which is based on example originally by William Pugh. -+ * -+ * Skip Lists are a probabilistic alternative to balanced trees, as -+ * described in the June 1990 issue of CACM and were invented by -+ * William Pugh in 1987. -+ * -+ * A couple of comments about this implementation: -+ * -+ * This file only provides a infrastructure of skip list. -+ * -+ * skiplist_node is embedded into container data structure, to get rid -+ * the dependency of kmalloc/kfree operation in scheduler code. -+ * -+ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+ * macro and be used for skip list insert operation. -+ * -+ * Random Level is also not defined in this file, instead, it should be -+ * customized implemented and set to node->level then pass to the customized -+ * skiplist_insert function. -+ * -+ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ * -+ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+ * considering that there will be 256 entries to enable the top level when using -+ * random level p=0.5, and that number is more than enough for a run queue usage -+ * in a scheduler usage. And it also help to reduce the memory usage of the -+ * embedded skip list node in task_struct to about 50%. -+ * -+ * The insertion routine has been implemented so as to use the -+ * dirty hack described in the CACM paper: if a random level is -+ * generated that is more than the current maximum level, the -+ * current maximum level plus one is used instead. -+ * -+ * BFS Notes: In this implementation of skiplists, there are bidirectional -+ * next/prev pointers and the insert function returns a pointer to the actual -+ * node the value is stored. The key here is chosen by the scheduler so as to -+ * sort tasks according to the priority list requirements and is no longer used -+ * by the scheduler after insertion. The scheduler lookup, however, occurs in -+ * O(1) time because it is always the first item in the level 0 linked list. -+ * Since the task struct stores a copy of the node pointer upon skiplist_insert, -+ * it can also remove it much faster than the original implementation with the -+ * aid of prev<->next pointer manipulation and no searching. -+ */ ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ +#ifndef _LINUX_SKIP_LIST_H +#define _LINUX_SKIP_LIST_H + @@ -418,7 +451,7 @@ index 000000000000..47ca955a451d + +static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) +{ -+ /* only level 0 ->next matters in skiplist_empty() */ ++ /* only level 0 ->next matters in skiplist_empty()*/ + WRITE_ONCE(node->next[0], node); +} + @@ -526,125 +559,223 @@ index 000000000000..47ca955a451d + return (node->prev[0] == head); +} +#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..f692642cf2da 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,7 +115,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++ ++#define SCHED_ISO 4 ++ + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..2122dba5596f 100644 +index d6a0b31b13dc..d4fcda3add24 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -770,9 +770,39 @@ config GENERIC_SCHED_CLOCK +@@ -92,6 +92,21 @@ config THREAD_INFO_IN_TASK - menu "Scheduler features" + menu "General setup" -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ +config SCHED_PDS -+ bool "PDS CPU scheduler" ++ bool "PDS-mq cpu scheduler" + help + The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. + -+endchoice ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. + -+endif ++ Say Y here. ++ default y + - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -858,6 +888,7 @@ config NUMA_BALANCING ++ + config BROKEN + bool + +@@ -858,6 +873,7 @@ config NUMA_BALANCING depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_ALT ++ depends on !SCHED_PDS help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when -@@ -944,7 +975,7 @@ menuconfig CGROUP_SCHED +@@ -944,7 +960,7 @@ menuconfig CGROUP_SCHED bandwidth allocation to such task groups. It uses cgroups to group tasks. -if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_ALT ++if CGROUP_SCHED && !SCHED_PDS config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED -@@ -1200,6 +1231,7 @@ config CHECKPOINT_RESTORE +@@ -1073,6 +1089,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1200,6 +1217,7 @@ config CHECKPOINT_RESTORE config SCHED_AUTOGROUP bool "Automatic process group scheduling" -+ depends on !SCHED_ALT ++ depends on !SCHED_PDS select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..5a23122f3d2c 100644 +index f6889fce64af..519552456bb5 100644 --- a/init/init_task.c +++ b/init/init_task.c -@@ -75,9 +75,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -87,6 +93,19 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+#endif +@@ -67,6 +67,127 @@ struct task_struct init_task + #endif + __aligned(L1_CACHE_BYTES) + = { +#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+ .sl_level = 0, -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -94,6 +113,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, ++ &init_task.alloc_lock), +#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -209,6 +329,7 @@ struct task_struct init_task + #ifdef CONFIG_SECCOMP + .seccomp = { .filter_count = ATOMIC_INIT(0) }, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 642415b8c3c9..7e0e1fe18035 100644 +index 642415b8c3c9..952fe6cf948d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) @@ -652,7 +783,7 @@ index 642415b8c3c9..7e0e1fe18035 100644 } -#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? @@ -661,7 +792,7 @@ index 642415b8c3c9..7e0e1fe18035 100644 partition_and_rebuild_sched_domains(ndoms, doms, attr); } -#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ static void rebuild_sched_domains_locked(void) { } @@ -701,14 +832,14 @@ index 733e80f334e7..3f3506c851fd 100644 __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..4176ad070bc9 100644 +index f6310f848f34..b5de980c7d4e 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) */ rq = task_rq_lock(task, &flags); -+#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_PDS + if (task_running(task) && task != current) { +#else if (task_running(rq, task) && task != current) { @@ -717,10 +848,10 @@ index f6310f848f34..4176ad070bc9 100644 "%s: %s:%d is running\n", __func__, task->comm, task->pid); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index cfdd5b93264d..84c284eb544a 100644 +index cfdd5b93264d..7577266d1c0c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, +@@ -227,7 +227,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, * Only use with rt_mutex_waiter_{less,equal}() */ #define task_to_waiter(p) \ @@ -729,54 +860,7 @@ index cfdd5b93264d..84c284eb544a 100644 static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static inline int - rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static void -@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, +@@ -678,7 +678,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * the values of the node being removed. */ waiter->prio = task->prio; @@ -785,7 +869,7 @@ index cfdd5b93264d..84c284eb544a 100644 rt_mutex_enqueue(lock, waiter); -@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, +@@ -951,7 +951,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; @@ -795,10 +879,10 @@ index cfdd5b93264d..84c284eb544a 100644 /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..eb6d7d87779f 100644 +index 5fc9c9b70862..1b5bc273ec4b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile -@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +@@ -22,15 +22,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif @@ -807,34 +891,171 @@ index 5fc9c9b70862..eb6d7d87779f 100644 -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o alt_debug.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o +else +obj-y += core.o +obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +endif +obj-y += loadavg.o clock.o cputime.o +obj-y += idle.o +obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o +obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index e39008242cf4..815be262eb90 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -912,6 +923,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) + cpufreq_governor_init(schedutil_gov); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -942,4 +954,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5a55d2300452..76b956661488 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -614,7 +623,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f324dc36fc43..80d841a6565e 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c new file mode 100644 -index 000000000000..f36264fea75c +index 000000000000..6e3920b03756 --- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,6360 @@ ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6803 @@ +/* -+ * kernel/sched/alt_core.c ++ * kernel/sched/pds.c, was kernel/sched.c + * -+ * Core alternative kernel scheduler code and related syscalls ++ * PDS-mq Core kernel scheduler code and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * @@ -842,9 +1063,12 @@ index 000000000000..f36264fea75c + * a whole lot of those previous things. + * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel + * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. + */ -+#include "sched.h" ++#define CREATE_TRACE_POINTS ++#include <trace/events/sched.h> ++#undef CREATE_TRACE_POINTS ++ ++#include "pds_sched.h" + +#include <linux/sched/rt.h> + @@ -864,7 +1088,6 @@ index 000000000000..f36264fea75c +#include <linux/wait_bit.h> + +#include <linux/kcov.h> -+#include <linux/scs.h> + +#include <asm/switch_to.h> + @@ -875,62 +1098,182 @@ index 000000000000..f36264fea75c +#include "pelt.h" +#include "smp.h" + -+#define CREATE_TRACE_POINTS -+#include <trace/events/sched.h> ++/* ++ * Export tracepoints that act as a bare tracehook (ie: have no trace event ++ * associated with them) to allow external modules to probe them. ++ */ ++EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); + -+#define ALT_SCHED_VERSION "v5.9-r0" + -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) +#define task_has_rt_policy(p) (rt_policy((p)->policy)) + ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ +#define STOP_PRIO (MAX_RT_PRIO - 1) + -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; + -+static int __init sched_timeslice(char *str) ++static inline void print_scheduler_version(void) +{ -+ int timeslice_us; ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} + -+ get_option(&str, ×lice_us); -+ if (timeslice_us >= 1000) -+ sched_timeslice_ns = timeslice_us * 1000; ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; + -+ return 0; ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; +} -+early_param("sched_timeslice", sched_timeslice); ++__setup("rr_interval=", rr_interval_set); ++ + -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; + +/** + * sched_yield_type - Choose what sort of yield sched_yield will perform. + * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. + */ +int sched_yield_type __read_mostly = 1; + ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ +#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; + -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); + +#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); +DEFINE_STATIC_KEY_FALSE(sched_smt_present); +EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ +#endif + ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ +/* + * Keep a unique ID per domain (we use the first CPUs number in the cpumask of + * the domain), this allows us to quickly tell if two cpus are in the same cache + * domain, see cpus_share_cache(). + */ +DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; +#endif /* CONFIG_SMP */ + +static DEFINE_MUTEX(sched_hotcpu_mutex); @@ -944,73 +1287,6 @@ index 000000000000..f36264fea75c +# define finish_arch_post_lock_switch() do { } while (0) +#endif + -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq_imp.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds_imp.h" -+#endif -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = sched_queue_watermark(rq); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", -+ cpu_of(rq), watermark, last_wm);*/ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ +/* + * Serialization rules: + * @@ -1240,20 +1516,6 @@ index 000000000000..f36264fea75c + } +} + -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ +/* + * RQ-clock updating methods: + */ @@ -1299,6 +1561,7 @@ index 000000000000..f36264fea75c + steal = delta; + + rq->prev_steal_time_rq += steal; ++ + delta -= steal; + } +#endif @@ -1321,85 +1584,82 @@ index 000000000000..f36264fea75c + update_rq_clock_task(rq, delta); +} + -+#ifdef CONFIG_NO_HZ_FULL ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ +/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. + */ -+static inline void sched_update_tick_dependency(struct rq *rq) ++static inline u64 task_deadline_diff(const struct task_struct *p) +{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} + -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; +} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif + +/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. + */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) +{ -+ lockdep_assert_held(&rq->lock); ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); + -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); ++ update_task_priodl(p); ++} + -+ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; + -+ sched_update_tick_dependency(rq); ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); +} + -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) +{ -+ lockdep_assert_held(&rq->lock); ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; + -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif ++ if (node == &rq->sl_header) ++ return rq->idle; + -+ sched_update_tick_dependency(rq); ++ return skiplist_entry(node, struct task_struct, sl_node); ++} + -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); +} + -+static inline void requeue_task(struct task_struct *p, struct rq *rq) ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) +{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; + -+ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; +} + +/* @@ -1436,7 +1696,7 @@ index 000000000000..f36264fea75c + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. ++ * flush_smp_call_function_from_idle() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ @@ -1471,257 +1731,374 @@ index 000000000000..f36264fea75c +#endif +#endif + -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) +{ -+ struct wake_q_node *node = &task->wake_q; ++ struct rq *rq = cpu_rq(cpu); + -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; ++ rcu_read_lock(); + -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); +} ++#endif /* CONFIG_SMT_NICE */ + -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) +{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; +} + -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) +{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; +} + -+void wake_up_q(struct wake_q_head *head) ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) +{ -+ struct wake_q_node *node = head->first; ++ struct task_struct *p = rq_first_queued_task(rq); + -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; ++ if (p->prio != NORMAL_PRIO) ++ return; + -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} + -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); +} ++#endif + -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) ++static inline void update_sched_rq_queued_masks(struct rq *rq) +{ -+ struct task_struct *curr = rq->curr; -+ int cpu; ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif + -+ lockdep_assert_held(&rq->lock); ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; + -+ if (test_tsk_need_resched(curr)) ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) + return; + -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) + return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); + } + -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif +} + -+void resched_cpu(int cpu) ++static inline void update_sched_rq_pending_masks(struct rq *rq) +{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); + -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); +} + -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); + -+void select_nohz_load_balancer(int stop_tick) {} ++ if (!tick_nohz_full_cpu(cpu)) ++ return; + -+void set_cpu_sd_state_idle(void) {} ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif + +/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. + * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ * Context: rq->lock + */ -+int get_nohz_timer_target(void) ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) +{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } ++ lockdep_assert_held(&rq->lock); + -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; + -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); + -+ return cpu; ++ sched_info_dequeued(rq, p); +} + +/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. + */ -+static inline void wake_up_idle_cpu(int cpu) ++static inline bool idleprio_suitable(struct task_struct *p) +{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); +} + -+static inline bool wake_up_full_nohz_cpu(int cpu) ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) +{ ++ long unsigned int randseed; ++ + /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. + */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; + -+ return false; ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); +} + -+void wake_up_nohz_cpu(int cpu) ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) +{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); +} + -+static void nohz_csd_func(void *info) ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) +{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); + + /* -+ * Release the rq::nohz_csd. ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. + */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); +} + -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; + -+static inline void check_preempt_curr(struct rq *rq) ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) +{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); +} + -+static inline void -+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +{ -+ csd->flags = 0; -+ csd->func = func; -+ csd->info = rq; ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); +} + +#ifdef CONFIG_SCHED_HRTICK @@ -1763,7 +2140,7 @@ index 000000000000..f36264fea75c +static inline int hrtick_enabled(struct rq *rq) +{ + /** -+ * Alt schedule FW doesn't support sched_feat yet ++ * PDS doesn't support sched_feat yet + if (!sched_feat(HRTICK)) + return 0; + */ @@ -1840,12 +2217,23 @@ index 000000000000..f36264fea75c +static void hrtick_rq_init(struct rq *rq) +{ +#ifdef CONFIG_SMP -+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + rq->hrtick_timer.function = hrtick; +} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ +#else /* CONFIG_SCHED_HRTICK */ +static inline int hrtick_enabled(struct rq *rq) +{ @@ -1859,14 +2247,27 @@ index 000000000000..f36264fea75c +static inline void hrtick_rq_init(struct rq *rq) +{ +} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} +#endif /* CONFIG_SCHED_HRTICK */ + +static inline int normal_prio(struct task_struct *p) +{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ + if (task_has_rt_policy(p)) + return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; ++ return policy_to_prio[p->policy]; +} + +/* @@ -1895,9 +2296,11 @@ index 000000000000..f36264fea75c + */ +static void activate_task(struct task_struct *p, struct rq *rq) +{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; + enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); +} + +/* @@ -1907,9 +2310,11 @@ index 000000000000..f36264fea75c + */ +static inline void deactivate_task(struct task_struct *p, struct rq *rq) +{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; + dequeue_task(p, rq, DEQUEUE_SLEEP); + p->on_rq = 0; -+ cpufreq_update_util(rq, 0); ++ cpufreq_update_this_cpu(rq, 0); +} + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) @@ -1976,7 +2381,7 @@ index 000000000000..f36264fea75c + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) + return false; + + if (is_per_cpu_kthread(p)) @@ -2009,7 +2414,7 @@ index 000000000000..f36264fea75c +{ + lockdep_assert_held(&rq->lock); + -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(p, rq, 0); + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); @@ -2020,7 +2425,7 @@ index 000000000000..f36264fea75c + BUG_ON(task_cpu(p) != new_cpu); + enqueue_task(p, rq, 0); + p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); ++ check_preempt_curr(rq, p); + + return rq; +} @@ -2066,12 +2471,6 @@ index 000000000000..f36264fea75c + * be on another CPU but it doesn't matter. + */ + local_irq_disable(); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_from_idle(); + + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); @@ -2080,8 +2479,9 @@ index 000000000000..f36264fea75c + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); + @@ -2102,6 +2502,13 @@ index 000000000000..f36264fea75c +} +#endif + ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. @@ -2243,7 +2650,7 @@ index 000000000000..f36264fea75c +EXPORT_SYMBOL_GPL(kick_process); + +/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * @@ -2283,14 +2690,14 @@ index 000000000000..f36264fea75c + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_active(dest_cpu)) + continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) + return dest_cpu; + } + } + + for (;;) { + /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ for_each_cpu(dest_cpu, &p->cpus_mask) { + if (!is_cpu_allowed(p, dest_cpu)) + continue; + goto out; @@ -2332,163 +2739,119 @@ index 000000000000..f36264fea75c + return dest_cpu; +} + -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) +{ -+ cpumask_t chk_mask, tmp; ++ cpumask_t *mask; + -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; + -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; + -+ return best_mask_cpu(task_cpu(p), &chk_mask); ++ return cpu; +} + -+void sched_set_stop_task(int cpu, struct task_struct *stop) ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) +{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } ++ cpumask_t tmp; + -+ cpu_rq(cpu)->stop = stop; ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif + -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); + } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); +} + -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) +{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); ++ cpumask_t tmp; ++ int level; + -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif + -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); + -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); + -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); + } + -+ do_set_cpus_allowed(p, new_mask); ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; + -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; + } + -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} + -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; + -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); + -+ return ret; -+} ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } + -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); +} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ +#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++static inline int select_task_rq(struct task_struct *p) +{ + return 0; +} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ +#endif /* CONFIG_SMP */ + +static void @@ -2505,7 +2868,7 @@ index 000000000000..f36264fea75c + if (cpu == rq->cpu) + __schedstat_inc(rq->ttwu_local); + else { -+ /** Alt schedule FW ToDo: ++ /** PDS ToDo: + * How to do ttwu_wake_remote + */ + } @@ -2520,7 +2883,6 @@ index 000000000000..f36264fea75c +static inline void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ -+ check_preempt_curr(rq); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); +} @@ -2528,8 +2890,10 @@ index 000000000000..f36264fea75c +static inline void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ ++#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; ++#endif + + activate_task(p, rq); + ttwu_do_wakeup(rq, p, 0); @@ -2568,8 +2932,6 @@ index 000000000000..f36264fea75c + + rq = __task_access_lock(p, &lock); + if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } @@ -2578,151 +2940,6 @@ index 000000000000..f36264fea75c + return ret; +} + -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#else /* !CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ +/* + * Notes on Program-Order guarantees on SMP systems. + * @@ -2855,31 +3072,9 @@ index 000000000000..f36264fea75c + int wake_flags) +{ + unsigned long flags; ++ struct rq *rq; + int cpu, success = 0; + -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be @@ -2889,19 +3084,20 @@ index 000000000000..f36264fea75c + raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); + if (!(p->state & state)) -+ goto unlock; ++ goto out; + + trace_sched_waking(p); + + /* We're going to change ->state: */ + success = 1; ++ cpu = task_cpu(p); + + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * -+ * sched_ttwu_pending() try_to_wake_up() ++ * flush_smp_call_function_from_idle() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * @@ -2915,17 +3111,10 @@ index 000000000000..f36264fea75c + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + */ + smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } ++ if (p->on_rq && ttwu_runnable(p, wake_flags)) ++ goto stat; + +#ifdef CONFIG_SMP + /* @@ -2946,43 +3135,8 @@ index 000000000000..f36264fea75c + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. + */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; ++ smp_rmb(); + + /* + * If the owning (remote) CPU is still in the middle of schedule() with @@ -2995,26 +3149,47 @@ index 000000000000..f36264fea75c + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); + -+ sched_task_ttwu(p); ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } + -+ cpu = select_task_rq(p, this_rq()); ++ cpu = select_task_rq(p); + + if (cpu != task_cpu(p)) { + wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); + set_task_cpu(p, cpu); + } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif + -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); +out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return success; +} @@ -3049,7 +3224,7 @@ index 000000000000..f36264fea75c + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); -+ __task_rq_unlock(rq, &rf); ++ rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: @@ -3090,38 +3265,30 @@ index 000000000000..f36264fea75c +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: + */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) +{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); + +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} + -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ __sched_fork(clone_flags, p); + /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external @@ -3155,40 +3322,38 @@ index 000000000000..f36264fea75c + } + + /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ ++ raw_spin_lock_irqsave(&rq->lock, flags); + rq->curr->time_slice /= 2; + p->time_slice = rq->curr->time_slice; +#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); +#endif + -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); + resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); + -+ rseq_migrate(p); ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ -+ __set_task_cpu(p, cpu_of(rq)); ++ __set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#ifdef CONFIG_SCHED_INFO @@ -3197,6 +3362,7 @@ index 000000000000..f36264fea75c +#endif + init_task_preempt_count(p); + ++ put_cpu(); + return 0; +} + @@ -3295,12 +3461,11 @@ index 000000000000..f36264fea75c + + p->state = TASK_RUNNING; + -+ rq = cpu_rq(select_task_rq(p, this_rq())); ++ rq = cpu_rq(select_task_rq(p)); +#ifdef CONFIG_SMP -+ rseq_migrate(p); + /* + * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path ++ * - cpus_mask can change in the fork path + * - any previously selected CPU might disappear through hotplug + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. @@ -3313,7 +3478,7 @@ index 000000000000..f36264fea75c + update_rq_clock(rq); + activate_task(p, rq); + trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); ++ check_preempt_curr(rq, p); + + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3771,57 +3936,21 @@ index 000000000000..f36264fea75c + return sum; +} + -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+ struct task_struct *p = current; -+ unsigned long flags; -+ int dest_cpu; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = this_rq(); -+ -+ if (rq != task_rq(p) || rq->nr_running < 2) -+ goto unlock; -+ -+ dest_cpu = select_task_rq(p, task_rq(p)); -+ if (dest_cpu == smp_processor_id()) -+ goto unlock; -+ -+ if (likely(cpu_active(dest_cpu))) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); -+ return; -+ } -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#endif -+ +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); + -+static inline void update_curr(struct rq *rq, struct task_struct *p) ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) +{ + s64 ns = rq->clock_task - p->last_ran; + + p->sched_time += ns; + account_group_exec_runtime(p, ns); + -+ p->time_slice -= ns; ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); + p->last_ran = rq->clock_task; +} + @@ -3861,7 +3990,7 @@ index 000000000000..f36264fea75c + */ + if (p == rq->curr && task_on_rq_queued(p)) { + update_rq_clock(rq); -+ update_curr(rq, p); ++ pds_update_curr(rq, p); + } + ns = tsk_seruntime(p); + task_access_unlock_irqrestore(p, lock, &flags); @@ -3870,57 +3999,41 @@ index 000000000000..f36264fea75c +} + +/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) ++static inline void pds_scheduler_task_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (is_idle_task(p)) + return; + -+ update_curr(rq, p); ++ pds_update_curr(rq, p); ++ + cpufreq_update_util(rq, 0); + + /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. + */ -+ if (p->time_slice >= RESCHED_NS) ++ if (p->time_slice - rq->dither >= RESCHED_US) + return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); + -+ perf_event_task_tick(); ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); +} + ++#ifdef CONFIG_SMP ++ +#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) ++static int active_load_balance_cpu_stop(void *data) +{ + struct rq *rq = this_rq(); + struct task_struct *p = data; -+ cpumask_t tmp; ++ int cpu; + unsigned long flags; + + local_irq_save(flags); @@ -3929,14 +4042,12 @@ index 000000000000..f36264fea75c + raw_spin_lock(&rq->lock); + + rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ ++ /* ++ * _something_ may have changed the task, double check again ++ */ + if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(cpu, &tmp, -+ per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); + + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); @@ -3946,69 +4057,94 @@ index 000000000000..f36264fea75c + return 0; +} + -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) +{ -+ struct rq *rq= cpu_rq(cpu); ++ struct rq *rq = cpu_rq(cpu); + unsigned long flags; + struct task_struct *curr; -+ int res; + + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; ++ return; + curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; + -+ if (res) -+ rq->active_balance = 1; ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } + -+ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); + -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* -+ * sg_balance_check - slibing group balance check for run queue @rq ++ * pds_sg_balance_check - slibing group balance check for run queue @rq + */ -+static inline void sg_balance_check(struct rq *rq) ++static inline void pds_sg_balance_check(const struct rq *rq) +{ + cpumask_t chk; -+ int cpu; ++ int i; + -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) + return; + -+ cpu = cpu_of(rq); -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk)) { -+ if (sg_balance_trigger(i)) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); + } +} ++ +#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ + -+#ifdef CONFIG_NO_HZ_FULL ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} + ++#ifdef CONFIG_NO_HZ_FULL +struct tick_work { + int cpu; + atomic_t state; @@ -4079,9 +4215,10 @@ index 000000000000..f36264fea75c + delta = rq_clock_task(rq) - curr->last_ran; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } -+ scheduler_task_tick(rq); -+ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); + calc_load_nohz_remote(rq); ++ +out_unlock: + raw_spin_unlock_irqrestore(&rq->lock, flags); + @@ -4221,6 +4358,172 @@ index 000000000000..f36264fea75c +static inline void preempt_latency_stop(int val) { } +#endif + ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) ++ cpufreq_update_this_cpu(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT @@ -4268,9 +4571,6 @@ index 000000000000..f36264fea75c +#ifdef CONFIG_SCHED_STACK_END_CHECK + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); +#endif + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -4293,166 +4593,16 @@ index 000000000000..f36264fea75c + schedstat_inc(this_rq()->sched_count); +} + -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[IDLE_WM].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0, ); -+ set_task_cpu(p, dest_cpu); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+#ifdef CONFIG_SMP -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+#endif -+ rq->nr_running += nr_migrated; -+#ifdef CONFIG_SMP -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+#endif -+ update_sched_rq_watermark(rq); -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) +{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } ++ p->last_ran = rq->clock_task; + -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } +#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); +#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); +} + +/* @@ -4484,7 +4634,7 @@ index 000000000000..f36264fea75c + * - in IRQ context, return from interrupt-handler to + * preemptible context + * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call @@ -4498,7 +4648,6 @@ index 000000000000..f36264fea75c +{ + struct task_struct *prev, *next; + unsigned long *switch_count; -+ unsigned long prev_state; + struct rq *rq; + int cpu; + @@ -4508,7 +4657,7 @@ index 000000000000..f36264fea75c + + schedule_debug(prev, preempt); + -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ + hrtick_clear(rq); + + local_irq_disable(); @@ -4517,16 +4666,9 @@ index 000000000000..f36264fea75c + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) ++ * done by the caller to avoid the race with signal_wake_up(). + * -+ * Also, the membarrier system call requires a full memory barrier ++ * The membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. + */ + raw_spin_lock(&rq->lock); @@ -4535,38 +4677,10 @@ index 000000000000..f36264fea75c + update_rq_clock(rq); + + switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state && prev_state == prev->state) { -+ if (signal_pending_state(prev_state, prev)) { ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { + prev->state = TASK_RUNNING; + } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); + deactivate_task(prev, rq); + + if (prev->in_iowait) { @@ -4577,18 +4691,18 @@ index 000000000000..f36264fea75c + switch_count = &prev->nvcsw; + } + -+ check_curr(prev, rq); ++ check_deadline(prev, rq); + + next = choose_next_task(rq, cpu, prev); + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + ++ set_rq_task(rq, next); + -+ if (likely(prev != next)) { -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); + -+ rq->nr_switches++; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). @@ -4609,6 +4723,7 @@ index 000000000000..f36264fea75c + * is a RELEASE barrier), + */ + ++*switch_count; ++ rq->nr_switches++; + + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + @@ -4616,12 +4731,11 @@ index 000000000000..f36264fea75c + + /* Also unlocks the rq: */ + rq = context_switch(rq, prev, next); -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+ +#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); ++ pds_sg_balance_check(rq); +#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); +} + +void __noreturn do_task_dead(void) @@ -4631,8 +4745,8 @@ index 000000000000..f36264fea75c + + /* Tell freezer to ignore us: */ + current->flags |= PF_NOFREEZE; -+ + __schedule(false); ++ + BUG(); + + /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -4642,7 +4756,8 @@ index 000000000000..f36264fea75c + +static inline void sched_submit_work(struct task_struct *tsk) +{ -+ if (!tsk->state) ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) + return; + + /* @@ -4650,8 +4765,7 @@ index 000000000000..f36264fea75c + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. ++ * in the possible wakeup of a kworker. + */ + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + preempt_disable(); @@ -4662,9 +4776,6 @@ index 000000000000..f36264fea75c + preempt_enable_no_resched(); + } + -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. @@ -4693,7 +4804,7 @@ index 000000000000..f36264fea75c + __schedule(false); + sched_preempt_enable_no_resched(); + } while (need_resched()); -+ sched_update_worker(tsk); ++ sched_update_worker(tsk); +} +EXPORT_SYMBOL(schedule); + @@ -4884,17 +4995,25 @@ index 000000000000..f36264fea75c +int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, + void *key) +{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) +{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ + requeue_task(p, rq); -+ check_preempt_curr(rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); + } +} + @@ -4983,6 +5102,7 @@ index 000000000000..f36264fea75c + update_task_priodl(p); + + check_task_changed(rq, p); ++ +out_unlock: + __task_access_unlock(p, lock); +} @@ -4995,12 +5115,14 @@ index 000000000000..f36264fea75c + +void set_user_nice(struct task_struct *p, long nice) +{ ++ int new_static; + unsigned long flags; + struct rq *rq; + raw_spinlock_t *lock; + + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; ++ new_static = NICE_TO_PRIO(nice); + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. @@ -5008,7 +5130,10 @@ index 000000000000..f36264fea75c + raw_spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_access_lock(p, &lock); + -+ p->static_prio = NICE_TO_PRIO(nice); ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected @@ -5018,6 +5143,8 @@ index 000000000000..f36264fea75c + if (task_has_rt_policy(p)) + goto out_unlock; + ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); + p->prio = effective_prio(p); + update_task_priodl(p); + @@ -5079,6 +5206,33 @@ index 000000000000..f36264fea75c +#endif + +/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** + * idle_cpu - is a given CPU idle currently? + * @cpu: the processor in question. + * @@ -5086,20 +5240,7 @@ index 000000000000..f36264fea75c + */ +int idle_cpu(int cpu) +{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** @@ -5124,6 +5265,154 @@ index 000000000000..f36264fea75c + return pid ? find_task_by_vpid(pid) : current; +} + ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. @@ -5133,6 +5422,7 @@ index 000000000000..f36264fea75c +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ ++ int old_policy = p->policy; + int policy = attr->sched_policy; + + if (policy == SETPARAM_POLICY) @@ -5154,6 +5444,10 @@ index 000000000000..f36264fea75c + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; +} + +/* Actually do priority change: must hold rq lock. */ @@ -5188,9 +5482,9 @@ index 000000000000..f36264fea75c + return match; +} + -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) +{ + const struct sched_attr dl_squash_attr = { + .size = sizeof(struct sched_attr), @@ -5210,7 +5504,7 @@ index 000000000000..f36264fea75c + BUG_ON(pi && in_interrupt()); + + /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO + */ + if (unlikely(SCHED_DEADLINE == policy)) { + attr = &dl_squash_attr; @@ -5756,9 +6050,7 @@ index 000000000000..f36264fea75c + goto out_unlock; + + kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) ++ if (rt_task(p)) + kattr.sched_priority = p->rt_priority; + else + kattr.sched_nice = task_nice(p); @@ -5779,7 +6071,7 @@ index 000000000000..f36264fea75c + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ -+ cpumask_var_t cpus_allowed, new_mask; ++ cpumask_var_t cpus_mask, new_mask; + struct task_struct *p; + int retval; + @@ -5801,7 +6093,7 @@ index 000000000000..f36264fea75c + retval = -EINVAL; + goto out_put_task; + } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } @@ -5823,27 +6115,27 @@ index 000000000000..f36264fea75c + if (retval) + goto out_unlock; + -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); +again: + retval = __set_cpus_allowed_ptr(p, new_mask, true); + + if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { + /* + * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask + */ -+ cpumask_copy(new_mask, cpus_allowed); ++ cpumask_copy(new_mask, cpus_mask); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); ++ free_cpumask_var(cpus_mask); +out_put_task: + put_task_struct(p); + put_online_cpus(); @@ -5969,15 +6261,11 @@ index 000000000000..f36264fea75c + + rq = this_rq_lock_irq(&rf); + -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); + } ++ schedstat_inc(rq->yld_count); + + /* + * Since we are going to call schedule() anyway, there's @@ -6076,7 +6364,7 @@ index 000000000000..f36264fea75c + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * -+ * In Alt schedule FW, yield_to is not supported. ++ * In PDS, yield_to is not supported. + * + * Return: + * true (>0) if we indeed boosted the target task. @@ -6125,7 +6413,7 @@ index 000000000000..f36264fea75c +} +EXPORT_SYMBOL(io_schedule_timeout); + -+void __sched io_schedule(void) ++void io_schedule(void) +{ + int token; + @@ -6154,6 +6442,7 @@ index 000000000000..f36264fea75c + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: + case SCHED_IDLE: + ret = 0; + break; @@ -6180,6 +6469,7 @@ index 000000000000..f36264fea75c + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: + case SCHED_IDLE: + ret = 0; + break; @@ -6192,8 +6482,6 @@ index 000000000000..f36264fea75c + struct task_struct *p; + int retval; + -+ alt_sched_debug(); -+ + if (pid < 0) + return -EINVAL; + @@ -6208,7 +6496,7 @@ index 000000000000..f36264fea75c + goto out_unlock; + rcu_read_unlock(); + -+ *t = ns_to_timespec64(sched_timeslice_ns); ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); + return 0; + +out_unlock: @@ -6322,7 +6610,7 @@ index 000000000000..f36264fea75c + } + +#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this ++ /* PDS TODO: should support this + if (!state_filter) + sysrq_sched_debug_show(); + */ @@ -6344,7 +6632,7 @@ index 000000000000..f36264fea75c +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question -+ * @cpu: CPU the idle task belongs to ++ * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. @@ -6354,8 +6642,6 @@ index 000000000000..f36264fea75c + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + -+ __sched_fork(0, idle); -+ + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); + update_rq_clock(rq); @@ -6363,9 +6649,11 @@ index 000000000000..f36264fea75c + idle->last_ran = rq->clock_task; + idle->state = TASK_RUNNING; + idle->flags |= PF_IDLE; -+ sched_queue_init_idle(rq, idle); ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); + -+ scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + +#ifdef CONFIG_SMP @@ -6400,6 +6688,104 @@ index 000000000000..f36264fea75c +#endif +} + ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ +#ifdef CONFIG_SMP + +int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, @@ -6428,7 +6814,79 @@ index 000000000000..f36264fea75c + return ret; +} + -+bool sched_smp_initialized __read_mostly; ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(rq->idle)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_HOTPLUG_CPU +/* @@ -6461,6 +6919,7 @@ index 000000000000..f36264fea75c +{ + struct rq *rq = dead_rq; + struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; + int count = 0; + + /* @@ -6474,18 +6933,18 @@ index 000000000000..f36264fea75c + */ + rq->stop = NULL; + -+ p = sched_rq_first_task(rq); -+ while (p != rq->idle) { ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { + int dest_cpu; + ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ + /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = sched_rq_next_task(p, rq); ++ if (task_running(p)) + continue; -+ } + + /* -+ * Rules for changing task_struct::cpus_allowed are holding ++ * Rules for changing task_struct::cpus_mask are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * @@ -6504,13 +6963,13 @@ index 000000000000..f36264fea75c + */ + if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { + raw_spin_unlock(&p->pi_lock); -+ p = sched_rq_next_task(p, rq); + continue; + } + + count++; + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ + rq = __migrate_task(rq, p, dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); @@ -6518,7 +6977,7 @@ index 000000000000..f36264fea75c + rq = dead_rq; + raw_spin_lock(&rq->lock); + /* Check queued task all over from the header again */ -+ p = sched_rq_first_task(rq); ++ node = &rq->sl_header; + } + + rq->stop = stop; @@ -6537,11 +6996,194 @@ index 000000000000..f36264fea75c + rq->online = true; +} + ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ /*llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);*/ ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#else /* !CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ +/* + * used to mark begin/end of suspend/resume: + */ +static int num_cpus_frozen; + ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ +/* + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper @@ -6635,11 +7277,8 @@ index 000000000000..f36264fea75c + /* + * When going down, decrement the number of cores with SMT present. + */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } +#endif + + if (!sched_smp_initialized) @@ -6673,9 +7312,7 @@ index 000000000000..f36264fea75c + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + -+ /* Handle pending wakeups and then migrate everything off */ + sched_tick_stop(cpu); -+ + raw_spin_lock_irqsave(&rq->lock, flags); + set_rq_offline(rq); + migrate_tasks(rq); @@ -6694,53 +7331,68 @@ index 000000000000..f36264fea75c + + for_each_possible_cpu(cpu) { + for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); + cpumask_copy(tmp, cpu_possible_mask); + cpumask_clear_cpu(cpu, tmp); + } -+ per_cpu(sched_cpu_llc_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); + } +} + -+#define TOPOLOGY_CPUMASK(name, mask, last) \ -+ if (cpumask_and(chk, chk, mask)) \ -+ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ -+ cpu, (chk++)->bits[0]); \ -+ if (!last) \ -+ cpumask_complement(chk, mask) -+ +static void sched_init_topology_cpumask(void) +{ + int cpu; + cpumask_t *chk; + + for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); + -+ cpumask_complement(chk, cpumask_of(cpu)); +#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } +#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = chk; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); + -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; + -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); + -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; + } +} +#endif @@ -6751,6 +7403,8 @@ index 000000000000..f36264fea75c + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + BUG(); + ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ + sched_init_topology_cpumask(); + + sched_smp_initialized = true; @@ -6758,7 +7412,6 @@ index 000000000000..f36264fea75c +#else +void __init sched_init_smp(void) +{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; +} +#endif /* CONFIG_SMP */ + @@ -6798,13 +7451,20 @@ index 000000000000..f36264fea75c + int i; + struct rq *rq; + -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); ++ print_scheduler_version(); + + wait_bit_init(); + +#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); +#endif + +#ifdef CONFIG_CGROUP_SCHED @@ -6816,12 +7476,9 @@ index 000000000000..f36264fea75c +#endif /* CONFIG_CGROUP_SCHED */ + for_each_possible_cpu(i) { + rq = cpu_rq(i); -+ -+ sched_queue_init(rq); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); + raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; + rq->nr_running = rq->nr_uninterruptible = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; @@ -6829,23 +7486,22 @@ index 000000000000..f36264fea75c + rq->online = false; + rq->cpu = i; + ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; +#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; + rq->active_balance = 0; +#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); +#endif -+#endif /* CONFIG_SMP */ + rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); + } +#ifdef CONFIG_SMP + /* Set rq->online for cpu 0 */ + cpu_rq(0)->online = true; +#endif ++ + /* + * The boot idle thread does lazy MMU switching as well: + */ @@ -7056,6 +7712,15 @@ index 000000000000..f36264fea75c + +#endif + ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ +#ifdef CONFIG_CGROUP_SCHED +static void sched_free_group(struct task_group *tg) +{ @@ -7162,7 +7827,6 @@ index 000000000000..f36264fea75c + { } /* Terminate */ +}; + -+ +static struct cftype cpu_files[] = { + { } /* terminate */ +}; @@ -7191,51 +7855,14 @@ index 000000000000..f36264fea75c +#endif /* CONFIG_CGROUP_SCHED */ + +#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h new file mode 100644 -index 000000000000..1212a031700e +index 000000000000..0a2e8b145ae1 --- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..99be2c51c88d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,555 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,581 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H + +#include <linux/sched.h> + @@ -7282,12 +7909,7 @@ index 000000000000..99be2c51c88d + +#include "cpupri.h" + -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif ++#include <trace/events/sched.h> + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 @@ -7309,7 +7931,13 @@ index 000000000000..99be2c51c88d +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_rq */ ++ ++/* ++ * rq::clock_update_flags bits ++ */ ++#define RQCF_REQ_SKIP 0x01 ++#define RQCF_ACT_SKIP 0x02 ++#define RQCF_UPDATED 0x04 + +/* + * This is the main, per-CPU runqueue data structure. @@ -7320,16 +7948,10 @@ index 000000000000..99be2c51c88d + raw_spinlock_t lock; + + struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; ++ struct task_struct *idle, *stop; + struct mm_struct *prev_mm; + -+#ifdef CONFIG_SCHED_BMQ -+ struct bmq queue; -+#endif -+#ifdef CONFIG_SCHED_PDS + struct skiplist_node sl_header; -+#endif -+ unsigned long watermark; + + /* switch count */ + u64 nr_switches; @@ -7343,14 +7965,18 @@ index 000000000000..99be2c51c88d +#ifdef CONFIG_SMP + int cpu; /* cpu of this runqueue */ + bool online; -+ + unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; ++ unsigned int clock_update_flags; + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; + +#ifdef CONFIG_SCHED_SMT + int active_balance; @@ -7372,8 +7998,8 @@ index 000000000000..99be2c51c88d + long calc_load_active; + + u64 clock, last_tick; -+ u64 last_ts_switch; + u64 clock_task; ++ int dither; + + unsigned long nr_running; + unsigned long nr_uninterruptible; @@ -7404,34 +8030,37 @@ index 000000000000..99be2c51c88d + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif /* CONFIG_SCHEDSTATS */ -+ +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ +}; + ++#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ ++ (task->flags & PF_FROZEN) == 0 && \ ++ (task->state & TASK_NOLOAD) == 0) ++ +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) + -+#ifdef CONFIG_SMP +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); @@ -7444,40 +8073,7 @@ index 000000000000..99be2c51c88d +} +#endif + -+extern bool sched_smp_initialized; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, -+ const cpumask_t *mask) -+{ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu : -+ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); -+} -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_from_idle(void) { } -+#endif ++#endif /* CONFIG_SMP */ + +#ifndef arch_scale_freq_tick +static __always_inline @@ -7490,7 +8086,7 @@ index 000000000000..99be2c51c88d +static __always_inline +unsigned long arch_scale_freq_capacity(int cpu) +{ -+ return SCHED_CAPACITY_SCALE; ++ return SCHED_CAPACITY_SCALE; +} +#endif + @@ -7519,6 +8115,24 @@ index 000000000000..99be2c51c88d + return rq->clock_task; +} + ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ +/* + * {de,en}queue flags: + * @@ -7535,11 +8149,13 @@ index 000000000000..99be2c51c88d +/* + * Below are scheduler API which using in other kernel code + * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * ToDo : PDS need to support these APIs for compatibility with mainline + * scheduler code. + */ +struct rq_flags { + unsigned long flags; ++ struct pin_cookie cookie; ++ unsigned int clock_update_flags; +}; + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -7555,6 +8171,26 @@ index 000000000000..99be2c51c88d + raw_spin_unlock(&rq->lock); +} + ++static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) ++{ ++ rf->cookie = lockdep_pin_lock(&rq->lock); ++ ++#ifdef CONFIG_SCHED_DEBUG ++ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); ++ rf->clock_update_flags = 0; ++#endif ++} ++ ++static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ if (rq->clock_update_flags > RQCF_ACT_SKIP) ++ rf->clock_update_flags = RQCF_UPDATED; ++#endif ++ ++ lockdep_unpin_lock(&rq->lock, rf->cookie); ++} ++ +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) @@ -7565,12 +8201,36 @@ index 000000000000..99be2c51c88d +} + +static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++ rq_pin_lock(rq, rf); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ rq_unpin_lock(rq, rf); ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + raw_spin_unlock_irq(&rq->lock); +} + ++static inline void ++rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ rq_unpin_lock(rq, rf); ++ raw_spin_unlock(&rq->lock); ++} ++ +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) @@ -7596,6 +8256,8 @@ index 000000000000..99be2c51c88d + +extern struct static_key_false sched_schedstats; + ++extern void flush_smp_call_function_from_idle(void); ++ +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) @@ -7631,24 +8293,6 @@ index 000000000000..99be2c51c88d + +#include "stats.h" + -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; @@ -7712,8 +8356,15 @@ index 000000000000..99be2c51c88d + if (data) + data->func(data, rq_clock(rq), flags); +} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef CONFIG_NO_HZ_FULL @@ -7732,6 +8383,8 @@ index 000000000000..99be2c51c88d + +extern void schedule_idle(void); + ++#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) ++ +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * @@ -7788,668 +8441,36 @@ index 000000000000..99be2c51c88d +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..aff0bb30a884 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,20 @@ -+#ifndef BMQ_H -+#define BMQ_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, SCHED_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#endif -diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h -new file mode 100644 -index 000000000000..ad9a7c448da7 ---- /dev/null -+++ b/kernel/sched/bmq_imp.h -@@ -0,0 +1,185 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline int task_sched_prio(struct task_struct *p, struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+} -+ -+static inline void update_task_priodl(struct task_struct *p) {} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return find_first_bit(rq->queue.bitmap, SCHED_BITS); -+} -+ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ struct bmq *q = &rq->queue; -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ struct bmq *q = &rq->queue; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); -+ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); -+ set_bit(idle->bmq_idx, q->bitmap); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ list_del(&p->bmq_node); \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap);\ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->bmq_idx = task_sched_prio(p, rq); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ -+ set_bit(p->bmq_idx, rq->queue.bitmap) -+ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{ \ -+ int idx = task_sched_prio(p, rq); \ -+\ -+ list_del(&p->bmq_node); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ -+ if (idx != p->bmq_idx) { \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap); \ -+ p->bmq_idx = idx; \ -+ set_bit(p->bmq_idx, rq->queue.bitmap); \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ return (task_sched_prio(p, rq) != p->bmq_idx); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+ -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index e39008242cf4..5963716fe391 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_ALT */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -912,6 +923,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) - cpufreq_governor_init(schedutil_gov); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_ALT - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -942,4 +954,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_ALT */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 5a55d2300452..66a0ab7165f0 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -614,7 +614,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f324dc36fc43..a6b566bda65b 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..7fdeace7e8a5 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,14 @@ -+#ifndef PDS_H -+#define PDS_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+#endif -diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h -new file mode 100644 -index 000000000000..6baee5e961b9 ---- /dev/null -+++ b/kernel/sched/pds_imp.h -@@ -0,0 +1,257 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static const u64 user_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, -+/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, -+/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, -+/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, -+/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, -+/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, -+/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, -+/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 -+}; -+ -+static const unsigned char dl_level_map[] = { -+/* 0 4 8 12 */ -+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, -+/* 16 20 24 28 */ -+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, -+/* 32 36 40 44 */ -+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, -+/* 48 52 56 60 */ -+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, -+/* 64 68 72 76 */ -+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, -+/* 80 84 88 92 */ -+ 1, 0 -+}; -+ -+static inline int -+task_sched_prio(const struct task_struct *p, const struct rq *rq) -+{ -+ size_t delta; -+ -+ if (p == rq->idle) -+ return IDLE_TASK_SCHED_PRIO; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return p->prio; -+ -+ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; -+ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); -+ -+ return MAX_RT_PRIO + dl_level_map[delta]; -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+} -+ -+/* -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Init the queue structure in rq -+ */ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ -+ int default_prio = idle->prio; -+ -+ idle->prio = MAX_PRIO; -+ idle->deadline = 0ULL; -+ update_task_priodl(idle); -+ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ -+ idle->sl_node.level = idle->sl_level; -+ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); -+ -+ idle->prio = default_prio; -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ BUG_ON(node == &rq->sl_header); -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *next = p->sl_node.next[0]; -+ -+ BUG_ON(next == &rq->sl_header); -+ return skiplist_entry(next, struct task_struct, sl_node); -+} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return task_sched_prio(sched_rq_first_task(rq), rq); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sl_node.level = p->sl_level; \ -+ pds_skiplist_insert(&rq->sl_header, &p->sl_node) -+ -+/* -+ * Requeue a task @p to @rq -+ */ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{\ -+ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ -+\ -+ p->sl_node.level = p->sl_level; \ -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *node = p->sl_node.prev[0]; -+ -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl > p->priodl) -+ return true; -+ } -+ -+ node = p->sl_node.next[0]; -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl < p->priodl) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->sl_level = pds_skiplist_random_level(p); -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int ret; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ -+ preempt_disable(); -+ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+static void sched_task_ttwu(struct task_struct *p) {} -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} ++#endif /* PDS_SCHED_H */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index 2c613e1cff3a..0103b2a7201d 100644 +index 2c613e1cff3a..02bef8978060 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS /* * sched_entity: * -@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +@@ -387,6 +388,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#endif --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + #ifdef CONFIG_SCHED_THERMAL_PRESSURE /* - * thermal: - * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index 795e43e02afc..856163dac896 100644 +index 795e43e02afc..d1fc38858d7f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ +@@ -1,11 +1,13 @@ #ifdef CONFIG_SMP #include "sched-pelt.h" -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); @@ -8457,16 +8478,13 @@ index 795e43e02afc..856163dac896 100644 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#endif --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + #ifdef CONFIG_SCHED_THERMAL_PRESSURE int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) @@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) return LOAD_AVG_MAX - 1024 + avg->period_contrib; } -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. @@ -8474,56 +8492,52 @@ index 795e43e02afc..856163dac896 100644 return rq_clock_pelt(rq_of(cfs_rq)); } #endif -+#endif /* CONFIG_SCHED_ALT */ ++#endif /* CONFIG_SCHED_PDS */ #else -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { -@@ -182,6 +187,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +@@ -193,6 +198,7 @@ static inline u64 thermal_load_avg(struct rq *rq) { return 0; } +#endif static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) + update_irq_load_avg(struct rq *rq, u64 running) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..6bc68bacbac8 100644 +index 28709f6b0975..21a6c761703a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,10 @@ /* * Scheduler internal types and methods: */ -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" +#else + #include <linux/sched.h> #include <linux/sched/autogroup.h> -@@ -2626,3 +2630,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) +@@ -2626,3 +2630,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ ++#endif /* !CONFIG_SCHED_PDS */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..108422ebc7bf 100644 +index 750fb3c67eed..45bd43942575 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) } else { struct rq *rq; #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS struct sched_domain *sd; int dcount = 0; +#endif @@ -8534,7 +8548,7 @@ index 750fb3c67eed..108422ebc7bf 100644 seq_printf(seq, "\n"); #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -8546,73 +8560,22 @@ index 750fb3c67eed..108422ebc7bf 100644 #endif } return 0; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..cc946a9bd550 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -4,6 +4,7 @@ - */ - #include "sched.h" - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology = tl; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index afad085960b8..e91b4cb3042b 100644 +index afad085960b8..61b25c6470d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c -@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_ALT -+static int __maybe_unused zero = 0; +@@ -117,9 +117,13 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int two_hundred = 200; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly two_hundred = 200; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; +extern int sched_yield_type; +#endif #ifdef CONFIG_PRINTK @@ -8623,7 +8586,7 @@ index afad085960b8..e91b4cb3042b 100644 #endif -#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ @@ -8631,7 +8594,7 @@ index afad085960b8..e91b4cb3042b 100644 } static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, @@ -8639,45 +8602,39 @@ index afad085960b8..e91b4cb3042b 100644 .extra2 = SYSCTL_ONE, }, #endif -+#endif /* !CONFIG_SCHED_ALT */ ++#endif /* !CONFIG_SCHED_PDS */ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -@@ -2430,6 +2436,17 @@ static struct ctl_table kern_table[] = { +@@ -2430,6 +2436,26 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -+#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, + { + .procname = "yield_type", + .data = &sched_yield_type, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, ++ .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..81f2ee62c807 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1927,8 +1927,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index a71758e34e45..d20c347df861 100644 +index a71758e34e45..fd62616c45ad 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) @@ -8693,7 +8650,7 @@ index a71758e34e45..d20c347df861 100644 } } -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { @@ -8709,7 +8666,7 @@ index a71758e34e45..d20c347df861 100644 u64 samples[CPUCLOCK_MAX]; unsigned long soft; -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS if (dl_task(tsk)) check_dl_overrun(tsk); +#endif @@ -8729,7 +8686,7 @@ index a71758e34e45..d20c347df861 100644 return true; } -+#ifndef CONFIG_SCHED_ALT ++#ifndef CONFIG_SCHED_PDS if (dl_task(tsk) && tsk->dl.dl_overrun) return true; +#endif @@ -8737,15 +8694,15 @@ index a71758e34e45..d20c347df861 100644 return false; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..65f60c77bc50 100644 +index b5e3496cf803..0816db0b9c16 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ + .sched_policy = SCHED_RR, +#else .sched_policy = SCHED_DEADLINE, @@ -8756,54 +8713,3 @@ index b5e3496cf803..65f60c77bc50 100644 }; struct wakeup_test_data *x = data; -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -index f36264fea75c6ca7c34eaa259c0bff829cbf6ac0..d43ca62fd00fe442bda9b4ad548fae432a7436de 100644 ---- a/kernel/sched/alt_core.c -+++ b/kernel/sched/alt_core.c -@@ -11,6 +11,10 @@ - * scheduler by Alfred Chen. - * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. - */ -+#define CREATE_TRACE_POINTS -+#include <trace/events/sched.h> -+#undef CREATE_TRACE_POINTS -+ - #include "sched.h" - - #include <linux/sched/rt.h> -@@ -42,8 +46,11 @@ - #include "pelt.h" - #include "smp.h" - --#define CREATE_TRACE_POINTS --#include <trace/events/sched.h> -+/* -+ * Export tracepoints that act as a bare tracehook (ie: have no trace event -+ * associated with them) to allow external modules to probe them. -+ */ -+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); - - #define ALT_SCHED_VERSION "v5.9-r0" - -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -index 99be2c51c88d0406cced20b36d7230da12930a5c..03f8b8b1aa27eeb15989af25b4050c767da12aad 100644 ---- a/kernel/sched/alt_sched.h -+++ b/kernel/sched/alt_sched.h -@@ -46,6 +46,8 @@ - - #include "cpupri.h" - -+#include <trace/events/sched.h> -+ - #ifdef CONFIG_SCHED_BMQ - #include "bmq.h" - #endif -@@ -496,6 +498,8 @@ static inline int sched_tick_offload_init(void) { return 0; } - - extern void schedule_idle(void); - -+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * @@ -59,7 +59,7 @@ _subarch= _localmodcfg= pkgbase=linux-pds -pkgver=5.9.1.arch1 +pkgver=5.9.2.arch1 pkgrel=1 pkgdesc="Linux" _srcver_tag=v${pkgver%.*}-${pkgver##*.} @@ -94,8 +94,8 @@ source=( "git+$_repo_url_gcc_patch" config # the main kernel config file sphinx-workaround.patch - 0009-prjc_v5.9-r0.patch - 0005-glitched-pds.patch + 0005-v5.9_undead-pds099o.patch + 0005-undead-glitched-pds.patch ) validpgpkeys=( "ABAF11C65A2970B130ABE3C479BE3E4300411886" # Linus Torvalds @@ -104,10 +104,10 @@ validpgpkeys=( ) sha512sums=('SKIP' 'SKIP' - '29e6b6b45fec5a93cfdd41d2286c406ed94aaee0148df0e452ace250eeff9287cf87d9a339af34b9beec690db5a3b439a2c7c441313f05f577a4e11b056b1610' + 'cefb516ae87c748f8fa6c5f227d932938be06e32774305cbea4d29c342359ffcd4eed21b80cb560d0a3e0a016c801a1446034b5aec521808f0e27d5897e155d9' '98e97155f86bbe837d43f27ec1018b5b6fdc6c372d6f7f2a0fe29da117d53979d9f9c262f886850d92002898682781029b80d4ee923633fc068f979e6c8254be' - 'afc135ec7c147ab6dc22e34f1f3373bde30a3a5fb77032832470ededf97a0a1a3e1fd4294bd0a03ef3edc51a10331ba7e37e63d5f6d6d603111600693bac9755' - '889f0a49f326de3f119290256393b09a9e9241c2a297ca0b7967a2884e4e35d71388d2a559e4c206f55f67228b65e8f2013a1ec61f6ff8f1de3b6a725fd5fa57') + 'e41d0f8a3ace142947fc5497f7377cf5a497ce1764ca96fdc6dc4915b027ac99a15296ad22c4ef99a3a5eb812614b5b280480249747a5c318452543cd85ce620' + '2cf83af1322f0fe5b9751e2b77fa1c890c7c22d9213b1cdfb57ca7f7a89a2cb263c213e178417ae1b7e947b386796b4b71507b127ec698cba661799346b33bbd') export KBUILD_BUILD_HOST=archlinux export KBUILD_BUILD_USER=$pkgbase @@ -124,8 +124,8 @@ prepare() { PatchesArray=( sphinx-workaround.patch $_reponame_gcc_patch/$_gcc_patch_name - 0009-prjc_v5.9-r0.patch - 0005-glitched-pds.patch + 0005-v5.9_undead-pds099o.patch + 0005-undead-glitched-pds.patch ) for MyPatch in "${PatchesArray[@]}" do @@ -155,7 +155,7 @@ prepare() { fi # Set yield_type to 0 - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/pds.c # do not run 'make olddefconfig' as it sets default options yes "" | make config >/dev/null @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 5.9.1-arch1 Kernel Configuration +# Linux/x86 5.9.2-arch1 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 10.2.0" CONFIG_CC_IS_GCC=y @@ -18,6 +18,7 @@ CONFIG_THREAD_INFO_IN_TASK=y # # General setup # +CONFIG_SCHED_PDS=y CONFIG_INIT_ENV_ARG_LIMIT=32 # CONFIG_COMPILE_TEST is not set CONFIG_LOCALVERSION="" @@ -156,9 +157,6 @@ CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y # # Scheduler features # -CONFIG_SCHED_ALT=y -# CONFIG_SCHED_BMQ is not set -CONFIG_SCHED_PDS=y # end of Scheduler features CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y @@ -387,6 +385,7 @@ CONFIG_NR_CPUS_RANGE_END=512 CONFIG_NR_CPUS_DEFAULT=64 CONFIG_NR_CPUS=320 CONFIG_SCHED_SMT=y +CONFIG_SMT_NICE=y CONFIG_SCHED_MC=y CONFIG_SCHED_MC_PRIO=y CONFIG_X86_LOCAL_APIC=y @@ -10141,6 +10140,7 @@ CONFIG_NFSD_PNFS=y CONFIG_NFSD_BLOCKLAYOUT=y CONFIG_NFSD_SCSILAYOUT=y # CONFIG_NFSD_FLEXFILELAYOUT is not set +# CONFIG_NFSD_V4_2_INTER_SSC is not set CONFIG_NFSD_V4_SECURITY_LABEL=y CONFIG_GRACE_PERIOD=m CONFIG_LOCKD=m @@ -11068,6 +11068,7 @@ CONFIG_LKDTM=m # CONFIG_TEST_STACKINIT is not set # CONFIG_TEST_MEMINIT is not set # CONFIG_TEST_HMM is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_TEST_FPU is not set # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set |