diff options
Diffstat (limited to 'intel-pstate-backport.patch')
-rw-r--r-- | intel-pstate-backport.patch | 775 |
1 files changed, 775 insertions, 0 deletions
diff --git a/intel-pstate-backport.patch b/intel-pstate-backport.patch new file mode 100644 index 000000000000..8b6146401240 --- /dev/null +++ b/intel-pstate-backport.patch @@ -0,0 +1,775 @@ +--- linux-4.6/drivers/cpufreq/intel_pstate.c.orig 2016-05-15 18:43:13.000000000 -0400 ++++ linux-4.6/drivers/cpufreq/intel_pstate.c 2016-06-24 17:36:23.064118833 -0400 +@@ -10,6 +10,8 @@ + * of the License. + */ + ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ + #include <linux/kernel.h> + #include <linux/kernel_stat.h> + #include <linux/module.h> +@@ -39,10 +41,17 @@ + #define ATOM_TURBO_RATIOS 0x66c + #define ATOM_TURBO_VIDS 0x66d + ++#ifdef CONFIG_ACPI ++#include <acpi/processor.h> ++#endif ++ + #define FRAC_BITS 8 + #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) + #define fp_toint(X) ((X) >> FRAC_BITS) + ++#define EXT_BITS 6 ++#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) ++ + static inline int32_t mul_fp(int32_t x, int32_t y) + { + return ((int64_t)x * (int64_t)y) >> FRAC_BITS; +@@ -64,12 +73,22 @@ + return ret; + } + ++static inline u64 mul_ext_fp(u64 x, u64 y) ++{ ++ return (x * y) >> EXT_FRAC_BITS; ++} ++ ++static inline u64 div_ext_fp(u64 x, u64 y) ++{ ++ return div64_u64(x << EXT_FRAC_BITS, y); ++} ++ + /** + * struct sample - Store performance sample +- * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual ++ * @core_avg_perf: Ratio of APERF/MPERF which is the actual average + * performance during last sample period + * @busy_scaled: Scaled busy value which is used to calculate next +- * P state. This can be different than core_pct_busy ++ * P state. This can be different than core_avg_perf + * to account for cpu idle period + * @aperf: Difference of actual performance frequency clock count + * read from APERF MSR between last and current sample +@@ -84,7 +103,7 @@ + * data for choosing next P State. + */ + struct sample { +- int32_t core_pct_busy; ++ int32_t core_avg_perf; + int32_t busy_scaled; + u64 aperf; + u64 mperf; +@@ -162,6 +181,7 @@ + * struct cpudata - Per CPU instance data storage + * @cpu: CPU number for this instance data + * @update_util: CPUFreq utility callback information ++ * @update_util_set: CPUFreq utility callback is set + * @pstate: Stores P state limits for this CPU + * @vid: Stores VID limits for this CPU + * @pid: Stores PID parameters for this CPU +@@ -172,6 +192,8 @@ + * @prev_cummulative_iowait: IO Wait time difference from last and + * current sample + * @sample: Storage for storing last Sample data ++ * @acpi_perf_data: Stores ACPI perf information read from _PSS ++ * @valid_pss_table: Set to true for valid ACPI _PSS entries found + * + * This structure stores per CPU instance data for all CPUs. + */ +@@ -179,6 +201,7 @@ + int cpu; + + struct update_util_data update_util; ++ bool update_util_set; + + struct pstate_data pstate; + struct vid_data vid; +@@ -190,6 +213,10 @@ + u64 prev_tsc; + u64 prev_cummulative_iowait; + struct sample sample; ++#ifdef CONFIG_ACPI ++ struct acpi_processor_performance acpi_perf_data; ++ bool valid_pss_table; ++#endif + }; + + static struct cpudata **all_cpu_data; +@@ -258,6 +285,9 @@ + static struct pstate_funcs pstate_funcs; + static int hwp_active; + ++#ifdef CONFIG_ACPI ++static bool acpi_ppc; ++#endif + + /** + * struct perf_limits - Store user and policy limits +@@ -331,6 +361,124 @@ + static struct perf_limits *limits = &powersave_limits; + #endif + ++#ifdef CONFIG_ACPI ++ ++static bool intel_pstate_get_ppc_enable_status(void) ++{ ++ if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER || ++ acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER) ++ return true; ++ ++ return acpi_ppc; ++} ++ ++/* ++ * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and ++ * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and ++ * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state ++ * ratio, out of it only high 8 bits are used. For example 0x1700 is setting ++ * target ratio 0x17. The _PSS control value stores in a format which can be ++ * directly written to PERF_CTL MSR. But in intel_pstate driver this shift ++ * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()). ++ * This function converts the _PSS control value to intel pstate driver format ++ * for comparison and assignment. ++ */ ++static int convert_to_native_pstate_format(struct cpudata *cpu, int index) ++{ ++ return cpu->acpi_perf_data.states[index].control >> 8; ++} ++ ++static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) ++{ ++ struct cpudata *cpu; ++ int turbo_pss_ctl; ++ int ret; ++ int i; ++ ++ if (hwp_active) ++ return; ++ ++ if (!intel_pstate_get_ppc_enable_status()) ++ return; ++ ++ cpu = all_cpu_data[policy->cpu]; ++ ++ ret = acpi_processor_register_performance(&cpu->acpi_perf_data, ++ policy->cpu); ++ if (ret) ++ return; ++ ++ /* ++ * Check if the control value in _PSS is for PERF_CTL MSR, which should ++ * guarantee that the states returned by it map to the states in our ++ * list directly. ++ */ ++ if (cpu->acpi_perf_data.control_register.space_id != ++ ACPI_ADR_SPACE_FIXED_HARDWARE) ++ goto err; ++ ++ /* ++ * If there is only one entry _PSS, simply ignore _PSS and continue as ++ * usual without taking _PSS into account ++ */ ++ if (cpu->acpi_perf_data.state_count < 2) ++ goto err; ++ ++ pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); ++ for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { ++ pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", ++ (i == cpu->acpi_perf_data.state ? '*' : ' '), i, ++ (u32) cpu->acpi_perf_data.states[i].core_frequency, ++ (u32) cpu->acpi_perf_data.states[i].power, ++ (u32) cpu->acpi_perf_data.states[i].control); ++ } ++ ++ /* ++ * The _PSS table doesn't contain whole turbo frequency range. ++ * This just contains +1 MHZ above the max non turbo frequency, ++ * with control value corresponding to max turbo ratio. But ++ * when cpufreq set policy is called, it will call with this ++ * max frequency, which will cause a reduced performance as ++ * this driver uses real max turbo frequency as the max ++ * frequency. So correct this frequency in _PSS table to ++ * correct max turbo frequency based on the turbo ratio. ++ * Also need to convert to MHz as _PSS freq is in MHz. ++ */ ++ turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0); ++ if (turbo_pss_ctl > cpu->pstate.max_pstate) ++ cpu->acpi_perf_data.states[0].core_frequency = ++ policy->cpuinfo.max_freq / 1000; ++ cpu->valid_pss_table = true; ++ pr_info("_PPC limits will be enforced\n"); ++ ++ return; ++ ++ err: ++ cpu->valid_pss_table = false; ++ acpi_processor_unregister_performance(policy->cpu); ++} ++ ++static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) ++{ ++ struct cpudata *cpu; ++ ++ cpu = all_cpu_data[policy->cpu]; ++ if (!cpu->valid_pss_table) ++ return; ++ ++ acpi_processor_unregister_performance(policy->cpu); ++} ++ ++#else ++static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) ++{ ++} ++ ++static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) ++{ ++} ++#endif ++ + static inline void pid_reset(struct _pid *pid, int setpoint, int busy, + int deadband, int integral) { + pid->setpoint = int_tofp(setpoint); +@@ -341,17 +489,17 @@ + + static inline void pid_p_gain_set(struct _pid *pid, int percent) + { +- pid->p_gain = div_fp(int_tofp(percent), int_tofp(100)); ++ pid->p_gain = div_fp(percent, 100); + } + + static inline void pid_i_gain_set(struct _pid *pid, int percent) + { +- pid->i_gain = div_fp(int_tofp(percent), int_tofp(100)); ++ pid->i_gain = div_fp(percent, 100); + } + + static inline void pid_d_gain_set(struct _pid *pid, int percent) + { +- pid->d_gain = div_fp(int_tofp(percent), int_tofp(100)); ++ pid->d_gain = div_fp(percent, 100); + } + + static signed int pid_calc(struct _pid *pid, int32_t busy) +@@ -537,7 +685,7 @@ + + total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; + no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; +- turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total)); ++ turbo_fp = div_fp(no_turbo, total); + turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); + return sprintf(buf, "%u\n", turbo_pct); + } +@@ -579,7 +727,7 @@ + + update_turbo_state(); + if (limits->turbo_disabled) { +- pr_warn("intel_pstate: Turbo disabled by BIOS or unavailable on processor\n"); ++ pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); + return -EPERM; + } + +@@ -608,8 +756,7 @@ + limits->max_perf_pct); + limits->max_perf_pct = max(limits->min_perf_pct, + limits->max_perf_pct); +- limits->max_perf = div_fp(int_tofp(limits->max_perf_pct), +- int_tofp(100)); ++ limits->max_perf = div_fp(limits->max_perf_pct, 100); + + if (hwp_active) + intel_pstate_hwp_set_online_cpus(); +@@ -633,8 +780,7 @@ + limits->min_perf_pct); + limits->min_perf_pct = min(limits->max_perf_pct, + limits->min_perf_pct); +- limits->min_perf = div_fp(int_tofp(limits->min_perf_pct), +- int_tofp(100)); ++ limits->min_perf = div_fp(limits->min_perf_pct, 100); + + if (hwp_active) + intel_pstate_hwp_set_online_cpus(); +@@ -1019,15 +1165,11 @@ + intel_pstate_set_min_pstate(cpu); + } + +-static inline void intel_pstate_calc_busy(struct cpudata *cpu) ++static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) + { + struct sample *sample = &cpu->sample; +- int64_t core_pct; +- +- core_pct = int_tofp(sample->aperf) * int_tofp(100); +- core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); + +- sample->core_pct_busy = (int32_t)core_pct; ++ sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf); + } + + static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) +@@ -1070,9 +1212,14 @@ + + static inline int32_t get_avg_frequency(struct cpudata *cpu) + { +- return fp_toint(mul_fp(cpu->sample.core_pct_busy, +- int_tofp(cpu->pstate.max_pstate_physical * +- cpu->pstate.scaling / 100))); ++ return mul_ext_fp(cpu->sample.core_avg_perf, ++ cpu->pstate.max_pstate_physical * cpu->pstate.scaling); ++} ++ ++static inline int32_t get_avg_pstate(struct cpudata *cpu) ++{ ++ return mul_ext_fp(cpu->pstate.max_pstate_physical, ++ cpu->sample.core_avg_perf); + } + + static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) +@@ -1107,49 +1254,43 @@ + cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc); + cpu->sample.busy_scaled = cpu_load; + +- return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load); ++ return get_avg_pstate(cpu) - pid_calc(&cpu->pid, cpu_load); + } + + static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) + { +- int32_t core_busy, max_pstate, current_pstate, sample_ratio; ++ int32_t perf_scaled, max_pstate, current_pstate, sample_ratio; + u64 duration_ns; + + /* +- * core_busy is the ratio of actual performance to max +- * max_pstate is the max non turbo pstate available +- * current_pstate was the pstate that was requested during +- * the last sample period. +- * +- * We normalize core_busy, which was our actual percent +- * performance to what we requested during the last sample +- * period. The result will be a percentage of busy at a +- * specified pstate. ++ * perf_scaled is the average performance during the last sampling ++ * period scaled by the ratio of the maximum P-state to the P-state ++ * requested last time (in percent). That measures the system's ++ * response to the previous P-state selection. + */ +- core_busy = cpu->sample.core_pct_busy; +- max_pstate = int_tofp(cpu->pstate.max_pstate_physical); +- current_pstate = int_tofp(cpu->pstate.current_pstate); +- core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); ++ max_pstate = cpu->pstate.max_pstate_physical; ++ current_pstate = cpu->pstate.current_pstate; ++ perf_scaled = mul_ext_fp(cpu->sample.core_avg_perf, ++ div_fp(100 * max_pstate, current_pstate)); + + /* + * Since our utilization update callback will not run unless we are + * in C0, check if the actual elapsed time is significantly greater (3x) + * than our sample interval. If it is, then we were idle for a long +- * enough period of time to adjust our busyness. ++ * enough period of time to adjust our performance metric. + */ + duration_ns = cpu->sample.time - cpu->last_sample_time; + if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { +- sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns), +- int_tofp(duration_ns)); +- core_busy = mul_fp(core_busy, sample_ratio); ++ sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns); ++ perf_scaled = mul_fp(perf_scaled, sample_ratio); + } else { + sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc); + if (sample_ratio < int_tofp(1)) +- core_busy = 0; ++ perf_scaled = 0; + } + +- cpu->sample.busy_scaled = core_busy; +- return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy); ++ cpu->sample.busy_scaled = perf_scaled; ++ return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled); + } + + static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) +@@ -1179,7 +1320,7 @@ + intel_pstate_update_pstate(cpu, target_pstate); + + sample = &cpu->sample; +- trace_pstate_sample(fp_toint(sample->core_pct_busy), ++ trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf), + fp_toint(sample->busy_scaled), + from, + cpu->pstate.current_pstate, +@@ -1199,7 +1340,7 @@ + bool sample_taken = intel_pstate_sample(cpu, time); + + if (sample_taken) { +- intel_pstate_calc_busy(cpu); ++ intel_pstate_calc_avg_perf(cpu); + if (!hwp_active) + intel_pstate_adjust_busy_pstate(cpu); + } +@@ -1261,23 +1402,16 @@ + + intel_pstate_busy_pid_reset(cpu); + +- cpu->update_util.func = intel_pstate_update_util; +- +- pr_debug("intel_pstate: controlling: cpu %d\n", cpunum); ++ pr_debug("controlling: cpu %d\n", cpunum); + + return 0; + } + + static unsigned int intel_pstate_get(unsigned int cpu_num) + { +- struct sample *sample; +- struct cpudata *cpu; ++ struct cpudata *cpu = all_cpu_data[cpu_num]; + +- cpu = all_cpu_data[cpu_num]; +- if (!cpu) +- return 0; +- sample = &cpu->sample; +- return get_avg_frequency(cpu); ++ return cpu ? get_avg_frequency(cpu) : 0; + } + + static void intel_pstate_set_update_util_hook(unsigned int cpu_num) +@@ -1286,12 +1420,20 @@ + + /* Prevent intel_pstate_update_util() from using stale data. */ + cpu->sample.time = 0; +- cpufreq_set_update_util_data(cpu_num, &cpu->update_util); ++ cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, ++ intel_pstate_update_util); ++ cpu->update_util_set = true; + } + + static void intel_pstate_clear_update_util_hook(unsigned int cpu) + { +- cpufreq_set_update_util_data(cpu, NULL); ++ struct cpudata *cpu_data = all_cpu_data[cpu]; ++ ++ if (!cpu_data->update_util_set) ++ return; ++ ++ cpufreq_remove_update_util_hook(cpu); ++ cpu_data->update_util_set = false; + synchronize_sched(); + } + +@@ -1311,20 +1453,31 @@ + + static int intel_pstate_set_policy(struct cpufreq_policy *policy) + { ++ struct cpudata *cpu; ++ + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + intel_pstate_clear_update_util_hook(policy->cpu); + ++ cpu = all_cpu_data[0]; ++ if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) { ++ if (policy->max < policy->cpuinfo.max_freq && ++ policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) { ++ pr_debug("policy->max > max non turbo frequency\n"); ++ policy->max = policy->cpuinfo.max_freq; ++ } ++ } ++ + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + limits = &performance_limits; + if (policy->max >= policy->cpuinfo.max_freq) { +- pr_debug("intel_pstate: set performance\n"); ++ pr_debug("set performance\n"); + intel_pstate_set_performance_limits(limits); + goto out; + } + } else { +- pr_debug("intel_pstate: set powersave\n"); ++ pr_debug("set powersave\n"); + limits = &powersave_limits; + } + +@@ -1348,10 +1501,8 @@ + /* Make sure min_perf_pct <= max_perf_pct */ + limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); + +- limits->min_perf = div_fp(int_tofp(limits->min_perf_pct), +- int_tofp(100)); +- limits->max_perf = div_fp(int_tofp(limits->max_perf_pct), +- int_tofp(100)); ++ limits->min_perf = div_fp(limits->min_perf_pct, 100); ++ limits->max_perf = div_fp(limits->max_perf_pct, 100); + + out: + intel_pstate_set_update_util_hook(policy->cpu); +@@ -1377,7 +1528,7 @@ + int cpu_num = policy->cpu; + struct cpudata *cpu = all_cpu_data[cpu_num]; + +- pr_debug("intel_pstate: CPU %d exiting\n", cpu_num); ++ pr_debug("CPU %d exiting\n", cpu_num); + + intel_pstate_clear_update_util_hook(cpu_num); + +@@ -1410,12 +1561,20 @@ + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; ++ intel_pstate_init_acpi_perf_limits(policy); + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + cpumask_set_cpu(policy->cpu, policy->cpus); + + return 0; + } + ++static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) ++{ ++ intel_pstate_exit_perf_limits(policy); ++ ++ return 0; ++} ++ + static struct cpufreq_driver intel_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = intel_pstate_verify_policy, +@@ -1423,6 +1582,7 @@ + .resume = intel_pstate_hwp_set_policy, + .get = intel_pstate_get, + .init = intel_pstate_cpu_init, ++ .exit = intel_pstate_cpu_exit, + .stop_cpu = intel_pstate_stop_cpu, + .name = "intel_pstate", + }; +@@ -1466,8 +1626,7 @@ + + } + +-#if IS_ENABLED(CONFIG_ACPI) +-#include <acpi/processor.h> ++#ifdef CONFIG_ACPI + + static bool intel_pstate_no_acpi_pss(void) + { +@@ -1623,7 +1782,7 @@ + if (intel_pstate_platform_pwr_mgmt_exists()) + return -ENODEV; + +- pr_info("Intel P-state driver initializing.\n"); ++ pr_info("Intel P-state driver initializing\n"); + + all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); + if (!all_cpu_data) +@@ -1640,7 +1799,7 @@ + intel_pstate_sysfs_expose_params(); + + if (hwp_active) +- pr_info("intel_pstate: HWP enabled\n"); ++ pr_info("HWP enabled\n"); + + return rc; + out: +@@ -1666,13 +1825,19 @@ + if (!strcmp(str, "disable")) + no_load = 1; + if (!strcmp(str, "no_hwp")) { +- pr_info("intel_pstate: HWP disabled\n"); ++ pr_info("HWP disabled\n"); + no_hwp = 1; + } + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) + hwp_only = 1; ++ ++#ifdef CONFIG_ACPI ++ if (!strcmp(str, "support_acpi_ppc")) ++ acpi_ppc = true; ++#endif ++ + return 0; + } + early_param("intel_pstate", intel_pstate_setup); +--- linux-4.6/kernel/sched/cpufreq.c.orig 2016-06-24 15:32:20.064495916 -0400 ++++ linux-4.6/kernel/sched/cpufreq.c 2016-06-24 15:33:47.717298423 -0400 +@@ -35,3 +35,52 @@ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); + } + EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); ++ ++/** ++ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. ++ * @cpu: The CPU to set the pointer for. ++ * @data: New pointer value. ++ * @func: Callback function to set for the CPU. ++ * ++ * Set and publish the update_util_data pointer for the given CPU. ++ * ++ * The update_util_data pointer of @cpu is set to @data and the callback ++ * function pointer in the target struct update_util_data is set to @func. ++ * That function will be called by cpufreq_update_util() from RCU-sched ++ * read-side critical sections, so it must not sleep. @data will always be ++ * passed to it as the first argument which allows the function to get to the ++ * target update_util_data structure and its container. ++ * ++ * The update_util_data pointer of @cpu must be NULL when this function is ++ * called or it will WARN() and return with no effect. ++ */ ++void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, ++ void (*func)(struct update_util_data *data, u64 time, ++ unsigned long util, unsigned long max)) ++{ ++ if (WARN_ON(!data || !func)) ++ return; ++ ++ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) ++ return; ++ ++ data->func = func; ++ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); ++} ++EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); ++ ++/** ++ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. ++ * @cpu: The CPU to clear the pointer for. ++ * ++ * Clear the update_util_data pointer for the given CPU. ++ * ++ * Callers must use RCU-sched callbacks to free any memory that might be ++ * accessed via the old update_util_data pointer or invoke synchronize_sched() ++ * right after this function to avoid use-after-free. ++ */ ++void cpufreq_remove_update_util_hook(int cpu) ++{ ++ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); ++} ++EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); +--- linux-4.6/include/linux/sched.h.dist 2016-06-24 19:19:15.391657951 -0400 ++++ linux-4.6/include/linux/sched.h 2016-06-24 19:21:46.863939933 -0400 +@@ -3241,6 +3241,10 @@ + }; + + void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); ++void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, ++ void (*func)(struct update_util_data *data, u64 time, ++ unsigned long util, unsigned long max)); ++void cpufreq_remove_update_util_hook(int cpu); + #endif /* CONFIG_CPU_FREQ */ + + #endif +--- linux-4.6/drivers/cpufreq/intel_pstate.c.orig 2016-07-03 10:37:53.324091642 -0400 ++++ linux-4.6/drivers/cpufreq/intel_pstate.c 2016-07-03 10:38:50.450757945 -0400 +@@ -372,26 +372,9 @@ + return acpi_ppc; + } + +-/* +- * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and +- * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and +- * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state +- * ratio, out of it only high 8 bits are used. For example 0x1700 is setting +- * target ratio 0x17. The _PSS control value stores in a format which can be +- * directly written to PERF_CTL MSR. But in intel_pstate driver this shift +- * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()). +- * This function converts the _PSS control value to intel pstate driver format +- * for comparison and assignment. +- */ +-static int convert_to_native_pstate_format(struct cpudata *cpu, int index) +-{ +- return cpu->acpi_perf_data.states[index].control >> 8; +-} +- + static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) + { + struct cpudata *cpu; +- int turbo_pss_ctl; + int ret; + int i; + +@@ -441,15 +424,14 @@ + * max frequency, which will cause a reduced performance as + * this driver uses real max turbo frequency as the max + * frequency. So correct this frequency in _PSS table to +- * correct max turbo frequency based on the turbo ratio. ++ * correct max turbo frequency based on the turbo state. + * Also need to convert to MHz as _PSS freq is in MHz. + */ +- turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0); +- if (turbo_pss_ctl > cpu->pstate.max_pstate) ++ if (!limits->turbo_disabled) + cpu->acpi_perf_data.states[0].core_frequency = + policy->cpuinfo.max_freq / 1000; + cpu->valid_pss_table = true; +- pr_info("_PPC limits will be enforced\n"); ++ pr_debug("_PPC limits will be enforced\n"); + + return; + +@@ -1418,6 +1400,9 @@ + { + struct cpudata *cpu = all_cpu_data[cpu_num]; + ++ if (cpu->update_util_set) ++ return; ++ + /* Prevent intel_pstate_update_util() from using stale data. */ + cpu->sample.time = 0; + cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, +@@ -1458,15 +1443,15 @@ + if (!policy->cpuinfo.max_freq) + return -ENODEV; + +- intel_pstate_clear_update_util_hook(policy->cpu); ++ pr_debug("set_policy cpuinfo.max %u policy->max %u\n", ++ policy->cpuinfo.max_freq, policy->max); + + cpu = all_cpu_data[0]; +- if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) { +- if (policy->max < policy->cpuinfo.max_freq && +- policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) { +- pr_debug("policy->max > max non turbo frequency\n"); +- policy->max = policy->cpuinfo.max_freq; +- } ++ if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && ++ policy->max < policy->cpuinfo.max_freq && ++ policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) { ++ pr_debug("policy->max > max non turbo frequency\n"); ++ policy->max = policy->cpuinfo.max_freq; + } + + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { +@@ -1496,13 +1481,13 @@ + limits->max_sysfs_pct); + limits->max_perf_pct = max(limits->min_policy_pct, + limits->max_perf_pct); +- limits->max_perf = round_up(limits->max_perf, FRAC_BITS); + + /* Make sure min_perf_pct <= max_perf_pct */ + limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); + + limits->min_perf = div_fp(limits->min_perf_pct, 100); + limits->max_perf = div_fp(limits->max_perf_pct, 100); ++ limits->max_perf = round_up(limits->max_perf, FRAC_BITS); + + out: + intel_pstate_set_update_util_hook(policy->cpu); +@@ -1559,8 +1544,11 @@ + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; +- policy->cpuinfo.max_freq = +- cpu->pstate.turbo_pstate * cpu->pstate.scaling; ++ update_turbo_state(); ++ policy->cpuinfo.max_freq = limits->turbo_disabled ? ++ cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; ++ policy->cpuinfo.max_freq *= cpu->pstate.scaling; ++ + intel_pstate_init_acpi_perf_limits(policy); + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + cpumask_set_cpu(policy->cpu, policy->cpus); |