summarylogtreecommitdiffstats
path: root/intel-pstate-backport.patch
diff options
context:
space:
mode:
Diffstat (limited to 'intel-pstate-backport.patch')
-rw-r--r--intel-pstate-backport.patch775
1 files changed, 775 insertions, 0 deletions
diff --git a/intel-pstate-backport.patch b/intel-pstate-backport.patch
new file mode 100644
index 000000000000..8b6146401240
--- /dev/null
+++ b/intel-pstate-backport.patch
@@ -0,0 +1,775 @@
+--- linux-4.6/drivers/cpufreq/intel_pstate.c.orig 2016-05-15 18:43:13.000000000 -0400
++++ linux-4.6/drivers/cpufreq/intel_pstate.c 2016-06-24 17:36:23.064118833 -0400
+@@ -10,6 +10,8 @@
+ * of the License.
+ */
+
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
+ #include <linux/kernel.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/module.h>
+@@ -39,10 +41,17 @@
+ #define ATOM_TURBO_RATIOS 0x66c
+ #define ATOM_TURBO_VIDS 0x66d
+
++#ifdef CONFIG_ACPI
++#include <acpi/processor.h>
++#endif
++
+ #define FRAC_BITS 8
+ #define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
+ #define fp_toint(X) ((X) >> FRAC_BITS)
+
++#define EXT_BITS 6
++#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
++
+ static inline int32_t mul_fp(int32_t x, int32_t y)
+ {
+ return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
+@@ -64,12 +73,22 @@
+ return ret;
+ }
+
++static inline u64 mul_ext_fp(u64 x, u64 y)
++{
++ return (x * y) >> EXT_FRAC_BITS;
++}
++
++static inline u64 div_ext_fp(u64 x, u64 y)
++{
++ return div64_u64(x << EXT_FRAC_BITS, y);
++}
++
+ /**
+ * struct sample - Store performance sample
+- * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual
++ * @core_avg_perf: Ratio of APERF/MPERF which is the actual average
+ * performance during last sample period
+ * @busy_scaled: Scaled busy value which is used to calculate next
+- * P state. This can be different than core_pct_busy
++ * P state. This can be different than core_avg_perf
+ * to account for cpu idle period
+ * @aperf: Difference of actual performance frequency clock count
+ * read from APERF MSR between last and current sample
+@@ -84,7 +103,7 @@
+ * data for choosing next P State.
+ */
+ struct sample {
+- int32_t core_pct_busy;
++ int32_t core_avg_perf;
+ int32_t busy_scaled;
+ u64 aperf;
+ u64 mperf;
+@@ -162,6 +181,7 @@
+ * struct cpudata - Per CPU instance data storage
+ * @cpu: CPU number for this instance data
+ * @update_util: CPUFreq utility callback information
++ * @update_util_set: CPUFreq utility callback is set
+ * @pstate: Stores P state limits for this CPU
+ * @vid: Stores VID limits for this CPU
+ * @pid: Stores PID parameters for this CPU
+@@ -172,6 +192,8 @@
+ * @prev_cummulative_iowait: IO Wait time difference from last and
+ * current sample
+ * @sample: Storage for storing last Sample data
++ * @acpi_perf_data: Stores ACPI perf information read from _PSS
++ * @valid_pss_table: Set to true for valid ACPI _PSS entries found
+ *
+ * This structure stores per CPU instance data for all CPUs.
+ */
+@@ -179,6 +201,7 @@
+ int cpu;
+
+ struct update_util_data update_util;
++ bool update_util_set;
+
+ struct pstate_data pstate;
+ struct vid_data vid;
+@@ -190,6 +213,10 @@
+ u64 prev_tsc;
+ u64 prev_cummulative_iowait;
+ struct sample sample;
++#ifdef CONFIG_ACPI
++ struct acpi_processor_performance acpi_perf_data;
++ bool valid_pss_table;
++#endif
+ };
+
+ static struct cpudata **all_cpu_data;
+@@ -258,6 +285,9 @@
+ static struct pstate_funcs pstate_funcs;
+ static int hwp_active;
+
++#ifdef CONFIG_ACPI
++static bool acpi_ppc;
++#endif
+
+ /**
+ * struct perf_limits - Store user and policy limits
+@@ -331,6 +361,124 @@
+ static struct perf_limits *limits = &powersave_limits;
+ #endif
+
++#ifdef CONFIG_ACPI
++
++static bool intel_pstate_get_ppc_enable_status(void)
++{
++ if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
++ acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
++ return true;
++
++ return acpi_ppc;
++}
++
++/*
++ * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and
++ * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and
++ * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state
++ * ratio, out of it only high 8 bits are used. For example 0x1700 is setting
++ * target ratio 0x17. The _PSS control value stores in a format which can be
++ * directly written to PERF_CTL MSR. But in intel_pstate driver this shift
++ * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()).
++ * This function converts the _PSS control value to intel pstate driver format
++ * for comparison and assignment.
++ */
++static int convert_to_native_pstate_format(struct cpudata *cpu, int index)
++{
++ return cpu->acpi_perf_data.states[index].control >> 8;
++}
++
++static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
++{
++ struct cpudata *cpu;
++ int turbo_pss_ctl;
++ int ret;
++ int i;
++
++ if (hwp_active)
++ return;
++
++ if (!intel_pstate_get_ppc_enable_status())
++ return;
++
++ cpu = all_cpu_data[policy->cpu];
++
++ ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
++ policy->cpu);
++ if (ret)
++ return;
++
++ /*
++ * Check if the control value in _PSS is for PERF_CTL MSR, which should
++ * guarantee that the states returned by it map to the states in our
++ * list directly.
++ */
++ if (cpu->acpi_perf_data.control_register.space_id !=
++ ACPI_ADR_SPACE_FIXED_HARDWARE)
++ goto err;
++
++ /*
++ * If there is only one entry _PSS, simply ignore _PSS and continue as
++ * usual without taking _PSS into account
++ */
++ if (cpu->acpi_perf_data.state_count < 2)
++ goto err;
++
++ pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
++ for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
++ pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n",
++ (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
++ (u32) cpu->acpi_perf_data.states[i].core_frequency,
++ (u32) cpu->acpi_perf_data.states[i].power,
++ (u32) cpu->acpi_perf_data.states[i].control);
++ }
++
++ /*
++ * The _PSS table doesn't contain whole turbo frequency range.
++ * This just contains +1 MHZ above the max non turbo frequency,
++ * with control value corresponding to max turbo ratio. But
++ * when cpufreq set policy is called, it will call with this
++ * max frequency, which will cause a reduced performance as
++ * this driver uses real max turbo frequency as the max
++ * frequency. So correct this frequency in _PSS table to
++ * correct max turbo frequency based on the turbo ratio.
++ * Also need to convert to MHz as _PSS freq is in MHz.
++ */
++ turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0);
++ if (turbo_pss_ctl > cpu->pstate.max_pstate)
++ cpu->acpi_perf_data.states[0].core_frequency =
++ policy->cpuinfo.max_freq / 1000;
++ cpu->valid_pss_table = true;
++ pr_info("_PPC limits will be enforced\n");
++
++ return;
++
++ err:
++ cpu->valid_pss_table = false;
++ acpi_processor_unregister_performance(policy->cpu);
++}
++
++static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
++{
++ struct cpudata *cpu;
++
++ cpu = all_cpu_data[policy->cpu];
++ if (!cpu->valid_pss_table)
++ return;
++
++ acpi_processor_unregister_performance(policy->cpu);
++}
++
++#else
++static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
++{
++}
++
++static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
++{
++}
++#endif
++
+ static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
+ int deadband, int integral) {
+ pid->setpoint = int_tofp(setpoint);
+@@ -341,17 +489,17 @@
+
+ static inline void pid_p_gain_set(struct _pid *pid, int percent)
+ {
+- pid->p_gain = div_fp(int_tofp(percent), int_tofp(100));
++ pid->p_gain = div_fp(percent, 100);
+ }
+
+ static inline void pid_i_gain_set(struct _pid *pid, int percent)
+ {
+- pid->i_gain = div_fp(int_tofp(percent), int_tofp(100));
++ pid->i_gain = div_fp(percent, 100);
+ }
+
+ static inline void pid_d_gain_set(struct _pid *pid, int percent)
+ {
+- pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
++ pid->d_gain = div_fp(percent, 100);
+ }
+
+ static signed int pid_calc(struct _pid *pid, int32_t busy)
+@@ -537,7 +685,7 @@
+
+ total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
+ no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
+- turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total));
++ turbo_fp = div_fp(no_turbo, total);
+ turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
+ return sprintf(buf, "%u\n", turbo_pct);
+ }
+@@ -579,7 +727,7 @@
+
+ update_turbo_state();
+ if (limits->turbo_disabled) {
+- pr_warn("intel_pstate: Turbo disabled by BIOS or unavailable on processor\n");
++ pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
+ return -EPERM;
+ }
+
+@@ -608,8 +756,7 @@
+ limits->max_perf_pct);
+ limits->max_perf_pct = max(limits->min_perf_pct,
+ limits->max_perf_pct);
+- limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
+- int_tofp(100));
++ limits->max_perf = div_fp(limits->max_perf_pct, 100);
+
+ if (hwp_active)
+ intel_pstate_hwp_set_online_cpus();
+@@ -633,8 +780,7 @@
+ limits->min_perf_pct);
+ limits->min_perf_pct = min(limits->max_perf_pct,
+ limits->min_perf_pct);
+- limits->min_perf = div_fp(int_tofp(limits->min_perf_pct),
+- int_tofp(100));
++ limits->min_perf = div_fp(limits->min_perf_pct, 100);
+
+ if (hwp_active)
+ intel_pstate_hwp_set_online_cpus();
+@@ -1019,15 +1165,11 @@
+ intel_pstate_set_min_pstate(cpu);
+ }
+
+-static inline void intel_pstate_calc_busy(struct cpudata *cpu)
++static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
+ {
+ struct sample *sample = &cpu->sample;
+- int64_t core_pct;
+-
+- core_pct = int_tofp(sample->aperf) * int_tofp(100);
+- core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
+
+- sample->core_pct_busy = (int32_t)core_pct;
++ sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
+ }
+
+ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
+@@ -1070,9 +1212,14 @@
+
+ static inline int32_t get_avg_frequency(struct cpudata *cpu)
+ {
+- return fp_toint(mul_fp(cpu->sample.core_pct_busy,
+- int_tofp(cpu->pstate.max_pstate_physical *
+- cpu->pstate.scaling / 100)));
++ return mul_ext_fp(cpu->sample.core_avg_perf,
++ cpu->pstate.max_pstate_physical * cpu->pstate.scaling);
++}
++
++static inline int32_t get_avg_pstate(struct cpudata *cpu)
++{
++ return mul_ext_fp(cpu->pstate.max_pstate_physical,
++ cpu->sample.core_avg_perf);
+ }
+
+ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
+@@ -1107,49 +1254,43 @@
+ cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc);
+ cpu->sample.busy_scaled = cpu_load;
+
+- return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
++ return get_avg_pstate(cpu) - pid_calc(&cpu->pid, cpu_load);
+ }
+
+ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
+ {
+- int32_t core_busy, max_pstate, current_pstate, sample_ratio;
++ int32_t perf_scaled, max_pstate, current_pstate, sample_ratio;
+ u64 duration_ns;
+
+ /*
+- * core_busy is the ratio of actual performance to max
+- * max_pstate is the max non turbo pstate available
+- * current_pstate was the pstate that was requested during
+- * the last sample period.
+- *
+- * We normalize core_busy, which was our actual percent
+- * performance to what we requested during the last sample
+- * period. The result will be a percentage of busy at a
+- * specified pstate.
++ * perf_scaled is the average performance during the last sampling
++ * period scaled by the ratio of the maximum P-state to the P-state
++ * requested last time (in percent). That measures the system's
++ * response to the previous P-state selection.
+ */
+- core_busy = cpu->sample.core_pct_busy;
+- max_pstate = int_tofp(cpu->pstate.max_pstate_physical);
+- current_pstate = int_tofp(cpu->pstate.current_pstate);
+- core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
++ max_pstate = cpu->pstate.max_pstate_physical;
++ current_pstate = cpu->pstate.current_pstate;
++ perf_scaled = mul_ext_fp(cpu->sample.core_avg_perf,
++ div_fp(100 * max_pstate, current_pstate));
+
+ /*
+ * Since our utilization update callback will not run unless we are
+ * in C0, check if the actual elapsed time is significantly greater (3x)
+ * than our sample interval. If it is, then we were idle for a long
+- * enough period of time to adjust our busyness.
++ * enough period of time to adjust our performance metric.
+ */
+ duration_ns = cpu->sample.time - cpu->last_sample_time;
+ if ((s64)duration_ns > pid_params.sample_rate_ns * 3) {
+- sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
+- int_tofp(duration_ns));
+- core_busy = mul_fp(core_busy, sample_ratio);
++ sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns);
++ perf_scaled = mul_fp(perf_scaled, sample_ratio);
+ } else {
+ sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc);
+ if (sample_ratio < int_tofp(1))
+- core_busy = 0;
++ perf_scaled = 0;
+ }
+
+- cpu->sample.busy_scaled = core_busy;
+- return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
++ cpu->sample.busy_scaled = perf_scaled;
++ return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
+ }
+
+ static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+@@ -1179,7 +1320,7 @@
+ intel_pstate_update_pstate(cpu, target_pstate);
+
+ sample = &cpu->sample;
+- trace_pstate_sample(fp_toint(sample->core_pct_busy),
++ trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
+ fp_toint(sample->busy_scaled),
+ from,
+ cpu->pstate.current_pstate,
+@@ -1199,7 +1340,7 @@
+ bool sample_taken = intel_pstate_sample(cpu, time);
+
+ if (sample_taken) {
+- intel_pstate_calc_busy(cpu);
++ intel_pstate_calc_avg_perf(cpu);
+ if (!hwp_active)
+ intel_pstate_adjust_busy_pstate(cpu);
+ }
+@@ -1261,23 +1402,16 @@
+
+ intel_pstate_busy_pid_reset(cpu);
+
+- cpu->update_util.func = intel_pstate_update_util;
+-
+- pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
++ pr_debug("controlling: cpu %d\n", cpunum);
+
+ return 0;
+ }
+
+ static unsigned int intel_pstate_get(unsigned int cpu_num)
+ {
+- struct sample *sample;
+- struct cpudata *cpu;
++ struct cpudata *cpu = all_cpu_data[cpu_num];
+
+- cpu = all_cpu_data[cpu_num];
+- if (!cpu)
+- return 0;
+- sample = &cpu->sample;
+- return get_avg_frequency(cpu);
++ return cpu ? get_avg_frequency(cpu) : 0;
+ }
+
+ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
+@@ -1286,12 +1420,20 @@
+
+ /* Prevent intel_pstate_update_util() from using stale data. */
+ cpu->sample.time = 0;
+- cpufreq_set_update_util_data(cpu_num, &cpu->update_util);
++ cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
++ intel_pstate_update_util);
++ cpu->update_util_set = true;
+ }
+
+ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
+ {
+- cpufreq_set_update_util_data(cpu, NULL);
++ struct cpudata *cpu_data = all_cpu_data[cpu];
++
++ if (!cpu_data->update_util_set)
++ return;
++
++ cpufreq_remove_update_util_hook(cpu);
++ cpu_data->update_util_set = false;
+ synchronize_sched();
+ }
+
+@@ -1311,20 +1453,31 @@
+
+ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
+ {
++ struct cpudata *cpu;
++
+ if (!policy->cpuinfo.max_freq)
+ return -ENODEV;
+
+ intel_pstate_clear_update_util_hook(policy->cpu);
+
++ cpu = all_cpu_data[0];
++ if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) {
++ if (policy->max < policy->cpuinfo.max_freq &&
++ policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
++ pr_debug("policy->max > max non turbo frequency\n");
++ policy->max = policy->cpuinfo.max_freq;
++ }
++ }
++
+ if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
+ limits = &performance_limits;
+ if (policy->max >= policy->cpuinfo.max_freq) {
+- pr_debug("intel_pstate: set performance\n");
++ pr_debug("set performance\n");
+ intel_pstate_set_performance_limits(limits);
+ goto out;
+ }
+ } else {
+- pr_debug("intel_pstate: set powersave\n");
++ pr_debug("set powersave\n");
+ limits = &powersave_limits;
+ }
+
+@@ -1348,10 +1501,8 @@
+ /* Make sure min_perf_pct <= max_perf_pct */
+ limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+
+- limits->min_perf = div_fp(int_tofp(limits->min_perf_pct),
+- int_tofp(100));
+- limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
+- int_tofp(100));
++ limits->min_perf = div_fp(limits->min_perf_pct, 100);
++ limits->max_perf = div_fp(limits->max_perf_pct, 100);
+
+ out:
+ intel_pstate_set_update_util_hook(policy->cpu);
+@@ -1377,7 +1528,7 @@
+ int cpu_num = policy->cpu;
+ struct cpudata *cpu = all_cpu_data[cpu_num];
+
+- pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
++ pr_debug("CPU %d exiting\n", cpu_num);
+
+ intel_pstate_clear_update_util_hook(cpu_num);
+
+@@ -1410,12 +1561,20 @@
+ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
+ policy->cpuinfo.max_freq =
+ cpu->pstate.turbo_pstate * cpu->pstate.scaling;
++ intel_pstate_init_acpi_perf_limits(policy);
+ policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ cpumask_set_cpu(policy->cpu, policy->cpus);
+
+ return 0;
+ }
+
++static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
++{
++ intel_pstate_exit_perf_limits(policy);
++
++ return 0;
++}
++
+ static struct cpufreq_driver intel_pstate_driver = {
+ .flags = CPUFREQ_CONST_LOOPS,
+ .verify = intel_pstate_verify_policy,
+@@ -1423,6 +1582,7 @@
+ .resume = intel_pstate_hwp_set_policy,
+ .get = intel_pstate_get,
+ .init = intel_pstate_cpu_init,
++ .exit = intel_pstate_cpu_exit,
+ .stop_cpu = intel_pstate_stop_cpu,
+ .name = "intel_pstate",
+ };
+@@ -1466,8 +1626,7 @@
+
+ }
+
+-#if IS_ENABLED(CONFIG_ACPI)
+-#include <acpi/processor.h>
++#ifdef CONFIG_ACPI
+
+ static bool intel_pstate_no_acpi_pss(void)
+ {
+@@ -1623,7 +1782,7 @@
+ if (intel_pstate_platform_pwr_mgmt_exists())
+ return -ENODEV;
+
+- pr_info("Intel P-state driver initializing.\n");
++ pr_info("Intel P-state driver initializing\n");
+
+ all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
+ if (!all_cpu_data)
+@@ -1640,7 +1799,7 @@
+ intel_pstate_sysfs_expose_params();
+
+ if (hwp_active)
+- pr_info("intel_pstate: HWP enabled\n");
++ pr_info("HWP enabled\n");
+
+ return rc;
+ out:
+@@ -1666,13 +1825,19 @@
+ if (!strcmp(str, "disable"))
+ no_load = 1;
+ if (!strcmp(str, "no_hwp")) {
+- pr_info("intel_pstate: HWP disabled\n");
++ pr_info("HWP disabled\n");
+ no_hwp = 1;
+ }
+ if (!strcmp(str, "force"))
+ force_load = 1;
+ if (!strcmp(str, "hwp_only"))
+ hwp_only = 1;
++
++#ifdef CONFIG_ACPI
++ if (!strcmp(str, "support_acpi_ppc"))
++ acpi_ppc = true;
++#endif
++
+ return 0;
+ }
+ early_param("intel_pstate", intel_pstate_setup);
+--- linux-4.6/kernel/sched/cpufreq.c.orig 2016-06-24 15:32:20.064495916 -0400
++++ linux-4.6/kernel/sched/cpufreq.c 2016-06-24 15:33:47.717298423 -0400
+@@ -35,3 +35,52 @@
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+ }
+ EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
++
++/**
++ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
++ * @cpu: The CPU to set the pointer for.
++ * @data: New pointer value.
++ * @func: Callback function to set for the CPU.
++ *
++ * Set and publish the update_util_data pointer for the given CPU.
++ *
++ * The update_util_data pointer of @cpu is set to @data and the callback
++ * function pointer in the target struct update_util_data is set to @func.
++ * That function will be called by cpufreq_update_util() from RCU-sched
++ * read-side critical sections, so it must not sleep. @data will always be
++ * passed to it as the first argument which allows the function to get to the
++ * target update_util_data structure and its container.
++ *
++ * The update_util_data pointer of @cpu must be NULL when this function is
++ * called or it will WARN() and return with no effect.
++ */
++void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
++ void (*func)(struct update_util_data *data, u64 time,
++ unsigned long util, unsigned long max))
++{
++ if (WARN_ON(!data || !func))
++ return;
++
++ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
++ return;
++
++ data->func = func;
++ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
++}
++EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
++
++/**
++ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
++ * @cpu: The CPU to clear the pointer for.
++ *
++ * Clear the update_util_data pointer for the given CPU.
++ *
++ * Callers must use RCU-sched callbacks to free any memory that might be
++ * accessed via the old update_util_data pointer or invoke synchronize_sched()
++ * right after this function to avoid use-after-free.
++ */
++void cpufreq_remove_update_util_hook(int cpu)
++{
++ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
++}
++EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+--- linux-4.6/include/linux/sched.h.dist 2016-06-24 19:19:15.391657951 -0400
++++ linux-4.6/include/linux/sched.h 2016-06-24 19:21:46.863939933 -0400
+@@ -3241,6 +3241,10 @@
+ };
+
+ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
++void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
++ void (*func)(struct update_util_data *data, u64 time,
++ unsigned long util, unsigned long max));
++void cpufreq_remove_update_util_hook(int cpu);
+ #endif /* CONFIG_CPU_FREQ */
+
+ #endif
+--- linux-4.6/drivers/cpufreq/intel_pstate.c.orig 2016-07-03 10:37:53.324091642 -0400
++++ linux-4.6/drivers/cpufreq/intel_pstate.c 2016-07-03 10:38:50.450757945 -0400
+@@ -372,26 +372,9 @@
+ return acpi_ppc;
+ }
+
+-/*
+- * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and
+- * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and
+- * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state
+- * ratio, out of it only high 8 bits are used. For example 0x1700 is setting
+- * target ratio 0x17. The _PSS control value stores in a format which can be
+- * directly written to PERF_CTL MSR. But in intel_pstate driver this shift
+- * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()).
+- * This function converts the _PSS control value to intel pstate driver format
+- * for comparison and assignment.
+- */
+-static int convert_to_native_pstate_format(struct cpudata *cpu, int index)
+-{
+- return cpu->acpi_perf_data.states[index].control >> 8;
+-}
+-
+ static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+ {
+ struct cpudata *cpu;
+- int turbo_pss_ctl;
+ int ret;
+ int i;
+
+@@ -441,15 +424,14 @@
+ * max frequency, which will cause a reduced performance as
+ * this driver uses real max turbo frequency as the max
+ * frequency. So correct this frequency in _PSS table to
+- * correct max turbo frequency based on the turbo ratio.
++ * correct max turbo frequency based on the turbo state.
+ * Also need to convert to MHz as _PSS freq is in MHz.
+ */
+- turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0);
+- if (turbo_pss_ctl > cpu->pstate.max_pstate)
++ if (!limits->turbo_disabled)
+ cpu->acpi_perf_data.states[0].core_frequency =
+ policy->cpuinfo.max_freq / 1000;
+ cpu->valid_pss_table = true;
+- pr_info("_PPC limits will be enforced\n");
++ pr_debug("_PPC limits will be enforced\n");
+
+ return;
+
+@@ -1418,6 +1400,9 @@
+ {
+ struct cpudata *cpu = all_cpu_data[cpu_num];
+
++ if (cpu->update_util_set)
++ return;
++
+ /* Prevent intel_pstate_update_util() from using stale data. */
+ cpu->sample.time = 0;
+ cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
+@@ -1458,15 +1443,15 @@
+ if (!policy->cpuinfo.max_freq)
+ return -ENODEV;
+
+- intel_pstate_clear_update_util_hook(policy->cpu);
++ pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
++ policy->cpuinfo.max_freq, policy->max);
+
+ cpu = all_cpu_data[0];
+- if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) {
+- if (policy->max < policy->cpuinfo.max_freq &&
+- policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
+- pr_debug("policy->max > max non turbo frequency\n");
+- policy->max = policy->cpuinfo.max_freq;
+- }
++ if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
++ policy->max < policy->cpuinfo.max_freq &&
++ policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
++ pr_debug("policy->max > max non turbo frequency\n");
++ policy->max = policy->cpuinfo.max_freq;
+ }
+
+ if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
+@@ -1496,13 +1481,13 @@
+ limits->max_sysfs_pct);
+ limits->max_perf_pct = max(limits->min_policy_pct,
+ limits->max_perf_pct);
+- limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+
+ /* Make sure min_perf_pct <= max_perf_pct */
+ limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+
+ limits->min_perf = div_fp(limits->min_perf_pct, 100);
+ limits->max_perf = div_fp(limits->max_perf_pct, 100);
++ limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+
+ out:
+ intel_pstate_set_update_util_hook(policy->cpu);
+@@ -1559,8 +1544,11 @@
+
+ /* cpuinfo and default policy values */
+ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
+- policy->cpuinfo.max_freq =
+- cpu->pstate.turbo_pstate * cpu->pstate.scaling;
++ update_turbo_state();
++ policy->cpuinfo.max_freq = limits->turbo_disabled ?
++ cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
++ policy->cpuinfo.max_freq *= cpu->pstate.scaling;
++
+ intel_pstate_init_acpi_perf_limits(policy);
+ policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ cpumask_set_cpu(policy->cpu, policy->cpus);