diff --git a/include/linux/sched.h b/include/linux/sched.h index ba78807eab9..322764e0405 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); +extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; diff --git a/kernel/sched.c b/kernel/sched.c index 6798328a2e0..da26f46d50d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + if (sysctl_sched_granularity > limit) + sysctl_sched_granularity = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_runtime_limit = sysctl_sched_latency * 5; + sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; } #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4d6b7e2df2a..0ba1e60f08d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,23 +15,32 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ /* - * Preemption granularity: - * (default: 10 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: */ -unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) + */ +unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. @@ -212,6 +221,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +/* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_granularity); + } + + return gran; +} + /* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: @@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); @@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * it will preempt the parent: */ p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: @@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea90ef51085..9e3d2960faf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -231,6 +231,17 @@ static ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns",