本文假定kernel配置成CONFIG_PREEMPT_NONE。
下面假设进程H的优先级高于进程L。
H通过sys_read()进入内核态进行IO操作,由于IO很慢,submit_bio之后主动sleep: 把H放到wait_queue
中,设置H状态为TASK_INTERRUPTIBLE,然后调用schedule()释放cpu。假设这时L被调度,成了current,
L也通过系统调用等方式进入了内核态运行。当L正在运行时IO操作完成,会发中断,于是CPU执行ISR响应中断:
唤醒H,比较优先级,如果发现H优先级比current的L高就会设置current->thread_info中的
TIF_NEED_RESCHED = 1,也会设置H状态为TASK_RUNNING。CPU 返回后,由于是回到内核态,所以不发生调度,
L继续在内核态运行。这时又发生了timer interrupt,恰好L的时间统计信息符合CFS设置调度标志的标准了,
就会设置TIF_NEED_RESCHED = 1。从ISR返回后,由于也是返回到内核态,所以不发生调度。直到L返回到用户
态时才会检测TIF_NEED_RESCHED = 1,这时H才会成为current。H先把自己之前sleep时插入wait_queue中的
descriptor删除,然后再继续运行。
可见这个preemption是内核态的概念,如果是返回用户态时被淘汰出去,并不叫preemption。
kernel在下面两种下会设置TIF_NEEDRESCHED:
1. try_to_wake_up()唤醒一个进程p,如果p优先级比current优先级高,就设置TIF_NEEDRESCHED=1。
2. 时钟中断处理时会对进程的时间信息做更新,如果符合scheduler的标准就设置TIF_NEEDRESCHED=1
以前是time slice == 0时设置TIF_NEEDRESCHED,现在CFS也有标准。
看看 wait_event_interruptible() 引出的一系列函数:
267 /**
268 * wait_event_interruptible - sleep until a condition gets true
269 * @wq: the waitqueue to wait on
270 * @condition: a C expression for the event to wait for
271 *
272 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
273 * @condition evaluates to true or a signal is received.
274 * The @condition is checked each time the waitqueue @wq is woken up.
275 *
276 * wake_up() has to be called after changing any variable that could
277 * change the result of the wait condition.
278 *
279 * The function will return -ERESTARTSYS if it was interrupted by a
280 * signal and 0 if @condition evaluated to true.
281 */
282 #define wait_event_interruptible(wq, condition) \
283 ({ \
284 int __ret = 0; \
285 if (!(condition)) \
286 __wait_event_interruptible(wq, condition, __ret); \
287 __ret; \
288 })
249 #define __wait_event_interruptible(wq, condition, ret) \
250 do { \
251 DEFINE_WAIT(__wait); \
252 \
253 for (;;) { \
254 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
255 if (condition) \
256 break; \
257 if (!signal_pending(current)) { \
258 schedule(); \
259 continue; \
260 } \
261 ret = -ERESTARTSYS; \
262 break; \
263 } \
264 finish_wait(&wq, &__wait); \
265 } while (0)
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
448 #define DEFINE_WAIT_FUNC(name, function) \
449 wait_queue_t name = { \
450 .private = current, \
451 .func = function, \
452 .task_list = LIST_HEAD_INIT((name).task_list), \
453 }
typedef struct __wait_queue wait_queue_t;
32 struct __wait_queue {
33 unsigned int flags;
34 #define WQ_FLAG_EXCLUSIVE 0x01
35 void *private;
36 wait_queue_func_t func;
37 struct list_head task_list;
38 };
初始化wait queue成员,注意设置的唤醒函数 autoremove_wake_function。然后在prepare_to_wait()中
把该成员加入wait queue,设置状态为TASK_UNINTERRUPTIBLE:
55 /*
56 * Note: we use "set_current_state()" _after_ the wait-queue add,
57 * because we need a memory barrier there on SMP, so that any
58 * wake-function that tests for the wait-queue being active
59 * will be guaranteed to see waitqueue addition _or_ subsequent
60 * tests in this thread will see the wakeup having taken place.
61 *
62 * The spin_unlock() itself is semi-permeable and only protects
63 * one way (it only protects stuff inside the critical region and
64 * stops them from bleeding out - it would still allow subsequent
65 * loads to move into the critical region).
66 */
67 void
68 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
69 {
70 unsigned long flags;
71
72 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
73 spin_lock_irqsave(&q->lock, flags);
74 if (list_empty(&wait->task_list))
75 __add_wait_queue(q, wait);
76 set_current_state(state);
77 spin_unlock_irqrestore(&q->lock, flags);
78 }
下面看看wake_up函数设置TIF_NEEDRESCHED:
163 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
164 {
165 int ret = default_wake_function(wait, mode, sync, key);
166
167 if (ret)
168 list_del_init(&wait->task_list);
169 return ret;
170 }
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
}
2353 /***
2354 * try_to_wake_up - wake up a thread
2355 * @p: the to-be-woken-up thread
2356 * @state: the mask of task states that can be woken
2357 * @sync: do a synchronous wakeup?
2358 *
2359 * Put it on the run-queue if it's not already there. The "current"
2360 * thread is always on the run-queue (except when the actual
2361 * re-schedule is in progress), and as such you're allowed to do
2362 * the simpler "current->state = TASK_RUNNING" to mark yourself
2363 * runnable without the overhead of this.
2364 *
2365 * returns failure only if the task is already active.
2366 */
2367 static int try_to_wake_up(struct task_struct *p, unsigned int state,
2368 int wake_flags)
2369 {
2370 int cpu, orig_cpu, this_cpu, success = 0;
2371 unsigned long flags;
2372 struct rq *rq, *orig_rq;
2373
2374 if (!sched_feat(SYNC_WAKEUPS))
2375 wake_flags &= ~WF_SYNC;
2376
2377 this_cpu = get_cpu();
2378
2379 smp_wmb();
2380 rq = orig_rq = task_rq_lock(p, &flags);
2381 update_rq_clock(rq);
2382 if (!(p->state & state))
2383 goto out;
2384
2385 if (p->se.on_rq)
2386 goto out_running;
2387
2388 cpu = task_cpu(p);
2389 orig_cpu = cpu;
2390
2391 #ifdef CONFIG_SMP
2392 if (unlikely(task_running(rq, p)))
2393 goto out_activate;
2394
2395 /*
2396 * In order to handle concurrent wakeups and release the rq->lock
2397 * we put the task in TASK_WAKING state.
2398 *
2399 * First fix up the nr_uninterruptible count:
2400 */
2401 if (task_contributes_to_load(p))
2402 rq->nr_uninterruptible--;
2403 p->state = TASK_WAKING;
2404
2405 if (p->sched_class->task_waking)
2406 p->sched_class->task_waking(rq, p);
2407
2408 __task_rq_unlock(rq);
2409
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu)
2412 set_task_cpu(p, cpu);
2413
2414 rq = __task_rq_lock(p);
2415 update_rq_clock(rq);
2416
2417 WARN_ON(p->state != TASK_WAKING);
2418 cpu = task_cpu(p);
2419
2420 #ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count);
2422 if (cpu == this_cpu)
2423 schedstat_inc(rq, ttwu_local);
2424 else {
2425 struct sched_domain *sd;
2426 for_each_domain(this_cpu, sd) {
2427 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2428 schedstat_inc(sd, ttwu_wake_remote);
2429 break;
2430 }
2431 }
2432 }
2433 #endif /* CONFIG_SCHEDSTATS */
2434
2435 out_activate:
2436 #endif /* CONFIG_SMP */
2437 schedstat_inc(p, se.nr_wakeups);
2438 if (wake_flags & WF_SYNC)
2439 schedstat_inc(p, se.nr_wakeups_sync);
2440 if (orig_cpu != cpu)
2441 schedstat_inc(p, se.nr_wakeups_migrate);
2442 if (cpu == this_cpu)
2443 schedstat_inc(p, se.nr_wakeups_local);
2444 else
2445 schedstat_inc(p, se.nr_wakeups_remote);
2446 activate_task(rq, p, 1);
2447 success = 1;
2448
2449 /*
2450 * Only attribute actual wakeups done by this task.
2451 */
2452 if (!in_interrupt()) {
2453 struct sched_entity *se = ¤t->se;
2454 u64 sample = se->sum_exec_runtime;
2455
2456 if (se->last_wakeup)
2457 sample -= se->last_wakeup;
2458 else
2459 sample -= se->start_runtime;
2460 update_avg(&se->avg_wakeup, sample);
2461
2462 se->last_wakeup = se->sum_exec_runtime;
2463 }
2464
2465 out_running:
2466 trace_sched_wakeup(rq, p, success);
2467 check_preempt_curr(rq, p, wake_flags);
2468
2469 p->state = TASK_RUNNING;
2470 #ifdef CONFIG_SMP
2471 if (p->sched_class->task_woken)
2472 p->sched_class->task_woken(rq, p);
2473
2474 if (unlikely(rq->idle_stamp)) {
2475 u64 delta = rq->clock - rq->idle_stamp;
2476 u64 max = 2*sysctl_sched_migration_cost;
2477
2478 if (delta > max)
2479 rq->avg_idle = max;
2480 else
2481 update_avg(&rq->avg_idle, delta);
2482 rq->idle_stamp = 0;
2483 }
2484 #endif
2485 out:
2486 task_rq_unlock(rq, &flags);
2487 put_cpu();
2488
2489 return success;
2490 }
参数p, @p: the to-be-woken-up thread
check_preempt_curr()里面设置TIF_NEEDRESCHED。
注意put_cpu(): 只有在CONFIG_PREEMPT 条件下,满足了#ifdef CONFIG_PREEMPT put_cpu()才会执行schedule(),
在这里CONFIG_PREEMPT_NONE时put_cpu()不会发生调度,直接返回。所以只要check_preempt_curr()里没有执行
schedule(),就符合开头的说法了。
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
}
下满要找 check_preempt_curr()赋初值的地方。
# grep -r check_preempt_curr kernel/ |less 找到3个地方
1。# vi kernel/sched_rt.c 文件开头有注释
/*
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
这是供 SCHED_FIFO 和 SCHED_RR 使用的。
2。# vi kernel/sched_idletask.c 跟踪一下注释说是预留还没实现:
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
3。# vi kernel/sched_fair.c 找到赋值语句:
.check_preempt_curr = check_preempt_wakeup,
现在看看 check_preempt_wakeup()
1699 /*
1700 * Preempt the current task with a newly woken task if needed:
1701 */
1702 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1703 {
1704 struct task_struct *curr = rq->curr;
1705 struct sched_entity *se = &curr->se, *pse = &p->se;
1706 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1707 int sync = wake_flags & WF_SYNC;
1708 int scale = cfs_rq->nr_running >= sched_nr_latency;
1709
1710 if (unlikely(rt_prio(p->prio)))
1711 goto preempt;
1712
1713 if (unlikely(p->sched_class != &fair_sched_class))
1714 return;
1715
1716 if (unlikely(se == pse))
1717 return;
1718
1719 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1720 set_next_buddy(pse);
1721
1722 /*
1723 * We can come here with TIF_NEED_RESCHED already set from new task
1724 * wake up path.
1725 */
1726 if (test_tsk_need_resched(curr))
1727 return;
1728
1729 /*
1730 * Batch and idle tasks do not preempt (their preemption is driven by
1731 * the tick):
1732 */
1733 if (unlikely(p->policy != SCHED_NORMAL))
1734 return;
1735
1736 /* Idle tasks are by definition preempted by everybody. */
1737 if (unlikely(curr->policy == SCHED_IDLE))
1738 goto preempt;
1739
1740 if (sched_feat(WAKEUP_SYNC) && sync)
1741 goto preempt;
1742
1743 if (sched_feat(WAKEUP_OVERLAP) &&
1744 se->avg_overlap < sysctl_sched_migration_cost &&
1745 pse->avg_overlap < sysctl_sched_migration_cost)
1746 goto preempt;
1747
1748 if (!sched_feat(WAKEUP_PREEMPT))
1749 return;
1750
1751 update_curr(cfs_rq);
1752 find_matching_se(&se, &pse);
1753 BUG_ON(!pse);
1754 if (wakeup_preempt_entity(se, pse) == 1)
1755 goto preempt;
1756
1757 return;
1758
1759 preempt:
1760 resched_task(curr);
1761 /*
1762 * Only set the backward buddy when the current task is still
1763 * on the rq. This can happen when a wakeup gets interleaved
1764 * with schedule on the ->pre_schedule() or idle_balance()
1765 * point, either of which can * drop the rq lock.
1766 *
1767 * Also, during early boot the idle thread is in the fair class,
1768 * for obvious reasons its a bad idea to schedule back to it.
1769 */
1770 if (unlikely(!se->on_rq || curr == rq->idle))
1771 return;
1772
1773 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1774 set_last_buddy(se);
1775 }
static inline int test_tsk_need_resched(struct task_struct *tsk)
{
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
里面还有个关键函数: resched_task():
1178 static void resched_task(struct task_struct *p)
1179 {
1180 int cpu;
1181
1182 assert_raw_spin_locked(&task_rq(p)->lock);
1183
1184 if (test_tsk_need_resched(p))
1185 return;
1186
1187 set_tsk_need_resched(p);
1188
1189 cpu = task_cpu(p);
1190 if (cpu == smp_processor_id())
1191 return;
1192
1193 /* NEED_RESCHED must be visible before we test polling */
1194 smp_mb();
1195 if (!tsk_is_polling(p))
1196 smp_send_reschedule(cpu);
1197 }
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
确实没有执行schedule()的地方。现在p重新占有了cpu,会把之前wait queue中的内容处理一下:
wait_event_interruptible() -> __wait_event_interruptible() -> finish_wait()
95 /*
96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on
98 * @wait: wait descriptor
99 *
100 * Sets current thread back to running state and removes
101 * the wait descriptor from the given waitqueue if still
102 * queued.
103 */
104 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
105 {
106 unsigned long flags;
107
108 __set_current_state(TASK_RUNNING);
109 /*
110 * We can check for list emptiness outside the lock
111 * IFF:
112 * - we use the "careful" check that verifies both
113 * the next and prev pointers, so that there cannot
114 * be any half-pending updates in progress on other
115 * CPU's that we haven't seen yet (and that might
116 * still change the stack area.
117 * and
118 * - all other users take the lock (ie we can only
119 * have _one_ other CPU that looks at or modifies
120 * the list).
121 */
122 if (!list_empty_careful(&wait->task_list)) {
123 spin_lock_irqsave(&q->lock, flags);
124 list_del_init(&wait->task_list);
125 spin_unlock_irqrestore(&q->lock, flags);
126 }
127 }
现在看看时钟中断处理函数中设置TIF_NEEDRESCHED的位置。
时钟管理硬件PIT,timer, 就是最常说的时钟中断,这里只说PIT。
[root@gridserver ~]# cat /proc/interrupts
CPU0 CPU1
0: 75698257 75667103 IO-APIC-edge timer
1: 161 57 IO-APIC-edge i8042
8: 1 0 IO-APIC-edge rtc
9: 0 0 IO-APIC-level acpi
12: 92 0 IO-APIC-edge i8042
14: 19 0 IO-APIC-edge ide0
169: 261 0 IO-APIC-level HDA Intel, uhci_hcd
177: 1 0 IO-APIC-level ehci_hcd, uhci_hcd
185: 103842 101556 IO-APIC-level libata, uhci_hcd
193: 0 0 IO-APIC-level uhci_hcd
201: 48 1597890 IO-APIC-level eth1
209: 321688 0 IO-APIC-level eth0
NMI: 0 0
LOC: 151375315 151375347
ERR: 0
MIS: 0
这里timer对应IRQ 0,rtc对应IRQ 8。也可看出系统的irqbalance挺好,interrupt在两个core上分布均衡。
要找到call back func,就要看system boot时如何设置初值的:
start_kernel()-> time_init()
/*
* Initialize TSC and delay the periodic timer init to
* late x86_late_time_init() so ioremap works.
*/
void __init time_init(void)
{
late_time_init = x86_late_time_init;
}
static __init void x86_late_time_init(void)
{
x86_init.timers.timer_init();
tsc_init();
}
/* Default timer init function */
void __init hpet_time_init(void)
{
if (!hpet_enable())
setup_pit_timer();
setup_default_timer_irq();
}
void __init setup_default_timer_irq(void)
{
setup_irq(0, &irq0);
}
static struct irqaction irq0 = {
.handler = timer_interrupt,
.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
.name = "timer"
};
可以看出timer interrup处理函数是timer_interrupt()
58 /*
59 * Default timer interrupt handler for PIT/HPET
60 */
61 static irqreturn_t timer_interrupt(int irq, void *dev_id)
62 {
63 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs);
65
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event);
81
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
83 if (MCA_bus)
84 outb_p(inb_p(0x61)| 0x80, 0x61);
85
86 return IRQ_HANDLED;
87 }
又碰到了call back func,找它初始化的地方:
# grep -r event_handler kernel/time/ |less 在这里: kernel/time/tick-broadcast.c
/*
* Set the periodic handler depending on broadcast on/off
*/
void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
if (!broadcast)
dev->event_handler = tick_handle_periodic;
else
dev->event_handler = tick_handle_periodic_broadcast;
}
76 /*
77 * Event handler for periodic ticks
78 */
79 void tick_handle_periodic(struct clock_event_device *dev)
80 {
81 int cpu = smp_processor_id();
82 ktime_t next;
83
84 tick_periodic(cpu);
85
86 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
87 return;
88 /*
89 * Setup the next period for devices, which do not have
90 * periodic mode:
91 */
92 next = ktime_add(dev->next_event, tick_period);
93 for (;;) {
94 if (!clockevents_program_event(dev, next, ktime_get()))
95 return;
96 /*
97 * Have to be careful here. If we're in oneshot mode,
98 * before we call tick_periodic() in a loop, we need
99 * to be sure we're using a real hardware clocksource.
100 * Otherwise we could get trapped in an infinite
101 * loop, as the tick_periodic() increments jiffies,
102 * when then will increment time, posibly causing
103 * the loop to trigger again and again.
104 */
105 if (timekeeping_valid_for_hres())
106 tick_periodic(cpu);
107 next = ktime_add(next, tick_period);
108 }
109 }
tick_handle_periodic() -> tick_handle_periodic()
57 /*
58 * Periodic tick
59 */
60 static void tick_periodic(int cpu)
61 {
62 if (tick_do_timer_cpu == cpu) {
63 write_seqlock(&xtime_lock);
64
65 /* Keep track of the next tick event */
66 tick_next_period = ktime_add(tick_next_period, tick_period);
67
68 do_timer(1);
69 write_sequnlock(&xtime_lock);
70 }
71
72 update_process_times(user_mode(get_irq_regs()));
73 profile_tick(CPU_PROFILING);
74 }
do_timer()主要是update_wall_time()和计算load。最后在看这里。
update_process_times()为了schedule更新时间变量:
tick_handle_periodic() -> tick_handle_periodic() -> update_process_times()
1187 /*
1188 * Called from the timer interrupt handler to charge one tick to the current
1189 * process. user_tick is 1 if the tick is user time, 0 for system.
1190 */
1191 void update_process_times(int user_tick)
1192 {
1193 struct task_struct *p = current;
1194 int cpu = smp_processor_id();
1195
1196 /* Note: this timer irq context must be accounted for as well. */
1197 account_process_tick(p, user_tick);
1198 run_local_timers();
1199 rcu_check_callbacks(cpu, user_tick);
1200 printk_tick();
1201 perf_event_do_pending();
1202 scheduler_tick();
1203 run_posix_cpu_timers(p);
1204 }
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
printk_tick();
scheduler_tick();
run_posix_cpu_timers(p);
}
account_process_tick()用来增加进程的 system time 或者 user time,
run_local_timers()是和定时器相关,raise_softirq(TIMER_SOFTIRQ);
scheduler_tick()
tick_handle_periodic() -> tick_handle_periodic() -> update_process_times() -> scheduler_tick()
5291 /*
5292 * This function gets called by the timer code, with HZ frequency.
5293 * We call it with interrupts disabled.
5294 *
5295 * It also gets called by the fork code, when changing the parent's
5296 * timeslices.
5297 */
5298 void scheduler_tick(void)
5299 {
5300 int cpu = smp_processor_id();
5301 struct rq *rq = cpu_rq(cpu);
5302 struct task_struct *curr = rq->curr;
5303
5304 sched_clock_tick();
5305
5306 raw_spin_lock(&rq->lock);
5307 update_rq_clock(rq);
5308 update_cpu_load(rq);
5309 curr->sched_class->task_tick(rq, curr, 0);
5310 raw_spin_unlock(&rq->lock);
5311
5312 perf_event_task_tick(curr, cpu);
5313
5314 #ifdef CONFIG_SMP
5315 rq->idle_at_tick = idle_cpu(cpu);
5316 trigger_load_balance(rq, cpu);
5317 #endif
5318 }
找task_tick()的初值: # grep -r task_tick kernel/ |less 搜出3个,上面经历过了,
直接到 kernel/sched_fair.c中去看:
# vi kernel/sched_fair.c
.task_tick = task_tick_fair,
task_tick_fair()就是要找的。虽然CFS不再依赖time slice,但这只是策略问题,只要发生timer interrupt,
那在时钟的中断处理中就不会放弃设置调度标志的机会。
1967 /*
1968 * scheduler tick hitting a task of our scheduling class:
1969 */
1970 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1971 {
1972 struct cfs_rq *cfs_rq;
1973 struct sched_entity *se = &curr->se;
1974
1975 for_each_sched_entity(se) {
1976 cfs_rq = cfs_rq_of(se);
1977 entity_tick(cfs_rq, se, queued);
1978 }
1979 }
task_tick_fair()-> entity_tick()
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
下面的 check_preempt_tick()是真正是否设置schedule flag的函数:
task_tick_fair()-> entity_tick() -> check_preempt_tick()
852 /*
853 * Preempt the current task with a newly woken task if needed:
854 */
855 static void
856 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
857 {
858 unsigned long ideal_runtime, delta_exec;
859
860 ideal_runtime = sched_slice(cfs_rq, curr);
861 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
862 if (delta_exec > ideal_runtime) {
863 resched_task(rq_of(cfs_rq)->curr);
864 /*
865 * The current task ran long enough, ensure it doesn't get
866 * re-elected due to buddy favours.
867 */
868 clear_buddies(cfs_rq, curr);
869 return;
870 }
871
872 /*
873 * Ensure that a task that missed wakeup preemption by a
874 * narrow margin doesn't have to wait for a full slice.
875 * This also mitigates buddy induced latencies under load.
876 */
877 if (!sched_feat(WAKEUP_PREEMPT))
878 return;
879
880 if (delta_exec < sysctl_sched_min_granularity)
881 return;
882
883 if (cfs_rq->nr_running > 1) {
884 struct sched_entity *se = __pick_next_entity(cfs_rq);
885 s64 delta = curr->vruntime - se->vruntime;
886
887 if (delta > ideal_runtime)
888 resched_task(rq_of(cfs_rq)->curr);
889 }
890 }
1165 /*
1166 * resched_task - mark a task 'to be rescheduled now'.
1167 *
1168 * On UP this means the setting of the need_resched flag, on SMP it
1169 * might also involve a cross-CPU call to trigger the scheduler on
1170 * the target CPU.
1171 */
1172 #ifdef CONFIG_SMP
1173
1174 #ifndef tsk_is_polling
1175 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1176 #endif
1177
1178 static void resched_task(struct task_struct *p)
1179 {
1180 int cpu;
1181
1182 assert_raw_spin_locked(&task_rq(p)->lock);
1183
1184 if (test_tsk_need_resched(p))
1185 return;
1186
1187 set_tsk_need_resched(p);
1188
1189 cpu = task_cpu(p);
1190 if (cpu == smp_processor_id())
1191 return;
1192
1193 /* NEED_RESCHED must be visible before we test polling */
1194 smp_mb();
1195 if (!tsk_is_polling(p))
1196 smp_send_reschedule(cpu);
1197 }
注释说了,只是 mark a task 'to be rescheduled now',并不会马上强制调度新进程。
set_tsk_need_resched()设置了TIF_NEED_RESCHED标志。
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
最后看看do_timer(),主要是update_wall_time()。
1229 /*
1230 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1231 * without sampling the sequence number in xtime_lock.
1232 * jiffies is defined in the linker script...
1233 */
1234
1235 void do_timer(unsigned long ticks)
1236 {
1237 jiffies_64 += ticks;
1238 update_wall_time();
1239 calc_global_load();
1240 }
do_timer() -> update_wall_time():
781 /**
782 * update_wall_time - Uses the current clocksource to increment the wall time
783 *
784 * Called from the timer interrupt, must hold a write on xtime_lock.
785 */
786 void update_wall_time(void)
787 {
788 struct clocksource *clock;
789 cycle_t offset;
790 u64 nsecs;
791 int shift = 0, maxshift;
792
793 /* Make sure we're fully resumed: */
794 if (unlikely(timekeeping_suspended))
795 return;
796
797 clock = timekeeper.clock;
798 #ifdef CONFIG_GENERIC_TIME
799 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
800 #else
801 offset = timekeeper.cycle_interval;
802 #endif
803 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
804
805 /*
806 * With NO_HZ we may have to accumulate many cycle_intervals
807 * (think "ticks") worth of time at once. To do this efficiently,
808 * we calculate the largest doubling multiple of cycle_intervals
809 * that is smaller then the offset. We then accumulate that
810 * chunk in one go, and then try to consume the next smaller
811 * doubled multiple.
812 */
813 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
814 shift = max(0, shift);
815 /* Bound shift to one less then what overflows tick_length */
816 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
817 shift = min(shift, maxshift);
818 while (offset >= timekeeper.cycle_interval) {
819 offset = logarithmic_accumulation(offset, shift);
820 shift--;
821 }
822
823 /* correct the clock when NTP error is too big */
824 timekeeping_adjust(offset);
825
826 /*
827 * Since in the loop above, we accumulate any amount of time
828 * in xtime_nsec over a second into xtime.tv_sec, its possible for
829 * xtime_nsec to be fairly small after the loop. Further, if we're
830 * slightly speeding the clocksource up in timekeeping_adjust(),
831 * its possible the required corrective factor to xtime_nsec could
832 * cause it to underflow.
833 *
834 * Now, we cannot simply roll the accumulated second back, since
835 * the NTP subsystem has been notified via second_overflow. So
836 * instead we push xtime_nsec forward by the amount we underflowed,
837 * and add that amount into the error.
838 *
839 * We'll correct this error next time through this function, when
840 * xtime_nsec is not as small.
841 */
842 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
843 s64 neg = -(s64)timekeeper.xtime_nsec;
844 timekeeper.xtime_nsec = 0;
845 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
846 }
847
848 /* store full nanoseconds into xtime after rounding it up and
849 * add the remainder to the error difference.
850 */
851 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
852 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
853 timekeeper.ntp_error += timekeeper.xtime_nsec <<
854 timekeeper.ntp_error_shift;
855
856 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
857 update_xtime_cache(nsecs);
858
859 /* check to see if there is a new clocksource to use */
860 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
861 }
NTP的很多操作在wall time这里,比如第一次执行会有很大误差,多执行几次就行了:
[root@cn122 linux-2.6.33]# ntpdate 210.72.145.44
21 Mar 17:15:53 ntpdate[14157]: step time server 210.72.145.44 offset 133.343336 sec
[root@cn122 linux-2.6.33]# ntpdate 210.72.145.44
21 Mar 17:15:56 ntpdate[14158]: adjust time server 210.72.145.44 offset 0.007530 sec
注释中说了adjust error,是用 timekeeping_adjust() 实现的。
这样把kernel的时钟中断处理也走了一下。
Sunday, March 21, 2010
preemption and timer interrupt
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment