一碗阳春面: preemption and timer interrupt


本文假定kernel配置成CONFIG_PREEMPT_NONE。

下面假设进程H的优先级高于进程L。

    H通过sys_read()进入内核态进行IO操作，由于IO很慢，submit_bio之后主动sleep: 把H放到wait_queue
中，设置H状态为TASK_INTERRUPTIBLE，然后调用schedule()释放cpu。假设这时L被调度，成了current，
L也通过系统调用等方式进入了内核态运行。当L正在运行时IO操作完成，会发中断，于是CPU执行ISR响应中断:
唤醒H，比较优先级，如果发现H优先级比current的L高就会设置current->thread_info中的
TIF_NEED_RESCHED = 1，也会设置H状态为TASK_RUNNING。CPU 返回后，由于是回到内核态，所以不发生调度，
L继续在内核态运行。这时又发生了timer interrupt，恰好L的时间统计信息符合CFS设置调度标志的标准了，
就会设置TIF_NEED_RESCHED = 1。从ISR返回后，由于也是返回到内核态，所以不发生调度。直到L返回到用户
态时才会检测TIF_NEED_RESCHED = 1，这时H才会成为current。H先把自己之前sleep时插入wait_queue中的
descriptor删除，然后再继续运行。

    可见这个preemption是内核态的概念，如果是返回用户态时被淘汰出去，并不叫preemption。

kernel在下面两种下会设置TIF_NEEDRESCHED:
1. try_to_wake_up()唤醒一个进程p，如果p优先级比current优先级高，就设置TIF_NEEDRESCHED=1。
2. 时钟中断处理时会对进程的时间信息做更新，如果符合scheduler的标准就设置TIF_NEEDRESCHED=1
   以前是time slice == 0时设置TIF_NEEDRESCHED，现在CFS也有标准。

看看 wait_event_interruptible() 引出的一系列函数:

267 /**
268  * wait_event_interruptible - sleep until a condition gets true
269  * @wq: the waitqueue to wait on
270  * @condition: a C expression for the event to wait for
271  *
272  * The process is put to sleep (TASK_INTERRUPTIBLE) until the
273  * @condition evaluates to true or a signal is received.
274  * The @condition is checked each time the waitqueue @wq is woken up.
275  *
276  * wake_up() has to be called after changing any variable that could
277  * change the result of the wait condition.
278  *
279  * The function will return -ERESTARTSYS if it was interrupted by a
280  * signal and 0 if @condition evaluated to true.
281  */
282 #define wait_event_interruptible(wq, condition)                         \
283 ({                                                                      \
284         int __ret = 0;                                                  \
285         if (!(condition))                                               \
286                 __wait_event_interruptible(wq, condition, __ret);       \
287         __ret;                                                          \
288 })

249 #define __wait_event_interruptible(wq, condition, ret)                  \
250 do {                                                                    \
251         DEFINE_WAIT(__wait);                                            \
252                                                                         \
253         for (;;) {                                                      \
254                 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
255                 if (condition)                                          \
256                         break;                                          \
257                 if (!signal_pending(current)) {                         \
258                         schedule();                                     \
259                         continue;                                       \
260                 }                                                       \
261                 ret = -ERESTARTSYS;                                     \
262                 break;                                                  \
263         }                                                               \
264         finish_wait(&wq, &__wait);                                      \
265 } while (0)

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

448 #define DEFINE_WAIT_FUNC(name, function)                                \
449         wait_queue_t name = {                                           \
450                 .private        = current,                              \
451                 .func           = function,                             \
452                 .task_list      = LIST_HEAD_INIT((name).task_list),     \
453         }

typedef struct __wait_queue wait_queue_t;

32 struct __wait_queue {
33         unsigned int flags;
34 #define WQ_FLAG_EXCLUSIVE       0x01
35         void *private;
36         wait_queue_func_t func;
37         struct list_head task_list;
38 };

初始化wait queue成员，注意设置的唤醒函数 autoremove_wake_function。然后在prepare_to_wait()中
把该成员加入wait queue，设置状态为TASK_UNINTERRUPTIBLE:

55 /*
56  * Note: we use "set_current_state()" _after_ the wait-queue add,
57  * because we need a memory barrier there on SMP, so that any
58  * wake-function that tests for the wait-queue being active
59  * will be guaranteed to see waitqueue addition _or_ subsequent
60  * tests in this thread will see the wakeup having taken place.
61  *
62  * The spin_unlock() itself is semi-permeable and only protects
63  * one way (it only protects stuff inside the critical region and
64  * stops them from bleeding out - it would still allow subsequent
65  * loads to move into the critical region).
66  */
67 void
68 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
69 {
70         unsigned long flags;
71 
72         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
73         spin_lock_irqsave(&q->lock, flags);
74         if (list_empty(&wait->task_list))
75                 __add_wait_queue(q, wait);
76         set_current_state(state);
77         spin_unlock_irqrestore(&q->lock, flags);
78 }

下面看看wake_up函数设置TIF_NEEDRESCHED:
163 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
164 {
165         int ret = default_wake_function(wait, mode, sync, key);
166 
167         if (ret)
168                 list_del_init(&wait->task_list);
169         return ret;
170 }               

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key)
{
        return try_to_wake_up(curr->private, mode, wake_flags);
}

2353 /***
2354  * try_to_wake_up - wake up a thread
2355  * @p: the to-be-woken-up thread
2356  * @state: the mask of task states that can be woken
2357  * @sync: do a synchronous wakeup?
2358  *
2359  * Put it on the run-queue if it's not already there. The "current"
2360  * thread is always on the run-queue (except when the actual
2361  * re-schedule is in progress), and as such you're allowed to do
2362  * the simpler "current->state = TASK_RUNNING" to mark yourself
2363  * runnable without the overhead of this.
2364  *
2365  * returns failure only if the task is already active.
2366  */
2367 static int try_to_wake_up(struct task_struct *p, unsigned int state,
2368                           int wake_flags)
2369 {
2370         int cpu, orig_cpu, this_cpu, success = 0;
2371         unsigned long flags;
2372         struct rq *rq, *orig_rq;
2373 
2374         if (!sched_feat(SYNC_WAKEUPS))
2375                 wake_flags &= ~WF_SYNC;
2376 
2377         this_cpu = get_cpu();
2378 
2379         smp_wmb();
2380         rq = orig_rq = task_rq_lock(p, &flags);
2381         update_rq_clock(rq);
2382         if (!(p->state & state))
2383                 goto out;
2384 
2385         if (p->se.on_rq)
2386                 goto out_running;
2387 
2388         cpu = task_cpu(p);
2389         orig_cpu = cpu;
2390 
2391 #ifdef CONFIG_SMP
2392         if (unlikely(task_running(rq, p)))
2393                 goto out_activate;
2394 
2395         /*
2396          * In order to handle concurrent wakeups and release the rq->lock
2397          * we put the task in TASK_WAKING state.
2398          *
2399          * First fix up the nr_uninterruptible count:
2400          */
2401         if (task_contributes_to_load(p))
2402                 rq->nr_uninterruptible--;
2403         p->state = TASK_WAKING;
2404 
2405         if (p->sched_class->task_waking)
2406                 p->sched_class->task_waking(rq, p);
2407 
2408         __task_rq_unlock(rq);
2409 
2410         cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411         if (cpu != orig_cpu)
2412                 set_task_cpu(p, cpu);
2413 
2414         rq = __task_rq_lock(p);
2415         update_rq_clock(rq);
2416 
2417         WARN_ON(p->state != TASK_WAKING);
2418         cpu = task_cpu(p);
2419 
2420 #ifdef CONFIG_SCHEDSTATS
2421         schedstat_inc(rq, ttwu_count);
2422         if (cpu == this_cpu)
2423                 schedstat_inc(rq, ttwu_local);
2424         else {
2425                 struct sched_domain *sd;
2426                 for_each_domain(this_cpu, sd) {
2427                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2428                                 schedstat_inc(sd, ttwu_wake_remote);
2429                                 break;
2430                         }
2431                 }
2432         }
2433 #endif /* CONFIG_SCHEDSTATS */
2434 
2435 out_activate:
2436 #endif /* CONFIG_SMP */
2437         schedstat_inc(p, se.nr_wakeups);
2438         if (wake_flags & WF_SYNC)
2439                 schedstat_inc(p, se.nr_wakeups_sync);
2440         if (orig_cpu != cpu)
2441                 schedstat_inc(p, se.nr_wakeups_migrate);
2442         if (cpu == this_cpu)
2443                 schedstat_inc(p, se.nr_wakeups_local);
2444         else
2445                 schedstat_inc(p, se.nr_wakeups_remote);
2446         activate_task(rq, p, 1);
2447         success = 1;
2448 
2449         /*
2450          * Only attribute actual wakeups done by this task.
2451          */
2452         if (!in_interrupt()) {
2453                 struct sched_entity *se = ¤t->se;
2454                 u64 sample = se->sum_exec_runtime;
2455 
2456                 if (se->last_wakeup)
2457                         sample -= se->last_wakeup;
2458                 else
2459                         sample -= se->start_runtime;
2460                 update_avg(&se->avg_wakeup, sample);
2461 
2462                 se->last_wakeup = se->sum_exec_runtime;
2463         }
2464 
2465 out_running:
2466         trace_sched_wakeup(rq, p, success);
2467         check_preempt_curr(rq, p, wake_flags);
2468 
2469         p->state = TASK_RUNNING;
2470 #ifdef CONFIG_SMP
2471         if (p->sched_class->task_woken)
2472                 p->sched_class->task_woken(rq, p);
2473 
2474         if (unlikely(rq->idle_stamp)) {
2475                 u64 delta = rq->clock - rq->idle_stamp;
2476                 u64 max = 2*sysctl_sched_migration_cost;
2477 
2478                 if (delta > max)
2479                         rq->avg_idle = max;
2480                 else
2481                         update_avg(&rq->avg_idle, delta);
2482                 rq->idle_stamp = 0;
2483         }
2484 #endif
2485 out:
2486         task_rq_unlock(rq, &flags);
2487         put_cpu();
2488 
2489         return success;
2490 }

参数p， @p: the to-be-woken-up thread 
check_preempt_curr()里面设置TIF_NEEDRESCHED。
注意put_cpu(): 只有在CONFIG_PREEMPT 条件下，满足了#ifdef CONFIG_PREEMPT put_cpu()才会执行schedule()，
在这里CONFIG_PREEMPT_NONE时put_cpu()不会发生调度，直接返回。所以只要check_preempt_curr()里没有执行
schedule()，就符合开头的说法了。

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
}

下满要找 check_preempt_curr()赋初值的地方。
# grep -r check_preempt_curr kernel/ |less  找到3个地方
1。# vi kernel/sched_rt.c 文件开头有注释

/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */

这是供 SCHED_FIFO 和 SCHED_RR 使用的。

2。# vi kernel/sched_idletask.c  跟踪一下注释说是预留还没实现:

/*
 * Scheduling policies
 */
#define SCHED_NORMAL            0
#define SCHED_FIFO              1
#define SCHED_RR                2
#define SCHED_BATCH             3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE              5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK     0x40000000

3。# vi kernel/sched_fair.c  找到赋值语句:
 .check_preempt_curr     = check_preempt_wakeup,

现在看看 check_preempt_wakeup()
1699 /*
1700  * Preempt the current task with a newly woken task if needed:
1701  */
1702 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1703 {
1704         struct task_struct *curr = rq->curr;
1705         struct sched_entity *se = &curr->se, *pse = &p->se;
1706         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1707         int sync = wake_flags & WF_SYNC;
1708         int scale = cfs_rq->nr_running >= sched_nr_latency;
1709 
1710         if (unlikely(rt_prio(p->prio)))
1711                 goto preempt;
1712 
1713         if (unlikely(p->sched_class != &fair_sched_class))
1714                 return;
1715 
1716         if (unlikely(se == pse))
1717                 return;
1718 
1719         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1720                 set_next_buddy(pse);
1721 
1722         /*
1723          * We can come here with TIF_NEED_RESCHED already set from new task
1724          * wake up path.
1725          */
1726         if (test_tsk_need_resched(curr))
1727                 return;
1728 
1729         /*
1730          * Batch and idle tasks do not preempt (their preemption is driven by
1731          * the tick):
1732          */
1733         if (unlikely(p->policy != SCHED_NORMAL))
1734                 return;
1735 
1736         /* Idle tasks are by definition preempted by everybody. */
1737         if (unlikely(curr->policy == SCHED_IDLE))
1738                 goto preempt;
1739 
1740         if (sched_feat(WAKEUP_SYNC) && sync)
1741                 goto preempt;
1742 
1743         if (sched_feat(WAKEUP_OVERLAP) &&
1744                         se->avg_overlap < sysctl_sched_migration_cost &&
1745                         pse->avg_overlap < sysctl_sched_migration_cost)
1746                 goto preempt;
1747 
1748         if (!sched_feat(WAKEUP_PREEMPT))
1749                 return;
1750 
1751         update_curr(cfs_rq);
1752         find_matching_se(&se, &pse);
1753         BUG_ON(!pse);
1754         if (wakeup_preempt_entity(se, pse) == 1)
1755                 goto preempt;
1756 
1757         return;
1758 
1759 preempt:
1760         resched_task(curr);
1761         /*
1762          * Only set the backward buddy when the current task is still
1763          * on the rq. This can happen when a wakeup gets interleaved
1764          * with schedule on the ->pre_schedule() or idle_balance()
1765          * point, either of which can * drop the rq lock.
1766          *
1767          * Also, during early boot the idle thread is in the fair class,
1768          * for obvious reasons its a bad idea to schedule back to it.
1769          */
1770         if (unlikely(!se->on_rq || curr == rq->idle))
1771                 return;
1772 
1773         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1774                 set_last_buddy(se);
1775 }

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

里面还有个关键函数: resched_task():
1178 static void resched_task(struct task_struct *p)
1179 {
1180         int cpu;
1181 
1182         assert_raw_spin_locked(&task_rq(p)->lock);
1183 
1184         if (test_tsk_need_resched(p))
1185                 return;
1186 
1187         set_tsk_need_resched(p);
1188 
1189         cpu = task_cpu(p);
1190         if (cpu == smp_processor_id())
1191                 return;
1192 
1193         /* NEED_RESCHED must be visible before we test polling */
1194         smp_mb();
1195         if (!tsk_is_polling(p))
1196                 smp_send_reschedule(cpu);
1197 }

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

确实没有执行schedule()的地方。现在p重新占有了cpu，会把之前wait queue中的内容处理一下:
wait_event_interruptible() -> __wait_event_interruptible() -> finish_wait()

 95 /*
 96  * finish_wait - clean up after waiting in a queue
 97  * @q: waitqueue waited on
 98  * @wait: wait descriptor
 99  * 
100  * Sets current thread back to running state and removes
101  * the wait descriptor from the given waitqueue if still
102  * queued.
103  */
104 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
105 {
106         unsigned long flags;
107 
108         __set_current_state(TASK_RUNNING);
109         /*
110          * We can check for list emptiness outside the lock
111          * IFF:
112          *  - we use the "careful" check that verifies both
113          *    the next and prev pointers, so that there cannot
114          *    be any half-pending updates in progress on other
115          *    CPU's that we haven't seen yet (and that might
116          *    still change the stack area.
117          * and
118          *  - all other users take the lock (ie we can only
119          *    have _one_ other CPU that looks at or modifies
120          *    the list).
121          */
122         if (!list_empty_careful(&wait->task_list)) {
123                 spin_lock_irqsave(&q->lock, flags);
124                 list_del_init(&wait->task_list);
125                 spin_unlock_irqrestore(&q->lock, flags);
126         }
127 }



现在看看时钟中断处理函数中设置TIF_NEEDRESCHED的位置。
时钟管理硬件PIT，timer, 就是最常说的时钟中断，这里只说PIT。

[root@gridserver ~]# cat /proc/interrupts
           CPU0       CPU1
  0:   75698257   75667103    IO-APIC-edge  timer
  1:        161         57    IO-APIC-edge  i8042
  8:          1          0    IO-APIC-edge  rtc
  9:          0          0   IO-APIC-level  acpi
 12:         92          0    IO-APIC-edge  i8042
 14:         19          0    IO-APIC-edge  ide0
169:        261          0   IO-APIC-level  HDA Intel, uhci_hcd
177:          1          0   IO-APIC-level  ehci_hcd, uhci_hcd
185:     103842     101556   IO-APIC-level  libata, uhci_hcd
193:          0          0   IO-APIC-level  uhci_hcd
201:         48    1597890   IO-APIC-level  eth1
209:     321688          0   IO-APIC-level  eth0
NMI:          0          0
LOC:  151375315  151375347
ERR:          0
MIS:          0

这里timer对应IRQ 0，rtc对应IRQ 8。也可看出系统的irqbalance挺好，interrupt在两个core上分布均衡。

要找到call back func，就要看system boot时如何设置初值的:
start_kernel()-> time_init()
/*
 * Initialize TSC and delay the periodic timer init to
 * late x86_late_time_init() so ioremap works.
 */
void __init time_init(void)
{
        late_time_init = x86_late_time_init;
}

static __init void x86_late_time_init(void)
{
        x86_init.timers.timer_init();
        tsc_init();
}

/* Default timer init function */
void __init hpet_time_init(void)
{
        if (!hpet_enable())
                setup_pit_timer();
        setup_default_timer_irq();
}

void __init setup_default_timer_irq(void)
{
        setup_irq(0, &irq0);
}

static struct irqaction irq0  = {
        .handler = timer_interrupt,
        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
        .name = "timer" 
};      

可以看出timer interrup处理函数是timer_interrupt()

58 /*
59  * Default timer interrupt handler for PIT/HPET
60  */
61 static irqreturn_t timer_interrupt(int irq, void *dev_id)
62 {
63         /* Keep nmi watchdog up to date */
64         inc_irq_stat(irq0_irqs);
65 
66         /* Optimized out for !IO_APIC and x86_64 */
67         if (timer_ack) {
68                 /*
69                  * Subtle, when I/O APICs are used we have to ack timer IRQ
70                  * manually to deassert NMI lines for the watchdog if run
71                  * on an 82489DX-based system.
72                  */
73                 spin_lock(&i8259A_lock);
74                 outb(0x0c, PIC_MASTER_OCW3);
75                 /* Ack the IRQ; AEOI will end it automatically. */
76                 inb(PIC_MASTER_POLL);
77                 spin_unlock(&i8259A_lock);
78         }
79     
80         global_clock_event->event_handler(global_clock_event);
81 
82         /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
83         if (MCA_bus)
84                 outb_p(inb_p(0x61)| 0x80, 0x61);
85 
86         return IRQ_HANDLED;
87 }

又碰到了call back func，找它初始化的地方:
# grep -r event_handler kernel/time/ |less   在这里: kernel/time/tick-broadcast.c

/*
 * Set the periodic handler depending on broadcast on/off
 */
void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
        if (!broadcast)
                dev->event_handler = tick_handle_periodic;
        else
                dev->event_handler = tick_handle_periodic_broadcast;
}

 76 /*
 77  * Event handler for periodic ticks
 78  */
 79 void tick_handle_periodic(struct clock_event_device *dev)
 80 {
 81         int cpu = smp_processor_id();
 82         ktime_t next;
 83 
 84         tick_periodic(cpu);
 85 
 86         if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
 87                 return;
 88         /*
 89          * Setup the next period for devices, which do not have
 90          * periodic mode:
 91          */
 92         next = ktime_add(dev->next_event, tick_period);
 93         for (;;) {
 94                 if (!clockevents_program_event(dev, next, ktime_get()))
 95                         return;
 96                 /*
 97                  * Have to be careful here. If we're in oneshot mode,
 98                  * before we call tick_periodic() in a loop, we need
 99                  * to be sure we're using a real hardware clocksource.
100                  * Otherwise we could get trapped in an infinite
101                  * loop, as the tick_periodic() increments jiffies,
102                  * when then will increment time, posibly causing
103                  * the loop to trigger again and again.
104                  */
105                 if (timekeeping_valid_for_hres())
106                         tick_periodic(cpu);
107                 next = ktime_add(next, tick_period);
108         }
109 }

tick_handle_periodic() -> tick_handle_periodic()
57 /*
58  * Periodic tick
59  */
60 static void tick_periodic(int cpu)
61 {
62         if (tick_do_timer_cpu == cpu) {
63                 write_seqlock(&xtime_lock);
64 
65                 /* Keep track of the next tick event */
66                 tick_next_period = ktime_add(tick_next_period, tick_period);
67 
68                 do_timer(1);
69                 write_sequnlock(&xtime_lock);
70         }
71 
72         update_process_times(user_mode(get_irq_regs()));
73         profile_tick(CPU_PROFILING);
74 }

do_timer()主要是update_wall_time()和计算load。最后在看这里。
update_process_times()为了schedule更新时间变量:
tick_handle_periodic() -> tick_handle_periodic() -> update_process_times()

1187 /*
1188  * Called from the timer interrupt handler to charge one tick to the current
1189  * process.  user_tick is 1 if the tick is user time, 0 for system.
1190  */
1191 void update_process_times(int user_tick)
1192 {
1193         struct task_struct *p = current;
1194         int cpu = smp_processor_id();
1195 
1196         /* Note: this timer irq context must be accounted for as well. */
1197         account_process_tick(p, user_tick);
1198         run_local_timers();
1199         rcu_check_callbacks(cpu, user_tick);
1200         printk_tick();
1201         perf_event_do_pending();
1202         scheduler_tick();
1203         run_posix_cpu_timers(p);
1204 }      

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);   
        run_local_timers();
        if (rcu_pending(cpu))
                rcu_check_callbacks(cpu, user_tick);
        printk_tick();
        scheduler_tick();
        run_posix_cpu_timers(p);
}

account_process_tick()用来增加进程的 system time 或者 user time，
run_local_timers()是和定时器相关，raise_softirq(TIMER_SOFTIRQ);
scheduler_tick()

tick_handle_periodic() -> tick_handle_periodic() -> update_process_times() -> scheduler_tick()
5291 /*
5292  * This function gets called by the timer code, with HZ frequency.
5293  * We call it with interrupts disabled.
5294  *
5295  * It also gets called by the fork code, when changing the parent's
5296  * timeslices.
5297  */
5298 void scheduler_tick(void)
5299 {
5300         int cpu = smp_processor_id();
5301         struct rq *rq = cpu_rq(cpu);
5302         struct task_struct *curr = rq->curr;
5303 
5304         sched_clock_tick();
5305 
5306         raw_spin_lock(&rq->lock);
5307         update_rq_clock(rq);
5308         update_cpu_load(rq);
5309         curr->sched_class->task_tick(rq, curr, 0);
5310         raw_spin_unlock(&rq->lock);
5311 
5312         perf_event_task_tick(curr, cpu);
5313 
5314 #ifdef CONFIG_SMP
5315         rq->idle_at_tick = idle_cpu(cpu);
5316         trigger_load_balance(rq, cpu);
5317 #endif
5318 }

找task_tick()的初值: # grep -r task_tick kernel/ |less  搜出3个，上面经历过了，
直接到 kernel/sched_fair.c中去看:
# vi kernel/sched_fair.c
.task_tick              = task_tick_fair,

task_tick_fair()就是要找的。虽然CFS不再依赖time slice，但这只是策略问题，只要发生timer interrupt，
那在时钟的中断处理中就不会放弃设置调度标志的机会。

1967 /*
1968  * scheduler tick hitting a task of our scheduling class:
1969  */
1970 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1971 {
1972         struct cfs_rq *cfs_rq;
1973         struct sched_entity *se = &curr->se;
1974 
1975         for_each_sched_entity(se) {
1976                 cfs_rq = cfs_rq_of(se);
1977                 entity_tick(cfs_rq, se, queued);
1978         }
1979 }

task_tick_fair()-> entity_tick()

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);

#ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
         * validating it and just reschedule.
         */
        if (queued) {
                resched_task(rq_of(cfs_rq)->curr);
                return;
        }
        /*
         * don't let the period tick interfere with the hrtick preemption
         */
        if (!sched_feat(DOUBLE_TICK) &&
                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
                return;
#endif

        if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
                check_preempt_tick(cfs_rq, curr);
}

下面的 check_preempt_tick()是真正是否设置schedule flag的函数:
task_tick_fair()-> entity_tick() -> check_preempt_tick()

852 /*
853  * Preempt the current task with a newly woken task if needed:
854  */
855 static void
856 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
857 {
858         unsigned long ideal_runtime, delta_exec;
859 
860         ideal_runtime = sched_slice(cfs_rq, curr);
861         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
862         if (delta_exec > ideal_runtime) {
863                 resched_task(rq_of(cfs_rq)->curr);
864                 /*
865                  * The current task ran long enough, ensure it doesn't get
866                  * re-elected due to buddy favours.
867                  */
868                 clear_buddies(cfs_rq, curr);
869                 return;
870         }
871 
872         /*
873          * Ensure that a task that missed wakeup preemption by a
874          * narrow margin doesn't have to wait for a full slice.
875          * This also mitigates buddy induced latencies under load.
876          */
877         if (!sched_feat(WAKEUP_PREEMPT))
878                 return;
879 
880         if (delta_exec < sysctl_sched_min_granularity)
881                 return;
882 
883         if (cfs_rq->nr_running > 1) {
884                 struct sched_entity *se = __pick_next_entity(cfs_rq);
885                 s64 delta = curr->vruntime - se->vruntime;
886 
887                 if (delta > ideal_runtime)
888                         resched_task(rq_of(cfs_rq)->curr);
889         }
890 }

1165 /*
1166  * resched_task - mark a task 'to be rescheduled now'.
1167  *
1168  * On UP this means the setting of the need_resched flag, on SMP it
1169  * might also involve a cross-CPU call to trigger the scheduler on
1170  * the target CPU.
1171  */
1172 #ifdef CONFIG_SMP
1173 
1174 #ifndef tsk_is_polling
1175 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1176 #endif
1177 
1178 static void resched_task(struct task_struct *p)
1179 {
1180         int cpu;
1181 
1182         assert_raw_spin_locked(&task_rq(p)->lock);
1183 
1184         if (test_tsk_need_resched(p))
1185                 return;
1186 
1187         set_tsk_need_resched(p);
1188 
1189         cpu = task_cpu(p);
1190         if (cpu == smp_processor_id()) 
1191                 return;
1192 
1193         /* NEED_RESCHED must be visible before we test polling */
1194         smp_mb();
1195         if (!tsk_is_polling(p))
1196                 smp_send_reschedule(cpu);
1197 }

注释说了，只是 mark a task 'to be rescheduled now'，并不会马上强制调度新进程。
set_tsk_need_resched()设置了TIF_NEED_RESCHED标志。

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}


最后看看do_timer()，主要是update_wall_time()。
1229 /*
1230  * The 64-bit jiffies value is not atomic - you MUST NOT read it
1231  * without sampling the sequence number in xtime_lock.
1232  * jiffies is defined in the linker script...
1233  */
1234 
1235 void do_timer(unsigned long ticks)
1236 {
1237         jiffies_64 += ticks;
1238         update_wall_time();
1239         calc_global_load();
1240 }

do_timer() -> update_wall_time():

781 /**
782  * update_wall_time - Uses the current clocksource to increment the wall time
783  *
784  * Called from the timer interrupt, must hold a write on xtime_lock.
785  */      
786 void update_wall_time(void)
787 {
788         struct clocksource *clock;
789         cycle_t offset;
790         u64 nsecs;
791         int shift = 0, maxshift;
792 
793         /* Make sure we're fully resumed: */
794         if (unlikely(timekeeping_suspended))
795                 return; 
796 
797         clock = timekeeper.clock;
798 #ifdef CONFIG_GENERIC_TIME
799         offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
800 #else
801         offset = timekeeper.cycle_interval;
802 #endif
803         timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
804 
805         /*
806          * With NO_HZ we may have to accumulate many cycle_intervals
807          * (think "ticks") worth of time at once. To do this efficiently,
808          * we calculate the largest doubling multiple of cycle_intervals
809          * that is smaller then the offset. We then accumulate that
810          * chunk in one go, and then try to consume the next smaller
811          * doubled multiple.
812          */
813         shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
814         shift = max(0, shift);
815         /* Bound shift to one less then what overflows tick_length */
816         maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
817         shift = min(shift, maxshift);
818         while (offset >= timekeeper.cycle_interval) {
819                 offset = logarithmic_accumulation(offset, shift);
820                 shift--;
821         }
822 
823         /* correct the clock when NTP error is too big */
824         timekeeping_adjust(offset);
825 
826         /*
827          * Since in the loop above, we accumulate any amount of time
828          * in xtime_nsec over a second into xtime.tv_sec, its possible for
829          * xtime_nsec to be fairly small after the loop. Further, if we're
830          * slightly speeding the clocksource up in timekeeping_adjust(),
831          * its possible the required corrective factor to xtime_nsec could
832          * cause it to underflow.
833          *
834          * Now, we cannot simply roll the accumulated second back, since
835          * the NTP subsystem has been notified via second_overflow. So
836          * instead we push xtime_nsec forward by the amount we underflowed,
837          * and add that amount into the error.
838          *
839          * We'll correct this error next time through this function, when
840          * xtime_nsec is not as small.
841          */
842         if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
843                 s64 neg = -(s64)timekeeper.xtime_nsec;
844                 timekeeper.xtime_nsec = 0;
845                 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
846         }
847 
848         /* store full nanoseconds into xtime after rounding it up and
849          * add the remainder to the error difference.
850          */
851         xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
852         timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
853         timekeeper.ntp_error += timekeeper.xtime_nsec <<
854                                 timekeeper.ntp_error_shift;
855 
856         nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
857         update_xtime_cache(nsecs);
858 
859         /* check to see if there is a new clocksource to use */
860         update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
861 }

NTP的很多操作在wall time这里，比如第一次执行会有很大误差，多执行几次就行了:
[root@cn122 linux-2.6.33]# ntpdate 210.72.145.44
21 Mar 17:15:53 ntpdate[14157]: step time server 210.72.145.44 offset 133.343336 sec
[root@cn122 linux-2.6.33]# ntpdate 210.72.145.44
21 Mar 17:15:56 ntpdate[14158]: adjust time server 210.72.145.44 offset 0.007530 sec

注释中说了adjust error，是用 timekeeping_adjust() 实现的。
这样把kernel的时钟中断处理也走了一下。
一碗阳春面

Sunday, March 21, 2010

preemption and timer interrupt

No comments:

Post a Comment