假设kernel编译成CONFIG_PREEMPT_NONE。
一个进程B退出时执行do_exit(),释放自己占有的绝大多数资源如memory、fd等,但和kernel stack
有关的task_struct以及struct thread_info不会立刻被释放,因为thread_info中含有exit status,
parent有时会通过这个exit status了解child执行时遇到的问题,parent通过waitpid()来获取这个
exit status。还有就是进程处于切换阶段但还没切换到新进程时,如果发生了中断,ISR可能会利用待退出
进程的kernel stack,此时就暂时保留退出进程的kernel stack,进程切换之后在释放。如果能做到中断处
理时不依赖刚刚退出进程的kernel stack,那后一条原因就没了。
几个常识:
thread group 可以 man clone,为了简化源码,这里假设B所在的 thread group 就他一个。
1. fork()时,tsk->usage = 2; 进程退出时释放kernel stack过程的复杂就在于这个usage=2。
2. init在启动阶段会设置SIGCHLD handler,在handler函数中调用wait_pid(),然后while(1)做其它事。
3. waitpid()工作过程:先执行add waitqueue操作,设置状态为TASK_INTERRUPTIBLE,然后调用schedule()
让出cpu,将来child用wakeup使得parent有机会重新得到执行,开始处理zombie进程,执行一次
release_task() usage--,此时usage=1,所以不释放退出进程的kernel stack。
总体过程是无论进程变成EXIT_DEAD自己self reap还是变成EXIT_ZOMBIE让init处理。都是先执行一次
release_task()做一次usage--,然后退出进程在do_exit()末尾执行schedule(),在schedule()中第二次执
行release_task(),此时usage=0,释放B的kernel stack。
EXIT_DEAD 状态self reap时自己执行第一次release_task(),
EXIT_ZOMBIE 状态是由waitpid()执行的第一次release_task(),
而两种情况下,第二次release_task()都是自己在schedule()中执行的。也就是说,最后释放
kernel stack的工作还要靠退出进程自己执行schedule()来完成,而不是由init释放的。
进程处于EXIT_ZOMBIE状态就是一个进程就剩下kernel stack了,正等着被释放。此时 ps aux |grep Z
能够看见。但如果长时间存在,系统会变慢。google发现原因很多,有的说application, libc, kernel
都有嫌疑,这里看了一些源码发现kernel对 EXIT_ZOMBIE 处理的很全面,应该不是kernel的原因。
如果B的parent是init,B退出了,那回收过程如下:
1) B退出时先给自己的孩子C1,C2...等找养父。找养父的过程是:先在本thread group中找一个B1做为C1,
C2,...的养父,如果thread group中只有B自己,那就让init做C1,C2...的养父。然后B给init发SIGCHLD
信号,信号中携带B的exit status。
2) init收到SIGCHLD,进入handler,执行wait_pid()。wait_pid()执行add waitqueue,然后调用
schedule()让出cpu。
3) B继续执行wakeup函数让init有机会得到运行,B设置自己状态为 TASK_ZOMBIE。init重新运行后开始
处理状态为TASK_ZOMBIE 的进程,执行一次release_task(),usage--,remove waitqueue。得到退出
进程的exit status。但不能释放B的kernel stack,
4) B继续在do_exit()末尾执行schedule(),在里面的finish_task_switch()中再执行一次release_task(),
这次usage--之后终于可以释放kernel stack了。
而状态为 EXIT_DEAD 的进程是self reap:
自己先执行一次release_task(),然后在schedule()中再执行一次release_task()释放kernel stack。
man waitpid
POSIX.1-2001 specifies that if the disposition of SIGCHLD is set to SIG_IGN or the
SA_NOCLDWAIT flag is set for SIGCHLD, then children that terminate do not become zombies.
Linux 2.6 conforms to this specification.
man 7 signal 发现SIGCHLD对应的Action是Ign,所以即使parent不调用waitpid(),也不会产生zombie。
下面用init源码,do_exit()和sys_waitpid()验证一下:
# rpm -qif /sbin/init 知道init源码在SysVinit包里,init.c 中main()
2594 /*
2595 * Main entry for init and telinit.
2596 */
2597 int main(int argc, char **argv)
2598 {
2599 char *p;
2600 int f;
2601 int isinit;
2602
2603 /* Get my own name */
2604 if ((p = strrchr(argv[0], '/')) != NULL)
2605 p++;
2606 else
2607 p = argv[0];
2608 umask(022);
2609
2610 /* Quick check */
2611 if (geteuid() != 0) {
2612 fprintf(stderr, "%s: must be superuser.\n", p);
2613 exit(1);
2614 }
2615
2616 /*
2617 * Is this telinit or init ?
2618 */
2619 isinit = (getpid() == 1);
2620 for (f = 1; f < argc; f++) {
2621 if (!strcmp(argv[f], "-i") || !strcmp(argv[f], "--init"))
2622 isinit = 1;
2623 break;
2624 }
2625 if (!isinit) exit(telinit(p, argc, argv));
2626
2627 /*
2628 * Check for re-exec
2629 */
2630 if (check_pipe(STATE_PIPE)) {
2631
2632 receive_state(STATE_PIPE);
2633
2634 myname = istrdup(argv[0]);
2635 argv0 = argv[0];
2636 maxproclen = 0;
2637 for (f = 0; f < argc; f++)
2638 maxproclen += strlen(argv[f]) + 1;
2639 reload = 1;
2640 setproctitle("init [%c]",runlevel);
2641
2642 init_main();
2643 }
2644
2645 /* Check command line arguments */
2646 maxproclen = strlen(argv[0]) + 1;
2647 for(f = 1; f < argc; f++) {
2648 if (!strcmp(argv[f], "single") || !strcmp(argv[f], "-s"))
2649 dfl_level = 'S';
2650 else if (!strcmp(argv[f], "-a") || !strcmp(argv[f], "auto"))
2651 putenv("AUTOBOOT=YES");
2652 else if (!strcmp(argv[f], "-b") || !strcmp(argv[f],"emergency"))
2653 emerg_shell = 1;
2654 else if (!strcmp(argv[f], "-z")) {
2655 /* Ignore -z xxx */
2656 if (argv[f + 1]) f++;
2657 } else if (strchr("0123456789sS", argv[f][0])
2658 && strlen(argv[f]) == 1)
2659 dfl_level = argv[f][0];
2660 /* "init u" in the very beginning makes no sense */
2661 if (dfl_level == 's') dfl_level = 'S';
2662 maxproclen += strlen(argv[f]) + 1;
2663 }
2664
2665 /* Start booting. */
2666 argv0 = argv[0];
2667 argv[1] = NULL;
2668 setproctitle("init boot");
2669 init_main(dfl_level);
2670
2671 /*NOTREACHED*/
2672 return 0;
2673 }
main() -> init_main()
2340 /*
2341 * The main loop
2342 */
2343 int init_main()
2344 {
2345 CHILD *ch;
2346 struct sigaction sa;
2347 sigset_t sgt;
2348 pid_t rc;
2349 int f, st;
2350
2351 if (!reload) {
2352
2353 #if INITDEBUG
2354 /*
2355 * Fork so we can debug the init process.
2356 */
2357 if ((f = fork()) > 0) {
2358 static const char killmsg[] = "PRNT: init killed.\r\n";
2359 pid_t rc;
2360
2361 while((rc = wait(&st)) != f)
2362 if (rc < 0 && errno == ECHILD)
2363 break;
2364 write(1, killmsg, sizeof(killmsg) - 1);
2365 while(1) pause();
2366 }
2367 #endif
2368
2369 #ifdef __linux__
2370 /*
2371 * Tell the kernel to send us SIGINT when CTRL-ALT-DEL
2372 * is pressed, and that we want to handle keyboard signals.
2373 */
2374 init_reboot(BMAGIC_SOFT);
2375 if ((f = open(VT_MASTER, O_RDWR | O_NOCTTY)) >= 0) {
2376 (void) ioctl(f, KDSIGACCEPT, SIGWINCH);
2377 close(f);
2378 } else
2379 (void) ioctl(0, KDSIGACCEPT, SIGWINCH);
2380 #endif
2381
2382 /*
2383 * Ignore all signals.
2384 */
2385 for(f = 1; f <= NSIG; f++)
2386 SETSIG(sa, f, SIG_IGN, SA_RESTART);
2387 }
2388
2389 SETSIG(sa, SIGALRM, signal_handler, 0);
2390 SETSIG(sa, SIGHUP, signal_handler, 0);
2391 SETSIG(sa, SIGINT, signal_handler, 0);
2392 SETSIG(sa, SIGCHLD, chld_handler, SA_RESTART);
2393 SETSIG(sa, SIGPWR, signal_handler, 0);
2394 SETSIG(sa, SIGWINCH, signal_handler, 0);
2395 SETSIG(sa, SIGUSR1, signal_handler, 0);
2396 SETSIG(sa, SIGSTOP, stop_handler, SA_RESTART);
2397 SETSIG(sa, SIGTSTP, stop_handler, SA_RESTART);
2398 SETSIG(sa, SIGCONT, cont_handler, SA_RESTART);
2399 SETSIG(sa, SIGSEGV, (void (*)(int))segv_handler, SA_RESTART);
2400
2401 console_init();
2402
2403 if (!reload) {
2404
2405 /* Close whatever files are open, and reset the console. */
2406 close(0);
2407 close(1);
2408 close(2);
2409 console_stty();
2410 setsid();
2411
2412 /*
2413 * Set default PATH variable.
2414 */
2415 putenv(PATH_DFL);
2416
2417 /*
2418 * Initialize /var/run/utmp (only works if /var is on
2419 * root and mounted rw)
2420 */
2421 (void) close(open(UTMP_FILE, O_WRONLY|O_CREAT|O_TRUNC, 0644));
2422
2423 /*
2424 * Say hello to the world
2425 */
2426 initlog(L_CO, bootmsg, "booting");
2427
2428 /*
2429 * See if we have to start an emergency shell.
2430 */
2431 if (emerg_shell) {
2432 SETSIG(sa, SIGCHLD, SIG_DFL, SA_RESTART);
2433 if (spawn(&ch_emerg, &f) > 0) {
2434 while((rc = wait(&st)) != f)
2435 if (rc < 0 && errno == ECHILD)
2436 break;
2437 }
2438 SETSIG(sa, SIGCHLD, chld_handler, SA_RESTART);
2439 }
2440
2441 /*
2442 * Start normal boot procedure.
2443 */
2444 runlevel = '#';
2445 read_inittab();
2446
2447 } else {
2448 /*
2449 * Restart: unblock signals and let the show go on
2450 */
2451 initlog(L_CO, bootmsg, "reloading");
2452 sigfillset(&sgt);
2453 sigprocmask(SIG_UNBLOCK, &sgt, NULL);
2454 }
2455 start_if_needed();
2456
2457 while(1) {
2458
2459 /* See if we need to make the boot transitions. */
2460 boot_transitions();
2461 INITDBG(L_VB, "init_main: waiting..");
2462
2463 /* Check if there are processes to be waited on. */
2464 for(ch = family; ch; ch = ch->next)
2465 if ((ch->flags & RUNNING) && ch->action != BOOT) break;
2466
2467 #if CHANGE_WAIT
2468 /* Wait until we get hit by some signal. */
2469 while (ch != NULL && got_signals == 0) {
2470 if (ISMEMBER(got_signals, SIGHUP)) {
2471 /* See if there are processes to be waited on. */
2472 for(ch = family; ch; ch = ch->next)
2473 if (ch->flags & WAITING) break;
2474 }
2475 if (ch != NULL) check_init_fifo();
2476 }
2477 #else /* CHANGE_WAIT */
2478 if (ch != NULL && got_signals == 0) check_init_fifo();
2479 #endif /* CHANGE_WAIT */
2480
2481 /* Check the 'failing' flags */
2482 fail_check();
2483
2484 /* Process any signals. */
2485 process_signals();
2486
2487 /* See what we need to start up (again) */
2488 start_if_needed();
2489 }
2490 /*NOTREACHED*/
2491 }
SIGCHLD处理函数:
/*
* SIGCHLD: one of our children has died.
*/
void chld_handler()
{
CHILD *ch;
int pid, st;
int saved_errno = errno;
/*
* Find out which process(es) this was (were)
*/
while((pid = waitpid(-1, &st, WNOHANG)) != 0) {
if (errno == ECHILD) break;
for( ch = family; ch; ch = ch->next )
if ( ch->pid == pid && (ch->flags & RUNNING) ) {
INITDBG(L_VB, "chld_handler: marked %d as zombie", ch->pid);
ADDSET(got_signals, SIGCHLD);
ch->exstat = st;
ch->flags |= ZOMBIE;
if (ch->new) {
ch->new->exstat = st;
ch->new->flags |= ZOMBIE;
}
break;
}
if (ch == NULL)
INITDBG(L_VB, "chld_handler: unknown child %d exited.", pid);
}
errno = saved_errno;
}
ch->exstat = st; 是目的。可见init进程也是用waitpid()获取child的exit status的,这里把SIGCHLD记录
到全局变量got_signals中,got_signals变量的每个bit代表一种signal。
最后的2457 while(1) { 循环就是平时见到的init的状态,在这等着事件发生。其中的process_signals()
取消全局变量got_signals相应的bit。
2234 /*
2235 * Init got hit by a signal. See which signal it is,
2236 * and act accordingly.
2237 */
2238 void process_signals()
2239 {
2240 CHILD *ch;
2241 int pwrstat;
2242 int oldlevel;
2243 int fd;
2244 char c;
2245
2246 if (ISMEMBER(got_signals, SIGPWR)) {
2247 INITDBG(L_VB, "got SIGPWR");
2248 /* See _what_ kind of SIGPWR this is. */
2249 pwrstat = 0;
2250 if ((fd = open(PWRSTAT, O_RDONLY)) >= 0) {
2251 c = 0;
2252 read(fd, &c, 1);
2253 pwrstat = c;
2254 close(fd);
2255 unlink(PWRSTAT);
2256 }
2257 do_power_fail(pwrstat);
2258 DELSET(got_signals, SIGPWR);
2259 }
2260
2261 if (ISMEMBER(got_signals, SIGINT)) {
2262 INITDBG(L_VB, "got SIGINT");
2263 /* Tell ctrlaltdel entry to start up */
2264 for(ch = family; ch; ch = ch->next)
2265 if (ch->action == CTRLALTDEL)
2266 ch->flags &= ~XECUTED;
2267 DELSET(got_signals, SIGINT);
2268 }
2269
2270 if (ISMEMBER(got_signals, SIGWINCH)) {
2271 INITDBG(L_VB, "got SIGWINCH");
2272 /* Tell kbrequest entry to start up */
2273 for(ch = family; ch; ch = ch->next)
2274 if (ch->action == KBREQUEST)
2275 ch->flags &= ~XECUTED;
2276 DELSET(got_signals, SIGWINCH);
2277 }
2278
2279 if (ISMEMBER(got_signals, SIGALRM)) {
2280 INITDBG(L_VB, "got SIGALRM");
2281 /* The timer went off: check it out */
2282 DELSET(got_signals, SIGALRM);
2283 }
2284
2285 if (ISMEMBER(got_signals, SIGCHLD)) {
2286 INITDBG(L_VB, "got SIGCHLD");
2287 /* First set flag to 0 */
2288 DELSET(got_signals, SIGCHLD);
2289
2290 /* See which child this was */
2291 for(ch = family; ch; ch = ch->next)
2292 if (ch->flags & ZOMBIE) {
2293 INITDBG(L_VB, "Child died, PID= %d", ch->pid);
2294 ch->flags &= ~(RUNNING|ZOMBIE|WAITING);
2295 if (ch->process[0] != '+')
2296 write_utmp_wtmp("", ch->id, ch->pid, DEAD_PROCESS, NULL);
2297 }
2298
2299 }
2300
2301 if (ISMEMBER(got_signals, SIGHUP)) {
2302 INITDBG(L_VB, "got SIGHUP");
2303 #if CHANGE_WAIT
2304 /* Are we waiting for a child? */
2305 for(ch = family; ch; ch = ch->next)
2306 if (ch->flags & WAITING) break;
2307 if (ch == NULL)
2308 #endif
2309 {
2310 /* We need to go into a new runlevel */
2311 oldlevel = runlevel;
2312 #ifdef INITLVL
2313 runlevel = read_level(0);
2314 #endif
2315 if (runlevel == 'U') {
2316 runlevel = oldlevel;
2317 re_exec();
2318 } else {
2319 if (oldlevel != 'S' && runlevel == 'S') console_stty();
2320 if (runlevel == '6' || runlevel == '0' ||
2321 runlevel == '1') console_stty();
2322 read_inittab();
2323 fail_cancel();
2324 setproctitle("init [%c]", runlevel);
2325 DELSET(got_signals, SIGHUP);
2326 }
2327 }
2328 }
2329 if (ISMEMBER(got_signals, SIGUSR1)) {
2330 /*
2331 * SIGUSR1 means close and reopen /dev/initctl
2332 */
2333 INITDBG(L_VB, "got SIGUSR1");
2334 close(pipe_fd);
2335 pipe_fd = -1;
2336 DELSET(got_signals, SIGUSR1);
2337 }
2338 }
可见用户态的init的代码对zombie的处理很简单,关键是靠调用waitpid(),来看看sys_wait4():
直接vi -t sys_waitpid()没有,grep -r 发现了 kernel/exit.c,进入搜索如下内容:
#ifdef __ARCH_WANT_SYS_WAITPID
/*
* sys_waitpid() remains for compatibility. waitpid() should be
* implemented by calling sys_wait4() from libc.a.
*/
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
return sys_wait4(pid, stat_addr, options, NULL);
}
但grep -r 也找不到sys_wait4的定义,上LXR在2.6.11中能找到定义,发现do_wait()是其核心函数,
好在 vi -t do_wait 存在。到2.6.29就没sys_wait4()的定义了。
1600 static long do_wait(struct wait_opts *wo)
1601 {
1602 struct task_struct *tsk;
1603 int retval;
1604
1605 trace_sched_process_wait(wo->wo_pid);
1606
1607 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1608 wo->child_wait.private = current;
1609 add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
1610 repeat:
1611 /*
1612 * If there is nothing that can match our critiera just get out.
1613 * We will clear ->notask_error to zero if we see any child that
1614 * might later match our criteria, even if we are not able to reap
1615 * it yet.
1616 */
1617 wo->notask_error = -ECHILD;
1618 if ((wo->wo_type < PIDTYPE_MAX) &&
1619 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1620 goto notask;
1621
1622 set_current_state(TASK_INTERRUPTIBLE);
1623 read_lock(&tasklist_lock);
1624 tsk = current;
1625 do {
1626 retval = do_wait_thread(wo, tsk);
1627 if (retval)
1628 goto end;
1629
1630 retval = ptrace_do_wait(wo, tsk);
1631 if (retval)
1632 goto end;
1633
1634 if (wo->wo_flags & __WNOTHREAD)
1635 break;
1636 } while_each_thread(current, tsk);
1637 read_unlock(&tasklist_lock);
1638
1639 notask:
1640 retval = wo->notask_error;
1641 if (!retval && !(wo->wo_flags & WNOHANG)) {
1642 retval = -ERESTARTSYS;
1643 if (!signal_pending(current)) {
1644 schedule();
1645 goto repeat;
1646 }
1647 }
1648 end:
1649 __set_current_state(TASK_RUNNING);
1650 remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
1651 return retval;
1652 }
do_wait() -> do_wait_thread()
1543 /*
1544 * Do the work of do_wait() for one thread in the group, @tsk.
1545 *
1546 * -ECHILD should be in ->notask_error before the first call.
1547 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1548 * Returns zero if the search for a child should continue; then
1549 * ->notask_error is 0 if there were any eligible children,
1550 * or another error from security_task_wait(), or still -ECHILD.
1551 */
1552 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1553 {
1554 struct task_struct *p;
1555
1556 list_for_each_entry(p, &tsk->children, sibling) {
1557 int ret = wait_consider_task(wo, 0, p);
1558 if (ret)
1559 return ret;
1560 }
1561
1562 return 0;
1563 }
曾误以为init是系统进程的祖先,应该权力很大,会定期扫描系统所有进程,发现ZOMBIE后就回收。
由 list_for_each_entry(p, &tsk->children, sibling) 看出,原来是必须先有进程在退出时把它所有的
儿子过继给init,之后init才有机会回收那些养子。而且init必须先受到SIGCHLD信号才会处理zombie。
do_wait() -> do_wait_thread() -> wait_consider_task()
1483 /*
1484 * Consider @p for a wait by @parent.
1485 *
1486 * -ECHILD should be in ->notask_error before the first call.
1487 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1488 * Returns zero if the search for a child should continue;
1489 * then ->notask_error is 0 if @p is an eligible child,
1490 * or another error from security_task_wait(), or still -ECHILD.
1491 */
1492 static int wait_consider_task(struct wait_opts *wo, int ptrace,
1493 struct task_struct *p)
1494 {
1495 int ret = eligible_child(wo, p);
1496 if (!ret)
1497 return ret;
1498
1499 ret = security_task_wait(p);
1500 if (unlikely(ret < 0)) {
1501 /*
1502 * If we have not yet seen any eligible child,
1503 * then let this error code replace -ECHILD.
1504 * A permission error will give the user a clue
1505 * to look for security policy problems, rather
1506 * than for mysterious wait bugs.
1507 */
1508 if (wo->notask_error)
1509 wo->notask_error = ret;
1510 return 0;
1511 }
1512
1513 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1514 /*
1515 * This child is hidden by ptrace.
1516 * We aren't allowed to see it now, but eventually we will.
1517 */
1518 wo->notask_error = 0;
1519 return 0;
1520 }
1521
1522 if (p->exit_state == EXIT_DEAD)
1523 return 0;
1524
1525 /*
1526 * We don't reap group leaders with subthreads.
1527 */
1528 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1529 return wait_task_zombie(wo, p);
1530
1531 /*
1532 * It's stopped or running now, so it might
1533 * later continue, exit, or stop again.
1534 */
1535 wo->notask_error = 0;
1536
1537 if (task_stopped_code(p, ptrace))
1538 return wait_task_stopped(wo, ptrace, p);
1539
1540 return wait_task_continued(wo, p);
1541 }
do_wait() -> do_wait_thread() -> wait_consider_task() -> wait_task_zombie()
1164 /*
1165 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1166 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1167 * the lock and this task is uninteresting. If we return nonzero, we have
1168 * released the lock and the system call should return.
1169 */
1170 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1171 {
1172 unsigned long state;
1173 int retval, status, traced;
1174 pid_t pid = task_pid_vnr(p);
1175 uid_t uid = __task_cred(p)->uid;
1176 struct siginfo __user *infop;
1177
1178 if (!likely(wo->wo_flags & WEXITED))
1179 return 0;
1180
1181 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code;
1183 int why, status;
1184
1185 get_task_struct(p);
1186 read_unlock(&tasklist_lock);
1187 if ((exit_code & 0x7f) == 0) {
1188 why = CLD_EXITED;
1189 status = exit_code >> 8;
1190 } else {
1191 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1192 status = exit_code & 0x7f;
1193 }
1194 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1195 }
1196
1197 /*
1198 * Try to move the task's state to DEAD
1199 * only one thread is allowed to do this:
1200 */
1201 state = xchg(&p->exit_state, EXIT_DEAD);
1202 if (state != EXIT_ZOMBIE) {
1203 BUG_ON(state != EXIT_DEAD);
1204 return 0;
1205 }
1206
1207 traced = ptrace_reparented(p);
1208 /*
1209 * It can be ptraced but not reparented, check
1210 * !task_detached() to filter out sub-threads.
1211 */
1212 if (likely(!traced) && likely(!task_detached(p))) {
1213 struct signal_struct *psig;
1214 struct signal_struct *sig;
1215 unsigned long maxrss;
1216 cputime_t tgutime, tgstime;
1217
1218 /*
1219 * The resource counters for the group leader are in its
1220 * own task_struct. Those for dead threads in the group
1221 * are in its signal_struct, as are those for the child
1222 * processes it has previously reaped. All these
1223 * accumulate in the parent's signal_struct c* fields.
1224 *
1225 * We don't bother to take a lock here to protect these
1226 * p->signal fields, because they are only touched by
1227 * __exit_signal, which runs with tasklist_lock
1228 * write-locked anyway, and so is excluded here. We do
1229 * need to protect the access to parent->signal fields,
1230 * as other threads in the parent group can be right
1231 * here reaping other children at the same time.
1232 *
1233 * We use thread_group_times() to get times for the thread
1234 * group, which consolidates times for all threads in the
1235 * group including the group leader.
1236 */
1237 thread_group_times(p, &tgutime, &tgstime);
1238 spin_lock_irq(&p->real_parent->sighand->siglock);
1239 psig = p->real_parent->signal;
1240 sig = p->signal;
1241 psig->cutime =
1242 cputime_add(psig->cutime,
1243 cputime_add(tgutime,
1244 sig->cutime));
1245 psig->cstime =
1246 cputime_add(psig->cstime,
1247 cputime_add(tgstime,
1248 sig->cstime));
1249 psig->cgtime =
1250 cputime_add(psig->cgtime,
1251 cputime_add(p->gtime,
1252 cputime_add(sig->gtime,
1253 sig->cgtime)));
1254 psig->cmin_flt +=
1255 p->min_flt + sig->min_flt + sig->cmin_flt;
1256 psig->cmaj_flt +=
1257 p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1258 psig->cnvcsw +=
1259 p->nvcsw + sig->nvcsw + sig->cnvcsw;
1260 psig->cnivcsw +=
1261 p->nivcsw + sig->nivcsw + sig->cnivcsw;
1262 psig->cinblock +=
1263 task_io_get_inblock(p) +
1264 sig->inblock + sig->cinblock;
1265 psig->coublock +=
1266 task_io_get_oublock(p) +
1267 sig->oublock + sig->coublock;
1268 maxrss = max(sig->maxrss, sig->cmaxrss);
1269 if (psig->cmaxrss < maxrss)
1270 psig->cmaxrss = maxrss;
1271 task_io_accounting_add(&psig->ioac, &p->ioac);
1272 task_io_accounting_add(&psig->ioac, &sig->ioac);
1273 spin_unlock_irq(&p->real_parent->sighand->siglock);
1274 }
1275
1276 /*
1277 * Now we are sure this task is interesting, and no other
1278 * thread can reap it because we set its state to EXIT_DEAD.
1279 */
1280 read_unlock(&tasklist_lock);
1281
1282 retval = wo->wo_rusage
1283 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1284 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1285 ? p->signal->group_exit_code : p->exit_code;
1286 if (!retval && wo->wo_stat)
1287 retval = put_user(status, wo->wo_stat);
1288
1289 infop = wo->wo_info;
1290 if (!retval && infop)
1291 retval = put_user(SIGCHLD, &infop->si_signo);
1292 if (!retval && infop)
1293 retval = put_user(0, &infop->si_errno);
1294 if (!retval && infop) {
1295 int why;
1296
1297 if ((status & 0x7f) == 0) {
1298 why = CLD_EXITED;
1299 status >>= 8;
1300 } else {
1301 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1302 status &= 0x7f;
1303 }
1304 retval = put_user((short)why, &infop->si_code);
1305 if (!retval)
1306 retval = put_user(status, &infop->si_status);
1307 }
1308 if (!retval && infop)
1309 retval = put_user(pid, &infop->si_pid);
1310 if (!retval && infop)
1311 retval = put_user(uid, &infop->si_uid);
1312 if (!retval)
1313 retval = pid;
1314
1315 if (traced) {
1316 write_lock_irq(&tasklist_lock);
1317 /* We dropped tasklist, ptracer could die and untrace */
1318 ptrace_unlink(p);
1319 /*
1320 * If this is not a detached task, notify the parent.
1321 * If it's still not detached after that, don't release
1322 * it now.
1323 */
1324 if (!task_detached(p)) {
1325 do_notify_parent(p, p->exit_signal);
1326 if (!task_detached(p)) {
1327 p->exit_state = EXIT_ZOMBIE;
1328 p = NULL;
1329 }
1330 }
1331 write_unlock_irq(&tasklist_lock);
1332 }
1333 if (p != NULL)
1334 release_task(p); // first usage--
1335
1336 return retval;
1337 }
进程的exit:
892 NORET_TYPE void do_exit(long code)
893 {
894 struct task_struct *tsk = current;
895 int group_dead;
896
897 profile_task_exit(tsk);
898
899 WARN_ON(atomic_read(&tsk->fs_excl));
900
901 if (unlikely(in_interrupt()))
902 panic("Aiee, killing interrupt handler!");
903 if (unlikely(!tsk->pid))
904 panic("Attempted to kill the idle task!");
905
906 tracehook_report_exit(&code);
907
908 validate_creds_for_do_exit(tsk);
909
910 /*
911 * We're taking recursive faults here in do_exit. Safest is to just
912 * leave this task alone and wait for reboot.
913 */
914 if (unlikely(tsk->flags & PF_EXITING)) {
915 printk(KERN_ALERT
916 "Fixing recursive fault but reboot is needed!\n");
917 /*
918 * We can do this unlocked here. The futex code uses
919 * this flag just to verify whether the pi state
920 * cleanup has been done or not. In the worst case it
921 * loops once more. We pretend that the cleanup was
922 * done as there is no way to return. Either the
923 * OWNER_DIED bit is set by now or we push the blocked
924 * task into the wait for ever nirwana as well.
925 */
926 tsk->flags |= PF_EXITPIDONE;
927 set_current_state(TASK_UNINTERRUPTIBLE);
928 schedule();
929 }
930
931 exit_irq_thread();
932
933 exit_signals(tsk); /* sets PF_EXITING */
934 /*
935 * tsk->flags are checked in the futex code to protect against
936 * an exiting task cleaning up the robust pi futexes.
937 */
938 smp_mb();
939 raw_spin_unlock_wait(&tsk->pi_lock);
940
941 if (unlikely(in_atomic()))
942 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
943 current->comm, task_pid_nr(current),
944 preempt_count());
945
946 acct_update_integrals(tsk);
947
948 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer);
951 exit_itimers(tsk->signal);
952 if (tsk->mm)
953 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
954 }
955 acct_collect(code, group_dead);
956 if (group_dead)
957 tty_audit_exit();
958 if (unlikely(tsk->audit_context))
959 audit_free(tsk);
960
961 tsk->exit_code = code;
962 taskstats_exit(tsk, group_dead);
963
964 exit_mm(tsk);
965
966 if (group_dead)
967 acct_process();
968 trace_sched_process_exit(tsk);
969
970 exit_sem(tsk);
971 exit_files(tsk);
972 exit_fs(tsk);
973 check_stack_usage();
974 exit_thread();
975 cgroup_exit(tsk, 1);
976
977 if (group_dead)
978 disassociate_ctty(1);
979
980 module_put(task_thread_info(tsk)->exec_domain->module);
981
982 proc_exit_connector(tsk);
983
984 /*
985 * FIXME: do that only when needed, using sched_exit tracepoint
986 */
987 flush_ptrace_hw_breakpoint(tsk);
988 /*
989 * Flush inherited counters to the parent - before the parent
990 * gets woken up by child-exit notifications.
991 */
992 perf_event_exit_task(tsk);
993
994 exit_notify(tsk, group_dead);
995 #ifdef CONFIG_NUMA
996 mpol_put(tsk->mempolicy);
997 tsk->mempolicy = NULL;
998 #endif
999 #ifdef CONFIG_FUTEX
1000 if (unlikely(current->pi_state_cache))
1001 kfree(current->pi_state_cache);
1002 #endif
1003 /*
1004 * Make sure we are holding no locks:
1005 */
1006 debug_check_no_locks_held(tsk);
1007 /*
1008 * We can do this unlocked here. The futex code uses this flag
1009 * just to verify whether the pi state cleanup has been done
1010 * or not. In the worst case it loops once more.
1011 */
1012 tsk->flags |= PF_EXITPIDONE;
1013
1014 if (tsk->io_context)
1015 exit_io_context(tsk);
1016
1017 if (tsk->splice_pipe)
1018 __free_pipe_info(tsk->splice_pipe);
1019
1020 validate_creds_for_do_exit(tsk);
1021
1022 preempt_disable();
1023 exit_rcu();
1024 /* causes final put_task_struct in finish_task_switch(). */
1025 tsk->state = TASK_DEAD;
1026 schedule();
1027 BUG();
1028 /* Avoid "noreturn function does return". */
1029 for (;;)
1030 cpu_relax(); /* For when BUG is null */
1031 }
注意上面函数末尾几行的注释已经说了
1024 /* causes final put_task_struct in finish_task_switch(). */
1025 tsk->state = TASK_DEAD;
1026 schedule();
与之呼应的注释是 do_fork() -> copy_process() -> dup_task_struct()
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
再看看注释说的finish_task_switch(): schedule() -> context_switch() -> finish_task_switch()
2761 /**
2762 * finish_task_switch - clean up after a task-switch
2763 * @rq: runqueue associated with task-switch
2764 * @prev: the thread we just switched away from.
2765 *
2766 * finish_task_switch must be called after the context switch, paired
2767 * with a prepare_task_switch call before the context switch.
2768 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2769 * and do any other architecture-specific cleanup actions.
2770 *
2771 * Note that we may have delayed dropping an mm in context_switch(). If
2772 * so, we finish that here outside of the runqueue lock. (Doing it
2773 * with the lock held can cause deadlocks; see schedule() for
2774 * details.)
2775 */
2776 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2777 __releases(rq->lock)
2778 {
2779 struct mm_struct *mm = rq->prev_mm;
2780 long prev_state;
2781
2782 rq->prev_mm = NULL;
2783
2784 /*
2785 * A task struct has one reference for the use as "current".
2786 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2787 * schedule one last time. The schedule call will never return, and
2788 * the scheduled task must drop that reference.
2789 * The test for TASK_DEAD must occur while the runqueue locks are
2790 * still held, otherwise prev could be scheduled on another cpu, die
2791 * there before we look at prev->state, and then the reference would
2792 * be dropped twice.
2793 * Manfred Spraul <manfred@colorfullife.com>
2794 */
2795 prev_state = prev->state;
2796 finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq));
2798 finish_lock_switch(rq, prev);
2799
2800 fire_sched_in_preempt_notifiers(current);
2801 if (mm)
2802 mmdrop(mm);
2803 if (unlikely(prev_state == TASK_DEAD)) {
2804 /*
2805 * Remove function-return probe instances associated with this
2806 * task and put them back on the free list.
2807 */
2808 kprobe_flush_task(prev);
2809 put_task_struct(prev); //最终释放了退出进程的kernel stack。
2810 }
2811 }
do_exit() -> exit_notify()
803 /*
804 * Send signals to all our closest relatives so that they know
805 * to properly mourn us..
806 */
807 static void exit_notify(struct task_struct *tsk, int group_dead)
808 {
809 int signal;
810 void *cookie;
811
812 /*
813 * This does two things:
814 *
815 * A. Make init inherit all the child processes
816 * B. Check to see if any process groups have become orphaned
817 * as a result of our exiting, and if they have any stopped
818 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
819 */
820 forget_original_parent(tsk);
821 exit_task_namespaces(tsk);
822
823 write_lock_irq(&tasklist_lock);
824 if (group_dead)
825 kill_orphaned_pgrp(tsk->group_leader, NULL);
826
827 /* Let father know we died
828 *
829 * Thread signals are configurable, but you aren't going to use
830 * that to send signals to arbitary processes.
831 * That stops right now.
832 *
833 * If the parent exec id doesn't match the exec id we saved
834 * when we started then we know the parent has changed security
835 * domain.
836 *
837 * If our self_exec id doesn't match our parent_exec_id then
838 * we have changed execution domain as these two values started
839 * the same after a fork.
840 */
841 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
842 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
843 tsk->self_exec_id != tsk->parent_exec_id))
844 tsk->exit_signal = SIGCHLD;
845
846 signal = tracehook_notify_death(tsk, &cookie, group_dead);
847 if (signal >= 0)
848 signal = do_notify_parent(tsk, signal);
849
850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
851
852 /* mt-exec, de_thread() is waiting for us */
853 if (thread_group_leader(tsk) &&
854 tsk->signal->group_exit_task &&
855 tsk->signal->notify_count < 0)
856 wake_up_process(tsk->signal->group_exit_task);
857
858 write_unlock_irq(&tasklist_lock);
859
860 tracehook_report_death(tsk, signal, cookie, group_dead);
861
862 /* If the process is dead, release it - nobody will wait for it */
863 if (signal == DEATH_REAP)
864 release_task(tsk);
865 }
forget_original_parent() 先给自己的孩子找养父,然后把自己的孩子都过继给这个养父。
do_exit() -> exit_notify() -> forget_original_parent()
769 static void forget_original_parent(struct task_struct *father)
770 {
771 struct task_struct *p, *n, *reaper;
772 LIST_HEAD(dead_children);
773
774 exit_ptrace(father);
775
776 write_lock_irq(&tasklist_lock);
777 reaper = find_new_reaper(father);
778
779 list_for_each_entry_safe(p, n, &father->children, sibling) {
780 struct task_struct *t = p;
781 do {
782 t->real_parent = reaper;
783 if (t->parent == father) {
784 BUG_ON(task_ptrace(t));
785 t->parent = t->real_parent;
786 }
787 if (t->pdeath_signal)
788 group_send_sig_info(t->pdeath_signal,
789 SEND_SIG_NOINFO, t);
790 } while_each_thread(p, t);
791 reparent_leader(father, p, &dead_children);
792 }
793 write_unlock_irq(&tasklist_lock);
794
795 BUG_ON(!list_empty(&father->children));
796
797 list_for_each_entry_safe(p, n, &dead_children, sibling) {
798 list_del_init(&p->sibling);
799 release_task(p);
800 }
801 }
其中 find_new_reaper() 的函数注释说的很明白:
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the child reaper process (ie "init") in our pid
* space.
*/
do_exit() -> exit_notify() -> tracehook_notify_death()
507 #define DEATH_REAP -1
508 #define DEATH_DELAYED_GROUP_LEADER -2
509
510 /**
511 * tracehook_notify_death - task is dead, ready to notify parent
512 * @task: @current task now exiting
513 * @death_cookie: value to pass to tracehook_report_death()
514 * @group_dead: nonzero if this was the last thread in the group to die
515 *
516 * A return value >= 0 means call do_notify_parent() with that signal
517 * number. Negative return value can be %DEATH_REAP to self-reap right
518 * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our
519 * parent. Note that a return value of 0 means a do_notify_parent() call
520 * that sends no signal, but still wakes up a parent blocked in wait*().
521 *
522 * Called with write_lock_irq(&tasklist_lock) held.
523 */
524 static inline int tracehook_notify_death(struct task_struct *task,
525 void **death_cookie, int group_dead)
526 {
527 if (task_detached(task))
528 return task->ptrace ? SIGCHLD : DEATH_REAP;
529
530 /*
531 * If something other than our normal parent is ptracing us, then
532 * send it a SIGCHLD instead of honoring exit_signal. exit_signal
533 * only has special meaning to our real parent.
534 */
535 if (thread_group_empty(task) && !ptrace_reparented(task))
536 return task->exit_signal;
537
538 return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER;
539 }
static inline int task_detached(struct task_struct *p)
{
return p->exit_signal == -1;
}
这里要明白exit_signal的含义:表示自己退出时,是否向parent发信号,-1就是不发。
如果不向parent发信号,并且没有被ptrace,那就返回DEATH_READ(-1), 然后进程设置状态为EXIT_DEAD,
调用release_task(),但此时在release_task()中不能释放kernel stack
do_exit() -> exit_notify() -> do_notify_parent()
1413 /*
1414 * Let a parent know about the death of a child.
1415 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1416 *
1417 * Returns -1 if our parent ignored us and so we've switched to
1418 * self-reaping, or else @sig.
1419 */
1420 int do_notify_parent(struct task_struct *tsk, int sig)
1421 {
1422 struct siginfo info;
1423 unsigned long flags;
1424 struct sighand_struct *psig;
1425 int ret = sig;
1426
1427 BUG_ON(sig == -1);
1428
1429 /* do_notify_parent_cldstop should have been called instead. */
1430 BUG_ON(task_is_stopped_or_traced(tsk));
1431
1432 BUG_ON(!task_ptrace(tsk) &&
1433 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1434
1435 info.si_signo = sig;
1436 info.si_errno = 0;
1437 /*
1438 * we are under tasklist_lock here so our parent is tied to
1439 * us and cannot exit and release its namespace.
1440 *
1441 * the only it can is to switch its nsproxy with sys_unshare,
1442 * bu uncharing pid namespaces is not allowed, so we'll always
1443 * see relevant namespace
1444 *
1445 * write_lock() currently calls preempt_disable() which is the
1446 * same as rcu_read_lock(), but according to Oleg, this is not
1447 * correct to rely on this
1448 */
1449 rcu_read_lock();
1450 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1451 info.si_uid = __task_cred(tsk)->uid;
1452 rcu_read_unlock();
1453
1454 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1455 tsk->signal->utime));
1456 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1457 tsk->signal->stime));
1458
1459 info.si_status = tsk->exit_code & 0x7f;
1460 if (tsk->exit_code & 0x80)
1461 info.si_code = CLD_DUMPED;
1462 else if (tsk->exit_code & 0x7f)
1463 info.si_code = CLD_KILLED;
1464 else {
1465 info.si_code = CLD_EXITED;
1466 info.si_status = tsk->exit_code >> 8;
1467 }
1468
1469 psig = tsk->parent->sighand;
1470 spin_lock_irqsave(&psig->siglock, flags);
1471 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1472 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1473 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1474 /*
1475 * We are exiting and our parent doesn't care. POSIX.1
1476 * defines special semantics for setting SIGCHLD to SIG_IGN
1477 * or setting the SA_NOCLDWAIT flag: we should be reaped
1478 * automatically and not left for our parent's wait4 call.
1479 * Rather than having the parent do it as a magic kind of
1480 * signal handler, we just set this to tell do_exit that we
1481 * can be cleaned up without becoming a zombie. Note that
1482 * we still call __wake_up_parent in this case, because a
1483 * blocked sys_wait4 might now return -ECHILD.
1484 *
1485 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
1486 * is implementation-defined: we do (if you don't want
1487 * it, just use SIG_IGN instead).
1488 */
1489 ret = tsk->exit_signal = -1;
1490 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1491 sig = -1;
1492 }
1493 if (valid_signal(sig) && sig > 0)
1494 __group_send_sig_info(sig, &info, tsk->parent);
1495 __wake_up_parent(tsk, tsk->parent);
1496 spin_unlock_irqrestore(&psig->siglock, flags);
1497
1498 return ret;
1499 }
Wednesday, March 24, 2010
init process and process exit
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment