一个用户进程执行时,如果产生了信号,用户程序被打断,kernel会把用户进程的register内容保存到该进
程的kernel stack中,kernel还会执行其它工作,还会设定sigmask,然后返回用户态去执行sig_handler,
sig_handler执行结束后,恢复被信号打断的用户进程的执行。
一般的用户进程执行系统调用的需求是: user process -> kernel mode -> user process
解决方法是把用户进程的ss, sp, cs, eip等值保存到该进程的kernel stack中,返回到user mode时恢复出
来就行了。
而信号处理的需求是: user process -> kernel mode -> sig_handler(user mode) -> user process
也是先把用户进程的ss, sp, cs, eip等值保存到该进程的kernel stack中,在返回到user mode执行
sig_handler时就要特别处理了: 一旦返回到user mode,进程的kernel stack可能被破坏,那将来
sig_handler结束后,用户进程就没法恢复状态继续运行了。为了使用户进程不被打扰,还要undo sigmask。
Linux的解决方法是:
返回user mode执行sig_handler之前,把kernel stack中的上下文copy到当前进程的user stack中,在建立
sig_handler的stack环境时,将sigreturn()的地址插入到stack中,当sig_handler执行完毕后,就会执行
sigreturn()重新进入kernel mode。sigreturn()把保存的内容重新填入kernel stack,undo sigmask。这
样返回用户态之后,用户进程就能继续正常执行了。
man sigreturn:
When the Linux kernel creates the stack frame for a signal handler, a call to sigreturn() is
inserted into the stack frame so that upon return from the signal handler, sigreturn() will be
called.
This sigreturn() call undoes everything that was done—changing the process’s signal mask,
switching stacks (see sigaltstack(2))—in order to invoke the signal handler: it restores the
process’s signal mask, switches stacks, and restores the process’s context (registers, proces-
sor flags), so that the process directly resumes execution at the point where it was inter-
rupted by the signal.
这样信号处理的过程是:
user process -> kernel mode -> sig_handler(user mode) -> sigreturn(kernel mode) -> user process
用户态对编程sigaction()对应kernel中是do_sigaction(),man sigaction 对照。
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
k = &t->sighand->action[sig-1];
spin_lock_irq(¤t->sighand->siglock);
if (oact)
*oact = *k;
if (act) {
sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
*k = *act;
/*
* POSIX 3.3.1.3:
* "Setting a signal action to SIG_IGN for a signal that is
* pending shall cause the pending signal to be discarded,
* whether or not it is blocked."
*
* "Setting a signal action to SIG_DFL for a signal that is
* pending and whose default action is to ignore the signal
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
if (sig_handler_ignored(sig_handler(t, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
rm_from_queue_full(&mask, &t->signal->shared_pending);
do {
rm_from_queue_full(&mask, &t->pending);
t = next_thread(t);
} while (t != current);
}
}
spin_unlock_irq(¤t->sighand->siglock);
return 0;
}
SIGKILL 和 SIGSTOP 特殊,用户不能改变它们的disposition,所以不能对它们两个设置sig_handler()
这和man 7 signal的说明一致:
#define sig_kernel_only(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
#define SIG_KERNEL_ONLY_MASK (\
rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP))
关键就是 struct k_sigaction,里面有sig_handler(),mask等,kernel处理时会用到。
struct sighand_struct {
atomic_t count;
struct k_sigaction action[_NSIG];
spinlock_t siglock;
wait_queue_head_t signalfd_wqh;
};
struct k_sigaction {
struct sigaction sa;
};
struct sigaction {
__sighandler_t sa_handler;
unsigned long sa_flags;
__sigrestore_t sa_restorer;
sigset_t sa_mask; /* mask last for extensibility */
};
假设kernel编译成 CONFIG_PREEMPT_NONE
信号的generate: send_sig(),会设置 TIF_SIGPENDING 。
信号的检测时机:
1。目标进程从kernel mode返回user mode时,如果检测到 TIF_SIGPENDING ,会调用 do_signal()来处理。
2。目标进程处于 TASK_INTERRUPTIBLE 状态,如果检测到 TIF_SIGPENDING ,会退出wait状态:
267 /**
268 * wait_event_interruptible - sleep until a condition gets true
269 * @wq: the waitqueue to wait on
270 * @condition: a C expression for the event to wait for
271 *
272 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
273 * @condition evaluates to true or a signal is received.
274 * The @condition is checked each time the waitqueue @wq is woken up.
275 *
276 * wake_up() has to be called after changing any variable that could
277 * change the result of the wait condition.
278 *
279 * The function will return -ERESTARTSYS if it was interrupted by a
280 * signal and 0 if @condition evaluated to true.
281 */
282 #define wait_event_interruptible(wq, condition) \
283 ({ \
284 int __ret = 0; \
285 if (!(condition)) \
286 __wait_event_interruptible(wq, condition, __ret); \
287 __ret; \
288 })
249 #define __wait_event_interruptible(wq, condition, ret) \
250 do { \
251 DEFINE_WAIT(__wait); \
252 \
253 for (;;) { \
254 prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
255 if (condition) \
256 break; \
257 if (!signal_pending(current)) { \
258 schedule(); \
259 continue; \
260 } \
261 ret = -ERESTARTSYS; \
262 break; \
263 } \
264 finish_wait(&wq, &__wait); \
265 } while (0)
一。用户态进程发信号给其它进程
对于 kill sigqueue 这类用户进程通过系统调用发送的信号,最终都是调用 do_send_sig_info() 系列,
所以只需走一遍 send_sig(),man 7 signal 对照
看看kill在kernel中的实现,man 2 kill 对照。
$ vi -t send_sig 对应kill的kernel source就在这个文件里:
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
info.si_uid = current_uid();
return kill_something_info(sig, &info, pid); // will call do_send_sig_info()
}
send_sig()更改接收信号的进程的task_struct中的相关field,struct sigqueue等,设置 TIF_SIGPENDING。
调用try_to_wake_up()唤醒目标进程,比如处于 TASK_INTERRUPTIBLE 状态的进程,看能否设置
TIF_NEEDRESCHED ,能设置就设置。这样current从kernel mode返回到user mode时,如果目标进程比
current优先级高,就会发生调度,目标进程开始执行,当返回到user mode时就会处理该信号。
过程挺多,必须要先理解schedule原理才行。平时kill一个进程很快就没了,以为对信号的处理是实时的。
看了源码就知道对信号的处理并不是实时的。
send_sig() -> send_sig_info() -> do_send_sig_info() -> send_signal() -> __send_signal():
864 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
865 int group, int from_ancestor_ns)
866 {
867 struct sigpending *pending;
868 struct sigqueue *q;
869 int override_rlimit;
870
871 trace_signal_generate(sig, info, t);
872
873 assert_spin_locked(&t->sighand->siglock);
874
875 if (!prepare_signal(sig, t, from_ancestor_ns))
876 return 0;
877
878 pending = group ? &t->signal->shared_pending : &t->pending;
879 /*
880 * Short-circuit ignored signals and support queuing
881 * exactly one non-rt signal, so that we can get more
882 * detailed information about the cause of the signal.
883 */
884 if (legacy_queue(pending, sig))
885 return 0;
886 /*
887 * fast-pathed signals for kernel-internal things like SIGSTOP
888 * or SIGKILL.
889 */
890 if (info == SEND_SIG_FORCED)
891 goto out_set;
892
893 /* Real-time signals must be queued if sent by sigqueue, or
894 some other real-time mechanism. It is implementation
895 defined whether kill() does so. We attempt to do so, on
896 the principle of least surprise, but since kill is not
897 allowed to fail with EAGAIN when low on memory we just
898 make sure at least one signal gets delivered and don't
899 pass on the info struct. */
900
901 if (sig < SIGRTMIN)
902 override_rlimit = (is_si_special(info) || info->si_code >= 0);
903 else
904 override_rlimit = 0;
905
906 q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
907 override_rlimit);
908 if (q) {
909 list_add_tail(&q->list, &pending->list);
910 switch ((unsigned long) info) {
911 case (unsigned long) SEND_SIG_NOINFO:
912 q->info.si_signo = sig;
913 q->info.si_errno = 0;
914 q->info.si_code = SI_USER;
915 q->info.si_pid = task_tgid_nr_ns(current,
916 task_active_pid_ns(t));
917 q->info.si_uid = current_uid();
918 break;
919 case (unsigned long) SEND_SIG_PRIV:
920 q->info.si_signo = sig;
921 q->info.si_errno = 0;
922 q->info.si_code = SI_KERNEL;
923 q->info.si_pid = 0;
924 q->info.si_uid = 0;
925 break;
926 default:
927 copy_siginfo(&q->info, info);
928 if (from_ancestor_ns)
929 q->info.si_pid = 0;
930 break;
931 }
932 } else if (!is_si_special(info)) {
933 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
934 /*
935 * Queue overflow, abort. We may abort if the
936 * signal was rt and sent by user using something
937 * other than kill().
938 */
939 trace_signal_overflow_fail(sig, group, info);
940 return -EAGAIN;
941 } else {
942 /*
943 * This is a silent loss of information. We still
944 * send the signal, but the *info bits are lost.
945 */
946 trace_signal_lose_info(sig, group, info);
947 }
948 }
949
950 out_set:
951 signalfd_notify(t, sig);
952 sigaddset(&pending->signal, sig);
953 complete_signal(sig, t, group);
954 return 0;
955 }
prepare_signal()是处理SIGSTOP/SIGCONT SIG_IGN的,不进去。
分配struct sigqueue,填充info,保存产生的信号信息。将来处理信号的时候会到这来找。
struct sigqueue {
struct list_head list;
int flags;
siginfo_t info;
struct user_struct *user;
};
由SEND_SIG_NOINFO SI_KERNEL 找到一些定义:
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
#define SEND_SIG_PRIV ((struct siginfo *) 1)
#define SEND_SIG_FORCED ((struct siginfo *) 2)
/*
* si_code values
* Digital reserves positive values for kernel-generated signals.
*/
#define SI_USER 0 /* sent by kill, sigsend, raise */
#define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
#define SI_QUEUE -1 /* sent by sigqueue */
#define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
#define SI_ASYNCIO -4 /* sent by AIO completion */
#define SI_SIGIO -5 /* sent by queued SIGIO */
#define SI_TKILL -6 /* sent by tkill system call */
#define SI_DETHREAD -7 /* sent by execve() killing subsidiary threads */
由 SI_KERNEL 注释看出,有些信号是kernel本身发出的。
末尾的signalfd_notify()和complete_signal()都会调用try_to_wake_up()
complete_signal()会设置 TIF_SIGPENDING。
需要注意的是,即使信号是SIGKILL,也不能立即杀掉目标进程,还是要等目标进程被调度执行,然后
从kernel mode 返回 user mode 时才能杀掉。
__send_signal() -> complete_signal()
783 static void complete_signal(int sig, struct task_struct *p, int group)
784 {
785 struct signal_struct *signal = p->signal;
786 struct task_struct *t;
787
788 /*
789 * Now find a thread we can wake up to take the signal off the queue.
790 *
791 * If the main thread wants the signal, it gets first crack.
792 * Probably the least surprising to the average bear.
793 */
794 if (wants_signal(sig, p))
795 t = p;
796 else if (!group || thread_group_empty(p))
797 /*
798 * There is just one thread and it does not need to be woken.
799 * It will dequeue unblocked signals before it runs again.
800 */
801 return;
802 else {
803 /*
804 * Otherwise try to find a suitable thread.
805 */
806 t = signal->curr_target;
807 while (!wants_signal(sig, t)) {
808 t = next_thread(t);
809 if (t == signal->curr_target)
810 /*
811 * No thread needs to be woken.
812 * Any eligible threads will see
813 * the signal in the queue soon.
814 */
815 return;
816 }
817 signal->curr_target = t;
818 }
819
820 /*
821 * Found a killable thread. If the signal will be fatal,
822 * then start taking the whole group down immediately.
823 */
824 if (sig_fatal(p, sig) &&
825 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
826 !sigismember(&t->real_blocked, sig) &&
827 (sig == SIGKILL ||
828 !tracehook_consider_fatal_signal(t, sig))) {
829 /*
830 * This signal will be fatal to the whole group.
831 */
832 if (!sig_kernel_coredump(sig)) {
833 /*
834 * Start a group exit and wake everybody up.
835 * This way we don't have other threads
836 * running and doing things after a slower
837 * thread has the fatal signal pending.
838 */
839 signal->flags = SIGNAL_GROUP_EXIT;
840 signal->group_exit_code = sig;
841 signal->group_stop_count = 0;
842 t = p;
843 do {
844 sigaddset(&t->pending.signal, SIGKILL);
845 signal_wake_up(t, 1); // cant terminate process immediately
846 } while_each_thread(p, t);
847 return;
848 }
849 }
850
851 /*
852 * The signal is already in the shared-pending queue.
853 * Tell the chosen thread to wake up and dequeue it.
854 */
855 signal_wake_up(t, sig == SIGKILL);
856 return;
857 }
/*
* Test if P wants to take SIG. After we've checked all threads with this,
* it's equivalent to finding no threads not blocking SIG. Any threads not
* blocking SIG were ruled out because they are not running and already
* have pending signals. Such threads will dequeue from the shared queue
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
static inline int wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
return 0;
if (p->flags & PF_EXITING)
return 0;
if (sig == SIGKILL)
return 1;
if (task_is_stopped_or_traced(p))
return 0;
return task_curr(p) || !signal_pending(p);
}
__send_signal() -> complete_signal() -> signal_wake_up()
528 /*
529 * Tell a process that it has a new active signal..
530 *
531 * NOTE! we rely on the previous spin_lock to
532 * lock interrupts for us! We can only be called with
533 * "siglock" held, and the local interrupt must
534 * have been disabled when that got acquired!
535 *
536 * No need to set need_resched since signal event passing
537 * goes through ->blocked
538 */
539 void signal_wake_up(struct task_struct *t, int resume)
540 {
541 unsigned int mask;
542
543 set_tsk_thread_flag(t, TIF_SIGPENDING); // here
544
545 /*
546 * For SIGKILL, we want to wake it up in the stopped/traced/killable
547 * case. We don't check t->state here because there is a race with it
548 * executing another processor and just now entering stopped state.
549 * By using wake_up_state, we ensure the process will wake up and
550 * handle its death signal.
551 */
552 mask = TASK_INTERRUPTIBLE;
553 if (resume)
554 mask |= TASK_WAKEKILL;
555 if (!wake_up_state(t, mask)) // try_to_wake_up()
556 kick_process(t);
557 }
目标进程对信号的检测和处理:
假设目标进程已被scheduler选中,当目标进程从 kernel mode 返回到 user mode 时,会检验到
TIF_SIGPENDING,从而进行处理。
350 ret_from_exception:
351 preempt_stop(CLBR_ANY)
352 ret_from_intr:
353 GET_THREAD_INFO(%ebp)
354 check_userspace:
355 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
356 movb PT_CS(%esp), %al
357 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
358 cmpl $USER_RPL, %eax
359 jb resume_kernel # not returning to v8086 or userspace
360
361 ENTRY(resume_userspace)
362 LOCKDEP_SYS_EXIT
363 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
364 # setting need_resched or sigpending
365 # between sampling and the iret
366 TRACE_IRQS_OFF
367 movl TI_flags(%ebp), %ecx
368 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
369 # int/exception return?
370 jne work_pending
371 jmp restore_all
372 END(ret_from_exception)
......
638 work_pending:
639 testb $_TIF_NEED_RESCHED, %cl
640 jz work_notifysig
641 work_resched:
642 call schedule
643 LOCKDEP_SYS_EXIT
644 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
645 # setting need_resched or sigpending
646 # between sampling and the iret
647 TRACE_IRQS_OFF
648 movl TI_flags(%ebp), %ecx
649 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
650 # than syscall tracing?
651 jz restore_all
652 testb $_TIF_NEED_RESCHED, %cl
653 jnz work_resched
654
655 work_notifysig: # deal with pending signals and
656 # notify-resume requests
657 #ifdef CONFIG_VM86
658 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
659 movl %esp, %eax
660 jne work_notifysig_v86 # returning to kernel-space or
661 # vm86-space
662 xorl %edx, %edx
663 call do_notify_resume // here
664 jmp resume_userspace_sig
665
666 ALIGN
667 work_notifysig_v86:
......
834 /*
835 * notification of userspace execution resumption
836 * - triggered by the TIF_WORK_MASK flags
837 */
838 void
839 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
840 {
841 #ifdef CONFIG_X86_MCE
842 /* notify userspace of pending MCEs */
843 if (thread_info_flags & _TIF_MCE_NOTIFY)
844 mce_notify_process();
845 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
846
847 /* deal with pending signal delivery */
848 if (thread_info_flags & _TIF_SIGPENDING)
849 do_signal(regs);
850
851 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
852 clear_thread_flag(TIF_NOTIFY_RESUME);
853 tracehook_notify_resume(regs);
854 if (current->replacement_session_keyring)
855 key_replace_session_keyring();
856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
859
860 #ifdef CONFIG_X86_32
861 clear_thread_flag(TIF_IRET);
862 #endif /* CONFIG_X86_32 */
863 }
do_notify_resume() -> do_signal()
764 /*
765 * Note that 'init' is a special process: it doesn't get signals it doesn't
766 * want to handle. Thus you cannot kill init even with a SIGKILL even by
767 * mistake.
768 */
769 static void do_signal(struct pt_regs *regs)
770 {
771 struct k_sigaction ka;
772 siginfo_t info;
773 int signr;
774 sigset_t *oldset;
775
776 /*
777 * We want the common case to go fast, which is why we may in certain
778 * cases get here from kernel mode. Just return without doing anything
779 * if so.
780 * X86_32: vm86 regs switched out by assembly code before reaching
781 * here, so testing against kernel CS suffices.
782 */
783 if (!user_mode(regs))
784 return;
785
786 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
787 oldset = ¤t->saved_sigmask;
788 else
789 oldset = ¤t->blocked;
790
791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
792 if (signr > 0) {
793 /* Whee! Actually deliver the signal. */
794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
795 /*
796 * A signal was successfully delivered; the saved
797 * sigmask will have been stored in the signal frame,
798 * and will be restored by sigreturn, so we can simply
799 * clear the TS_RESTORE_SIGMASK flag.
800 */
801 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
802 }
803 return;
804 }
805
806 /* Did we come from a system call? */
807 if (syscall_get_nr(current, regs) >= 0) {
808 /* Restart the system call - no handlers present */
809 switch (syscall_get_error(current, regs)) {
810 case -ERESTARTNOHAND:
811 case -ERESTARTSYS:
812 case -ERESTARTNOINTR:
813 regs->ax = regs->orig_ax;
814 regs->ip -= 2;
815 break;
816
817 case -ERESTART_RESTARTBLOCK:
818 regs->ax = NR_restart_syscall;
819 regs->ip -= 2;
820 break;
821 }
822 }
823
824 /*
825 * If there's no signal to deliver, we just put the saved sigmask
826 * back.
827 */
828 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
829 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
830 sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL);
831 }
832 }
get_signal_to_deliver() 从进程的信号队列中取得未被屏蔽的信号,能处理的就处理。
剩下的比如用户自己定义了signal_handler(),就要由handle_signal()处理。
do_signal() -> get_signal_to_deliver()
1807 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1808 struct pt_regs *regs, void *cookie)
1809 {
1810 struct sighand_struct *sighand = current->sighand;
1811 struct signal_struct *signal = current->signal;
1812 int signr;
1813
1814 relock:
1815 /*
1816 * We'll jump back here after any time we were stopped in TASK_STOPPED.
1817 * While in TASK_STOPPED, we were considered "frozen enough".
1818 * Now that we woke up, it's crucial if we're supposed to be
1819 * frozen that we freeze now before running anything substantial.
1820 */
1821 try_to_freeze();
1822
1823 spin_lock_irq(&sighand->siglock);
1824 /*
1825 * Every stopped thread goes here after wakeup. Check to see if
1826 * we should notify the parent, prepare_signal(SIGCONT) encodes
1827 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1828 */
1829 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1830 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1831 ? CLD_CONTINUED : CLD_STOPPED;
1832 signal->flags &= ~SIGNAL_CLD_MASK;
1833
1834 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1835 spin_unlock_irq(&sighand->siglock);
1836
1837 if (why) {
1838 read_lock(&tasklist_lock);
1839 do_notify_parent_cldstop(current->group_leader, why);
1840 read_unlock(&tasklist_lock);
1841 }
1842 goto relock;
1843 }
1844
1845 for (;;) {
1846 struct k_sigaction *ka;
1847 /*
1848 * Tracing can induce an artifical signal and choose sigaction.
1849 * The return value in @signr determines the default action,
1850 * but @info->si_signo is the signal number we will report.
1851 */
1852 signr = tracehook_get_signal(current, regs, info, return_ka);
1853 if (unlikely(signr < 0))
1854 goto relock;
1855 if (unlikely(signr != 0))
1856 ka = return_ka;
1857 else {
1858 if (unlikely(signal->group_stop_count > 0) &&
1859 do_signal_stop(0))
1860 goto relock;
1861
1862 signr = dequeue_signal(current, ¤t->blocked,
1863 info);
1864
1865 if (!signr)
1866 break; /* will return 0 */
1867
1868 if (signr != SIGKILL) {
1869 signr = ptrace_signal(signr, info,
1870 regs, cookie);
1871 if (!signr)
1872 continue;
1873 }
1874
1875 ka = &sighand->action[signr-1];
1876 }
1877
1878 /* Trace actually delivered signals. */
1879 trace_signal_deliver(signr, info, ka);
1880
1881 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1882 continue;
1883 if (ka->sa.sa_handler != SIG_DFL) {
1884 /* Run the handler. */
1885 *return_ka = *ka;
1886
1887 if (ka->sa.sa_flags & SA_ONESHOT)
1888 ka->sa.sa_handler = SIG_DFL;
1889
1890 break; /* will return non-zero "signr" value */
1891 }
1892
1893 /*
1894 * Now we are doing the default action for this signal.
1895 */
1896 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1897 continue;
1898
1899 /*
1900 * Global init gets no signals it doesn't want.
1901 * Container-init gets no signals it doesn't want from same
1902 * container.
1903 *
1904 * Note that if global/container-init sees a sig_kernel_only()
1905 * signal here, the signal must have been generated internally
1906 * or must have come from an ancestor namespace. In either
1907 * case, the signal cannot be dropped.
1908 */
1909 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1910 !sig_kernel_only(signr))
1911 continue;
1912
1913 if (sig_kernel_stop(signr)) {
1914 /*
1915 * The default action is to stop all threads in
1916 * the thread group. The job control signals
1917 * do nothing in an orphaned pgrp, but SIGSTOP
1918 * always works. Note that siglock needs to be
1919 * dropped during the call to is_orphaned_pgrp()
1920 * because of lock ordering with tasklist_lock.
1921 * This allows an intervening SIGCONT to be posted.
1922 * We need to check for that and bail out if necessary.
1923 */
1924 if (signr != SIGSTOP) {
1925 spin_unlock_irq(&sighand->siglock);
1926
1927 /* signals can be posted during this window */
1928
1929 if (is_current_pgrp_orphaned())
1930 goto relock;
1931
1932 spin_lock_irq(&sighand->siglock);
1933 }
1934
1935 if (likely(do_signal_stop(info->si_signo))) {
1936 /* It released the siglock. */
1937 goto relock;
1938 }
1939
1940 /*
1941 * We didn't actually stop, due to a race
1942 * with SIGCONT or something like that.
1943 */
1944 continue;
1945 }
1946
1947 spin_unlock_irq(&sighand->siglock);
1948
1949 /*
1950 * Anything else is fatal, maybe with a core dump.
1951 */
1952 current->flags |= PF_SIGNALED;
1953
1954 if (sig_kernel_coredump(signr)) { // its true for SIGBUS
1955 if (print_fatal_signals)
1956 print_fatal_signal(regs, info->si_signo);
1957 /*
1958 * If it was able to dump core, this kills all
1959 * other threads in the group and synchronizes with
1960 * their demise. If we lost the race with another
1961 * thread getting here, it set group_exit_code
1962 * first and our do_group_exit call below will use
1963 * that value and ignore the one we pass it.
1964 */
1965 do_coredump(info->si_signo, info->si_signo, regs);
1966 }
1967
1968 /*
1969 * Death signals, no core dump.
1970 */
1971 do_group_exit(info->si_signo);
1972 /* NOTREACHED */
1973 }
1974 spin_unlock_irq(&sighand->siglock);
1975 return signr;
1976 }
dequeue_signal()每次提取一个未被屏蔽的信号sig,从struct queue取得info,释放对应sig的sigqueue。
取得 struct sigaction:
1875 ka = &sighand->action[signr-1];
然后根据信号的disposition: SIG_IGN !SIG_DFL SIGDFL 来处理。 man 7 signal
Signal Dispositions
Term Default action is to terminate the process.
Ign Default action is to ignore the signal.
Core Default action is to terminate the process and dump core (see core(5)).
Stop Default action is to stop the process.
Cont Default action is to continue the process if it is currently stopped.
#define sig_kernel_only(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
#define sig_kernel_coredump(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
#define sig_kernel_ignore(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
#define sig_kernel_stop(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
#define sig_user_defined(t, signr) \
(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
#define sig_fatal(t, signr) \
(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
(t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
#define SIG_KERNEL_ONLY_MASK (\
rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP))
#define SIG_KERNEL_STOP_MASK (\
rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \
rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) )
#define SIG_KERNEL_COREDUMP_MASK (\
rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \
rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \
rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \
rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \
rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \
SIGEMT_MASK )
#define SIG_KERNEL_IGNORE_MASK (\
rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \
rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) )
如果用户程序中没有对 SIGCHLD 设置sig_handler(),则SIGCHLD属于sig_kernel_ignore类型,SIG_IGN。
而处理 SIGKILL 时将执行 1971 do_group_exit(info->si_signo);
do_group_exit()最后调用do_exit()结束自己的生命,对SIGKILL的处理就结束了,符合man 7 signal。
而对于 SIGBUS SIGSEGV 等 sig_kernel_coredump() 类型,就要do_coredump()生成core文件了。
dequeue_signal() -> __dequeue_signal()
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info)
{
int sig = next_signal(pending, mask); // Given the mask, find the first available signal
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
clear_thread_flag(TIF_SIGPENDING);
return 0;
}
}
}
collect_signal(sig, pending, info);
}
return sig;
}
dequeue_signal() -> __dequeue_signal() -> collect_signal()
static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
/*
* Collect the siginfo appropriate to this signal. Check if
* there is another siginfo for the same signal.
*/
list_for_each_entry(q, &list->list, list) {
if (q->info.si_signo == sig) {
if (first)
goto still_pending;
first = q;
}
}
sigdelset(&list->signal, sig);
if (first) {
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
__sigqueue_free(first);
} else {
/* Ok, it wasn't in the queue. This must be
a fast-pathed signal or we must have been
out of queue space. So zero out the info.
*/
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = 0;
info->si_uid = 0;
}
}
handle_signal()设置各类context:调用put_user()把kernel stack中的内容保存到user stack中,然后修
改某些kernel stack中的register值,为用户态sig_handler()的运行做准备。这里就不看细节了。
handle_signal() -> setup_rt_frame() -> ia32_setup_frame()
int ia32_setup_frame(int sig, struct k_sigaction *ka, compat_sigset_t *set, struct pt_regs *regs)
{
struct sigframe_ia32 __user *frame;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
/* copy_to_user optimizes that into a single 8 byte store */
static const struct {
u16 poplmovl;
u32 val;
u16 int80;
} __attribute__((packed)) code = {
0xb858, /* popl %eax ; movl $...,%eax */
__NR_ia32_sigreturn,
0x80cd, /* int $0x80 */
};
frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); // determine user sp
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
if (__put_user(sig, &frame->sig))
return -EFAULT;
if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) // here
return -EFAULT;
if (_COMPAT_NSIG_WORDS > 1) {
if (__copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)))
return -EFAULT;
}
if (ka->sa.sa_flags & SA_RESTORER) {
restorer = ka->sa.sa_restorer;
restorer = ka->sa.sa_restorer;
} else {
/* Return stub is in 32bit vsyscall page */
if (current->mm->context.vdso)
restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
else
restorer = &frame->retcode;
}
put_user_try {
put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
/*
* These are actually not used anymore, but left because some
* gdb versions depend on them as a marker.
*/
put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
} put_user_catch(err);
if (err)
return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
regs->ip = (unsigned long) ka->sa.sa_handler;
/* Make -mregparm=3 work */
regs->ax = sig;
regs->dx = 0;
regs->cx = 0;
loadsegment(ds, __USER32_DS);
loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
return 0;
}
handle_signal()之后就层层返回到汇编代码:
678 call do_notify_resume
679 jmp resume_userspace_sig
define resume_userspace_sig check_userspace
在check_userspace内会执行jmp restore_all将进程的kernel stack内容恢复到各个register中,从而开始
在用户态执行用户自定义的sig_handler(),
执行完sig_handler()之后再调用sigreturn进入kernel mode。sigreturn先做一些设置,然后执行
restore_sigcontext() 用get_user()恢复kernel stack。最后返回到user mode继续执行用户程序。
sys_sigreturn()对应sigreturn,这里不看了。
二。SI_KERNEL 表示kernel发信号,使用的是 force_sig() 系列。
比如do_page_fault()中,handle_mm_fault()返回后会检查,如果发现 VM_FAULT_ERROR 就要处理错误了。
处理错误的过程中kernel就可能会发信号。
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
dotraplinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
....
fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address);
}
check_v8086_mode(regs, address, tsk);
up_read(&mm->mmap_sem);
}
do_page_fault() -> mm_fault_error()
static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address); // call OOM killer
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
do_sigbus(regs, error_code, address, fault);
else
BUG();
}
}
这里用上了 OOM Killer。
当系统实在没有内存可用时,会调用 OOM Killer 杀掉一个进程,希望之后能有空余内存。比如用户程序用
malloc()申请了很多内存,malloc()通过mmap()给进程分配了vma等虚拟存储结构,并不分配物理内存。即使
此时物理内存已经很少,并不能满足申请。这样当程序运行期间通过page fault获取物理内存时,无物理内
存可用,就会试图调用 OOM Killer 杀死一个进程。kernel发 SIGKILL 信号,通过force_sig(SIGKILL, p);
实现。但 OOM Killer 几乎没用:由于kernel thread 没有用户空间,也就没有返回到用户态的机会,所以
不选择杀kernel thread,要杀用户进程。而如果被选择的进程运行于user mode,则必须等它以某种方式进
入kernel mode,然后返回到user mode时,才能处理SIGKILL,被杀掉,不具实时性。
因此在用户态用 kill -9 杀kernel thread是没用的。
$ ps aux |less 带[]的是kernel thread
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMand
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.0 0.0 2028 580 ? Ss Apr22 0:00 /sbin/init
root 2 0.0 0.0 0 0 ? S< Apr22 0:00 [kthreadd]
...
root 806 0.0 0.1 35680 1084 ? Sl Apr22 0:00 /sbin/rsyslogd -c 4
....
root 13168 0.0 0.0 0 0 ? S 12:34 0:00 [pdflush]
root 13169 0.0 0.0 0 0 ? S 12:35 0:00 [pdflush]
...
kthreadd 和 pdflush 都是kernel thread,而 rsyslogd 和 klogd 不是,虽然它们四个都是daemon。
继续看另一种情况 do_sigbus()
static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
int code = BUS_ADRERR;
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER))
no_context(regs, error_code, address); // fixup_exception()
/* User-space => ok to do another page fault: */
if (is_prefetch(regs, error_code, address))
return;
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & VM_FAULT_HWPOISON) {
printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR;
}
#endif
force_sig_info_fault(SIGBUS, code, address, tsk);
}
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
*/
enum x86_pf_error_code {
PF_PROT = 1 << 0,
PF_WRITE = 1 << 1,
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
};
kernel fault用fixup_exception查表的方式来处理; user fault 就给current发 SIGBUS 信号。
man 7 signal 知道表示 Bus error (bad memory access)。
Signal Dispositions Core Default action is to terminate the process and dump core
接下来force_sig_info_fault()最终也是调用 send_signal()
static void force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk)
{
siginfo_t info;
info.si_signo = si_signo;
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
force_sig_info(si_signo, &info, tsk);
}
1029 /*
1030 * Force a signal that the process can't ignore: if necessary
1031 * we unblock the signal and change any SIG_IGN to SIG_DFL.
1032 *
1033 * Note: If we unblock the signal, we always reset it to SIG_DFL,
1034 * since we do not want to have a signal handler that was blocked
1035 * be invoked when user space had explicitly blocked it.
1036 *
1037 * We don't want to have recursive SIGSEGV's etc, for example,
1038 * that is why we also clear SIGNAL_UNKILLABLE.
1039 */
1040 int
1041 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1042 {
1043 unsigned long int flags;
1044 int ret, blocked, ignored;
1045 struct k_sigaction *action;
1046
1047 spin_lock_irqsave(&t->sighand->siglock, flags);
1048 action = &t->sighand->action[sig-1];
1049 ignored = action->sa.sa_handler == SIG_IGN;
1050 blocked = sigismember(&t->blocked, sig);
1051 if (blocked || ignored) {
1052 action->sa.sa_handler = SIG_DFL;
1053 if (blocked) {
1054 sigdelset(&t->blocked, sig);
1055 recalc_sigpending_and_wake(t);
1056 }
1057 }
1058 if (action->sa.sa_handler == SIG_DFL)
1059 t->signal->flags &= ~SIGNAL_UNKILLABLE;
1060 ret = specific_send_sig_info(sig, info, t);
1061 spin_unlock_irqrestore(&t->sighand->siglock, flags);
1062
1063 return ret;
1064 }
static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
return send_signal(sig, info, t, 0);
}
force_sig()和 send_sig()的区别在于:
void force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, SEND_SIG_PRIV, p);
}
int send_sig(int sig, struct task_struct *p, int priv)
{
return send_sig_info(sig, __si_special(priv), p);
}
#define __si_special(priv) \
((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
/* These can be the second arg to send_sig_info/send_group_sig_info. */
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
#define SEND_SIG_PRIV ((struct siginfo *) 1)
#define SEND_SIG_FORCED ((struct siginfo *) 2)
在处理SIGBUS时,要进行coredump。而处理 SIGKILL时是不用coredump的,这正符合man 7 signal的说明。