一碗阳春面: page fault and mmap


mmap后发生page fault时，do_page_fault()处理起来要多些步骤:
mmap后，第一次对map page进行操作时，无论read还是write，由于所需物理页从来还没进入内存过，需要分
配物理页，do_page_fault()肯定要走demand paging的do_linear_fault()或是do_anonymous_page().

之后继续访问:
I) 如果之前调入的物理页没被换出到外存，那在发生page fault时就区分read和write了。read没什么说的，
   如果是write引发的page fault就要调用 do_wp_page()来处理，完成COW的实现。

II)如果之前调入的物理页已经被换出到外存，那就仍需demand paging，走do_linear_fault()或者
   do_anonymous_page()或者do_nonlinear_fault()或者do_swap_page()，将缺的物理页从硬盘读入。

比如用mmap()创建了一个MAP_SHARED writable 的mmap，man mmap，kernel用do_mmap()实现:
vm_flags要设置 VM_SHARED | VM_WRITE，注释说有的driver会去掉vma->vm_page_prot的PROT_WRITE权限。
之后引发page fault时，第一次page fault肯定是走demand paging的do_linear_fault()，里面会根据
vma->vm_page_prot设置pte，此时pte也就没有 _PAGE_RW了，变相做了个pte_wrprotect()，和fork()的情况
类似了。此后继续引发page fault时，如果刚才分配的物理页没有被换出到硬盘，那就不走demand paging
了，如果是write引发的，就会调用do_wp_page()来处理。看看此时do_wp_page()的处理:

do_page_fault() -> handle_mm_fault() -> handle_pte_fault() -> do_wp_page()
1985 /*
1986  * This routine handles present pages, when users try to write
1987  * to a shared page. It is done by copying the page to a new address
1988  * and decrementing the shared-page counter for the old page.
1989  *
1990  * Note that this routine assumes that the protection checks have been
1991  * done by the caller (the low-level page fault routine in most cases).
1992  * Thus we can safely just mark it writable once we've done any necessary
1993  * COW.
1994  *
1995  * We also mark the page dirty at this point even though the page will
1996  * change only once the write actually happens. This avoids a few races,
1997  * and potentially makes it more efficient.
1998  *
1999  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2000  * but allow concurrent faults), with pte both mapped and locked.
2001  * We return with mmap_sem still held, but pte unmapped and unlocked.
2002  */
2003 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2004                 unsigned long address, pte_t *page_table, pmd_t *pmd,
2005                 spinlock_t *ptl, pte_t orig_pte)
2006 {
2007         struct page *old_page, *new_page;
2008         pte_t entry;
2009         int reuse = 0, ret = 0;
2010         int page_mkwrite = 0;
2011         struct page *dirty_page = NULL;
2012 
2013         old_page = vm_normal_page(vma, address, orig_pte);
2014         if (!old_page) {
2015                 /*
2016                  * VM_MIXEDMAP !pfn_valid() case
2017                  *
2018                  * We should not cow pages in a shared writeable mapping.
2019                  * Just mark the pages writable as we can't do any dirty
2020                  * accounting on raw pfn maps.
2021                  */
2022                 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2023                                      (VM_WRITE|VM_SHARED))
2024                         goto reuse;
2025                 goto gotten;
2026         }
2027 
2028         /*
2029          * Take out anonymous pages first, anonymous shared vmas are
2030          * not dirty accountable.
2031          */
2032         if (PageAnon(old_page) && !PageKsm(old_page)) {
2033                 if (!trylock_page(old_page)) {
2034                         page_cache_get(old_page);
2035                         pte_unmap_unlock(page_table, ptl);
2036                         lock_page(old_page);
2037                         page_table = pte_offset_map_lock(mm, pmd, address,
2038                                                          &ptl);
2039                         if (!pte_same(*page_table, orig_pte)) {
2040                                 unlock_page(old_page);
2041                                 page_cache_release(old_page);
2042                                 goto unlock;
2043                         }
2044                         page_cache_release(old_page);
2045                 }
2046                 reuse = reuse_swap_page(old_page);
2047                 unlock_page(old_page);
2048         } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==           
2049                                         (VM_WRITE|VM_SHARED))) {        //  here
2050                 /*
2051                  * Only catch write-faults on shared writable pages,
2052                  * read-only shared pages can get COWed by
2053                  * get_user_pages(.write=1, .force=1).
2054                  */
2055                 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2056                         struct vm_fault vmf;
2057                         int tmp;
2058 
2059                         vmf.virtual_address = (void __user *)(address &
2060                                                                 PAGE_MASK);
2061                         vmf.pgoff = old_page->index;
2062                         vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2063                         vmf.page = old_page;
2064 
2065                         /*
2066                          * Notify the address space that the page is about to
2067                          * become writable so that it can prohibit this or wait
2068                          * for the page to get into an appropriate state.
2069                          *
2070                          * We do this without the lock held, so that it can
2071                          * sleep if it needs to.
2072                          */
2073                         page_cache_get(old_page);
2074                         pte_unmap_unlock(page_table, ptl);
2075 
2076                         tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2077                         if (unlikely(tmp &
2078                                         (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2079                                 ret = tmp;
2080                                 goto unwritable_page;
2081                         }
2082                         if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2083                                 lock_page(old_page);
2084                                 if (!old_page->mapping) {
2085                                         ret = 0; /* retry the fault */
2086                                         unlock_page(old_page);
2087                                         goto unwritable_page;
2088                                 }
2089                         } else
2090                                 VM_BUG_ON(!PageLocked(old_page));
2091 
2092                         /*
2093                          * Since we dropped the lock we need to revalidate
2094                          * the PTE as someone else may have changed it.  If
2095                          * they did, we just return, as we can count on the
2096                          * MMU to tell us if they didn't also make it writable.
2097                          */
2098                         page_table = pte_offset_map_lock(mm, pmd, address,
2099                                                          &ptl);
2100                         if (!pte_same(*page_table, orig_pte)) {
2101                                 unlock_page(old_page);
2102                                 page_cache_release(old_page);
2103                                 goto unlock;
2104                         }
2105 
2106                         page_mkwrite = 1;
2107                 }
2108                 dirty_page = old_page;
2109                 get_page(dirty_page);
2110                 reuse = 1;
2111         }
2112 
2113         if (reuse) {
2114 reuse:
2115                 flush_cache_page(vma, address, pte_pfn(orig_pte));
2116                 entry = pte_mkyoung(orig_pte);
2117                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118                 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119                         update_mmu_cache(vma, address, entry);
2120                 ret |= VM_FAULT_WRITE;
2121                 goto unlock;
2122         }
2123 
2124         /*
2125          * Ok, we need to copy. Oh, well..
2126          */
2127         page_cache_get(old_page);
2128 gotten:
2129         pte_unmap_unlock(page_table, ptl);
2130 
2131         if (unlikely(anon_vma_prepare(vma)))
2132                 goto oom;
2133 
2134         if (is_zero_pfn(pte_pfn(orig_pte))) {
2135                 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2136                 if (!new_page)
2137                         goto oom;
2138         } else {
2139                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2140                 if (!new_page)
2141                         goto oom;
2142                 cow_user_page(new_page, old_page, address, vma);
2143         }
2144         __SetPageUptodate(new_page);
2145 
2146         /*
2147          * Don't let another task, with possibly unlocked vma,
2148          * keep the mlocked page.
2149          */
2150         if ((vma->vm_flags & VM_LOCKED) && old_page) {
2151                 lock_page(old_page);    /* for LRU manipulation */
2152                 clear_page_mlock(old_page);
2153                 unlock_page(old_page);
2154         }
2155 
2156         if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2157                 goto oom_free_new;
2158 
2159         /*
2160          * Re-check the pte - we dropped the lock
2161          */
2162         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2163         if (likely(pte_same(*page_table, orig_pte))) {
2164                 if (old_page) {
2165                         if (!PageAnon(old_page)) {
2166                                 dec_mm_counter(mm, file_rss);
2167                                 inc_mm_counter(mm, anon_rss);
2168                         }
2169                 } else
2170                         inc_mm_counter(mm, anon_rss);
2171                 flush_cache_page(vma, address, pte_pfn(orig_pte));
2172                 entry = mk_pte(new_page, vma->vm_page_prot);
2173                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2174                 /*
2175                  * Clear the pte entry and flush it first, before updating the
2176                  * pte with the new entry. This will avoid a race condition
2177                  * seen in the presence of one thread doing SMC and another
2178                  * thread doing COW.
2179                  */
2180                 ptep_clear_flush(vma, address, page_table);
2181                 page_add_new_anon_rmap(new_page, vma, address);
2182                 /*
2183                  * We call the notify macro here because, when using secondary
2184                  * mmu page tables (such as kvm shadow page tables), we want the
2185                  * new page to be mapped directly into the secondary page table.
2186                  */
2187                 set_pte_at_notify(mm, address, page_table, entry);
2188                 update_mmu_cache(vma, address, entry);
2189                 if (old_page) {
2190                         /*
2191                          * Only after switching the pte to the new page may
2192                          * we remove the mapcount here. Otherwise another
2193                          * process may come and find the rmap count decremented
2194                          * before the pte is switched to the new page, and
2195                          * "reuse" the old page writing into it while our pte
2196                          * here still points into it and can be read by other
2197                          * threads.
2198                          *
2199                          * The critical issue is to order this
2200                          * page_remove_rmap with the ptp_clear_flush above.
2201                          * Those stores are ordered by (if nothing else,)
2202                          * the barrier present in the atomic_add_negative
2203                          * in page_remove_rmap.
2204                          *
2205                          * Then the TLB flush in ptep_clear_flush ensures that
2206                          * no process can access the old page before the
2207                          * decremented mapcount is visible. And the old page
2208                          * cannot be reused until after the decremented
2209                          * mapcount is visible. So transitively, TLBs to
2210                          * old page will be flushed before it can be reused.
2211                          */
2212                         page_remove_rmap(old_page);
2213                 }
2214 
2215                 /* Free the old page.. */
2216                 new_page = old_page;
2217                 ret |= VM_FAULT_WRITE;
2218         } else
2219                 mem_cgroup_uncharge_page(new_page);
2220 
2221         if (new_page)
2222                 page_cache_release(new_page);
2223         if (old_page)
2224                 page_cache_release(old_page);
2225 unlock:
2226         pte_unmap_unlock(page_table, ptl);
2227         if (dirty_page) {
2228                 /*
2229                  * Yes, Virginia, this is actually required to prevent a race
2230                  * with clear_page_dirty_for_io() from clearing the page dirty
2231                  * bit after it clear all dirty ptes, but before a racing
2232                  * do_wp_page installs a dirty pte.
2233                  *
2234                  * do_no_page is protected similarly.
2235                  */
2236                 if (!page_mkwrite) {
2237                         wait_on_page_locked(dirty_page);
2238                         set_page_dirty_balance(dirty_page, page_mkwrite);
2239                 }
2240                 put_page(dirty_page);
2241                 if (page_mkwrite) {
2242                         struct address_space *mapping = dirty_page->mapping;
2243 
2244                         set_page_dirty(dirty_page);
2245                         unlock_page(dirty_page);
2246                         page_cache_release(dirty_page);
2247                         if (mapping)    {
2248                                 /*
2249                                  * Some device drivers do not set page.mapping
2250                                  * but still dirty their pages
2251                                  */
2252                                 balance_dirty_pages_ratelimited(mapping);
2253                         }
2254                 }
2255 
2256                 /* file_update_time outside page_lock */
2257                 if (vma->vm_file)
2258                         file_update_time(vma->vm_file);
2259         }
2260         return ret;
2261 oom_free_new:
2262         page_cache_release(new_page);
2263 oom:
2264         if (old_page) {
2265                 if (page_mkwrite) {
2266                         unlock_page(old_page);
2267                         page_cache_release(old_page);
2268                 }
2269                 page_cache_release(old_page);
2270         }
2271         return VM_FAULT_OOM;
2272 
2273 unwritable_page:
2274         page_cache_release(old_page);
2275         return ret;
2276 }

可见，这也是用引用计数处理的，2073     page_cache_get(old_page); 注释说了这样可以sleep，所以不
用lock。执行dirty_page = old_page; get_page(dirty_page); reuse = 1; 后进入reuse: 设置pte权限。
然后到unlock: 处又 put_page(dirty_page); page_cache_release(dirty_page);
这里是两次_count++两次_count--，和处理fork时不完全一样，fork()时执行了一次get_page()，而之前的
do_mmap()和do_linear_fault()中的++ --操作都是对称的，确实没发现单独的++操作。现在看看:

do_mmap() -> do_mmap_pgoff()
 908 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 909                         unsigned long len, unsigned long prot,
 910                         unsigned long flags, unsigned long pgoff)
 911 {
 912         struct mm_struct * mm = current->mm;
 913         struct inode *inode;
 914         unsigned int vm_flags;
 915         int error;
 916         unsigned long reqprot = prot;
 917 
 918         /*
 919          * Does the application expect PROT_READ to imply PROT_EXEC?
 920          *
 921          * (the exception is when the underlying filesystem is noexec
 922          *  mounted, in which case we dont add PROT_EXEC.)
 923          */
 924         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 925                 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
 926                         prot |= PROT_EXEC;
 927 
 928         if (!len)
 929                 return -EINVAL;
 930 
 931         if (!(flags & MAP_FIXED))
 932                 addr = round_hint_to_min(addr);
 933 
 934         /* Careful about overflows.. */
 935         len = PAGE_ALIGN(len);
 936         if (!len)
 937                 return -ENOMEM;
 938 
 939         /* offset overflow? */
 940         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
 941                return -EOVERFLOW;
 942 
 943         /* Too many mappings? */
 944         if (mm->map_count > sysctl_max_map_count)
 945                 return -ENOMEM;
 946 
 947         /* Obtain the address to map to. we verify (or select) it and ensure
 948          * that it represents a valid section of the address space.
 949          */
 950         addr = get_unmapped_area(file, addr, len, pgoff, flags);
 951         if (addr & ~PAGE_MASK)
 952                 return addr;
 953 
 954         /* Do simple checking here so the lower-level routines won't have
 955          * to. we assume access permissions have been handled by the open
 956          * of the memory object, so we don't do any here.
 957          */
 958         vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 959                         mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 960 
 961         if (flags & MAP_LOCKED)
 962                 if (!can_do_mlock())
 963                         return -EPERM;
 964 
 965         /* mlock MCL_FUTURE? */
 966         if (vm_flags & VM_LOCKED) {
 967                 unsigned long locked, lock_limit;
 968                 locked = len >> PAGE_SHIFT;
 969                 locked += mm->locked_vm;
 970                 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 971                 lock_limit >>= PAGE_SHIFT;
 972                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 973                         return -EAGAIN;
 974         }
 975 
 976         inode = file ? file->f_path.dentry->d_inode : NULL;
 977 
 978         if (file) {                                // file memory mapping
 979                 switch (flags & MAP_TYPE) {
 980                 case MAP_SHARED:
 981                         if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 982                                 return -EACCES;
 983 
 984                         /*
 985                          * Make sure we don't allow writing to an append-only
 986                          * file..
 987                          */
 988                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
 989                                 return -EACCES;
 990 
 991                         /*
 992                          * Make sure there are no mandatory locks on the file.
 993                          */
 994                         if (locks_verify_locked(inode))
 995                                 return -EAGAIN;
 996 
 997                         vm_flags |= VM_SHARED | VM_MAYSHARE;
 998                         if (!(file->f_mode & FMODE_WRITE))
 999                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1000 
1001                         /* fall through */
1002                 case MAP_PRIVATE:
1003                         if (!(file->f_mode & FMODE_READ))
1004                                 return -EACCES;
1005                         if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1006                                 if (vm_flags & VM_EXEC)
1007                                         return -EPERM;
1008                                 vm_flags &= ~VM_MAYEXEC;
1009                         }
1010 
1011                         if (!file->f_op || !file->f_op->mmap)
1012                                 return -ENODEV;
1013                         break;
1014 
1015                 default:
1016                         return -EINVAL;
1017                 }
1018         } else {                                 // Anonymous mapping
1019                 switch (flags & MAP_TYPE) {
1020                 case MAP_SHARED:
1021                         /*
1022                          * Ignore pgoff.
1023                          */
1024                         pgoff = 0;
1025                         vm_flags |= VM_SHARED | VM_MAYSHARE;
1026                         break;
1027                 case MAP_PRIVATE:
1028                         /*
1029                          * Set pgoff according to addr for anon_vma.
1030                          */
1031                         pgoff = addr >> PAGE_SHIFT;
1032                         break;
1033                 default:
1034                         return -EINVAL;
1035                 }
1036         }
1037 
1038         error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1039         if (error)
1040                 return error;
1041 
1042         return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1043 }

VM_SHARED用来区分 MAP_SHARED 和 MAP_PRIVATE。
这里只是根据 MAP_SHARED 还是 MAP_PRIVATE 设置vm_flags，处理引起冲突的权限。比如
disk file mapping memory 映射后的权限不能和文件的open权限冲突，如果文件open时没有write权限，那
映射之后的那部分空间也没有write权限，否则报错。

先看anonymous map代码，不涉及文件相对简单:
1) private writable mapping: (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE
2) shared anonymous mapping，用于IPC，此时vm_flags要设置VM_SHARED

如果是file map，那就涉及硬盘读写，牵涉到page cache，很复杂。

do_mmap() -> do_mmap_pgoff() -> mmap_region()
1134 unsigned long mmap_region(struct file *file, unsigned long addr,
1135                           unsigned long len, unsigned long flags,
1136                           unsigned int vm_flags, unsigned long pgoff)
1137 {
1138         struct mm_struct *mm = current->mm;
1139         struct vm_area_struct *vma, *prev;
1140         int correct_wcount = 0;
1141         int error;
1142         struct rb_node **rb_link, *rb_parent;
1143         unsigned long charged = 0;
1144         struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
1145 
1146         /* Clear old maps */
1147         error = -ENOMEM;
1148 munmap_back:
1149         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1150         if (vma && vma->vm_start < addr + len) {
1151                 if (do_munmap(mm, addr, len))
1152                         return -ENOMEM;
1153                 goto munmap_back;
1154         }
1155 
1156         /* Check against address space limit. */
1157         if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1158                 return -ENOMEM;
1159 
1160         /*
1161          * Set 'VM_NORESERVE' if we should not account for the
1162          * memory use of this mapping.
1163          */
1164         if ((flags & MAP_NORESERVE)) {
1165                 /* We honor MAP_NORESERVE if allowed to overcommit */
1166                 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1167                         vm_flags |= VM_NORESERVE;
1168 
1169                 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1170                 if (file && is_file_hugepages(file))
1171                         vm_flags |= VM_NORESERVE;
1172         }
1173 
1174         /*
1175          * Private writable mapping: check memory availability
1176          */
1177         if (accountable_mapping(file, vm_flags)) {
1178                 charged = len >> PAGE_SHIFT;
1179                 if (security_vm_enough_memory(charged))
1180                         return -ENOMEM;
1181                 vm_flags |= VM_ACCOUNT;
1182         }
1183 
1184         /*
1185          * Can we just expand an old mapping?
1186          */
1187         vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1188         if (vma)
1189                 goto out;
1190 
1191         /*
1192          * Determine the object being mapped and call the appropriate
1193          * specific mapper. the address has already been validated, but
1194          * not unmapped, but the maps are removed from the list.
1195          */
1196         vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1197         if (!vma) {
1198                 error = -ENOMEM;
1199                 goto unacct_error;
1200         }
1201 
1202         vma->vm_mm = mm;
1203         vma->vm_start = addr;
1204         vma->vm_end = addr + len;
1205         vma->vm_flags = vm_flags;
1206         vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207         vma->vm_pgoff = pgoff;
1208 
1209         if (file) {
1210                 error = -EINVAL;
1211                 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1212                         goto free_vma;
1213                 if (vm_flags & VM_DENYWRITE) {
1214                         error = deny_write_access(file);
1215                         if (error)
1216                                 goto free_vma;
1217                         correct_wcount = 1;
1218                 }
1219                 vma->vm_file = file;
1220                 get_file(file);
1221                 error = file->f_op->mmap(file, vma);    // generic_file_mmap()
1222                 if (error)
1223                         goto unmap_and_free_vma;
1224                 if (vm_flags & VM_EXECUTABLE)
1225                         added_exe_file_vma(mm);
1226 
1227                 /* Can addr have changed??
1228                  *
1229                  * Answer: Yes, several device drivers can do it in their
1230                  *         f_op->mmap method. -DaveM
1231                  */
1232                 addr = vma->vm_start;
1233                 pgoff = vma->vm_pgoff;
1234                 vm_flags = vma->vm_flags;
1235         } else if (vm_flags & VM_SHARED) {       // IPC, shared anonymous mapping
1236                 error = shmem_zero_setup(vma);
1237                 if (error)
1238                         goto free_vma;
1239         }
1240 
1241         if (vma_wants_writenotify(vma)) {
1242                 pgprot_t pprot = vma->vm_page_prot;
1243 
1244                 /* Can vma->vm_page_prot have changed??
1245                  *
1246                  * Answer: Yes, drivers may have changed it in their
1247                  *         f_op->mmap method.
1248                  *
1249                  * Ensures that vmas marked as uncached stay that way.
1250                  */
1251                 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1252                 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1253                         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1254         }
1255 
1256         vma_link(mm, vma, prev, rb_link, rb_parent);
1257         file = vma->vm_file;
1258 
1259         /* Once vma denies write, undo our temporary denial count */
1260         if (correct_wcount)
1261                 atomic_inc(&inode->i_writecount);
1262 out:
1263         perf_event_mmap(vma);
1264 
1265         mm->total_vm += len >> PAGE_SHIFT;
1266         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1267         if (vm_flags & VM_LOCKED) {
1268                 /*
1269                  * makes pages present; downgrades, drops, reacquires mmap_sem
1270                  */
1271                 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1272                 if (nr_pages < 0)
1273                         return nr_pages;        /* vma gone! */
1274                 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1275         } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1276                 make_pages_present(addr, addr + len);
1277         return addr;
1278 
1279 unmap_and_free_vma:
1280         if (correct_wcount)
1281                 atomic_inc(&inode->i_writecount);
1282         vma->vm_file = NULL;
1283         fput(file);
1284 
1285         /* Undo any partial mapping done by a device driver. */
1286         unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1287         charged = 0;
1288 free_vma:
1289         kmem_cache_free(vm_area_cachep, vma);
1290 unacct_error:
1291         if (charged)
1292                 vm_unacct_memory(charged);
1293         return error;
1294 }
 
rb_***是指red-black tree。
有VM_LOCKED flag就直接申请物理页锁在内存里，不必以后通过page fault来申请物理页了。具体是通过
make_pages_present() -> get_user_pages()申请。也是通过handle_mm_fault()实现的。

至于这行  1221          error = file->f_op->mmap(file, vma); 
多数文件系统和 block device file 都用generic_file_mmap()

1597 /* This is used for a general mmap of a disk file */
1598 
1599 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1600 {
1601         struct address_space *mapping = file->f_mapping;
1602 
1603         if (!mapping->a_ops->readpage)
1604                 return -ENOEXEC;
1605         file_accessed(file);
1606         vma->vm_ops = &generic_file_vm_ops;
1607         vma->vm_flags |= VM_CAN_NONLINEAR;
1608         return 0;
1609 }

const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
};

1465 /**
1466  * filemap_fault - read in file data for page fault handling
1467  * @vma:        vma in which the fault was taken
1468  * @vmf:        struct vm_fault containing details of the fault
1469  *
1470  * filemap_fault() is invoked via the vma operations vector for a
1471  * mapped memory region to read in file data during a page fault.
1472  *
1473  * The goto's are kind of ugly, but this streamlines the normal case of having
1474  * it in the page cache, and handles the special cases reasonably without
1475  * having a lot of duplicated code.
1476  */
1477 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1478 {
1479         int error;
1480         struct file *file = vma->vm_file;
1481         struct address_space *mapping = file->f_mapping;
1482         struct file_ra_state *ra = &file->f_ra;
1483         struct inode *inode = mapping->host;
1484         pgoff_t offset = vmf->pgoff;
1485         struct page *page;
1486         pgoff_t size;
1487         int ret = 0;
1488 
1489         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1490         if (offset >= size)
1491                 return VM_FAULT_SIGBUS;
1492 
1493         /*
1494          * Do we have something in the page cache already?
1495          */
1496         page = find_get_page(mapping, offset);
1497         if (likely(page)) {
1498                 /*
1499                  * We found the page, so try async readahead before
1500                  * waiting for the lock.
1501                  */
1502                 do_async_mmap_readahead(vma, ra, file, page, offset);
1503                 lock_page(page);
1504 
1505                 /* Did it get truncated? */
1506                 if (unlikely(page->mapping != mapping)) {
1507                         unlock_page(page);
1508                         put_page(page);
1509                         goto no_cached_page;
1510                 }
1511         } else {
1512                 /* No page in the page cache at all */
1513                 do_sync_mmap_readahead(vma, ra, file, offset);
1514                 count_vm_event(PGMAJFAULT);
1515                 ret = VM_FAULT_MAJOR;
1516 retry_find:
1517                 page = find_lock_page(mapping, offset);
1518                 if (!page)
1519                         goto no_cached_page;
1520         }
1521 
1522         /*
1523          * We have a locked page in the page cache, now we need to check
1524          * that it's up-to-date. If not, it is going to be due to an error.
1525          */
1526         if (unlikely(!PageUptodate(page)))
1527                 goto page_not_uptodate;
1528 
1529         /*
1530          * Found the page and have a reference on it.
1531          * We must recheck i_size under page lock.
1532          */
1533         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1534         if (unlikely(offset >= size)) {
1535                 unlock_page(page);
1536                 page_cache_release(page);
1537                 return VM_FAULT_SIGBUS;
1538         }
1539 
1540         ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1541         vmf->page = page;
1542         return ret | VM_FAULT_LOCKED;
1543 
1544 no_cached_page:
1545         /*
1546          * We're only likely to ever get here if MADV_RandOM is in
1547          * effect.
1548          */
1549         error = page_cache_read(file, offset);
1550 
1551         /*
1552          * The page we want has now been added to the page cache.
1553          * In the unlikely event that someone removed it in the
1554          * meantime, we'll just come back here and read it again.
1555          */
1556         if (error >= 0)
1557                 goto retry_find;
1558 
1559         /*
1560          * An error return from page_cache_read can result if the
1561          * system is low on memory, or a problem occurs while trying
1562          * to schedule I/O.
1563          */
1564         if (error == -ENOMEM)
1565                 return VM_FAULT_OOM;
1566         return VM_FAULT_SIGBUS;
1567 
1568 page_not_uptodate:
1569         /*
1570          * Umm, take care of errors if the page isn't up-to-date.
1571          * Try to re-read it _once_. We do this synchronously,
1572          * because there really aren't any performance issues here
1573          * and we need to check for errors.
1574          */
1575         ClearPageError(page);
1576         error = mapping->a_ops->readpage(file, page);
1577         if (!error) {
1578                 wait_on_page_locked(page);
1579                 if (!PageUptodate(page))
1580                         error = -EIO;
1581         }
1582         page_cache_release(page);
1583 
1584         if (!error || error == AOP_TRUNCATED_PAGE)
1585                 goto retry_find;
1586 
1587         /* Things didn't work out. Return zero to tell the mm layer so. */
1588         shrink_readahead_size_eio(file, ra);
1589         return VM_FAULT_SIGBUS;
1590 }

由 .fault    = filemap_fault, 知道 filemap_fault 是个call back函数，由函数注释知道之后发生
page fault时，会用到这个函数。它先到page cache中找，如果没有在从硬盘上读取。把page struct组织成
了一颗radix tree，加快了page的查找速度。如果在page cache中找到了，则根据访问规律决定是否进行
readahead操作；如果需要从硬盘读，也要先决定是否进行readahead操作，继续试验从page cache中读取，
如果实在没有，就只好在page_cache_read()中先alloc page，然后把新分配的page加入page cache的某个
lru list，从硬盘读取数据，最后再去page cache中就能找到所需的page了。

alloc page后，kernel将page加到lru中, lru中的page是回收的候选. 其余page要么空闲要么正被使用, 不
能被回收。 

不说readahead算法，也不对从硬盘读取数据的通用函数do_generic_file_read()继续分析。

filemap_fault() -> page_cache_read()
1363 #ifdef CONFIG_MMU
1364 /**
1365  * page_cache_read - adds requested page to the page cache if not already there
1366  * @file:       file to read
1367  * @offset:     page index
1368  *
1369  * This adds the requested page to the page cache if it isn't already there,
1370  * and schedules an I/O to read in its contents from disk.
1371  */
1372 static int page_cache_read(struct file *file, pgoff_t offset)
1373 {
1374         struct address_space *mapping = file->f_mapping;
1375         struct page *page;
1376         int ret;
1377 
1378         do {
1379                 page = page_cache_alloc_cold(mapping);        // alloc page
1380                 if (!page)
1381                         return -ENOMEM;
1382 
1383                 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 
1384                 if (ret == 0)
1385                         ret = mapping->a_ops->readpage(file, page);   // read from disk
1386                 else if (ret == -EEXIST)
1387                         ret = 0; /* losing race to add is OK */
1388 
1389                 page_cache_release(page);
1390 
1391         } while (ret == AOP_TRUNCATED_PAGE);
1392 
1393         return ret;
1394 }

do_page_fault()中demand paging的处理分三种情况：
1)该页从未被访问过。anonymous map 或者 属于 linear file map。
2) nonlinear file map，就是在mmap基础上分离mapped page，man remap_file_pages有详细介绍。
3)swap area：该页曾经被访问过，后来放在swap area上，现在又要访问它了。

1)对应于linear file map的处理函数do_linear_fault()确实调用了上面的filemap_fault()。而且2)的处理
函数do_nonlinear_fault()也调用了filemap_fault()。

don_linear_fault()中有个公式:
pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pgoff表示address在整个inode的地址空间里的偏移，local vma offset + vm_file offset 

do_page_fault()->handle_mm_fault()->handle_pte_fault()
             ......
2966         if (!pte_present(entry)) {         // demand paging
2967                 if (pte_none(entry)) {
2968                         if (vma->vm_ops) {
2969                                 if (likely(vma->vm_ops->fault))
2970                                         return do_linear_fault(mm, vma, address,
2971                                                 pte, pmd, flags, entry);
2972                         }
2973                         return do_anonymous_page(mm, vma, address,
2974                                                  pte, pmd, flags);
2975                 }
2976                 if (pte_file(entry))
2977                         return do_nonlinear_fault(mm, vma, address,
2978                                         pte, pmd, flags, entry);
2979                 return do_swap_page(mm, vma, address,
2980                                         pte, pmd, flags, entry);
2981         }
             .....

do_anonymous_page()用于处理anonymous map，优化了read操作。
当read引发page fault时，由于是第一次read该页，所以新分配的页面里的内容填什么无所谓，但是填0是安
全的。为此系统在初始化时预留了zero page，以便进程随时使用。这里就是直接利用zero page，然后设置
pte，这样就不必进行alloc page操作了。如果page fault是write引发的，则需要alloc page，设置pte。

do_page_fault()->handle_mm_fault()->handle_pte_fault() -> do_anonymous_page():
2647 /*
2648  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2649  * but allow concurrent faults), and pte mapped but not yet locked.
2650  * We return with mmap_sem still held, but pte unmapped and unlocked.
2651  */
2652 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2653                 unsigned long address, pte_t *page_table, pmd_t *pmd,
2654                 unsigned int flags)
2655 {
2656         struct page *page;
2657         spinlock_t *ptl;
2658         pte_t entry;
2659 
2660         if (!(flags & FAULT_FLAG_WRITE)) {                            
2661                 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),   // zero page
2662                                                 vma->vm_page_prot));
2663                 ptl = pte_lockptr(mm, pmd);
2664                 spin_lock(ptl);
2665                 if (!pte_none(*page_table))
2666                         goto unlock;
2667                 goto setpte;
2668         }
2669 
2670         /* Allocate our own private page. */
2671         pte_unmap(page_table);
2672 
2673         if (unlikely(anon_vma_prepare(vma)))
2674                 goto oom;
2675         page = alloc_zeroed_user_highpage_movable(vma, address);
2676         if (!page)
2677                 goto oom;
2678         __SetPageUptodate(page);
2679 
2680         if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2681                 goto oom_free_page;
2682 
2683         entry = mk_pte(page, vma->vm_page_prot);
2684         if (vma->vm_flags & VM_WRITE)
2685                 entry = pte_mkwrite(pte_mkdirty(entry));
2686 
2687         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2688         if (!pte_none(*page_table))
2689                 goto release;
2690 
2691         inc_mm_counter(mm, anon_rss);
2692         page_add_new_anon_rmap(page, vma, address);
2693 setpte:
2694         set_pte_at(mm, address, page_table, entry);
2695 
2696         /* No need to invalidate - it was non-present before */
2697         update_mmu_cache(vma, address, entry);
2698 unlock:
2699         pte_unmap_unlock(page_table, ptl);
2700         return 0;
2701 release:
2702         mem_cgroup_uncharge_page(page);
2703         page_cache_release(page);
2704         goto unlock;
2705 oom_free_page:
2706         page_cache_release(page);
2707 oom:
2708         return VM_FAULT_OOM;
2709 }

其中page_add_new_anon_rmap()负责设置page struct field，将page加入相应的lru list。
do_anonymous_page() -> page_add_new_anon_rmap():

 690 /**
 691  * page_add_new_anon_rmap - add pte mapping to a new anonymous page
 692  * @page:       the page to add the mapping to
 693  * @vma:        the vm area in which the mapping is added
 694  * @address:    the user virtual address mapped
 695  *
 696  * Same as page_add_anon_rmap but must only be called on *new* pages.
 697  * This means the inc-and-test can be bypassed.
 698  * Page does not have to be locked.
 699  */
 700 void page_add_new_anon_rmap(struct page *page,
 701         struct vm_area_struct *vma, unsigned long address)
 702 {
 703         VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 704         SetPageSwapBacked(page);
 705         atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
 706         __inc_zone_page_state(page, NR_ANON_PAGES);
 707         __page_set_anon_rmap(page, vma, address);
 708         if (page_evictable(page, vma))
 709                 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
 710         else
 711                 add_page_to_unevictable_list(page);
 712 }

do_anonymous_page() -> page_add_new_anon_rmap() -> __page_set_anon_rmap():
/**
 * __page_set_anon_rmap - setup new anonymous rmap
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added 
 * @address:    the user virtual address mapped
 */
static void __page_set_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
{
        struct anon_vma *anon_vma = vma->anon_vma;
 
        BUG_ON(!anon_vma);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
}

最后是page在swap area上的情况。此时也是先到swap cache中找，找到返回; 如果没找到则alloc page，
lock page，并将该page加入swap cache，同时加入lru list。然后将请求封装成bio提交给block device layer，
开始具体的硬盘操作。swap cache是个特殊的page cache，

swap area可能位于单独的partition，也可能位于一个file，系统对它们的处理是一样的。swap area 包含
多个page slots。如果swap area位于file，因为filesystem无法保证能够连续分配文件，为了提高性能，会
把整个swap area分成许多swap extent，硬盘上的物理位置连续的内容就是一个swap extent。如果swap area
位于partition，则系统会很好的规划成连续的，此时无需swap extent，即只有一个swap extent。有的
server会创建很多swap area，Linux最多可达32个。malloc()分配的heap就可能被swap out。

do_page_fault() -> handle_mm_fault()->handle_pte_fault() -> do_swap_page()
2505 /*
2506  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2507  * but allow concurrent faults), and pte mapped but not yet locked.
2508  * We return with mmap_sem still held, but pte unmapped and unlocked.
2509  */
2510 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2511                 unsigned long address, pte_t *page_table, pmd_t *pmd,
2512                 unsigned int flags, pte_t orig_pte)
2513 {
2514         spinlock_t *ptl;
2515         struct page *page;
2516         swp_entry_t entry;
2517         pte_t pte;
2518         struct mem_cgroup *ptr = NULL;
2519         int ret = 0;
2520 
2521         if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2522                 goto out;
2523 
2524         entry = pte_to_swp_entry(orig_pte);
2525         if (unlikely(non_swap_entry(entry))) {
2526                 if (is_migration_entry(entry)) {
2527                         migration_entry_wait(mm, pmd, address);
2528                 } else if (is_hwpoison_entry(entry)) {
2529                         ret = VM_FAULT_HWPOISON;
2530                 } else {
2531                         print_bad_pte(vma, address, orig_pte, NULL);
2532                         ret = VM_FAULT_SIGBUS;
2533                 }
2534                 goto out;
2535         }
2536         delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2537         page = lookup_swap_cache(entry);
2538         if (!page) {
2539                 grab_swap_token(mm); /* Contend for token _before_ read-in */
2540                 page = swapin_readahead(entry,
2541                                         GFP_HIGHUSER_MOVABLE, vma, address);
2542                 if (!page) {
2543                         /*
2544                          * Back out if somebody else faulted in this pte
2545                          * while we released the pte lock.
2546                          */
2547                         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2548                         if (likely(pte_same(*page_table, orig_pte)))
2549                                 ret = VM_FAULT_OOM;
2550                         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2551                         goto unlock;
2552                 }
2553 
2554                 /* Had to read the page from swap area: Major fault */
2555                 ret = VM_FAULT_MAJOR;
2556                 count_vm_event(PGMAJFAULT);
2557         } else if (PageHWPoison(page)) {
2558                 /*
2559                  * hwpoisoned dirty swapcache pages are kept for killing
2560                  * owner processes (which may be unknown at hwpoison time)
2561                  */
2562                 ret = VM_FAULT_HWPOISON;
2563                 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2564                 goto out_release;
2565         }
2566 
2567         lock_page(page);
2568         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2569 
2570         page = ksm_might_need_to_copy(page, vma, address);
2571         if (!page) {
2572                 ret = VM_FAULT_OOM;
2573                 goto out;
2574         }
2575 
2576         if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2577                 ret = VM_FAULT_OOM;
2578                 goto out_page;
2579         }
2580 
2581         /*
2582          * Back out if somebody else already faulted in this pte.
2583          */
2584         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2585         if (unlikely(!pte_same(*page_table, orig_pte)))
2586                 goto out_nomap;
2587 
2588         if (unlikely(!PageUptodate(page))) {
2589                 ret = VM_FAULT_SIGBUS;
2590                 goto out_nomap;
2591         }
2592 
2593         /*
2594          * The page isn't present yet, go ahead with the fault.
2595          *
2596          * Be careful about the sequence of operations here.
2597          * To get its accounting right, reuse_swap_page() must be called
2598          * while the page is counted on swap but not yet in mapcount i.e.
2599          * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2600          * must be called after the swap_free(), or it will never succeed.
2601          * Because delete_from_swap_page() may be called by reuse_swap_page(),
2602          * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2603          * in page->private. In this case, a record in swap_cgroup  is silently
2604          * discarded at swap_free().
2605          */
2606 
2607         inc_mm_counter(mm, anon_rss);
2608         pte = mk_pte(page, vma->vm_page_prot);
2609         if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2610                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2611                 flags &= ~FAULT_FLAG_WRITE;
2612         }
2613         flush_icache_page(vma, page);
2614         set_pte_at(mm, address, page_table, pte);
2615         page_add_anon_rmap(page, vma, address);
2616         /* It's better to call commit-charge after rmap is established */
2617         mem_cgroup_commit_charge_swapin(page, ptr);
2618 
2619         swap_free(entry);
2620         if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2621                 try_to_free_swap(page);
2622         unlock_page(page);
2623 
2624         if (flags & FAULT_FLAG_WRITE) {
2625                 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2626                 if (ret & VM_FAULT_ERROR)
2627                         ret &= VM_FAULT_ERROR;
2628                 goto out;
2629         }
2630 
2631         /* No need to invalidate - it was non-present before */
2632         update_mmu_cache(vma, address, pte);
2633 unlock:
2634         pte_unmap_unlock(page_table, ptl);
2635 out:
2636         return ret;
2637 out_nomap:
2638         mem_cgroup_cancel_charge_swapin(ptr);
2639         pte_unmap_unlock(page_table, ptl);
2640 out_page:
2641         unlock_page(page);
2642 out_release:
2643         page_cache_release(page);
2644         return ret;
2645 }

代码中的 swap_info_struct 和 swap_extent 分别描述 swap area 和 swap extent。
swp_entry_t 类型由 pte_t 转化而来，其变量是个地址，分两个字段，分别表示swap out page所在的
swap area及在该swap area中的哪个page slot。这样由pte的内容就能找到swap out page了。此时pte的
_PAGE_PRESENT为false，其它的bit就构成了地址。为此就有函数比如提取某个字段，或者由两个字段组合。

grab_swap_token()决定是否授予swaptoken，进程如果有了这个token，则只有在系统极度缺页时才能把它的
页swap out。

do_swap_page() -> swapin_readahead()
351 /**
352  * swapin_readahead - swap in pages in hope we need them soon
353  * @entry: swap entry of this memory
354  * @gfp_mask: memory allocation flags
355  * @vma: user vma this address belongs to
356  * @addr: target address for mempolicy
357  *
358  * Returns the struct page for entry and addr, after queueing swapin.
359  *
360  * Primitive swap readahead code. We simply read an aligned block of
361  * (1 << page_cluster) entries in the swap area. This method is chosen
362  * because it doesn't cost us any seek time.  We also make sure to queue
363  * the 'original' request together with the readahead ones...
364  *
365  * This has been extended to use the NUMA policies from the mm triggering
366  * the readahead.
367  *
368  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
369  */
370 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
371                         struct vm_area_struct *vma, unsigned long addr)
372 {
373         int nr_pages;
374         struct page *page;
375         unsigned long offset;
376         unsigned long end_offset;
377 
378         /*
379          * Get starting offset for readaround, and number of pages to read.
380          * Adjust starting address by readbehind (for NUMA interleave case)?
381          * No, it's very unlikely that swap layout would follow vma layout,
382          * more likely that neighbouring swap pages came from the same node:
383          * so use the same "addr" to choose the same node for each swap read.
384          */
385         nr_pages = valid_swaphandles(entry, &offset);
386         for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
387                 /* Ok, do the async read-ahead now */
388                 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
389                                                 gfp_mask, vma, addr);
390                 if (!page)
391                         break;
392                 page_cache_release(page);
393         }
394         lru_add_drain();        /* Push any new pages onto the LRU now */
395         return read_swap_cache_async(entry, gfp_mask, vma, addr);
396 }

do_swap_page() -> swapin_readahead() -> read_swap_cache_async()
273  * Locate a page of swap in physical memory, reserving swap cache space
274  * and reading the disk if it is not already cached.
275  * A failure return means that either the page allocation failed or that
276  * the swap entry is no longer in use.
277  */
278 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
279                         struct vm_area_struct *vma, unsigned long addr)
280 {
281         struct page *found_page, *new_page = NULL;
282         int err;
283 
284         do {
285                 /*
286                  * First check the swap cache.  Since this is normally
287                  * called after lookup_swap_cache() failed, re-calling
288                  * that would confuse statistics.
289                  */
290                 found_page = find_get_page(&swapper_space, entry.val);
291                 if (found_page)
292                         break;
293 
294                 /*
295                  * Get a new page to read into from swap.
296                  */
297                 if (!new_page) {
298                         new_page = alloc_page_vma(gfp_mask, vma, addr);
299                         if (!new_page)
300                                 break;          /* Out of memory */
301                 }
302 
303                 /*
304                  * call radix_tree_preload() while we can wait.
305                  */
306                 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307                 if (err)
308                         break;
309 
310                 /*
311                  * Swap entry may have been freed since our caller observed it.
312                  */
313                 err = swapcache_prepare(entry);
314                 if (err == -EEXIST) {   /* seems racy */
315                         radix_tree_preload_end();
316                         continue;
317                 }
318                 if (err) {              /* swp entry is obsolete ? */
319                         radix_tree_preload_end();
320                         break;
321                 }
322 
323                 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
324                 __set_page_locked(new_page);
325                 SetPageSwapBacked(new_page);
326                 err = __add_to_swap_cache(new_page, entry);
327                 if (likely(!err)) {
328                         radix_tree_preload_end();
329                         /*
330                          * Initiate read into locked page and return.
331                          */
332                         lru_cache_add_anon(new_page);
333                         swap_readpage(new_page);
334                         return new_page;
335                 }
336                 radix_tree_preload_end();
337                 ClearPageSwapBacked(new_page);
338                 __clear_page_locked(new_page);
339                 /*
340                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341                  * clear SWAP_HAS_CACHE flag.
342                  */
343                 swapcache_free(entry, NULL);
344         } while (err != -ENOMEM);
345 
346         if (new_page)
347                 page_cache_release(new_page);
348         return found_page;
349 }

上面需要注意的是，每当有alloc page操作时，由于可能sleep，所以返回时要对分配前的某些条件重新判断:
310                 /*
311                  * Swap entry may have been freed since our caller observed it.
312                  */
313                 err = swapcache_prepare(entry);

接下来就要封装bio struct，然后提交给block device layer。
do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage()
117 int swap_readpage(struct page *page)
118 {
119         struct bio *bio;
120         int ret = 0;
121  
122         VM_BUG_ON(!PageLocked(page));
123         VM_BUG_ON(PageUptodate(page));
124         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);   // initalize bio struct
125         if (bio == NULL) {
126                 unlock_page(page);
127                 ret = -ENOMEM;
128                 goto out;
129         }
130         count_vm_event(PSWPIN);
131         submit_bio(READ, bio);
132 out:
133         return ret;
134 }

注意上面的 end_swap_bio_read 是个call back函数，负责释放lock，执行bio_put()减少bio struct的引用
计数，如果计数为0就释放bio struct。

下面的__generic_make_request()负责提交bio struct给device driver。由于磁盘操作费时，提交bio struct
给device driver之后，当前进程会might_sleep()。如果是server，kernel配置成CONFIG_PREEMPT_NONE，
此时might_sleep()为空操作，当前进程直接返回; 桌面系统的kernel默认配置成CONFIG_PREEMPT_VOLUNTARY，
此时就会发生调度，其它进程可能会占用cpu。当硬盘读取数据完毕后，产生中断，唤醒当前进程。
end_swap_bio_read() 用来收尾。

do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio()
1542 /**
1543  * submit_bio - submit a bio to the block device layer for I/O
1544  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1545  * @bio: The &struct bio which describes the I/O
1546  *
1547  * submit_bio() is very similar in purpose to generic_make_request(), and
1548  * uses that function to do most of the work. Both are fairly rough
1549  * interfaces; @bio must be presetup and ready for I/O.
1550  *     
1551  */    
1552 void submit_bio(int rw, struct bio *bio)
1553 {              
1554         int count = bio_sectors(bio);
1555        
1556         bio->bi_rw |= rw;
1557        
1558         /*
1559          * If it's a regular read/write or a barrier with data attached,
1560          * go through the normal accounting stuff before submission.
1561          */
1562         if (bio_has_data(bio)) {
1563                 if (rw & WRITE) {
1564                         count_vm_events(PGPGOUT, count);
1565                 } else {
1566                         task_io_account_read(bio->bi_size);
1567                         count_vm_events(PGPGIN, count);
1568                 }
1569 
1570                 if (unlikely(block_dump)) {
1571                         char b[BDEVNAME_SIZE];
1572                         printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
1573                         current->comm, task_pid_nr(current),
1574                                 (rw & WRITE) ? "WRITE" : "READ",
1575                                 (unsigned long long)bio->bi_sector,
1576                                 bdevname(bio->bi_bdev, b));
1577                 }
1578         }
1579 
1580         generic_make_request(bio);
1581 }

do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio() -> generic_make_request()

1490 /*
1491  * We only want one ->make_request_fn to be active at a time,
1492  * else stack usage with stacked devices could be a problem.
1493  * So use current->bio_{list,tail} to keep a list of requests
1494  * submited by a make_request_fn function.
1495  * current->bio_tail is also used as a flag to say if
1496  * generic_make_request is currently active in this task or not.
1497  * If it is NULL, then no make_request is active.  If it is non-NULL,
1498  * then a make_request is active, and new requests should be added
1499  * at the tail
1500  */
1501 void generic_make_request(struct bio *bio)
1502 {
1503         if (current->bio_tail) {
1504                 /* make_request is active */
1505                 *(current->bio_tail) = bio;
1506                 bio->bi_next = NULL;
1507                 current->bio_tail = &bio->bi_next;
1508                 return;
1509         }
1510         /* following loop may be a bit non-obvious, and so deserves some
1511          * explanation.
1512          * Before entering the loop, bio->bi_next is NULL (as all callers
1513          * ensure that) so we have a list with a single bio.
1514          * We pretend that we have just taken it off a longer list, so
1515          * we assign bio_list to the next (which is NULL) and bio_tail
1516          * to &bio_list, thus initialising the bio_list of new bios to be
1517          * added.  __generic_make_request may indeed add some more bios
1518          * through a recursive call to generic_make_request.  If it
1519          * did, we find a non-NULL value in bio_list and re-enter the loop
1520          * from the top.  In this case we really did just take the bio
1521          * of the top of the list (no pretending) and so fixup bio_list and
1522          * bio_tail or bi_next, and call into __generic_make_request again.
1523          *
1524          * The loop was structured like this to make only one call to
1525          * __generic_make_request (which is important as it is large and
1526          * inlined) and to keep the structure simple.
1527          */
1528         BUG_ON(bio->bi_next);
1529         do {
1530                 current->bio_list = bio->bi_next;
1531                 if (bio->bi_next == NULL)
1532                         current->bio_tail = ¤t->bio_list;
1533                 else
1534                         bio->bi_next = NULL;
1535                 __generic_make_request(bio);
1536                 bio = current->bio_list;
1537         } while (bio);
1538         current->bio_tail = NULL; /* deactivate */
1539 }

do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio() -> generic_make_request() -> __generic_make_request()
1380 /**
1381  * generic_make_request - hand a buffer to its device driver for I/O
1382  * @bio:  The bio describing the location in memory and on the device.
1383  *
1384  * generic_make_request() is used to make I/O requests of block
1385  * devices. It is passed a &struct bio, which describes the I/O that needs
1386  * to be done.
1387  *
1388  * generic_make_request() does not return any status.  The
1389  * success/failure status of the request, along with notification of
1390  * completion, is delivered asynchronously through the bio->bi_end_io
1391  * function described (one day) else where.
1392  *
1393  * The caller of generic_make_request must make sure that bi_io_vec
1394  * are set to describe the memory buffer, and that bi_dev and bi_sector are
1395  * set to describe the device address, and the
1396  * bi_end_io and optionally bi_private are set to describe how
1397  * completion notification should be signaled.
1398  *
1399  * generic_make_request and the drivers it calls may use bi_next if this
1400  * bio happens to be merged with someone else, and may change bi_dev and
1401  * bi_sector for remaps as it sees fit.  So the values of these fields
1402  * should NOT be depended on after the call to generic_make_request.
1403  */
1404 static inline void __generic_make_request(struct bio *bio)
1405 {
1406         struct request_queue *q;
1407         sector_t old_sector;
1408         int ret, nr_sectors = bio_sectors(bio);
1409         dev_t old_dev;
1410         int err = -EIO;
1411 
1412         might_sleep();
1413 
1414         if (bio_check_eod(bio, nr_sectors))
1415                 goto end_io;
1416 
1417         /*
1418          * Resolve the mapping until finished. (drivers are
1419          * still free to implement/resolve their own stacking
1420          * by explicitly returning 0)
1421          *
1422          * NOTE: we don't repeat the blk_size check for each new device.
1423          * Stacking drivers are expected to know what they are doing.
1424          */
1425         old_sector = -1;
1426         old_dev = 0;
1427         do {
1428                 char b[BDEVNAME_SIZE];
1429 
1430                 q = bdev_get_queue(bio->bi_bdev);
1431                 if (unlikely(!q)) {
1432                         printk(KERN_ERR
1433                                "generic_make_request: Trying to access "
1434                                 "nonexistent block-device %s (%Lu)\n",
1435                                 bdevname(bio->bi_bdev, b),
1436                                 (long long) bio->bi_sector);
1437                         goto end_io;
1438                 }
1439 
1440                 if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1441                              nr_sectors > queue_max_hw_sectors(q))) {
1442                         printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1443                                bdevname(bio->bi_bdev, b),
1444                                bio_sectors(bio),
1445                                queue_max_hw_sectors(q));
1446                         goto end_io;
1447                 }
1448 
1449                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
1450                         goto end_io;
1451 
1452                 if (should_fail_request(bio))
1453                         goto end_io;
1454 
1455                 /*
1456                  * If this device has partitions, remap block n
1457                  * of partition p to block n+start(p) of the disk.
1458                  */
1459                 blk_partition_remap(bio);
1460 
1461                 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1462                         goto end_io;
1463 
1464                 if (old_sector != -1)
1465                         trace_block_remap(q, bio, old_dev, old_sector);
1466 
1467                 old_sector = bio->bi_sector;
1468                 old_dev = bio->bi_bdev->bd_dev;
1469 
1470                 if (bio_check_eod(bio, nr_sectors))
1471                         goto end_io;
1472 
1473                 if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1474                     !blk_queue_discard(q)) {
1475                         err = -EOPNOTSUPP;
1476                         goto end_io;
1477                 }
1478 
1479                 trace_block_bio_queue(q, bio);
1480 
1481                 ret = q->make_request_fn(q, bio);
1482         } while (ret);
1483 
1484         return;
1485 
1486 end_io:
1487         bio_endio(bio, err);      // end_swap_bio_read()
1488 }

看看收尾工作:
68 void end_swap_bio_read(struct bio *bio, int err)
69 {
70         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
71         struct page *page = bio->bi_io_vec[0].bv_page;
72 
73         if (!uptodate) {
74                 SetPageError(page);
75                 ClearPageUptodate(page);
76                 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
77                                 imajor(bio->bi_bdev->bd_inode),
78                                 iminor(bio->bi_bdev->bd_inode),
79                                 (unsigned long long)bio->bi_sector);
80         } else {
81                 SetPageUptodate(page);
82         }
83         unlock_page(page);
84         bio_put(bio);
85 }
一碗阳春面

Sunday, March 21, 2010

page fault and mmap

No comments:

Post a Comment