mmap后发生page fault时,do_page_fault()处理起来要多些步骤:
mmap后,第一次对map page进行操作时,无论read还是write,由于所需物理页从来还没进入内存过,需要分
配物理页,do_page_fault()肯定要走demand paging的do_linear_fault()或是do_anonymous_page().
之后继续访问:
I) 如果之前调入的物理页没被换出到外存,那在发生page fault时就区分read和write了。read没什么说的,
如果是write引发的page fault就要调用 do_wp_page()来处理,完成COW的实现。
II)如果之前调入的物理页已经被换出到外存,那就仍需demand paging,走do_linear_fault()或者
do_anonymous_page()或者do_nonlinear_fault()或者do_swap_page(),将缺的物理页从硬盘读入。
比如用mmap()创建了一个MAP_SHARED writable 的mmap,man mmap,kernel用do_mmap()实现:
vm_flags要设置 VM_SHARED | VM_WRITE,注释说有的driver会去掉vma->vm_page_prot的PROT_WRITE权限。
之后引发page fault时,第一次page fault肯定是走demand paging的do_linear_fault(),里面会根据
vma->vm_page_prot设置pte,此时pte也就没有 _PAGE_RW了,变相做了个pte_wrprotect(),和fork()的情况
类似了。此后继续引发page fault时,如果刚才分配的物理页没有被换出到硬盘,那就不走demand paging
了,如果是write引发的,就会调用do_wp_page()来处理。看看此时do_wp_page()的处理:
do_page_fault() -> handle_mm_fault() -> handle_pte_fault() -> do_wp_page()
1985 /*
1986 * This routine handles present pages, when users try to write
1987 * to a shared page. It is done by copying the page to a new address
1988 * and decrementing the shared-page counter for the old page.
1989 *
1990 * Note that this routine assumes that the protection checks have been
1991 * done by the caller (the low-level page fault routine in most cases).
1992 * Thus we can safely just mark it writable once we've done any necessary
1993 * COW.
1994 *
1995 * We also mark the page dirty at this point even though the page will
1996 * change only once the write actually happens. This avoids a few races,
1997 * and potentially makes it more efficient.
1998 *
1999 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2000 * but allow concurrent faults), with pte both mapped and locked.
2001 * We return with mmap_sem still held, but pte unmapped and unlocked.
2002 */
2003 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2004 unsigned long address, pte_t *page_table, pmd_t *pmd,
2005 spinlock_t *ptl, pte_t orig_pte)
2006 {
2007 struct page *old_page, *new_page;
2008 pte_t entry;
2009 int reuse = 0, ret = 0;
2010 int page_mkwrite = 0;
2011 struct page *dirty_page = NULL;
2012
2013 old_page = vm_normal_page(vma, address, orig_pte);
2014 if (!old_page) {
2015 /*
2016 * VM_MIXEDMAP !pfn_valid() case
2017 *
2018 * We should not cow pages in a shared writeable mapping.
2019 * Just mark the pages writable as we can't do any dirty
2020 * accounting on raw pfn maps.
2021 */
2022 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2023 (VM_WRITE|VM_SHARED))
2024 goto reuse;
2025 goto gotten;
2026 }
2027
2028 /*
2029 * Take out anonymous pages first, anonymous shared vmas are
2030 * not dirty accountable.
2031 */
2032 if (PageAnon(old_page) && !PageKsm(old_page)) {
2033 if (!trylock_page(old_page)) {
2034 page_cache_get(old_page);
2035 pte_unmap_unlock(page_table, ptl);
2036 lock_page(old_page);
2037 page_table = pte_offset_map_lock(mm, pmd, address,
2038 &ptl);
2039 if (!pte_same(*page_table, orig_pte)) {
2040 unlock_page(old_page);
2041 page_cache_release(old_page);
2042 goto unlock;
2043 }
2044 page_cache_release(old_page);
2045 }
2046 reuse = reuse_swap_page(old_page);
2047 unlock_page(old_page);
2048 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2049 (VM_WRITE|VM_SHARED))) { // here
2050 /*
2051 * Only catch write-faults on shared writable pages,
2052 * read-only shared pages can get COWed by
2053 * get_user_pages(.write=1, .force=1).
2054 */
2055 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2056 struct vm_fault vmf;
2057 int tmp;
2058
2059 vmf.virtual_address = (void __user *)(address &
2060 PAGE_MASK);
2061 vmf.pgoff = old_page->index;
2062 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2063 vmf.page = old_page;
2064
2065 /*
2066 * Notify the address space that the page is about to
2067 * become writable so that it can prohibit this or wait
2068 * for the page to get into an appropriate state.
2069 *
2070 * We do this without the lock held, so that it can
2071 * sleep if it needs to.
2072 */
2073 page_cache_get(old_page);
2074 pte_unmap_unlock(page_table, ptl);
2075
2076 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2077 if (unlikely(tmp &
2078 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2079 ret = tmp;
2080 goto unwritable_page;
2081 }
2082 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2083 lock_page(old_page);
2084 if (!old_page->mapping) {
2085 ret = 0; /* retry the fault */
2086 unlock_page(old_page);
2087 goto unwritable_page;
2088 }
2089 } else
2090 VM_BUG_ON(!PageLocked(old_page));
2091
2092 /*
2093 * Since we dropped the lock we need to revalidate
2094 * the PTE as someone else may have changed it. If
2095 * they did, we just return, as we can count on the
2096 * MMU to tell us if they didn't also make it writable.
2097 */
2098 page_table = pte_offset_map_lock(mm, pmd, address,
2099 &ptl);
2100 if (!pte_same(*page_table, orig_pte)) {
2101 unlock_page(old_page);
2102 page_cache_release(old_page);
2103 goto unlock;
2104 }
2105
2106 page_mkwrite = 1;
2107 }
2108 dirty_page = old_page;
2109 get_page(dirty_page);
2110 reuse = 1;
2111 }
2112
2113 if (reuse) {
2114 reuse:
2115 flush_cache_page(vma, address, pte_pfn(orig_pte));
2116 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry);
2120 ret |= VM_FAULT_WRITE;
2121 goto unlock;
2122 }
2123
2124 /*
2125 * Ok, we need to copy. Oh, well..
2126 */
2127 page_cache_get(old_page);
2128 gotten:
2129 pte_unmap_unlock(page_table, ptl);
2130
2131 if (unlikely(anon_vma_prepare(vma)))
2132 goto oom;
2133
2134 if (is_zero_pfn(pte_pfn(orig_pte))) {
2135 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2136 if (!new_page)
2137 goto oom;
2138 } else {
2139 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2140 if (!new_page)
2141 goto oom;
2142 cow_user_page(new_page, old_page, address, vma);
2143 }
2144 __SetPageUptodate(new_page);
2145
2146 /*
2147 * Don't let another task, with possibly unlocked vma,
2148 * keep the mlocked page.
2149 */
2150 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2151 lock_page(old_page); /* for LRU manipulation */
2152 clear_page_mlock(old_page);
2153 unlock_page(old_page);
2154 }
2155
2156 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2157 goto oom_free_new;
2158
2159 /*
2160 * Re-check the pte - we dropped the lock
2161 */
2162 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2163 if (likely(pte_same(*page_table, orig_pte))) {
2164 if (old_page) {
2165 if (!PageAnon(old_page)) {
2166 dec_mm_counter(mm, file_rss);
2167 inc_mm_counter(mm, anon_rss);
2168 }
2169 } else
2170 inc_mm_counter(mm, anon_rss);
2171 flush_cache_page(vma, address, pte_pfn(orig_pte));
2172 entry = mk_pte(new_page, vma->vm_page_prot);
2173 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2174 /*
2175 * Clear the pte entry and flush it first, before updating the
2176 * pte with the new entry. This will avoid a race condition
2177 * seen in the presence of one thread doing SMC and another
2178 * thread doing COW.
2179 */
2180 ptep_clear_flush(vma, address, page_table);
2181 page_add_new_anon_rmap(new_page, vma, address);
2182 /*
2183 * We call the notify macro here because, when using secondary
2184 * mmu page tables (such as kvm shadow page tables), we want the
2185 * new page to be mapped directly into the secondary page table.
2186 */
2187 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry);
2189 if (old_page) {
2190 /*
2191 * Only after switching the pte to the new page may
2192 * we remove the mapcount here. Otherwise another
2193 * process may come and find the rmap count decremented
2194 * before the pte is switched to the new page, and
2195 * "reuse" the old page writing into it while our pte
2196 * here still points into it and can be read by other
2197 * threads.
2198 *
2199 * The critical issue is to order this
2200 * page_remove_rmap with the ptp_clear_flush above.
2201 * Those stores are ordered by (if nothing else,)
2202 * the barrier present in the atomic_add_negative
2203 * in page_remove_rmap.
2204 *
2205 * Then the TLB flush in ptep_clear_flush ensures that
2206 * no process can access the old page before the
2207 * decremented mapcount is visible. And the old page
2208 * cannot be reused until after the decremented
2209 * mapcount is visible. So transitively, TLBs to
2210 * old page will be flushed before it can be reused.
2211 */
2212 page_remove_rmap(old_page);
2213 }
2214
2215 /* Free the old page.. */
2216 new_page = old_page;
2217 ret |= VM_FAULT_WRITE;
2218 } else
2219 mem_cgroup_uncharge_page(new_page);
2220
2221 if (new_page)
2222 page_cache_release(new_page);
2223 if (old_page)
2224 page_cache_release(old_page);
2225 unlock:
2226 pte_unmap_unlock(page_table, ptl);
2227 if (dirty_page) {
2228 /*
2229 * Yes, Virginia, this is actually required to prevent a race
2230 * with clear_page_dirty_for_io() from clearing the page dirty
2231 * bit after it clear all dirty ptes, but before a racing
2232 * do_wp_page installs a dirty pte.
2233 *
2234 * do_no_page is protected similarly.
2235 */
2236 if (!page_mkwrite) {
2237 wait_on_page_locked(dirty_page);
2238 set_page_dirty_balance(dirty_page, page_mkwrite);
2239 }
2240 put_page(dirty_page);
2241 if (page_mkwrite) {
2242 struct address_space *mapping = dirty_page->mapping;
2243
2244 set_page_dirty(dirty_page);
2245 unlock_page(dirty_page);
2246 page_cache_release(dirty_page);
2247 if (mapping) {
2248 /*
2249 * Some device drivers do not set page.mapping
2250 * but still dirty their pages
2251 */
2252 balance_dirty_pages_ratelimited(mapping);
2253 }
2254 }
2255
2256 /* file_update_time outside page_lock */
2257 if (vma->vm_file)
2258 file_update_time(vma->vm_file);
2259 }
2260 return ret;
2261 oom_free_new:
2262 page_cache_release(new_page);
2263 oom:
2264 if (old_page) {
2265 if (page_mkwrite) {
2266 unlock_page(old_page);
2267 page_cache_release(old_page);
2268 }
2269 page_cache_release(old_page);
2270 }
2271 return VM_FAULT_OOM;
2272
2273 unwritable_page:
2274 page_cache_release(old_page);
2275 return ret;
2276 }
可见,这也是用引用计数处理的,2073 page_cache_get(old_page); 注释说了这样可以sleep,所以不
用lock。执行dirty_page = old_page; get_page(dirty_page); reuse = 1; 后进入reuse: 设置pte权限。
然后到unlock: 处又 put_page(dirty_page); page_cache_release(dirty_page);
这里是两次_count++两次_count--,和处理fork时不完全一样,fork()时执行了一次get_page(),而之前的
do_mmap()和do_linear_fault()中的++ --操作都是对称的,确实没发现单独的++操作。现在看看:
do_mmap() -> do_mmap_pgoff()
908 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
909 unsigned long len, unsigned long prot,
910 unsigned long flags, unsigned long pgoff)
911 {
912 struct mm_struct * mm = current->mm;
913 struct inode *inode;
914 unsigned int vm_flags;
915 int error;
916 unsigned long reqprot = prot;
917
918 /*
919 * Does the application expect PROT_READ to imply PROT_EXEC?
920 *
921 * (the exception is when the underlying filesystem is noexec
922 * mounted, in which case we dont add PROT_EXEC.)
923 */
924 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
925 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
926 prot |= PROT_EXEC;
927
928 if (!len)
929 return -EINVAL;
930
931 if (!(flags & MAP_FIXED))
932 addr = round_hint_to_min(addr);
933
934 /* Careful about overflows.. */
935 len = PAGE_ALIGN(len);
936 if (!len)
937 return -ENOMEM;
938
939 /* offset overflow? */
940 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
941 return -EOVERFLOW;
942
943 /* Too many mappings? */
944 if (mm->map_count > sysctl_max_map_count)
945 return -ENOMEM;
946
947 /* Obtain the address to map to. we verify (or select) it and ensure
948 * that it represents a valid section of the address space.
949 */
950 addr = get_unmapped_area(file, addr, len, pgoff, flags);
951 if (addr & ~PAGE_MASK)
952 return addr;
953
954 /* Do simple checking here so the lower-level routines won't have
955 * to. we assume access permissions have been handled by the open
956 * of the memory object, so we don't do any here.
957 */
958 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
959 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
960
961 if (flags & MAP_LOCKED)
962 if (!can_do_mlock())
963 return -EPERM;
964
965 /* mlock MCL_FUTURE? */
966 if (vm_flags & VM_LOCKED) {
967 unsigned long locked, lock_limit;
968 locked = len >> PAGE_SHIFT;
969 locked += mm->locked_vm;
970 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
971 lock_limit >>= PAGE_SHIFT;
972 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
973 return -EAGAIN;
974 }
975
976 inode = file ? file->f_path.dentry->d_inode : NULL;
977
978 if (file) { // file memory mapping
979 switch (flags & MAP_TYPE) {
980 case MAP_SHARED:
981 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
982 return -EACCES;
983
984 /*
985 * Make sure we don't allow writing to an append-only
986 * file..
987 */
988 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
989 return -EACCES;
990
991 /*
992 * Make sure there are no mandatory locks on the file.
993 */
994 if (locks_verify_locked(inode))
995 return -EAGAIN;
996
997 vm_flags |= VM_SHARED | VM_MAYSHARE;
998 if (!(file->f_mode & FMODE_WRITE))
999 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1000
1001 /* fall through */
1002 case MAP_PRIVATE:
1003 if (!(file->f_mode & FMODE_READ))
1004 return -EACCES;
1005 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1006 if (vm_flags & VM_EXEC)
1007 return -EPERM;
1008 vm_flags &= ~VM_MAYEXEC;
1009 }
1010
1011 if (!file->f_op || !file->f_op->mmap)
1012 return -ENODEV;
1013 break;
1014
1015 default:
1016 return -EINVAL;
1017 }
1018 } else { // Anonymous mapping
1019 switch (flags & MAP_TYPE) {
1020 case MAP_SHARED:
1021 /*
1022 * Ignore pgoff.
1023 */
1024 pgoff = 0;
1025 vm_flags |= VM_SHARED | VM_MAYSHARE;
1026 break;
1027 case MAP_PRIVATE:
1028 /*
1029 * Set pgoff according to addr for anon_vma.
1030 */
1031 pgoff = addr >> PAGE_SHIFT;
1032 break;
1033 default:
1034 return -EINVAL;
1035 }
1036 }
1037
1038 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1039 if (error)
1040 return error;
1041
1042 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1043 }
VM_SHARED用来区分 MAP_SHARED 和 MAP_PRIVATE。
这里只是根据 MAP_SHARED 还是 MAP_PRIVATE 设置vm_flags,处理引起冲突的权限。比如
disk file mapping memory 映射后的权限不能和文件的open权限冲突,如果文件open时没有write权限,那
映射之后的那部分空间也没有write权限,否则报错。
先看anonymous map代码,不涉及文件相对简单:
1) private writable mapping: (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE
2) shared anonymous mapping,用于IPC,此时vm_flags要设置VM_SHARED
如果是file map,那就涉及硬盘读写,牵涉到page cache,很复杂。
do_mmap() -> do_mmap_pgoff() -> mmap_region()
1134 unsigned long mmap_region(struct file *file, unsigned long addr,
1135 unsigned long len, unsigned long flags,
1136 unsigned int vm_flags, unsigned long pgoff)
1137 {
1138 struct mm_struct *mm = current->mm;
1139 struct vm_area_struct *vma, *prev;
1140 int correct_wcount = 0;
1141 int error;
1142 struct rb_node **rb_link, *rb_parent;
1143 unsigned long charged = 0;
1144 struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
1145
1146 /* Clear old maps */
1147 error = -ENOMEM;
1148 munmap_back:
1149 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1150 if (vma && vma->vm_start < addr + len) {
1151 if (do_munmap(mm, addr, len))
1152 return -ENOMEM;
1153 goto munmap_back;
1154 }
1155
1156 /* Check against address space limit. */
1157 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1158 return -ENOMEM;
1159
1160 /*
1161 * Set 'VM_NORESERVE' if we should not account for the
1162 * memory use of this mapping.
1163 */
1164 if ((flags & MAP_NORESERVE)) {
1165 /* We honor MAP_NORESERVE if allowed to overcommit */
1166 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1167 vm_flags |= VM_NORESERVE;
1168
1169 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1170 if (file && is_file_hugepages(file))
1171 vm_flags |= VM_NORESERVE;
1172 }
1173
1174 /*
1175 * Private writable mapping: check memory availability
1176 */
1177 if (accountable_mapping(file, vm_flags)) {
1178 charged = len >> PAGE_SHIFT;
1179 if (security_vm_enough_memory(charged))
1180 return -ENOMEM;
1181 vm_flags |= VM_ACCOUNT;
1182 }
1183
1184 /*
1185 * Can we just expand an old mapping?
1186 */
1187 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1188 if (vma)
1189 goto out;
1190
1191 /*
1192 * Determine the object being mapped and call the appropriate
1193 * specific mapper. the address has already been validated, but
1194 * not unmapped, but the maps are removed from the list.
1195 */
1196 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1197 if (!vma) {
1198 error = -ENOMEM;
1199 goto unacct_error;
1200 }
1201
1202 vma->vm_mm = mm;
1203 vma->vm_start = addr;
1204 vma->vm_end = addr + len;
1205 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff;
1208
1209 if (file) {
1210 error = -EINVAL;
1211 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1212 goto free_vma;
1213 if (vm_flags & VM_DENYWRITE) {
1214 error = deny_write_access(file);
1215 if (error)
1216 goto free_vma;
1217 correct_wcount = 1;
1218 }
1219 vma->vm_file = file;
1220 get_file(file);
1221 error = file->f_op->mmap(file, vma); // generic_file_mmap()
1222 if (error)
1223 goto unmap_and_free_vma;
1224 if (vm_flags & VM_EXECUTABLE)
1225 added_exe_file_vma(mm);
1226
1227 /* Can addr have changed??
1228 *
1229 * Answer: Yes, several device drivers can do it in their
1230 * f_op->mmap method. -DaveM
1231 */
1232 addr = vma->vm_start;
1233 pgoff = vma->vm_pgoff;
1234 vm_flags = vma->vm_flags;
1235 } else if (vm_flags & VM_SHARED) { // IPC, shared anonymous mapping
1236 error = shmem_zero_setup(vma);
1237 if (error)
1238 goto free_vma;
1239 }
1240
1241 if (vma_wants_writenotify(vma)) {
1242 pgprot_t pprot = vma->vm_page_prot;
1243
1244 /* Can vma->vm_page_prot have changed??
1245 *
1246 * Answer: Yes, drivers may have changed it in their
1247 * f_op->mmap method.
1248 *
1249 * Ensures that vmas marked as uncached stay that way.
1250 */
1251 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1252 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1253 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1254 }
1255
1256 vma_link(mm, vma, prev, rb_link, rb_parent);
1257 file = vma->vm_file;
1258
1259 /* Once vma denies write, undo our temporary denial count */
1260 if (correct_wcount)
1261 atomic_inc(&inode->i_writecount);
1262 out:
1263 perf_event_mmap(vma);
1264
1265 mm->total_vm += len >> PAGE_SHIFT;
1266 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1267 if (vm_flags & VM_LOCKED) {
1268 /*
1269 * makes pages present; downgrades, drops, reacquires mmap_sem
1270 */
1271 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1272 if (nr_pages < 0)
1273 return nr_pages; /* vma gone! */
1274 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1275 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1276 make_pages_present(addr, addr + len);
1277 return addr;
1278
1279 unmap_and_free_vma:
1280 if (correct_wcount)
1281 atomic_inc(&inode->i_writecount);
1282 vma->vm_file = NULL;
1283 fput(file);
1284
1285 /* Undo any partial mapping done by a device driver. */
1286 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1287 charged = 0;
1288 free_vma:
1289 kmem_cache_free(vm_area_cachep, vma);
1290 unacct_error:
1291 if (charged)
1292 vm_unacct_memory(charged);
1293 return error;
1294 }
rb_***是指red-black tree。
有VM_LOCKED flag就直接申请物理页锁在内存里,不必以后通过page fault来申请物理页了。具体是通过
make_pages_present() -> get_user_pages()申请。也是通过handle_mm_fault()实现的。
至于这行 1221 error = file->f_op->mmap(file, vma);
多数文件系统和 block device file 都用generic_file_mmap()
1597 /* This is used for a general mmap of a disk file */
1598
1599 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1600 {
1601 struct address_space *mapping = file->f_mapping;
1602
1603 if (!mapping->a_ops->readpage)
1604 return -ENOEXEC;
1605 file_accessed(file);
1606 vma->vm_ops = &generic_file_vm_ops;
1607 vma->vm_flags |= VM_CAN_NONLINEAR;
1608 return 0;
1609 }
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
};
1465 /**
1466 * filemap_fault - read in file data for page fault handling
1467 * @vma: vma in which the fault was taken
1468 * @vmf: struct vm_fault containing details of the fault
1469 *
1470 * filemap_fault() is invoked via the vma operations vector for a
1471 * mapped memory region to read in file data during a page fault.
1472 *
1473 * The goto's are kind of ugly, but this streamlines the normal case of having
1474 * it in the page cache, and handles the special cases reasonably without
1475 * having a lot of duplicated code.
1476 */
1477 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1478 {
1479 int error;
1480 struct file *file = vma->vm_file;
1481 struct address_space *mapping = file->f_mapping;
1482 struct file_ra_state *ra = &file->f_ra;
1483 struct inode *inode = mapping->host;
1484 pgoff_t offset = vmf->pgoff;
1485 struct page *page;
1486 pgoff_t size;
1487 int ret = 0;
1488
1489 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1490 if (offset >= size)
1491 return VM_FAULT_SIGBUS;
1492
1493 /*
1494 * Do we have something in the page cache already?
1495 */
1496 page = find_get_page(mapping, offset);
1497 if (likely(page)) {
1498 /*
1499 * We found the page, so try async readahead before
1500 * waiting for the lock.
1501 */
1502 do_async_mmap_readahead(vma, ra, file, page, offset);
1503 lock_page(page);
1504
1505 /* Did it get truncated? */
1506 if (unlikely(page->mapping != mapping)) {
1507 unlock_page(page);
1508 put_page(page);
1509 goto no_cached_page;
1510 }
1511 } else {
1512 /* No page in the page cache at all */
1513 do_sync_mmap_readahead(vma, ra, file, offset);
1514 count_vm_event(PGMAJFAULT);
1515 ret = VM_FAULT_MAJOR;
1516 retry_find:
1517 page = find_lock_page(mapping, offset);
1518 if (!page)
1519 goto no_cached_page;
1520 }
1521
1522 /*
1523 * We have a locked page in the page cache, now we need to check
1524 * that it's up-to-date. If not, it is going to be due to an error.
1525 */
1526 if (unlikely(!PageUptodate(page)))
1527 goto page_not_uptodate;
1528
1529 /*
1530 * Found the page and have a reference on it.
1531 * We must recheck i_size under page lock.
1532 */
1533 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1534 if (unlikely(offset >= size)) {
1535 unlock_page(page);
1536 page_cache_release(page);
1537 return VM_FAULT_SIGBUS;
1538 }
1539
1540 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1541 vmf->page = page;
1542 return ret | VM_FAULT_LOCKED;
1543
1544 no_cached_page:
1545 /*
1546 * We're only likely to ever get here if MADV_RandOM is in
1547 * effect.
1548 */
1549 error = page_cache_read(file, offset);
1550
1551 /*
1552 * The page we want has now been added to the page cache.
1553 * In the unlikely event that someone removed it in the
1554 * meantime, we'll just come back here and read it again.
1555 */
1556 if (error >= 0)
1557 goto retry_find;
1558
1559 /*
1560 * An error return from page_cache_read can result if the
1561 * system is low on memory, or a problem occurs while trying
1562 * to schedule I/O.
1563 */
1564 if (error == -ENOMEM)
1565 return VM_FAULT_OOM;
1566 return VM_FAULT_SIGBUS;
1567
1568 page_not_uptodate:
1569 /*
1570 * Umm, take care of errors if the page isn't up-to-date.
1571 * Try to re-read it _once_. We do this synchronously,
1572 * because there really aren't any performance issues here
1573 * and we need to check for errors.
1574 */
1575 ClearPageError(page);
1576 error = mapping->a_ops->readpage(file, page);
1577 if (!error) {
1578 wait_on_page_locked(page);
1579 if (!PageUptodate(page))
1580 error = -EIO;
1581 }
1582 page_cache_release(page);
1583
1584 if (!error || error == AOP_TRUNCATED_PAGE)
1585 goto retry_find;
1586
1587 /* Things didn't work out. Return zero to tell the mm layer so. */
1588 shrink_readahead_size_eio(file, ra);
1589 return VM_FAULT_SIGBUS;
1590 }
由 .fault = filemap_fault, 知道 filemap_fault 是个call back函数,由函数注释知道之后发生
page fault时,会用到这个函数。它先到page cache中找,如果没有在从硬盘上读取。把page struct组织成
了一颗radix tree,加快了page的查找速度。如果在page cache中找到了,则根据访问规律决定是否进行
readahead操作;如果需要从硬盘读,也要先决定是否进行readahead操作,继续试验从page cache中读取,
如果实在没有,就只好在page_cache_read()中先alloc page,然后把新分配的page加入page cache的某个
lru list,从硬盘读取数据,最后再去page cache中就能找到所需的page了。
alloc page后,kernel将page加到lru中, lru中的page是回收的候选. 其余page要么空闲要么正被使用, 不
能被回收。
不说readahead算法,也不对从硬盘读取数据的通用函数do_generic_file_read()继续分析。
filemap_fault() -> page_cache_read()
1363 #ifdef CONFIG_MMU
1364 /**
1365 * page_cache_read - adds requested page to the page cache if not already there
1366 * @file: file to read
1367 * @offset: page index
1368 *
1369 * This adds the requested page to the page cache if it isn't already there,
1370 * and schedules an I/O to read in its contents from disk.
1371 */
1372 static int page_cache_read(struct file *file, pgoff_t offset)
1373 {
1374 struct address_space *mapping = file->f_mapping;
1375 struct page *page;
1376 int ret;
1377
1378 do {
1379 page = page_cache_alloc_cold(mapping); // alloc page
1380 if (!page)
1381 return -ENOMEM;
1382
1383 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1384 if (ret == 0)
1385 ret = mapping->a_ops->readpage(file, page); // read from disk
1386 else if (ret == -EEXIST)
1387 ret = 0; /* losing race to add is OK */
1388
1389 page_cache_release(page);
1390
1391 } while (ret == AOP_TRUNCATED_PAGE);
1392
1393 return ret;
1394 }
do_page_fault()中demand paging的处理分三种情况:
1)该页从未被访问过。anonymous map 或者 属于 linear file map。
2) nonlinear file map,就是在mmap基础上分离mapped page,man remap_file_pages有详细介绍。
3)swap area:该页曾经被访问过,后来放在swap area上,现在又要访问它了。
1)对应于linear file map的处理函数do_linear_fault()确实调用了上面的filemap_fault()。而且2)的处理
函数do_nonlinear_fault()也调用了filemap_fault()。
don_linear_fault()中有个公式:
pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pgoff表示address在整个inode的地址空间里的偏移,local vma offset + vm_file offset
do_page_fault()->handle_mm_fault()->handle_pte_fault()
......
2966 if (!pte_present(entry)) { // demand paging
2967 if (pte_none(entry)) {
2968 if (vma->vm_ops) {
2969 if (likely(vma->vm_ops->fault))
2970 return do_linear_fault(mm, vma, address,
2971 pte, pmd, flags, entry);
2972 }
2973 return do_anonymous_page(mm, vma, address,
2974 pte, pmd, flags);
2975 }
2976 if (pte_file(entry))
2977 return do_nonlinear_fault(mm, vma, address,
2978 pte, pmd, flags, entry);
2979 return do_swap_page(mm, vma, address,
2980 pte, pmd, flags, entry);
2981 }
.....
do_anonymous_page()用于处理anonymous map,优化了read操作。
当read引发page fault时,由于是第一次read该页,所以新分配的页面里的内容填什么无所谓,但是填0是安
全的。为此系统在初始化时预留了zero page,以便进程随时使用。这里就是直接利用zero page,然后设置
pte,这样就不必进行alloc page操作了。如果page fault是write引发的,则需要alloc page,设置pte。
do_page_fault()->handle_mm_fault()->handle_pte_fault() -> do_anonymous_page():
2647 /*
2648 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2649 * but allow concurrent faults), and pte mapped but not yet locked.
2650 * We return with mmap_sem still held, but pte unmapped and unlocked.
2651 */
2652 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2653 unsigned long address, pte_t *page_table, pmd_t *pmd,
2654 unsigned int flags)
2655 {
2656 struct page *page;
2657 spinlock_t *ptl;
2658 pte_t entry;
2659
2660 if (!(flags & FAULT_FLAG_WRITE)) {
2661 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), // zero page
2662 vma->vm_page_prot));
2663 ptl = pte_lockptr(mm, pmd);
2664 spin_lock(ptl);
2665 if (!pte_none(*page_table))
2666 goto unlock;
2667 goto setpte;
2668 }
2669
2670 /* Allocate our own private page. */
2671 pte_unmap(page_table);
2672
2673 if (unlikely(anon_vma_prepare(vma)))
2674 goto oom;
2675 page = alloc_zeroed_user_highpage_movable(vma, address);
2676 if (!page)
2677 goto oom;
2678 __SetPageUptodate(page);
2679
2680 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2681 goto oom_free_page;
2682
2683 entry = mk_pte(page, vma->vm_page_prot);
2684 if (vma->vm_flags & VM_WRITE)
2685 entry = pte_mkwrite(pte_mkdirty(entry));
2686
2687 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2688 if (!pte_none(*page_table))
2689 goto release;
2690
2691 inc_mm_counter(mm, anon_rss);
2692 page_add_new_anon_rmap(page, vma, address);
2693 setpte:
2694 set_pte_at(mm, address, page_table, entry);
2695
2696 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry);
2698 unlock:
2699 pte_unmap_unlock(page_table, ptl);
2700 return 0;
2701 release:
2702 mem_cgroup_uncharge_page(page);
2703 page_cache_release(page);
2704 goto unlock;
2705 oom_free_page:
2706 page_cache_release(page);
2707 oom:
2708 return VM_FAULT_OOM;
2709 }
其中page_add_new_anon_rmap()负责设置page struct field,将page加入相应的lru list。
do_anonymous_page() -> page_add_new_anon_rmap():
690 /**
691 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
692 * @page: the page to add the mapping to
693 * @vma: the vm area in which the mapping is added
694 * @address: the user virtual address mapped
695 *
696 * Same as page_add_anon_rmap but must only be called on *new* pages.
697 * This means the inc-and-test can be bypassed.
698 * Page does not have to be locked.
699 */
700 void page_add_new_anon_rmap(struct page *page,
701 struct vm_area_struct *vma, unsigned long address)
702 {
703 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
704 SetPageSwapBacked(page);
705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
706 __inc_zone_page_state(page, NR_ANON_PAGES);
707 __page_set_anon_rmap(page, vma, address);
708 if (page_evictable(page, vma))
709 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
710 else
711 add_page_to_unevictable_list(page);
712 }
do_anonymous_page() -> page_add_new_anon_rmap() -> __page_set_anon_rmap():
/**
* __page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
static void __page_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
}
最后是page在swap area上的情况。此时也是先到swap cache中找,找到返回; 如果没找到则alloc page,
lock page,并将该page加入swap cache,同时加入lru list。然后将请求封装成bio提交给block device layer,
开始具体的硬盘操作。swap cache是个特殊的page cache,
swap area可能位于单独的partition,也可能位于一个file,系统对它们的处理是一样的。swap area 包含
多个page slots。如果swap area位于file,因为filesystem无法保证能够连续分配文件,为了提高性能,会
把整个swap area分成许多swap extent,硬盘上的物理位置连续的内容就是一个swap extent。如果swap area
位于partition,则系统会很好的规划成连续的,此时无需swap extent,即只有一个swap extent。有的
server会创建很多swap area,Linux最多可达32个。malloc()分配的heap就可能被swap out。
do_page_fault() -> handle_mm_fault()->handle_pte_fault() -> do_swap_page()
2505 /*
2506 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2507 * but allow concurrent faults), and pte mapped but not yet locked.
2508 * We return with mmap_sem still held, but pte unmapped and unlocked.
2509 */
2510 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2511 unsigned long address, pte_t *page_table, pmd_t *pmd,
2512 unsigned int flags, pte_t orig_pte)
2513 {
2514 spinlock_t *ptl;
2515 struct page *page;
2516 swp_entry_t entry;
2517 pte_t pte;
2518 struct mem_cgroup *ptr = NULL;
2519 int ret = 0;
2520
2521 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2522 goto out;
2523
2524 entry = pte_to_swp_entry(orig_pte);
2525 if (unlikely(non_swap_entry(entry))) {
2526 if (is_migration_entry(entry)) {
2527 migration_entry_wait(mm, pmd, address);
2528 } else if (is_hwpoison_entry(entry)) {
2529 ret = VM_FAULT_HWPOISON;
2530 } else {
2531 print_bad_pte(vma, address, orig_pte, NULL);
2532 ret = VM_FAULT_SIGBUS;
2533 }
2534 goto out;
2535 }
2536 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2537 page = lookup_swap_cache(entry);
2538 if (!page) {
2539 grab_swap_token(mm); /* Contend for token _before_ read-in */
2540 page = swapin_readahead(entry,
2541 GFP_HIGHUSER_MOVABLE, vma, address);
2542 if (!page) {
2543 /*
2544 * Back out if somebody else faulted in this pte
2545 * while we released the pte lock.
2546 */
2547 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2548 if (likely(pte_same(*page_table, orig_pte)))
2549 ret = VM_FAULT_OOM;
2550 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2551 goto unlock;
2552 }
2553
2554 /* Had to read the page from swap area: Major fault */
2555 ret = VM_FAULT_MAJOR;
2556 count_vm_event(PGMAJFAULT);
2557 } else if (PageHWPoison(page)) {
2558 /*
2559 * hwpoisoned dirty swapcache pages are kept for killing
2560 * owner processes (which may be unknown at hwpoison time)
2561 */
2562 ret = VM_FAULT_HWPOISON;
2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2564 goto out_release;
2565 }
2566
2567 lock_page(page);
2568 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2569
2570 page = ksm_might_need_to_copy(page, vma, address);
2571 if (!page) {
2572 ret = VM_FAULT_OOM;
2573 goto out;
2574 }
2575
2576 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2577 ret = VM_FAULT_OOM;
2578 goto out_page;
2579 }
2580
2581 /*
2582 * Back out if somebody else already faulted in this pte.
2583 */
2584 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2585 if (unlikely(!pte_same(*page_table, orig_pte)))
2586 goto out_nomap;
2587
2588 if (unlikely(!PageUptodate(page))) {
2589 ret = VM_FAULT_SIGBUS;
2590 goto out_nomap;
2591 }
2592
2593 /*
2594 * The page isn't present yet, go ahead with the fault.
2595 *
2596 * Be careful about the sequence of operations here.
2597 * To get its accounting right, reuse_swap_page() must be called
2598 * while the page is counted on swap but not yet in mapcount i.e.
2599 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2600 * must be called after the swap_free(), or it will never succeed.
2601 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2602 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2603 * in page->private. In this case, a record in swap_cgroup is silently
2604 * discarded at swap_free().
2605 */
2606
2607 inc_mm_counter(mm, anon_rss);
2608 pte = mk_pte(page, vma->vm_page_prot);
2609 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2610 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2611 flags &= ~FAULT_FLAG_WRITE;
2612 }
2613 flush_icache_page(vma, page);
2614 set_pte_at(mm, address, page_table, pte);
2615 page_add_anon_rmap(page, vma, address);
2616 /* It's better to call commit-charge after rmap is established */
2617 mem_cgroup_commit_charge_swapin(page, ptr);
2618
2619 swap_free(entry);
2620 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2621 try_to_free_swap(page);
2622 unlock_page(page);
2623
2624 if (flags & FAULT_FLAG_WRITE) {
2625 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2626 if (ret & VM_FAULT_ERROR)
2627 ret &= VM_FAULT_ERROR;
2628 goto out;
2629 }
2630
2631 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte);
2633 unlock:
2634 pte_unmap_unlock(page_table, ptl);
2635 out:
2636 return ret;
2637 out_nomap:
2638 mem_cgroup_cancel_charge_swapin(ptr);
2639 pte_unmap_unlock(page_table, ptl);
2640 out_page:
2641 unlock_page(page);
2642 out_release:
2643 page_cache_release(page);
2644 return ret;
2645 }
代码中的 swap_info_struct 和 swap_extent 分别描述 swap area 和 swap extent。
swp_entry_t 类型由 pte_t 转化而来,其变量是个地址,分两个字段,分别表示swap out page所在的
swap area及在该swap area中的哪个page slot。这样由pte的内容就能找到swap out page了。此时pte的
_PAGE_PRESENT为false,其它的bit就构成了地址。为此就有函数比如提取某个字段,或者由两个字段组合。
grab_swap_token()决定是否授予swaptoken,进程如果有了这个token,则只有在系统极度缺页时才能把它的
页swap out。
do_swap_page() -> swapin_readahead()
351 /**
352 * swapin_readahead - swap in pages in hope we need them soon
353 * @entry: swap entry of this memory
354 * @gfp_mask: memory allocation flags
355 * @vma: user vma this address belongs to
356 * @addr: target address for mempolicy
357 *
358 * Returns the struct page for entry and addr, after queueing swapin.
359 *
360 * Primitive swap readahead code. We simply read an aligned block of
361 * (1 << page_cluster) entries in the swap area. This method is chosen
362 * because it doesn't cost us any seek time. We also make sure to queue
363 * the 'original' request together with the readahead ones...
364 *
365 * This has been extended to use the NUMA policies from the mm triggering
366 * the readahead.
367 *
368 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
369 */
370 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
371 struct vm_area_struct *vma, unsigned long addr)
372 {
373 int nr_pages;
374 struct page *page;
375 unsigned long offset;
376 unsigned long end_offset;
377
378 /*
379 * Get starting offset for readaround, and number of pages to read.
380 * Adjust starting address by readbehind (for NUMA interleave case)?
381 * No, it's very unlikely that swap layout would follow vma layout,
382 * more likely that neighbouring swap pages came from the same node:
383 * so use the same "addr" to choose the same node for each swap read.
384 */
385 nr_pages = valid_swaphandles(entry, &offset);
386 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
387 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
389 gfp_mask, vma, addr);
390 if (!page)
391 break;
392 page_cache_release(page);
393 }
394 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396 }
do_swap_page() -> swapin_readahead() -> read_swap_cache_async()
273 * Locate a page of swap in physical memory, reserving swap cache space
274 * and reading the disk if it is not already cached.
275 * A failure return means that either the page allocation failed or that
276 * the swap entry is no longer in use.
277 */
278 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
279 struct vm_area_struct *vma, unsigned long addr)
280 {
281 struct page *found_page, *new_page = NULL;
282 int err;
283
284 do {
285 /*
286 * First check the swap cache. Since this is normally
287 * called after lookup_swap_cache() failed, re-calling
288 * that would confuse statistics.
289 */
290 found_page = find_get_page(&swapper_space, entry.val);
291 if (found_page)
292 break;
293
294 /*
295 * Get a new page to read into from swap.
296 */
297 if (!new_page) {
298 new_page = alloc_page_vma(gfp_mask, vma, addr);
299 if (!new_page)
300 break; /* Out of memory */
301 }
302
303 /*
304 * call radix_tree_preload() while we can wait.
305 */
306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
308 break;
309
310 /*
311 * Swap entry may have been freed since our caller observed it.
312 */
313 err = swapcache_prepare(entry);
314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
316 continue;
317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
320 break;
321 }
322
323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
324 __set_page_locked(new_page);
325 SetPageSwapBacked(new_page);
326 err = __add_to_swap_cache(new_page, entry);
327 if (likely(!err)) {
328 radix_tree_preload_end();
329 /*
330 * Initiate read into locked page and return.
331 */
332 lru_cache_add_anon(new_page);
333 swap_readpage(new_page);
334 return new_page;
335 }
336 radix_tree_preload_end();
337 ClearPageSwapBacked(new_page);
338 __clear_page_locked(new_page);
339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
343 swapcache_free(entry, NULL);
344 } while (err != -ENOMEM);
345
346 if (new_page)
347 page_cache_release(new_page);
348 return found_page;
349 }
上面需要注意的是,每当有alloc page操作时,由于可能sleep,所以返回时要对分配前的某些条件重新判断:
310 /*
311 * Swap entry may have been freed since our caller observed it.
312 */
313 err = swapcache_prepare(entry);
接下来就要封装bio struct,然后提交给block device layer。
do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage()
117 int swap_readpage(struct page *page)
118 {
119 struct bio *bio;
120 int ret = 0;
121
122 VM_BUG_ON(!PageLocked(page));
123 VM_BUG_ON(PageUptodate(page));
124 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); // initalize bio struct
125 if (bio == NULL) {
126 unlock_page(page);
127 ret = -ENOMEM;
128 goto out;
129 }
130 count_vm_event(PSWPIN);
131 submit_bio(READ, bio);
132 out:
133 return ret;
134 }
注意上面的 end_swap_bio_read 是个call back函数,负责释放lock,执行bio_put()减少bio struct的引用
计数,如果计数为0就释放bio struct。
下面的__generic_make_request()负责提交bio struct给device driver。由于磁盘操作费时,提交bio struct
给device driver之后,当前进程会might_sleep()。如果是server,kernel配置成CONFIG_PREEMPT_NONE,
此时might_sleep()为空操作,当前进程直接返回; 桌面系统的kernel默认配置成CONFIG_PREEMPT_VOLUNTARY,
此时就会发生调度,其它进程可能会占用cpu。当硬盘读取数据完毕后,产生中断,唤醒当前进程。
end_swap_bio_read() 用来收尾。
do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio()
1542 /**
1543 * submit_bio - submit a bio to the block device layer for I/O
1544 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1545 * @bio: The &struct bio which describes the I/O
1546 *
1547 * submit_bio() is very similar in purpose to generic_make_request(), and
1548 * uses that function to do most of the work. Both are fairly rough
1549 * interfaces; @bio must be presetup and ready for I/O.
1550 *
1551 */
1552 void submit_bio(int rw, struct bio *bio)
1553 {
1554 int count = bio_sectors(bio);
1555
1556 bio->bi_rw |= rw;
1557
1558 /*
1559 * If it's a regular read/write or a barrier with data attached,
1560 * go through the normal accounting stuff before submission.
1561 */
1562 if (bio_has_data(bio)) {
1563 if (rw & WRITE) {
1564 count_vm_events(PGPGOUT, count);
1565 } else {
1566 task_io_account_read(bio->bi_size);
1567 count_vm_events(PGPGIN, count);
1568 }
1569
1570 if (unlikely(block_dump)) {
1571 char b[BDEVNAME_SIZE];
1572 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
1573 current->comm, task_pid_nr(current),
1574 (rw & WRITE) ? "WRITE" : "READ",
1575 (unsigned long long)bio->bi_sector,
1576 bdevname(bio->bi_bdev, b));
1577 }
1578 }
1579
1580 generic_make_request(bio);
1581 }
do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio() -> generic_make_request()
1490 /*
1491 * We only want one ->make_request_fn to be active at a time,
1492 * else stack usage with stacked devices could be a problem.
1493 * So use current->bio_{list,tail} to keep a list of requests
1494 * submited by a make_request_fn function.
1495 * current->bio_tail is also used as a flag to say if
1496 * generic_make_request is currently active in this task or not.
1497 * If it is NULL, then no make_request is active. If it is non-NULL,
1498 * then a make_request is active, and new requests should be added
1499 * at the tail
1500 */
1501 void generic_make_request(struct bio *bio)
1502 {
1503 if (current->bio_tail) {
1504 /* make_request is active */
1505 *(current->bio_tail) = bio;
1506 bio->bi_next = NULL;
1507 current->bio_tail = &bio->bi_next;
1508 return;
1509 }
1510 /* following loop may be a bit non-obvious, and so deserves some
1511 * explanation.
1512 * Before entering the loop, bio->bi_next is NULL (as all callers
1513 * ensure that) so we have a list with a single bio.
1514 * We pretend that we have just taken it off a longer list, so
1515 * we assign bio_list to the next (which is NULL) and bio_tail
1516 * to &bio_list, thus initialising the bio_list of new bios to be
1517 * added. __generic_make_request may indeed add some more bios
1518 * through a recursive call to generic_make_request. If it
1519 * did, we find a non-NULL value in bio_list and re-enter the loop
1520 * from the top. In this case we really did just take the bio
1521 * of the top of the list (no pretending) and so fixup bio_list and
1522 * bio_tail or bi_next, and call into __generic_make_request again.
1523 *
1524 * The loop was structured like this to make only one call to
1525 * __generic_make_request (which is important as it is large and
1526 * inlined) and to keep the structure simple.
1527 */
1528 BUG_ON(bio->bi_next);
1529 do {
1530 current->bio_list = bio->bi_next;
1531 if (bio->bi_next == NULL)
1532 current->bio_tail = ¤t->bio_list;
1533 else
1534 bio->bi_next = NULL;
1535 __generic_make_request(bio);
1536 bio = current->bio_list;
1537 } while (bio);
1538 current->bio_tail = NULL; /* deactivate */
1539 }
do_swap_page() -> swapin_readahead() -> read_swap_cache_async() -> swap_readpage() ->
submit_bio() -> generic_make_request() -> __generic_make_request()
1380 /**
1381 * generic_make_request - hand a buffer to its device driver for I/O
1382 * @bio: The bio describing the location in memory and on the device.
1383 *
1384 * generic_make_request() is used to make I/O requests of block
1385 * devices. It is passed a &struct bio, which describes the I/O that needs
1386 * to be done.
1387 *
1388 * generic_make_request() does not return any status. The
1389 * success/failure status of the request, along with notification of
1390 * completion, is delivered asynchronously through the bio->bi_end_io
1391 * function described (one day) else where.
1392 *
1393 * The caller of generic_make_request must make sure that bi_io_vec
1394 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1395 * set to describe the device address, and the
1396 * bi_end_io and optionally bi_private are set to describe how
1397 * completion notification should be signaled.
1398 *
1399 * generic_make_request and the drivers it calls may use bi_next if this
1400 * bio happens to be merged with someone else, and may change bi_dev and
1401 * bi_sector for remaps as it sees fit. So the values of these fields
1402 * should NOT be depended on after the call to generic_make_request.
1403 */
1404 static inline void __generic_make_request(struct bio *bio)
1405 {
1406 struct request_queue *q;
1407 sector_t old_sector;
1408 int ret, nr_sectors = bio_sectors(bio);
1409 dev_t old_dev;
1410 int err = -EIO;
1411
1412 might_sleep();
1413
1414 if (bio_check_eod(bio, nr_sectors))
1415 goto end_io;
1416
1417 /*
1418 * Resolve the mapping until finished. (drivers are
1419 * still free to implement/resolve their own stacking
1420 * by explicitly returning 0)
1421 *
1422 * NOTE: we don't repeat the blk_size check for each new device.
1423 * Stacking drivers are expected to know what they are doing.
1424 */
1425 old_sector = -1;
1426 old_dev = 0;
1427 do {
1428 char b[BDEVNAME_SIZE];
1429
1430 q = bdev_get_queue(bio->bi_bdev);
1431 if (unlikely(!q)) {
1432 printk(KERN_ERR
1433 "generic_make_request: Trying to access "
1434 "nonexistent block-device %s (%Lu)\n",
1435 bdevname(bio->bi_bdev, b),
1436 (long long) bio->bi_sector);
1437 goto end_io;
1438 }
1439
1440 if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1441 nr_sectors > queue_max_hw_sectors(q))) {
1442 printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1443 bdevname(bio->bi_bdev, b),
1444 bio_sectors(bio),
1445 queue_max_hw_sectors(q));
1446 goto end_io;
1447 }
1448
1449 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
1450 goto end_io;
1451
1452 if (should_fail_request(bio))
1453 goto end_io;
1454
1455 /*
1456 * If this device has partitions, remap block n
1457 * of partition p to block n+start(p) of the disk.
1458 */
1459 blk_partition_remap(bio);
1460
1461 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1462 goto end_io;
1463
1464 if (old_sector != -1)
1465 trace_block_remap(q, bio, old_dev, old_sector);
1466
1467 old_sector = bio->bi_sector;
1468 old_dev = bio->bi_bdev->bd_dev;
1469
1470 if (bio_check_eod(bio, nr_sectors))
1471 goto end_io;
1472
1473 if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1474 !blk_queue_discard(q)) {
1475 err = -EOPNOTSUPP;
1476 goto end_io;
1477 }
1478
1479 trace_block_bio_queue(q, bio);
1480
1481 ret = q->make_request_fn(q, bio);
1482 } while (ret);
1483
1484 return;
1485
1486 end_io:
1487 bio_endio(bio, err); // end_swap_bio_read()
1488 }
看看收尾工作:
68 void end_swap_bio_read(struct bio *bio, int err)
69 {
70 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
71 struct page *page = bio->bi_io_vec[0].bv_page;
72
73 if (!uptodate) {
74 SetPageError(page);
75 ClearPageUptodate(page);
76 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
77 imajor(bio->bi_bdev->bd_inode),
78 iminor(bio->bi_bdev->bd_inode),
79 (unsigned long long)bio->bi_sector);
80 } else {
81 SetPageUptodate(page);
82 }
83 unlock_page(page);
84 bio_put(bio);
85 }
Sunday, March 21, 2010
page fault and mmap
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment