/* * Try finding a free buddy page on the fallback list and put it on the free * list of requested migratetype, possibly along with other pages from the same * block, depending on fragmentation avoidance heuristics. Returns true if * fallback was found so that __rmqueue_smallest() can grab it. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. */ static __always_inline bool __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, unsignedint alloc_flags) { structfree_area *area; int current_order; int min_order = order; structpage *page; int fallback_mt; bool can_steal;
/* * Do not steal pages from freelists belonging to other pageblocks * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ // pageblock_order一般等于MAX_ORDER - 1 if (alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order;
/* * Find the largest available free page in the other list. This roughly * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); // ------------------------------------------------------------------------------------------------ (1) fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); if (fallback_mt == -1) continue;
/* * We cannot steal all free pages from the pageblock and the * requested migratetype is movable. In that case it's better to * steal and split the smallest available page instead of the * largest available page, because even if the next movable * allocation falls back into a different pageblock than this * one, it won't cause permanent fragmentation. */ // ------------------------------------------------------------------------------------------------ (2) if (!can_steal && start_migratetype == MIGRATE_MOVABLE && current_order > order) goto find_smallest;
goto do_steal; }
returnfalse;
find_smallest: // ---------------------------------------------------------------------------------------------------- (3) for (current_order = order; current_order < MAX_ORDER; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); if (fallback_mt != -1) break; }
/* * This should not happen - we already found a suitable fallback * when looking for the largest page. */ VM_BUG_ON(current_order == MAX_ORDER);
// ##################################################################################### // steal_suitable_fallback() // ##################################################################################### /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this * pageblock to our migratetype and determine how many already-allocated pages * are there in the pageblock with a compatible migratetype. If at least half * of pages are free or compatible, we can change migratetype of the pageblock * itself, so pages freed in the future will be put on the correct free list. */ staticvoidsteal_suitable_fallback(struct zone *zone, struct page *page, unsignedint alloc_flags, int start_type, bool whole_block) { unsignedint current_order = page_order(page); structfree_area *area; int free_pages, movable_pages, alike_pages; int old_block_type;
old_block_type = get_pageblock_migratetype(page);
/* * This can happen due to races and we want to prevent broken * highatomic accounting. */ if (is_migrate_highatomic(old_block_type)) goto single_page;
/* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); goto single_page; }
/* * Boost watermarks to increase reclaim pressure to reduce the * likelihood of future fallbacks. Wake kswapd now as the node * may be balanced overall and kswapd will not wake naturally. */ boost_watermark(zone); if (alloc_flags & ALLOC_KSWAPD) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */ // --------------------------------------------------------------------------------- (1) if (!whole_block) goto single_page;
free_pages = move_freepages_block(zone, page, start_type, &movable_pages); /* * Determine how many pages are compatible with our allocation. * For movable allocation, it's the number of movable pages which * we just obtained. For o:ther types it's a bit more tricky. */ if (start_type == MIGRATE_MOVABLE) { alike_pages = movable_pages; } else { /* * If we are falling back a RECLAIMABLE or UNMOVABLE allocation * to MOVABLE pageblock, consider all non-movable pages as * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or * vice versa, be conservative since we can't distinguish the * exact migratetype of non-movable pages. */ if (old_block_type == MIGRATE_MOVABLE) alike_pages = pageblock_nr_pages - (free_pages + movable_pages); else alike_pages = 0; }
/* moving whole block can fail due to zone boundary conditions */ if (!free_pages) goto single_page;
/* * If a sufficient number of pages in the block are either free or of * comparable migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type);
// ##################################################################################### // move_freepagesm() // ##################################################################################### /* * Move the free pages in a range to the free lists of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ staticintmove_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype, int *num_movable) { structpage *page; unsignedint order; int pages_moved = 0;
#ifndef CONFIG_HOLES_IN_ZONE /* * page_zone is not safe to call in this context when * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant * anyway as we check zone boundaries in move_freepages_block(). * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && pfn_valid(page_to_pfn(end_page)) && page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; }
/* Make sure we are not inadvertently changing nodes */ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for * migration are movable. But we don't actually try * isolating, as that would be expensive. */ if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++;
structper_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */ structlist_headlists[MIGRATE_PCPTYPES]; };
/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ staticinline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsignedint order, gfp_t gfp_flags, unsignedint alloc_flags, int migratetype) { unsignedlong flags; structpage *page;
/* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, gfp_t gfp_flags, int migratetype, unsignedint alloc_flags) { structper_cpu_pages *pcp; structlist_head *list; structpage *page; unsignedlong flags;
/* Remove page from the per-cpu list, caller must protect the list */ staticstructpage *__rmqueue_pcplist(structzone *zone, intmigratetype, unsignedintalloc_flags, structper_cpu_pages *pcp, structlist_head *list) { structpage *page;
do { // Case1: 如果list为空,那么我们就会通过rmqueue_bulk()函数来获取pcp->batch数量的页 // 这些页将用于补充pcplist if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, migratetype, alloc_flags); if (unlikely(list_empty(list))) returnNULL; } // Case2: 如果list不为空,那么很简单,直接获取一页即可 page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page));
/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ staticintrmqueue_bulk(struct zone *zone, unsignedint order, unsignedlong count, struct list_head *list, int migratetype, unsignedint alloc_flags) { int i, alloced = 0;
spin_lock(&zone->lock); for (i = 0; i < count; ++i) { structpage *page = __rmqueue(zone, order, migratetype, alloc_flags); if (unlikely(page == NULL)) break;
if (unlikely(check_pcp_refill(page))) continue;
/* * Split buddy pages returned by expand() are received here in * physical page order. The page is added to the tail of * caller's list. From the callers perspective, the linked list * is ordered by page number under some conditions. This is * useful for IO devices that can forward direction from the * head, thus also in the physical page order. This is useful * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ // 获取的页放在list的尾部 list_add_tail(&page->lru, list); alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); }
/* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based * on i. Do not confuse with 'alloced' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); return alloced; }
/* * Freeing function for a buddy system allocator. * * The concept of a buddy system is to maintain direct-mapped table * (containing bit values) for memory blocks of various "orders". * The bottom level table contains the map for the smallest allocatable * units of memory (here, pages), and each level above it describes * pairs of units from the levels below, hence, "buddies". * At a high level, all that happens here is marking the table entry * at the bottom level available, and propagating the changes upward * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PageBuddy. * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this * triggers coalescing into a block of larger size. * * -- nyc */
continue_merging: while (order < max_order - 1) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); return; } // ------------------------------------------------------------- (1) // page_pfn ^ (1 << order) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); // pfn_valid_within()在没有没有定义CONFIG_HOLES_IN_ZONE时返回1 if (!pfn_valid_within(buddy_pfn)) goto done_merging; // 确保buddy的zone, order和page的一样;确保buddy不在hole;确保buddy在buddy system(没有被分配出去) if (!page_is_buddy(page, buddy, order)) goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. */ if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); else // 将buddy从相应的free_list中取下 del_page_from_free_area(buddy, &zone->free_area[order]); // 获取合并后的pfn combined_pfn = buddy_pfn & pfn; // 获取合并后的页块 page = page + (combined_pfn - pfn); pfn = combined_pfn; // order加1,继续向上合并 order++; // ------------------------------------------------------------- (1) }
// ----------------------------------------------------------------- (2) if (max_order < MAX_ORDER) { /* If we are here, it means order is >= pageblock_order. * We want to prevent merge between freepages on isolate * pageblock and normal pageblock. Without this, pageblock * isolation could cause incorrect freepage or CMA accounting. * * We don't want to hit this code for the more frequent * low-order merging. */ if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt;
// ----------------------------------------------------------------- (3) // 这种情况出现在上面page_is_buddy()不满足条件的情况 /* * If this is not the largest possible page, check if the buddy * of the next-highest order is free. If it is, it's possible * that pages are being freed that will coalesce soon. In case, * that is happening, add the free page to the tail of the list * so it's less likely to be used soon and more likely to be merged * as a higher order page */ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) && !is_shuffle_order(order)) { structpage *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); // 寻找更上以及的伙伴 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (pfn_valid_within(buddy_pfn) && page_is_buddy(higher_page, higher_buddy, order + 1)) { add_to_free_area_tail(page, &zone->free_area[order], migratetype); return; } } // ----------------------------------------------------------------- (4) // 是否进行随机添加,随机添加指的是随机在free_list头或者尾进行插入 if (is_shuffle_order(order)) add_to_free_area_random(page, &zone->free_area[order], migratetype); else add_to_free_area(page, &zone->free_area[order], migratetype);
/* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ if (migratetype >= MIGRATE_PCPTYPES) { // 这种情况下直接用伙伴系统完成内存的释放 - 伙伴系统也可以对order == 0的情况进行处理 if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, pfn, 0, migratetype); return; } migratetype = MIGRATE_MOVABLE; }
/* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * * If the zone was previously in an "all pages pinned" state then look to * see if this freeing clears that state. * * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ staticvoidfree_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { int migratetype = 0; int batch_free = 0; int prefetch_nr = 0; bool isolated_pageblocks; structpage *page, *tmp; LIST_HEAD(head);
// ------------------------------------------------------ (1) while (count) { structlist_head *list;
/* * Remove pages from lists in a round-robin fashion. A * batch_free count is maintained that is incremented when an * empty list is encountered. This is so more pages are freed * off fuller lists instead of spinning excessively around empty * lists */ do { batch_free++; if (++migratetype == MIGRATE_PCPTYPES) migratetype = 0; list = &pcp->lists[migratetype]; } while (list_empty(list));
/* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) batch_free = count;
do { page = list_last_entry(list, struct page, lru); /* must delete to avoid corrupting pcp list */ list_del(&page->lru); pcp->count--;
if (bulkfree_pcp_prepare(page)) continue;
list_add_tail(&page->lru, &head);
/* * We are going to put the page back to the global * pool, prefetch its buddy to speed up later access * under zone->lock. It is believed the overhead of * an additional test and calculating buddy_pfn here * can be offset by reduced memory latency later. To * avoid excessive prefetching due to large count, only * prefetch buddy for the first pcp->batch nr of pages. */ if (prefetch_nr++ < pcp->batch) prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); }
/* * Use safe version since after __free_one_page(), * page->lru.next will not point to original list. */ // ------------------------------------------------------ (2) list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page);