/* * This is the 'heart' of the zoned buddy allocator. */ structpage *__alloc_pages_nodemask(gfp_tgfp_mask, unsignedintorder, intpreferred_nid, nodemask_t *nodemask) { structpage *page; // 默认情况下只要处于低水位之上都可以正常的分配内存空间 unsignedint alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ structalloc_contextac = { };
/* * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ if (unlikely(order >= MAX_ORDER)) { WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); returnNULL; }
/* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ // 避免内存碎片化的相关分配标识设置,可暂时忽略 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
/* First allocation attempt */ // ------------------------------------------------------------------------------------------------ (3) // 第一次尝试进行内存分配,此时的watermarks为ALLOC_WMARK_LOW page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) goto out; // 如果执行到这里就说明上面的内存分配失败了,我们需要通过__alloc_pages_slowpath()再次尝试内存的分配 /* * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests * from a particular context which has been marked by * memalloc_no{fs,io}_{save,restore}. */ // 清除GFP掩码中的(__GFP_IO | __GFP_FS) alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false;
/* * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ if (unlikely(ac.nodemask != nodemask)) ac.nodemask = nodemask; // 再次尝试分配内存,既然第一次分配失败了,说明内存空间可能出现不足的情况。因此在这里 // 会做更多的工作比如回收内存等,之后再进行内存的分配 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
// ##################################################################################### // finalise_ac() // ##################################################################################### /* Determine whether to spread dirty pages and what the first usable zone */ staticinlinevoidfinalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ // 在for_next_zone_zonelist_nodemask()宏中,ac->preferred_zoneref就是开始的iterator ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); }
// ./include/linux/mmzone.h
// ##################################################################################### // first_zones_zonelist() // ##################################################################################### /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist - The zonelist to search for a suitable zone * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * @return - Zoneref pointer for the first suitable zone found (see below) * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. */ staticinline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); }
// ##################################################################################### // next_zones_zonelist() // ##################################################################################### /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { // 这是一种常见的情况 - 1) 没有指定Node; 2) highest_zoneidx就是zonelist中最大的zone_idx // 满足这种情况就直接选zonelist中的第一个zone if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); }
// ./mm/mmzone.c
// ##################################################################################### // __next_zones_zonelist() // ##################################################################################### /* Returns the next zone at or below highest_zoneidx in a zonelist */ structzoneref *__next_zones_zonelist(structzoneref *z, enumzone_typehighest_zoneidx, nodemask_t *nodes) { /* * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ // 如果不指定Node,那么就找第一个zone_idx <= highest_zoneidx的Zone if (unlikely(nodes == NULL)) while (zonelist_zone_idx(z) > highest_zoneidx) z++; // 如果指定Node,那么还需要在指令Node中寻找满足zone_idx <= highest_zoneidx的Zone else while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes))) z++;
/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ static struct page *get_page_from_freelist(gfp_t gfp_mask, unsignedint order, int alloc_flags, const struct alloc_context *ac) { structzoneref *z; structzone *zone; structpglist_data *last_pgdat_dirty_limit =NULL; bool no_fallback;
retry: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; // ----------------------------------------------------------------------------------------- (1) for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { structpage *page; unsignedlong mark;
// ------------------------------------------------------------------------------------- (2) if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * When allocating a page cache page for writing, we * want to get it from a node that is within its dirty * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * * XXX: For now, allow allocations to potentially * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * nodes are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ if (ac->spread_dirty_pages) { if (last_pgdat_dirty_limit == zone->zone_pgdat) continue;
if (!node_dirty_ok(zone->zone_pgdat)) { last_pgdat_dirty_limit = zone->zone_pgdat; continue; } }
if (no_fallback && nr_online_nodes > 1 && zone != ac->preferred_zoneref->zone) { int local_nid;
/* * If moving to a remote node, retry but allow * fragmenting fallbacks. Locality is more important * than fragmentation avoidance. */ local_nid = zone_to_nid(ac->preferred_zoneref->zone); if (zone_to_nid(zone) != local_nid) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; } } // ALLOC_WMARK_LOW is set here // ------------------------------------------------------------------------------------- (3) // 获取当前水位线要求的空闲页数 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); // 判断当前Zone的空闲页是否满足水位线的要求 if (!zone_watermark_fast(zone, order, mark, ac_classzone_idx(ac), alloc_flags)) { int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ if (static_branch_unlikely(&deferred_pages)) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone;
if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, ac_classzone_idx(ac), alloc_flags)) goto try_this_zone;
/* * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ if (unlikely(order && (alloc_flags & ALLOC_HARDER))) reserve_highatomic_pageblock(page, zone, order);
return page; } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (static_branch_unlikely(&deferred_pages)) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif } }
/* * It's possible on a UMA machine to get through all zones that are * fragmented. If avoiding fragmentation, reset and try again. */ if (no_fallback) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; }
/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ staticinline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsignedint order, gfp_t gfp_flags, unsignedint alloc_flags, int migratetype) { unsignedlong flags; structpage *page;
/* * We most definitely don't want callers attempting to * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags);
do { page = NULL; // ----------------------------------------------------------------------------- (2) // 如果需要ALLOC_HARDER,则从freelist[MIGRATE_HIGHATOMIC]中通过__rmqueue_smallest()函数分配内存空间 if (alloc_flags & ALLOC_HARDER) { page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } // ----------------------------------------------------------------------------- (3) // 按照输入的order, migratetype等分配内存空间 if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); // 如果没有获取页或者获取的页无效,则再次尝试获取内存空间 } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page));
// ##################################################################################### // __rmqueue() // ##################################################################################### /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline structpage * __rmqueue(structzone *zone, unsignedintorder, intmigratetype, unsignedintalloc_flags) { structpage *page;
retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype, alloc_flags)) goto retry; }
// ##################################################################################### // __rmqueue_smallest() // ##################################################################################### /* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ static __always_inline structpage *__rmqueue_smallest(structzone *zone, unsignedintorder, intmigratetype) { unsignedint current_order; structfree_area *area; structpage *page;
/* Find a page of the appropriate size in the preferred list */ // 从当前order开始,寻找第一个满足要求的order for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); if (!page) continue; // 将page从free_list中断开 list_del(&page->lru); rmv_page_order(page); area->nr_free--; // 这是最核心的函数之一,下面再仔细分析 expand(zone, page, order, current_order, area, migratetype); set_pcppage_migratetype(page, migratetype); return page; }
returnNULL; }
// ##################################################################################### // expand() // ##################################################################################### /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression * testing. Specifically, as large blocks of memory are subdivided, * the order in which smaller blocks are delivered depends on the order * they're subdivided in this function. This is the primary factor * influencing the order in which pages are delivered to the IO * subsystem according to empirical testing, and this is also justified * by considering the behavior of a buddy system containing a single * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * * -- nyc */ // 这个函数很重要,这里我用一个例子来说明它是如何工作的 // 假设high = 4, low = 1,那么size = 1 << 4 = 16页,而我们需要的页块有1 << 1 = 2页 // 第一次循环: // [0][1][2][3][4][5][6][7][8][9][10][11][12][13][14][15] // area = area3 (4 - 1) // high = 3 (4 - 1) // size = 16 / 2 = 8 // area3->free_list->[8][9][10][11][12][13][14][15] // 剩下[0][1][2][3][4][5][6][7] // // 第二次循环: // area = area2 (3 - 1) // high = 2 (3 - 1) // size = 8 / 2 = 4 // area2->free_list->[4][5][6][7] // 剩下[0][1][2][3] // // 第三次循环: // area = area1 (2 - 1) // high = 1 (2 - 1) // size = 4 / 2 = 2 // area1->free_list->[2][3] // 剩下[0][1] // // high == low,循环跳出 // 返回[0][1],满足我们需要 staticinlinevoidexpand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, int migratetype) { unsignedlong size = 1 << high;
/* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ if (set_page_guard(zone, &page[size], high, migratetype)) continue; // 这里是直接将中间的page添加入free_list,page block只有第一个page链入free_list list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); } }