【LM11】伙伴系统内存分配篇

硬核技术 Linux内核 Linux_5_0 ARMv8_64 Linux内存管理

 2023/06/03 

继续接上篇笔记【LM10】伙伴系统基本信息最后介绍的的__alloc_pages_nodemask()函数。我这里还是附上该函数的源代码吧，免得查看比较麻烦。

// ./mm/page_alloc.c

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
    struct page *page;
    // 默认情况下只要处于低水位之上都可以正常的分配内存空间
    unsigned int alloc_flags = ALLOC_WMARK_LOW;
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = { };

    /*
     * There are several places where we assume that the order value is sane
     * so bail out early if the request is out of bound.
     */
    if (unlikely(order >= MAX_ORDER)) {
        WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
        return NULL;
    }

    // gfp_mask需要在gfp_allowed_mask的范围内 - 后者默认为(__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
    gfp_mask &= gfp_allowed_mask;
    alloc_mask = gfp_mask;
    // ------------------------------------------------------------------------------------------------ (1)
    // 参考下面prepare_alloc_pages()函数的分析
    if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
        return NULL;
    // ------------------------------------------------------------------------------------------------ (2)
    // 获取ac->zonelist中优先级小于等于ac->high_zoneidx、且在指定Node（也可以不指定）中最高优先级的zoneref
    finalise_ac(gfp_mask, &ac);

    /*
     * Forbid the first pass from falling back to types that fragment
     * memory until all local zones are considered.
     */
    // 避免内存碎片化的相关分配标识设置，可暂时忽略
    alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);

    /* First allocation attempt */
    // ------------------------------------------------------------------------------------------------ (3)
    // 第一次尝试进行内存分配，此时的watermarks为ALLOC_WMARK_LOW
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
    if (likely(page))
        goto out;
    
    // 如果执行到这里就说明上面的内存分配失败了，我们需要通过__alloc_pages_slowpath()再次尝试内存的分配
    /*
     * Apply scoped allocation constraints. This is mainly about GFP_NOFS
     * resp. GFP_NOIO which has to be inherited for all allocation requests
     * from a particular context which has been marked by
     * memalloc_no{fs,io}_{save,restore}.
     */
    // 清除GFP掩码中的(__GFP_IO | __GFP_FS)
    alloc_mask = current_gfp_context(gfp_mask);
    ac.spread_dirty_pages = false;

    /*
     * Restore the original nodemask if it was potentially replaced with
     * &cpuset_current_mems_allowed to optimize the fast-path attempt.
     */
    if (unlikely(ac.nodemask != nodemask))
        ac.nodemask = nodemask;
    
    // 再次尝试分配内存，既然第一次分配失败了，说明内存空间可能出现不足的情况。因此在这里
    // 会做更多的工作比如回收内存等，之后再进行内存的分配
    page = __alloc_pages_slowpath(alloc_mask, order, &ac);

    // 到这里说明内存分配已经成功，做一点收尾工作就可以返回了
out:
    if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
        unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
        __free_pages(page, order);
        page = NULL;
    }

    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

    return page;
}

prepare_alloc_pages()

我们先来看这个函数，在每次进行内存分配前，都需要准备一个结构体 - struct alloc_context *ac，这个结构体中包含了内存分配需要的重要信息。通过下面函数我们可以看看它更新了哪些信息。

// ./mm/page_alloc.c

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
                                       int preferred_nid, nodemask_t *nodemask,
                                       struct alloc_context *ac, gfp_t *alloc_mask,
                                       unsigned int *alloc_flags)
{
    // 根据GFP掩码获取最高优先级的Zone
    ac->high_zoneidx = gfp_zone(gfp_mask);
    // NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags)，后者等于0 
    ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    ac->nodemask = nodemask;
    // 根据GFP掩码获取migratetype
    ac->migratetype = gfpflags_to_migratetype(gfp_mask);

    if (cpusets_enabled()) {
        *alloc_mask |= __GFP_HARDWALL;
        if (!ac->nodemask)
            ac->nodemask = &cpuset_current_mems_allowed;
        else
            *alloc_flags |= ALLOC_CPUSET;
    }

    fs_reclaim_acquire(gfp_mask);
    fs_reclaim_release(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
    // 对GFP掩码、order进行基本的验证，如果不满足要求则立刻返回报错
    if (should_fail_alloc_page(gfp_mask, order))
        return false;

    if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
        *alloc_flags |= ALLOC_CMA;

    return true;
}

我这里简单解释下几个ac中的参数，

ac->high_zoneidx - 根据输入的GFP掩码，确定内核寻找zone的最高zone_idx，这个在遍历zonelist时会使用。后面介绍next_zones_zonelist时再看看它是如何使用的；
ac->nodemask - 这个参数会决定内核去哪些node中遍历zone，同样，后面介绍next_zones_zonelist时再看看它是如何使用的；

剩下的部分就没有什么特别的了，我们接着来看下一个函数finalise_ac()。

finalise_ac()

这个函数也是更新struct alloc_context *ac，不过这里更新的参数 - ac->preferred_zoneref就需要好好理解了。这里牵涉好几个宏和函数，具体还是看源码和注释吧。

// ./mm/page_alloc.c

// #####################################################################################
// finalise_ac()
// #####################################################################################
/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
    /* Dirty zone balancing only done in the fast path */
    ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

    /*
     * The preferred zone is used for statistics but crucially it is
     * also used as the starting point for the zonelist iterator. It
     * may get reset for allocations that ignore memory policies.
     */
     // 在for_next_zone_zonelist_nodemask()宏中，ac->preferred_zoneref就是开始的iterator
    ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask);
}

// ./include/linux/mmzone.h

// #####################################################################################
// first_zones_zonelist()
// #####################################################################################
/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist - The zonelist to search for a suitable zone
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 * @return - Zoneref pointer for the first suitable zone found (see below)
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                                   enum zone_type highest_zoneidx, nodemask_t *nodes)
{
    return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes);
}

// #####################################################################################
// next_zones_zonelist()
// #####################################################################################
/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z - The cursor used as a starting point for the search
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes)
{
    // 这是一种常见的情况 - 1) 没有指定Node; 2) highest_zoneidx就是zonelist中最大的zone_idx
    // 满足这种情况就直接选zonelist中的第一个zone
    if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
        return z;
    return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

// ./mm/mmzone.c

// #####################################################################################
// __next_zones_zonelist()
// #####################################################################################
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, 
                                      nodemask_t *nodes)
{
    /*
     * Find the next suitable zone to use for the allocation.
     * Only filter based on nodemask if it's set
     */
    // 如果不指定Node，那么就找第一个zone_idx <= highest_zoneidx的Zone
    if (unlikely(nodes == NULL))
        while (zonelist_zone_idx(z) > highest_zoneidx)
            z++;
    // 如果指定Node，那么还需要在指令Node中寻找满足zone_idx <= highest_zoneidx的Zone
    else
        while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes)))
            z++;

    return z;
}

通过__next_zones_zonelist()函数，我们可以解答下之前留下的一些疑问，

ac->nodemask的作用很明显，就是判断zone是否在nodemask中。如果为NULL，那么就不需要管这个参数；否则就需要不断移动zone来找到对应的node；
ac->high_zoneidx用于筛选zone，只有zone_idx小于等于ac->high_zoneidx的zone才会被选上；
ac->preferred_zoneref就是在zonelist中，满足ac->nodemask和ac->high_zoneidx的第一个zone，它可以作为循环遍历的起始条件。

为了更好地解释上面说的这几点，我再举一些例子：

// 假设zonelist按照如下顺序进行排序 - 有3个node，每个node最多有3个zone
// 注意，在zonelist中，高优先级（排在前面）的zone有更大的zone_idx
// [N0Z2][N0Z1][N0Z0] | [N1Z1][N1Z0] | [N2Z2][N2Z1][N2Z0]

// 假设ac->nodemask = 0x5，ac->high_zoneidx = 0x1
// 首先遍历N0，并且由于第一个zone的zone_idx = 0x2 > 0x1，因此跳过
// 第二个zone的zone_idx = 0x1 <= 0x1，因此满足要求，返回该zone

// 假设ac->nodemask = 0x4，ac->high_zoneidx = 0x2
// 返回的zone是[N2Z2]

// 假设ac->nodemask = 0x2，ac->high_zoneidx = 0x0
// 返回的zone是[N1Z0]

有了这几个例子，ac中的这些成员变量应该就搞明白了，接下来我们要继续看接下来的代码了。

get_page_from_freelist()

这个函数很重要，不管是走正常的分配路线还是慢的分配路线，最后都会调用这个函数。同时，这个函数又很繁琐，细节很多。不过，如果不纠结这些细节，这个函数还是很好理解的：

按照zonelist的优先级顺序以及输入的ac->preferred_zoneref，ac->high_zoneidx和ac->nodemask等遍历所有的zone；
对每个zone都进行watermarks的判断，如果满足要求（剩余的空闲内存空间大于水位线）则进行正常的内存分配；否则进行内存回收等工作；
使用伙伴系统分配内存空间。

有了这个大概的认识，我们就开始阅读源码吧，注意，这里我只介绍剩余内存空间大于水位线的情况，

// ./mm/page_alloc.c

/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                           const struct alloc_context *ac)
{
    struct zoneref *z;
    struct zone *zone;
    struct pglist_data *last_pgdat_dirty_limit = NULL;
    bool no_fallback;

retry:
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
     */
    no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    z = ac->preferred_zoneref;
    // ----------------------------------------------------------------------------------------- (1)    
    for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) {
        struct page *page;
        unsigned long mark;

        // ------------------------------------------------------------------------------------- (2)
        if (cpusets_enabled() &&
            (alloc_flags & ALLOC_CPUSET) &&
            !__cpuset_zone_allowed(zone, gfp_mask))
                continue;
        /*
         * When allocating a page cache page for writing, we
         * want to get it from a node that is within its dirty
         * limit, such that no single node holds more than its
         * proportional share of globally allowed dirty pages.
         * The dirty limits take into account the node's
         * lowmem reserves and high watermark so that kswapd
         * should be able to balance it without having to
         * write pages from its LRU list.
         *
         * XXX: For now, allow allocations to potentially
         * exceed the per-node dirty limit in the slowpath
         * (spread_dirty_pages unset) before going into reclaim,
         * which is important when on a NUMA setup the allowed
         * nodes are together not big enough to reach the
         * global limit.  The proper fix for these situations
         * will require awareness of nodes in the
         * dirty-throttling and the flusher threads.
         */
        if (ac->spread_dirty_pages) {
            if (last_pgdat_dirty_limit == zone->zone_pgdat)
                continue;

            if (!node_dirty_ok(zone->zone_pgdat)) {
                last_pgdat_dirty_limit = zone->zone_pgdat;
                continue;
            }
        }

        if (no_fallback && nr_online_nodes > 1 &&
            zone != ac->preferred_zoneref->zone) {
            int local_nid;

            /*
             * If moving to a remote node, retry but allow
             * fragmenting fallbacks. Locality is more important
             * than fragmentation avoidance.
             */
            local_nid = zone_to_nid(ac->preferred_zoneref->zone);
            if (zone_to_nid(zone) != local_nid) {
                alloc_flags &= ~ALLOC_NOFRAGMENT;
                goto retry;
            }
        }
        // ALLOC_WMARK_LOW is set here
        // ------------------------------------------------------------------------------------- (3)
        // 获取当前水位线要求的空闲页数
        mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
        // 判断当前Zone的空闲页是否满足水位线的要求    
        if (!zone_watermark_fast(zone, order, mark, ac_classzone_idx(ac), alloc_flags)) {
            int ret;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
            /*
             * Watermark failed for this zone, but see if we can
             * grow this zone if it contains deferred pages.
             */
            if (static_branch_unlikely(&deferred_pages)) {
                if (_deferred_grow_zone(zone, order))
                    goto try_this_zone;
            }
#endif
            /* Checked here to keep the fast path fast */
            BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
            if (alloc_flags & ALLOC_NO_WATERMARKS)
                goto try_this_zone;

            if (node_reclaim_mode == 0 ||
                !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                continue;

            ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
            switch (ret) {
            case NODE_RECLAIM_NOSCAN:
                /* did not scan */
                continue;
            case NODE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                continue;
            default:
                /* did we reclaim enough */
                if (zone_watermark_ok(zone, order, mark,
                        ac_classzone_idx(ac), alloc_flags))
                    goto try_this_zone;

                continue;
            }
        }

try_this_zone:
        // ------------------------------------------------------------------------------------- (4)
        // 正式进入伙伴系统    
        page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype);
        if (page) {
            // 这里有个知识点 - compound_page 
            // 如果满足 gfp_flags & __GFP_COMP 那么就需要将该page块（逻辑上）合并起来
            // prep_compound_page()完成这部分工作
            prep_new_page(page, order, gfp_mask, alloc_flags);

            /*
             * If this is a high-order atomic allocation then check
             * if the pageblock should be reserved for the future
             */
            if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
                reserve_highatomic_pageblock(page, zone, order);

            return page;
        } else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
            /* Try again if zone has deferred pages */
            if (static_branch_unlikely(&deferred_pages)) {
                if (_deferred_grow_zone(zone, order))
                    goto try_this_zone;
            }
#endif
        }
    }

    /*
     * It's possible on a UMA machine to get through all zones that are
     * fragmented. If avoiding fragmentation, reset and try again.
     */
    if (no_fallback) {
        alloc_flags &= ~ALLOC_NOFRAGMENT;
        goto retry;
    }

    return NULL;
}

(1) 这个宏就直接看注释吧，这个宏解释了ac->preferred_zoneref的作用，

// ./include/linux/mmzone.h

// 1. z就是ac->preferred_zoneref，它是zonelist遍历的第一个元素;
// 2. zonelist最后一个元素的z->zone为NULL，整个遍历到此结束;
// 3. 遍历的下一个zone由next_zones_zonelist(++z, highidx, nodemask)决定，因此获取的zone很可能不是连续的
#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
    for (zone = z->zone;    \
        zone;                            \
        z = next_zones_zonelist(++z, highidx, nodemask),    \
            zone = zonelist_zone(z))

(2) - (3) 这部分不太清楚是干什么的，以后要是搞懂了再来补充；
(3) 判断当前zone的空闲页是否在水位线上方，如果是的话则正常分配内存空间，否则进行内存回收等工作；
(4) 这部分就是伙伴系统内存分配的核心代码了，下面来看看这个函数吧，

rmqueue()

// ./mm/page_alloc.c

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *rmqueue(struct zone *preferred_zone,
            struct zone *zone, unsigned int order,
            gfp_t gfp_flags, unsigned int alloc_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;

    // --------------------------------------------------------------------------------- （1）
    // 如果只需要获取一个页，那么先尝试从pcplist中获取
    if (likely(order == 0)) {
        page = rmqueue_pcplist(preferred_zone, zone, order,
			                   gfp_flags, migratetype, alloc_flags);
        goto out;
    }

    /*
     * We most definitely don't want callers attempting to
     * allocate greater than order-1 page units with __GFP_NOFAIL.
     */
    WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    spin_lock_irqsave(&zone->lock, flags);

    do {
        page = NULL;
        // ----------------------------------------------------------------------------- （2）
        // 如果需要ALLOC_HARDER，则从freelist[MIGRATE_HIGHATOMIC]中通过__rmqueue_smallest()函数分配内存空间
        if (alloc_flags & ALLOC_HARDER) {
            page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
            if (page)
                trace_mm_page_alloc_zone_locked(page, order, migratetype);
        }
        // ----------------------------------------------------------------------------- （3）
        //  按照输入的order, migratetype等分配内存空间
        if (!page)
            page = __rmqueue(zone, order, migratetype, alloc_flags);
    // 如果没有获取页或者获取的页无效，则再次尝试获取内存空间    
    } while (page && check_new_pages(page, order));
    spin_unlock(&zone->lock);
    if (!page)
        goto failed;
    __mod_zone_freepage_state(zone, -(1 << order),
                  get_pcppage_migratetype(page));

    __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    // 更新zone的统计信息    
    zone_statistics(preferred_zone, zone);
    local_irq_restore(flags);

out:
    /* Separate test+clear to avoid unnecessary atomics */
    if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
        clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
        wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    }

    VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}

这个函数分了3条路可以走，(1), (2) 和 (3)分别用不同的方式获取页。(2)其实是(3)的一种特殊情况，因此我先分析(3)。__rmqueue()函数源码如下，可以看出，它也会调用__rmqueue_smallest()，和__rmqueue_smallest()最大的区别是：如果获取失败还会尝试cma或者fallback的方式，下面我们来看看这几个函数吧。

// ./mm/page_alloc.c

// #####################################################################################
// __rmqueue()
// #####################################################################################
/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags)
{
    struct page *page;

retry:
    page = __rmqueue_smallest(zone, order, migratetype);
    if (unlikely(!page)) {
        if (migratetype == MIGRATE_MOVABLE)
            page = __rmqueue_cma_fallback(zone, order);

        if (!page && __rmqueue_fallback(zone, order, migratetype, alloc_flags))
            goto retry;
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}

// #####################################################################################
// __rmqueue_smallest()
// #####################################################################################
/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype)
{
    unsigned int current_order;
    struct free_area *area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    // 从当前order开始，寻找第一个满足要求的order    
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]);
        page = list_first_entry_or_null(&area->free_list[migratetype],
                            struct page, lru);
        if (!page)
            continue;
        // 将page从free_list中断开    
        list_del(&page->lru);
        rmv_page_order(page);
        area->nr_free--;
        // 这是最核心的函数之一，下面再仔细分析    
        expand(zone, page, order, current_order, area, migratetype);
        set_pcppage_migratetype(page, migratetype);
        return page;
    }

    return NULL;
}

// #####################################################################################
// expand()
// #####################################################################################
/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
 // 这个函数很重要，这里我用一个例子来说明它是如何工作的
 // 假设high = 4, low = 1，那么size = 1 << 4 = 16页，而我们需要的页块有1 << 1 = 2页
 // 第一次循环：
 // [0][1][2][3][4][5][6][7][8][9][10][11][12][13][14][15]
 // area = area3 (4 - 1)
 // high = 3 (4 - 1)
 // size = 16 / 2 = 8
 // area3->free_list->[8][9][10][11][12][13][14][15]
 // 剩下[0][1][2][3][4][5][6][7]
 //
 // 第二次循环：
 // area = area2 (3 - 1)
 // high = 2 (3 - 1)
 // size = 8 / 2 = 4
 // area2->free_list->[4][5][6][7]
 // 剩下[0][1][2][3]
 //
 // 第三次循环：
 // area = area1 (2 - 1)
 // high = 1 (2 - 1)
 // size = 4 / 2 = 2
 // area1->free_list->[2][3]
 // 剩下[0][1]
 // 
 // high == low，循环跳出
 // 返回[0][1]，满足我们需要
static inline void expand(struct zone *zone, struct page *page, int low, 
                          int high, struct free_area *area, int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {
        area--;
        high--;
        size >>= 1;
        VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

        /*
         * Mark as guard pages (or page), that will allow to
         * merge back to allocator when buddy will be freed.
         * Corresponding page table entries will not be touched,
         * pages will stay not present in virtual address space
         */
        if (set_page_guard(zone, &page[size], high, migratetype))
            continue;
        // 这里是直接将中间的page添加入free_list，page block只有第一个page链入free_list
        list_add(&page[size].lru, &area->free_list[migratetype]);
        area->nr_free++;
        set_page_order(&page[size], high);
    }
}

在上面的几个函数中，只有expand()是真正的涉及伙伴系统算法的函数，对于expand()这个函数，我的注释应该可以很好地解释其是如何工作的。鉴于篇幅过长，我将剩下的关于fallback函数的内容放到下一篇笔记中，好吧，继续加油！

参考资料

Next Post

【LM12】伙伴系统内存释放篇
Previous Post

【LM10】伙伴系统基本信息

CATALOG

1. prepare_alloc_pages()
2. finalise_ac()
3. get_page_from_freelist()
1. 3.1. rmqueue()
4. 参考资料