【LM12】伙伴系统内存释放篇

硬核技术 Linux内核 Linux_5_0 ARMv8_64 Linux内存管理

 2023/07/01 

接上篇笔记【LM11】伙伴系统内存分配篇，我们先来看看在内存分配时，fallback的是如何实现的。

__rmqueue_fallback()

// ./mm/page_alloc.c 

/*
 * Try finding a free buddy page on the fallback list and put it on the free
 * list of requested migratetype, possibly along with other pages from the same
 * block, depending on fragmentation avoidance heuristics. Returns true if
 * fallback was found so that __rmqueue_smallest() can grab it.
 *
 * The use of signed ints for order and current_order is a deliberate
 * deviation from the rest of this file, to make the for loop
 * condition simpler.
 */
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
                        unsigned int alloc_flags)
{
    struct free_area *area;
    int current_order;
    int min_order = order;
    struct page *page;
    int fallback_mt;
    bool can_steal;

    /*
     * Do not steal pages from freelists belonging to other pageblocks
     * i.e. orders < pageblock_order. If there are no local zones free,
     * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
     */
     // pageblock_order一般等于MAX_ORDER - 1
    if (alloc_flags & ALLOC_NOFRAGMENT)
        min_order = pageblock_order;

    /*
     * Find the largest available free page in the other list. This roughly
     * approximates finding the pageblock with the most free pages, which
     * would be too costly to do exactly.
     */
    for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) {
        area = &(zone->free_area[current_order]);
        // ------------------------------------------------------------------------------------------------ (1)    
        fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal);
        if (fallback_mt == -1)
            continue;

        /*
         * We cannot steal all free pages from the pageblock and the
         * requested migratetype is movable. In that case it's better to
         * steal and split the smallest available page instead of the
         * largest available page, because even if the next movable
         * allocation falls back into a different pageblock than this
         * one, it won't cause permanent fragmentation.
         */
        // ------------------------------------------------------------------------------------------------ (2)    
        if (!can_steal && start_migratetype == MIGRATE_MOVABLE && current_order > order)
            goto find_smallest;

        goto do_steal;
    }

    return false;

find_smallest:
    // ---------------------------------------------------------------------------------------------------- (3)
    for (current_order = order; current_order < MAX_ORDER; current_order++) {
        area = &(zone->free_area[current_order]);
        fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal);
        if (fallback_mt != -1)
            break;
    }

    /*
     * This should not happen - we already found a suitable fallback
     * when looking for the largest page.
     */
    VM_BUG_ON(current_order == MAX_ORDER);

do_steal:
    // 这里的page就是找到最合适的目标order + migratetype处的内存块
    page = list_first_entry(&area->free_list[fallback_mt], struct page, lru);
    // ---------------------------------------------------------------------------------------------------- (4)
    steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, can_steal);

    trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt);

    return true;

}

上面的注释简单说明了fallback是如何工作的，在看代码注释前，我这里先简单说明下fallback的工作原理。

fallback不是仅从不同的migratetype中偷内存空间，而是从不同的order + migratetype中偷内存空间，所以我们首先就需要确定order + migratetype；
order直接遍历即可，migratetype需要靠遍历fallbacks这个二维数组；因为是去不同的order偷内存，为了减少内存碎片，Linux内核会从高order开始遍历；为什么这么做？我个人理解如下：在内存分配时，内核是按照order从小到大的顺序分配内存的，如果偷走了小order的内存，那么出现内存不足的情况会上升。这样还需要获取高order的内存块，然后再填补低order的内存块。这样就不如偷的时候直接偷高order的内存；
我们从目标order + migratetype中偷取内存块，并将其存入目标order + 原始migratetype中。这样，在下次通过原始migratetype进行内存分配时，伙伴系统可以在高阶order处找到空闲的内存块并将其拆分使用。

接下来看看代码注释吧，
(1) 确定两个东西，第一，目标migratetype，也就是fallback_mt；第二，can_steal是否为真，如果为真，那么就会去偷整个pageblock，否则只会去偷单个页块。find_suitable_fallback()这个函数就不展开了；
(2) 满足这种调节我们就没有必要去考虑碎片化的问题了，因此可以直接从小的order开始寻找合适的free_list；
(3) 完成(2)中需要的遍历；
(4) 这个就是偷内存的核心函数，我们先来看看它的输入参数是什么，page是已经找到的order + migratetype中的第一个内存块；start_migratetype是目标migratetype；can_steal赋值给whole_block。下面的源码，主要关注steal_suitable_fallback()的大逻辑即可。

// ./mm/page_alloc.c

// #####################################################################################
// steal_suitable_fallback()
// #####################################################################################
/*
 * This function implements actual steal behaviour. If order is large enough,
 * we can steal whole pageblock. If not, we first move freepages in this
 * pageblock to our migratetype and determine how many already-allocated pages
 * are there in the pageblock with a compatible migratetype. If at least half
 * of pages are free or compatible, we can change migratetype of the pageblock
 * itself, so pages freed in the future will be put on the correct free list.
 */
static void steal_suitable_fallback(struct zone *zone, struct page *page,
        unsigned int alloc_flags, int start_type, bool whole_block)
{
    unsigned int current_order = page_order(page);
    struct free_area *area;
    int free_pages, movable_pages, alike_pages;
    int old_block_type;

    old_block_type = get_pageblock_migratetype(page);

    /*
     * This can happen due to races and we want to prevent broken
     * highatomic accounting.
     */
    if (is_migrate_highatomic(old_block_type))
        goto single_page;

    /* Take ownership for orders >= pageblock_order */
    if (current_order >= pageblock_order) {
        change_pageblock_range(page, current_order, start_type);
        goto single_page;
    }

    /*
     * Boost watermarks to increase reclaim pressure to reduce the
     * likelihood of future fallbacks. Wake kswapd now as the node
     * may be balanced overall and kswapd will not wake naturally.
     */
    boost_watermark(zone);
    if (alloc_flags & ALLOC_KSWAPD)
        set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);

    /* We are not allowed to try stealing from the whole block */
    // --------------------------------------------------------------------------------- (1) 
    if (!whole_block)
        goto single_page;

    free_pages = move_freepages_block(zone, page, start_type, &movable_pages);
    /*
     * Determine how many pages are compatible with our allocation.
     * For movable allocation, it's the number of movable pages which
     * we just obtained. For o:ther types it's a bit more tricky.
     */
    if (start_type == MIGRATE_MOVABLE) {
        alike_pages = movable_pages;
    } else {
        /*
         * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
         * to MOVABLE pageblock, consider all non-movable pages as
         * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
         * vice versa, be conservative since we can't distinguish the
         * exact migratetype of non-movable pages.
         */
        if (old_block_type == MIGRATE_MOVABLE)
            alike_pages = pageblock_nr_pages
                        - (free_pages + movable_pages);
        else
            alike_pages = 0;
    }

    /* moving whole block can fail due to zone boundary conditions */
    if (!free_pages)
        goto single_page;

    /*
     * If a sufficient number of pages in the block are either free or of
     * comparable migratability as our allocation, claim the whole block.
     */
    if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
            page_group_by_mobility_disabled)
        set_pageblock_migratetype(page, start_type);

    return;
// ------------------------------------------------------------------------------------- (2) 
single_page:
    area = &zone->free_area[current_order];
    list_move(&page->lru, &area->free_list[start_type]);
}

// #####################################################################################
// move_freepages_block()
// #####################################################################################
int move_freepages_block(struct zone *zone, struct page *page,
                int migratetype, int *num_movable)
{
    unsigned long start_pfn, end_pfn;
    struct page *start_page, *end_page;

    if (num_movable)
        *num_movable = 0;

    start_pfn = page_to_pfn(page);
    start_pfn = start_pfn & ~(pageblock_nr_pages-1);
    start_page = pfn_to_page(start_pfn);
    end_page = start_page + pageblock_nr_pages - 1;
    end_pfn = start_pfn + pageblock_nr_pages - 1;

    /* Do not cross zone boundaries */
    if (!zone_spans_pfn(zone, start_pfn))
        start_page = page;
    if (!zone_spans_pfn(zone, end_pfn))
        return 0;

    return move_freepages(zone, start_page, end_page, migratetype, num_movable);
}

// #####################################################################################
// move_freepagesm() 
// #####################################################################################
/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
static int move_freepages(struct zone *zone,
              struct page *start_page, struct page *end_page,
              int migratetype, int *num_movable)
{
    struct page *page;
    unsigned int order;
    int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
    /*
     * page_zone is not safe to call in this context when
     * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
     * anyway as we check zone boundaries in move_freepages_block().
     * Remove at a later date when no bug reports exist related to
     * grouping pages by mobility
     */
    VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
              pfn_valid(page_to_pfn(end_page)) &&
              page_zone(start_page) != page_zone(end_page));
#endif
    for (page = start_page; page <= end_page;) {
        if (!pfn_valid_within(page_to_pfn(page))) {
            page++;
            continue;
        }

        /* Make sure we are not inadvertently changing nodes */
        VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);

        if (!PageBuddy(page)) {
            /*
             * We assume that pages that could be isolated for
             * migration are movable. But we don't actually try
             * isolating, as that would be expensive.
             */
            if (num_movable &&
                    (PageLRU(page) || __PageMovable(page)))
                (*num_movable)++;

            page++;
            continue;
        }

        order = page_order(page);
        list_move(&page->lru,
              &zone->free_area[order].free_list[migratetype]);
        page += 1 << order;
        pages_moved += 1 << order;
    }

    return pages_moved;
}

(1) 只要满足输入的can_steal为真，那么我们就直接进行move_freepages_block()，否则只会进行单内存块的移动操作；那么move_freepages_block()做了什么？通过源码可以知道它将尝试移动大小为2 ^ (MAX_ORDER - 1)的内存块。
(2) 对于这种情况，我们只需要偷一个页块，因此直接对list进行操作即可。

至此，我们基本了解了fallback是如何工作的了。这里面的细节也很多，但只要抓住了主要的逻辑，fallback还是比较好理解的。完成fallback的知识点后，我们再来看看另一个知识点 - pcplist

pcplist

PCP是伙伴系统中的一个小知识点，简单来说就是内核会频繁地分配回收order = 0的页块，如果还是每次地调用伙伴系统来完成分配和回收，随着CPU数量的增加，spin_lock的竞争会加剧。为了缓解这种情况，Linux内核为每个CPU准备了一些专门的页，这些页就保存在pcplist中。下面一段简介摘于网络：

Linux系统中0阶内存分配需求是最多的，而且经常存在频繁分配释放的行为，如果每次都去伙伴系统中申请，会经常需要获取zone->lock锁住整个zone区域。随着CPU核心数的增加，伙伴系统锁竞争激烈程度也会越来越大。为了改善这个问题，linux内核中引入了per_cpu_pageset（下面简称pcp）。优化思路是先从zone一次性拿一些页出来，放到每个cpu自己本地中。释放也先放回到这里，等满了再一起还给zone。

通过之前的分析，我们已经知道，如果只需要一个页，那么就直接从pcplist中获取该页。这里我们先来看看pcp相关的结构体。

// ./include/linux/mmzone.h

struct zone {
	...
	struct per_cpu_pageset __percpu *pageset;
	...
}

struct per_cpu_pageset {
	struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
	s8 expire;
	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
	s8 stat_threshold;
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

struct per_cpu_pages {
	int count;		/* number of pages in the list */
	int high;		/* high watermark, emptying needed */
	int batch;		/* chunk size for buddy add/remove */

	/* Lists of pages, one per migrate type stored on the pcp-lists */
	struct list_head lists[MIGRATE_PCPTYPES];
};

我们可以看出，pcp是在zone中的（小知识，zone是可以被多个CPU共享的，多个CPU都在一个node中）。在每个zone中都有一个pcp来保证每个CPU都有自己独立的一部分内存空间。

重新回顾rmqueue()函数，在该函数中，通过调用rmqueue_pcplist()来获取pcplist中的页。

// ./mm/page_alloc.c

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *rmqueue(struct zone *preferred_zone,
            struct zone *zone, unsigned int order,
            gfp_t gfp_flags, unsigned int alloc_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;

    // 如果只需要获取一个页，那么先尝试从pcplist中获取
    if (likely(order == 0)) {
        page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags);
        goto out;
    }
	...
}

rmqueue_pcplist()

这个函数主要是获取pcp然后通过调用__rmqueue_pcplist()函数来获取一个页。我们先看rmqueue_pcplist()再看__rmqueue_pcplist()函数。

// ./mm/page_alloc.c

/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
			struct zone *zone, gfp_t gfp_flags,
			int migratetype, unsigned int alloc_flags)
{
	struct per_cpu_pages *pcp;
	struct list_head *list;
	struct page *page;
	unsigned long flags;

	local_irq_save(flags);
	pcp = &this_cpu_ptr(zone->pageset)->pcp;
	// pcplist中也分不同的migratetype
	list = &pcp->lists[migratetype];
	// 通过pcplist获取一个页
	page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
	if (page) {
		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
		zone_statistics(preferred_zone, zone);
	}
	local_irq_restore(flags);
	return page;
}

__rmqueue_pcplist()

// ./mm/page_alloc.c

/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
			unsigned int alloc_flags,
			struct per_cpu_pages *pcp,
			struct list_head *list)
{
	struct page *page;

	do {
	    // Case1: 如果list为空，那么我们就会通过rmqueue_bulk()函数来获取pcp->batch数量的页
	    // 这些页将用于补充pcplist
		if (list_empty(list)) {
			pcp->count += rmqueue_bulk(zone, 0,
					pcp->batch, list,
					migratetype, alloc_flags);
			if (unlikely(list_empty(list)))
				return NULL;
		}
		// Case2: 如果list不为空，那么很简单，直接获取一页即可
		page = list_first_entry(list, struct page, lru);
		list_del(&page->lru);
		pcp->count--;
	} while (check_new_pcp(page));

	return page;
}

上面的第二种情况可以直接获取页，第一种情况需要使用伙伴系统获取内存块，然后再通过pcplist获取页，下面我们来看看rmqueue_bulk()函数。

rmqueue_bulk()

这个函数只需要明白是调用__rmqueue()就行了，剩下的其它内容简单看看就行了（其实我也不太懂）。

// ./mm/page_alloc.c

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
			unsigned long count, struct list_head *list,
			int migratetype, unsigned int alloc_flags)
{
	int i, alloced = 0;

	spin_lock(&zone->lock);
	for (i = 0; i < count; ++i) {
		struct page *page = __rmqueue(zone, order, migratetype, alloc_flags);
		if (unlikely(page == NULL))
			break;

		if (unlikely(check_pcp_refill(page)))
			continue;

		/*
		 * Split buddy pages returned by expand() are received here in
		 * physical page order. The page is added to the tail of
		 * caller's list. From the callers perspective, the linked list
		 * is ordered by page number under some conditions. This is
		 * useful for IO devices that can forward direction from the
		 * head, thus also in the physical page order. This is useful
		 * for IO devices that can merge IO requests if the physical
		 * pages are ordered properly.
		 */
		// 获取的页放在list的尾部	
		list_add_tail(&page->lru, list);
		alloced++;
		if (is_migrate_cma(get_pcppage_migratetype(page)))
			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
					      -(1 << order));
	}

	/*
	 * i pages were removed from the buddy list even if some leak due
	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
	 * on i. Do not confuse with 'alloced' which is the number of
	 * pages added to the pcp list.
	 */
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
	spin_unlock(&zone->lock);
	return alloced;
}

好了，到目前为止，fallback和pcplist我们都搞明白了，接下来就要看内存的释放了。相比内存的分配，内存的释放要简单很多，我争取在这篇笔记完成内存释放的所有内容。

内存的释放

这里我先放一张bin的技术小屋的图片，可以看出，伙伴系统的内存释放最后都会到达free_the_page()函数。我们先简单看看这几个函数，

// ./include/linux/gfp.h
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

// ./mm/page_alloc.c
void free_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0) {
		VM_BUG_ON(!virt_addr_valid((void *)addr));
		__free_pages(virt_to_page((void *)addr), order);
	}
}
void __free_pages(struct page *page, unsigned int order)
{
	if (put_page_testzero(page))
		free_the_page(page, order);
}

static inline void free_the_page(struct page *page, unsigned int order)
{
	if (order == 0)		/* Via pcp? */
		free_unref_page(page);
	else
		__free_pages_ok(page, order);
}

同内存的分配相似，这里我们也有两条路可以走，第一条是order == 0的情况，要用到pcplist；第二条是order > 0的情况，要用到伙伴系统；我们先来看第二种情况。

__free_pages_ok()

// ./mm/page_alloc.c

static void __free_pages_ok(struct page *page, unsigned int order)
{
	unsigned long flags;
	int migratetype;
	unsigned long pfn = page_to_pfn(page);
	// 对page的一些flag进行处理，如果该page无效，则直接返回NULL	
	if (!free_pages_prepare(page, order, true))
		return;

	migratetype = get_pfnblock_migratetype(page, pfn);
	local_irq_save(flags);
	__count_vm_events(PGFREE, 1 << order);
	free_one_page(page_zone(page), page, pfn, order, migratetype);
	local_irq_restore(flags);
}

static void free_one_page(struct zone *zone,
				struct page *page, unsigned long pfn,
				unsigned int order,
				int migratetype)
{
	spin_lock(&zone->lock);
	if (unlikely(has_isolate_pageblock(zone) ||
		is_migrate_isolate(migratetype))) {
		migratetype = get_pfnblock_migratetype(page, pfn);
	}
	__free_one_page(page, pfn, zone, order, migratetype);
	spin_unlock(&zone->lock);
}

__free_one_page()

下面我们来看看__free_one_page()这个函数，

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
 * free pages of length of (1 << order) and marked with PageBuddy.
 * Page's order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.
 *
 * -- nyc
 */

static inline void __free_one_page(struct page *page,
		unsigned long pfn,
		struct zone *zone, unsigned int order,
		int migratetype)
{
	unsigned long combined_pfn;
	unsigned long uninitialized_var(buddy_pfn);
	struct page *buddy;
	unsigned int max_order;
	struct capture_control *capc = task_capc(zone);
    // 大部分情况pageblock_order + 1 <= MAX_ORDER
	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

	VM_BUG_ON(!zone_is_initialized(zone));
	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

	VM_BUG_ON(migratetype == -1);
	if (likely(!is_migrate_isolate(migratetype)))
		__mod_zone_freepage_state(zone, 1 << order, migratetype);

	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
	VM_BUG_ON_PAGE(bad_range(zone, page), page);

continue_merging:
	while (order < max_order - 1) {
		if (compaction_capture(capc, page, order, migratetype)) {
			__mod_zone_freepage_state(zone, -(1 << order),
								migratetype);
			return;
		}
	    // ------------------------------------------------------------- (1)	
	    // page_pfn ^ (1 << order)
		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);
		// pfn_valid_within()在没有没有定义CONFIG_HOLES_IN_ZONE时返回1
		if (!pfn_valid_within(buddy_pfn))
			goto done_merging;
		// 确保buddy的zone, order和page的一样；确保buddy不在hole；确保buddy在buddy system(没有被分配出去)
		if (!page_is_buddy(page, buddy, order))
			goto done_merging;
		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
		if (page_is_guard(buddy))
			clear_page_guard(zone, buddy, order, migratetype);
		else
		    // 将buddy从相应的free_list中取下
			del_page_from_free_area(buddy, &zone->free_area[order]);
		// 获取合并后的pfn	
		combined_pfn = buddy_pfn & pfn;
		// 获取合并后的页块 	
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
		// order加1，继续向上合并	
		order++;
		// ------------------------------------------------------------- (1)
	}

	// ----------------------------------------------------------------- (2)
	if (max_order < MAX_ORDER) {
		/* If we are here, it means order is >= pageblock_order.
		 * We want to prevent merge between freepages on isolate
		 * pageblock and normal pageblock. Without this, pageblock
		 * isolation could cause incorrect freepage or CMA accounting.
		 *
		 * We don't want to hit this code for the more frequent
		 * low-order merging.
		 */
		if (unlikely(has_isolate_pageblock(zone))) {
			int buddy_mt;

			buddy_pfn = __find_buddy_pfn(pfn, order);
			buddy = page + (buddy_pfn - pfn);
			buddy_mt = get_pageblock_migratetype(buddy);

			if (migratetype != buddy_mt
					&& (is_migrate_isolate(migratetype) ||
						is_migrate_isolate(buddy_mt)))
				goto done_merging;
		}
		max_order++;
		goto continue_merging;
	}

done_merging:
	set_page_order(page, order);

	// ----------------------------------------------------------------- (3)
	// 这种情况出现在上面page_is_buddy()不满足条件的情况
	/*
	 * If this is not the largest possible page, check if the buddy
	 * of the next-highest order is free. If it is, it's possible
	 * that pages are being freed that will coalesce soon. In case,
	 * that is happening, add the free page to the tail of the list
	 * so it's less likely to be used soon and more likely to be merged
	 * as a higher order page
	 */
	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) && !is_shuffle_order(order)) {
		struct page *higher_page, *higher_buddy;
		combined_pfn = buddy_pfn & pfn;
		higher_page = page + (combined_pfn - pfn);
		// 寻找更上以及的伙伴
		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
		if (pfn_valid_within(buddy_pfn) &&
		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
			add_to_free_area_tail(page, &zone->free_area[order],
					      migratetype);
			return;
		}
	}
	
	// ----------------------------------------------------------------- (4)
	// 是否进行随机添加，随机添加指的是随机在free_list头或者尾进行插入
	if (is_shuffle_order(order))
		add_to_free_area_random(page, &zone->free_area[order], migratetype);
	else
		add_to_free_area(page, &zone->free_area[order], migratetype);

}

看上面的注释基本已经能理解伙伴系统内存的释放了，这里我再简单解释下，外加给个例子，
(1) 这部分是伙伴系统的核心部分，通过寻找伙伴、合并、升阶，直到找不到有效伙伴；
(2) 这部分不清楚，发生在pageblock_order + 1 <= MAX_ORDER的情况；
(3) 这部分适用于 i) 页块在上一级order找不到伙伴；ii) 但在上上一级order能找到其伙伴（其伙伴已经合并到这一级了）。在这种情况下，我们将当前页块插入到free_list的尾部，这样它就不会在短期内被分配出去。如果上上一级的页块裂开了，那么就会和这个页块进行合并；
(4) 判断是否将合并的页块随机插入free_list的首或者尾。

下面看看伙伴系统释放的例子，

// case 1 - 可以直接合并到最高order
// ---------------------
// [8,9,10,11,12,13,14,15] 
// [4,5,6,7][16,17,18,19]
// [0,1][20,21]
// free([2,3])
// 1) 寻找2的buddy, buddy_pfn = 2 ^ (1 << 1) = 0
// 2) 获取combined_pfn = 0 & 2 = 0
// 3) order++, 合并成[0,1,2,3]
// 4) 寻找0的buddy, buddy_pfn = 0 ^ (1 << 2) = 4
// 5) 获取combined_pfn = 0 & 4 = 0
// 6) order++, 合并成[0,1,2,3,4,5,6,7]
// 继续重复1) - 3)直到最后合并成[0 - 15]
//
// case 2 - 不能合并到最高order 
// ---------------------
// [32,33,34,35,36,37,38,39] 
// [4,5,6,7][16,17,18,19]
// [0,1][20,21]
// 这种情况最后只能合并到[0 - 7]，不能再向上合并了

上面讲的都是伙伴系统如何进行内存释放的，下面看看pcplist那部分是如何进行内存释放的。

free_unref_page()

// ./mm/page_alloc.c

/*
 * Free a 0-order page
 */
void free_unref_page(struct page *page)
{
	unsigned long flags;
	unsigned long pfn = page_to_pfn(page);
	// 对page做一些处理，最后会调用free_pages_prepare()
	if (!free_unref_page_prepare(page, pfn))
		return;

	local_irq_save(flags);
	free_unref_page_commit(page, pfn);
	local_irq_restore(flags);
}

这个函数基本就是一个封装函数，除了对page进行一些预处理外，最核心的部分就是调用free_unref_page_commit()函数。下面我们来看看这个函数，

free_unref_page_commit()

// ./mm/page_alloc.c

static void free_unref_page_commit(struct page *page, unsigned long pfn)
{
	struct zone *zone = page_zone(page);
	struct per_cpu_pages *pcp;
	int migratetype;

	migratetype = get_pcppage_migratetype(page);
	__count_vm_event(PGFREE);

	/*
	 * We only track unmovable, reclaimable and movable on pcp lists.
	 * Free ISOLATE pages back to the allocator because they are being
	 * offlined but treat HIGHATOMIC as movable pages so we can get those
	 * areas back if necessary. Otherwise, we may have to free
	 * excessively into the page allocator
	 */
	if (migratetype >= MIGRATE_PCPTYPES) {
		// 这种情况下直接用伙伴系统完成内存的释放 - 伙伴系统也可以对order == 0的情况进行处理
		if (unlikely(is_migrate_isolate(migratetype))) {
			free_one_page(zone, page, pfn, 0, migratetype);
			return;
		}
		migratetype = MIGRATE_MOVABLE;
	}

    // 直接将页放回pcplist
	pcp = &this_cpu_ptr(zone->pageset)->pcp;
	list_add(&page->lru, &pcp->lists[migratetype]);
	pcp->count++;
    // pcplist中的页太多了，我们需要释放部分回伙伴系统	
	if (pcp->count >= pcp->high) {
		unsigned long batch = READ_ONCE(pcp->batch);
		free_pcppages_bulk(zone, batch, pcp);
	}
}

上面这个函数很简单也很直接，唯一需要额外关注的就是free_pcppages_bulk()这个函数，接下来看看这个函数吧。

// ./mm/page_alloc.c

/*
 * Frees a number of pages from the PCP lists
 * Assumes all pages on list are in same zone, and of same order.
 * count is the number of pages to free.
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp)
{
	int migratetype = 0;
	int batch_free = 0;
	int prefetch_nr = 0;
	bool isolated_pageblocks;
	struct page *page, *tmp;
	LIST_HEAD(head);

    // ------------------------------------------------------ (1)
	while (count) {
		struct list_head *list;

		/*
		 * Remove pages from lists in a round-robin fashion. A
		 * batch_free count is maintained that is incremented when an
		 * empty list is encountered.  This is so more pages are freed
		 * off fuller lists instead of spinning excessively around empty
		 * lists
		 */
		do {
			batch_free++;
			if (++migratetype == MIGRATE_PCPTYPES)
				migratetype = 0;
			list = &pcp->lists[migratetype];
		} while (list_empty(list));

		/* This is the only non-empty list. Free them all. */
		if (batch_free == MIGRATE_PCPTYPES)
			batch_free = count;

		do {
			page = list_last_entry(list, struct page, lru);
			/* must delete to avoid corrupting pcp list */
			list_del(&page->lru);
			pcp->count--;

			if (bulkfree_pcp_prepare(page))
				continue;

			list_add_tail(&page->lru, &head);

			/*
			 * We are going to put the page back to the global
			 * pool, prefetch its buddy to speed up later access
			 * under zone->lock. It is believed the overhead of
			 * an additional test and calculating buddy_pfn here
			 * can be offset by reduced memory latency later. To
			 * avoid excessive prefetching due to large count, only
			 * prefetch buddy for the first pcp->batch nr of pages.
			 */
			if (prefetch_nr++ < pcp->batch)
				prefetch_buddy(page);
		} while (--count && --batch_free && !list_empty(list));
	}

	spin_lock(&zone->lock);
	isolated_pageblocks = has_isolate_pageblock(zone);

	/*
	 * Use safe version since after __free_one_page(),
	 * page->lru.next will not point to original list.
	 */
    // ------------------------------------------------------ (2)
	list_for_each_entry_safe(page, tmp, &head, lru) {
		int mt = get_pcppage_migratetype(page);
		/* MIGRATE_ISOLATE page should not go to pcplists */
		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
		/* Pageblock could have been isolated meanwhile */
		if (unlikely(isolated_pageblocks))
			mt = get_pageblock_migratetype(page);

		__free_one_page(page, page_to_pfn(page), zone, 0, mt);
		trace_mm_page_pcpu_drain(page, 0, mt);
	}
	spin_unlock(&zone->lock);
}

这个函数还是比较简单的，我把它分为两部分，
(1) 用round-robin的方式遍历所有不为空的migratetype，将页从原来的free_list中移除并添加进一个临时的list；
(2) 所有需要释放回伙伴系统的页都在list中了，根据migratetype的不同，将它们释放回伙伴系统(order == 0)。

实例分析 - Memblock释放内存空间

我们现在来看看之前留下的一个坑，Memblock是如何将内存空间还给伙伴系统的。

// ./mm/memblock.c

// ################################################################################
// __free_pages_memory()
// ################################################################################
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
    int order;

    while (start < end) {
        order = min(MAX_ORDER - 1UL, __ffs(start));
	    // 尽量用大的order来释放内存空间 
        while (start + (1UL << order) > end)
            order--;

        memblock_free_pages(pfn_to_page(start), start, order);

        start += (1UL << order);
    }
}

// ################################################################################
// memblock_free_pages()
// ################################################################################
void __init memblock_free_pages(struct page *page, unsigned long pfn,
							unsigned int order)
{
	if (early_page_uninitialised(pfn))
		return;
	__free_pages_core(page, order);
}

// ################################################################################
// __free_pages_core()
// ################################################################################
void __free_pages_core(struct page *page, unsigned int order)
{
	unsigned int nr_pages = 1 << order;
	struct page *p = page;
	unsigned int loop;

	prefetchw(p);
	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
		prefetchw(p + 1);
		__ClearPageReserved(p);
		set_page_count(p, 0);
	}
	__ClearPageReserved(p);
	set_page_count(p, 0);
    // 更新zone中伙伴系统管理页的数量	
	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
	set_page_refcounted(page);
	__free_pages(page, order);
}

// ################################################################################
// __free_pages()
// ################################################################################
void __free_pages(struct page *page, unsigned int order)
{
	if (put_page_testzero(page))
	    // 至此，回到上面"内存的释放"部分的内容
		free_the_page(page, order);
}

这部分代码很容易理解，唯一留下的疑问就是：Memblock中的这些页，它们的migratetype是什么？是在哪里定义的？以后如果我找到了我会回来更新这个疑问。除此之外，这部分代码直接读就行了。

参考资料

Next Post

【LM13】slab系统的初始化
Previous Post

【LM11】伙伴系统内存分配篇

CATALOG

1. __rmqueue_fallback()
2. pcplist
3. 内存的释放
4. 实例分析 - Memblock释放内存空间
5. 参考资料