12. 【Slab】4 - slab的基本操作 - kmem_cache_alloc()

Linux内核内存管理 Linux 2.6.11

 2023/02/27 

在上一篇笔记 11. 【Slab】3 - Slab的基本操作 - kmem_cache_create() 中，我们完成了对高速缓存的分配。在在那篇笔记中，我们提到了kmem_cache_alloc()函数，这个函数用于获取对象，是一个很重要的函数。在这篇笔记中，我将仔细分析这个函数。有了这个函数，我们就可以获取对象了，并且这里和伙伴系统又联系在了一起。通过学习这个函数，我们可以知道slab是何时真正获取页和对象。时间紧任务重，赶紧开始源码分析吧。

kmem_cache_alloc()

这个函数只是一个封装函数，他并没有做其它任何事情。

/*
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.  The flags are only relevant
 * if the cache has no available objects.
 */
void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
{
    return __cache_alloc(cachep, flags);
}

__cache_alloc()

这个函数逻辑上是比较简单的，首先判断ac->avail是否为0，然后根据情况从CPU的本地缓存获取对象或者通过cache_alloc_refill()获取对象。下面分别对这两种情况进行讨论，首先看简单的 - ac->avail != 0，也就是CPU的本地高速缓存有可用的对象。

static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
{
    unsigned long save_flags;
    void* objp;
    struct array_cache *ac;

    cache_alloc_debugcheck_before(cachep, flags);

    local_irq_save(save_flags);
    
    /* 
        ac_data(cachep)用于获取CPU的cache_array的指针，其函数原型如下：
           
        static inline struct array_cache *ac_data(kmem_cache_t *cachep)
        {
            return cachep->array[smp_processor_id()];
        }
    */
    ac = ac_data(cachep);
    if (likely(ac->avail)) {
        STATS_INC_ALLOCHIT(cachep);
        ac->touched = 1;

        /*
            这个函数我们是将(ac + 1)这个array_cache类型的指针转换成(void **)，为什么这样转换？
            因为在(ac + 1)后正好是对象的地址，(ac + 1)是第一个对象地址(void *)的地址(void **)，因此这里用指针的指针。
            [array_cache][p_obj][p_obj]...[p_obj]
                         |
                       void **

            static inline void **ac_entry(struct array_cache *ac)
            {
                return (void **)(ac + 1);
            }    

            [--ac->avail]等价于[--(ac->avail)]，avail指向第一个空的位置，因此减一后就是最后一个非空的位置。
        */
        objp = ac_entry(ac)[--ac->avail];
    } else {
        ...
        ...
    }
    local_irq_restore(save_flags);
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
    return objp;
}

上面这部分还是比较简单的，但如果ac->avail = 0，那么情况就比较复杂了，我们需要调用cache_alloc_refill()函数，下面就来看这个函数吧。

static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
{
    ...
    } else {
        STATS_INC_ALLOCMISS(cachep);
        objp = cache_alloc_refill(cachep, flags);
    }
    ...
}

cache_alloc_refill()

static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
{
    int batchcount;
    struct kmem_list3 *l3;
    struct array_cache *ac;

    check_irq_off();
    ac = ac_data(cachep);
retry:
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /* if there was little recent activity on this
         * cache, then perform only a partial refill.
         * Otherwise we could generate refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    l3 = list3_data(cachep);

    BUG_ON(ac->avail > 0);
    spin_lock(&cachep->spinlock);

    // 先检查shared的本地高速缓存是否有空闲对象可以使用
    if (l3->shared) {
        struct array_cache *shared_array = l3->shared;
        if (shared_array->avail) {
            // 确保使用的对象块不超过shared_array->avail
            if (batchcount > shared_array->avail)
                batchcount = shared_array->avail;
            shared_array->avail -= batchcount;
            ac->avail = batchcount;
            // 将shared的本地高速缓存中最后batchcount个对象的地址复制到ac的对象地址区
            memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], sizeof(void *) * batchcount);
            shared_array->touched = 1;
            // 到了alloc_done后就可以直接返回ac_entry(ac)[--ac->avail]，也就是最后一个对象的地址。
            goto alloc_done;
        }
    }

    // 如果没有通过shared的本地高速缓存获取对象，那么就将执行下面语句
    while (batchcount > 0) {
        struct list_head *entry;
        struct slab *slabp;
        /* Get slab alloc is to come from. */
        // 这里主要是判断用哪个list - partial还是free，如果都为空 - entry == &l3->slabs_partial 和 entry == &l3->slabs_free，
        // 那么必须要去must_grow了
        entry = l3->slabs_partial.next;
        if (entry == &l3->slabs_partial) {
            l3->free_touched = 1;
            entry = l3->slabs_free.next;
            if (entry == &l3->slabs_free)
                goto must_grow;
        }

        // 只要partial或者free list有slab，那么就从该slab中获取对象
        slabp = list_entry(entry, struct slab, list);
        check_slabp(cachep, slabp);
        check_spinlock_acquired(cachep);
        // 如果需要的batchcount过多，那么slabp->inuse < cachep->num就会提前不满足，跳出循环
        // 这个函数只是补充对象，因此就算其剩余对象不够batchcount也没关系
        while (slabp->inuse < cachep->num && batchcount--) {
            kmem_bufctl_t next;
            STATS_INC_ALLOCED(cachep);
            STATS_INC_ACTIVE(cachep);
            STATS_SET_HIGH(cachep);

            /* get obj pointer */
            // 这里有个小知识 - void *指针+/-数字就是直接加数字，和有类型的指针不一样，不等于sizeof(type) * number
            ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free * cachep->objsize;
            slabp->inuse++;

            // 这里需要先了解slab描述符和对象描述符，我们假设这两个描述符都在slab内部
            //
            //             对象描述符            对象        
            // [slab描述符][X][3][X][BUFCTL_END][  ][空闲][  ][空闲]
            //
            // 在ULK P333中的图更能直观的解释这两个描述符，简单来说，对象描述符和对象一一对应，但只有空闲对象的对象描述符才有意义。
            // 它的值是下一个空闲对象的下标。比如上面例子中，obj_des[1]的值是3，其中3是下一个空闲的对象快，这样就形成了一个类似链表
            // 的数据结构。
            // 
            // slabp->free是下一个空闲块的下标（现在这个块已经不是空闲的了），用这个下标可以定位对象描述符从而可以获得下一个空闲块的
            // 下标，用这个下标可以更新slab->free
            next = slab_bufctl(slabp)[slabp->free];
            slabp->free = next;
        }
        check_slabp(cachep, slabp);

        /* move slabp to correct slabp list: */
        // 从partial/free list到partial/full list
        list_del(&slabp->list);
        if (slabp->free == BUFCTL_END)
            list_add(&slabp->list, &l3->slabs_full);
        else
            list_add(&slabp->list, &l3->slabs_partial);
    }

must_grow:
    // 这里之所以要减去ac->avail是因为正常情况下，程序也会运行到这里，我们需要更新list的free_objects
    l3->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&cachep->spinlock);

    // 初始化以及其它极少情况需要执行这部分代码，比如上面跳到must_grow的情况
    if (unlikely(!ac->avail)) {
        int x;
        // 到这里我们真的需要增加高速缓存的容量了，使用伙伴系统获取新的页，下面再分析这个函数
        x = cache_grow(cachep, flags, -1);
        
        // cache_grow can reenable interrupts, then ac could change.
        ac = ac_data(cachep);
        if (!x && ac->avail == 0)    // no objects in sight? abort
            return NULL;

        // 我们终于增加了高速缓存的容量了，赶紧重新去获取对象吧！
        if (!ac->avail)
            goto retry;
    }
    ac->touched = 1;
    return ac_entry(ac)[--ac->avail];
}

cache_grow()

/*
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static int cache_grow (kmem_cache_t * cachep, int flags, int nodeid)
{
    struct slab   *slabp;
    void          *objp;
    size_t        offset;
    int           local_flags;
    unsigned long ctor_flags;

    /* Be lazy and only check for valid flags here,
      * keeping it out of the critical path in kmem_cache_alloc().
     */
    if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
        BUG();
    if (flags & SLAB_NO_GROW)
        return 0;

    ctor_flags = SLAB_CTOR_CONSTRUCTOR;
    local_flags = (flags & SLAB_LEVEL_MASK);
    if (!(local_flags & __GFP_WAIT))
        /*
         * Not allowed to sleep.  Need to tell a constructor about
         * this - it might need to know...
         */
        ctor_flags |= SLAB_CTOR_ATOMIC;

    /* About to mess with non-constant members - lock. */
    check_irq_off();
    spin_lock(&cachep->spinlock);

    /* Get colour for the slab, and cal the next value. */
    // 开始计算这个slab需要偏移多少，偏移多少由 (颜色 * cachep->colour_off) 计算出
    offset = cachep->colour_next;
    cachep->colour_next++;
    if (cachep->colour_next >= cachep->colour)
        cachep->colour_next = 0;
    offset *= cachep->colour_off;

    spin_unlock(&cachep->spinlock);

    if (local_flags & __GFP_WAIT)
        local_irq_enable();

    /*
     * The test for missing atomic flag is performed here, rather than
     * the more obvious place, simply to reduce the critical path length
     * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
     * will eventually be caught here (where it matters).
     */
    kmem_flagcheck(cachep, flags);

    // 这个函数我就不具体分析了，简单来说就是通过伙伴系统获取空白页、修改slab标志位并返回其地址
    // 这里的objp并不是对象地址，而是获取页的首地址
    if (!(objp = kmem_getpages(cachep, flags, nodeid)))
        goto failed;

    // 这个函数看下面代码分析，其返回是slab描述符的地址
    if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
        goto opps1;

    // 主要是设置这两个 - SET_PAGE_CACHE(page, cachep)和SET_PAGE_SLAB(page, slabp)
    // 这里是用page->lru.next和page->lru.prev实现page和cache/slab的联系
    set_slab_attr(cachep, slabp, objp);

    // 这里实现对象描述符的链表化的初始化 - [slab描述符][1][2][3][BUFCTL_END]...[空闲][空闲][空闲][空闲]
    cache_init_objs(cachep, slabp, ctor_flags);

    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    check_irq_off();
    spin_lock(&cachep->spinlock);

    // 将新分配的slab放入相应链表
    list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
    STATS_INC_GROWN(cachep);
    list3_data(cachep)->free_objects += cachep->num;
    spin_unlock(&cachep->spinlock);
    return 1;
opps1:
    kmem_freepages(cachep, objp);
failed:
    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    return 0;
}

alloc_slabmgmt()

/* Get the memory for a slab management obj. */
static struct slab* alloc_slabmgmt (kmem_cache_t *cachep, void *objp, int colour_off, int local_flags)
{
    struct slab *slabp;

    // slab描述符不在该slab中，我们需要重新获取一个对象，这个对象在cachep->slabp_cache中
    if (OFF_SLAB(cachep)) {
        /* Slab management obj is off-slab. */
        slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
        if (!slabp)
            return NULL;
    // slab描述符在该slab中，那就直接放在colour_off后面就行
    } else {
        slabp = objp + colour_off;
        colour_off += cachep->slab_size;
    }
    
    // 更新slab的部分成员变量
    slabp->inuse = 0;
    // 此时的slabp->colouroff以及包含了colour_offset + slab_des + object_des
    slabp->colouroff = colour_off;
    slabp->s_mem = objp + colour_off;

    return slabp;
}

至此，kmem_cache_alloc()函数分析就结束了，不求细节的话，基本能够理解slab如何分配对象了。从这个函数可以看出，刚创建的高速缓存是没有对象的，需要调用cache_grow()来获取slab以及其中的对象。当然，初始化时和之后的情况又不太一样，我会在最后一篇总结笔记中做一些对比。下一篇笔记暂定是和slab销毁、释放空间有关。这部分还没有开始看代码，争取3月底前完成吧。

Next Post

13. 【Slab】5 - slab、高速缓存以及对象的释放和销毁
Previous Post

11. 【Slab】3 - slab的基本操作 - kmem_cache_create()

CATALOG

1. kmem_cache_alloc()
2. __cache_alloc()