【LM14】slab系统的创建和分配

硬核技术 Linux内核 Linux_5_0 ARMv8_64 Linux内存管理

 2023/12/26 

在上一篇笔记【LM13】slab系统的初始化中，我已经详细介绍了slab系统的初始化。slab的初始化阶段完成了启动slab缓存以及通用slab缓存的创建，在这篇笔记中我将先分析下一般情况下slab缓存是如何创建的；剩下的大部分内容将重点放在slab的内存分配上。

slab缓存的创建

kmem_cache_create()

这个是非启动阶段使用的slab缓存创建函数，它是一个封装函数，直接调用kmem_cache_create_usercopy()这个函数。

// ./mm/slab_common.c

/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align,
                                     slab_flags_t flags, void (*ctor)(void *))
{
    return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, ctor);
}

kmem_cache_create_usercopy()

// ./mm/slab_common.c

/**
 * kmem_cache_create_usercopy - Create a cache with a region suitable
 * for copying to userspace
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
 * @ctor: A constructor for the objects.
 *
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
                           unsigned int size, unsigned int align,
                           slab_flags_t flags,
                           unsigned int useroffset, unsigned int usersize,
                           void (*ctor)(void *))
{
    struct kmem_cache *s = NULL;
    const char *cache_name;
    int err;

    // 获取cpu hotplug的锁(定义CONFIG_HOTPLUG_CPU)
    get_online_cpus();
    // 获取mem hotplug的锁(定义CONFIG_MEMORY_HOTPLUG)
    get_online_mems();
    // 未定义CONFIG_MEMCG_KMEM时为空函数    
    memcg_get_cache_ids();
    // 获取slab的mutex
    mutex_lock(&slab_mutex);
    
    // CONFIG_DEBUG_VM = 1时对slab的名字、对象大小、是否在中断中等进行检查
    err = kmem_cache_sanity_check(name, size);
    if (err) {
        goto out_unlock;
    }

    /* Refuse requests with allocator specific flags */
    // 如果flag有SLAB_FLAGS_PERMITTED外的位，那么就直接返回错误
    if (flags & ~SLAB_FLAGS_PERMITTED) {
        err = -EINVAL;
        goto out_unlock;
    }

    /*
     * Some allocators will constraint the set of valid flags to a subset
     * of all flags. We expect them to define CACHE_CREATE_MASK in this
     * case, and we'll just provide them with a sanitized version of the
     * passed flags.
     */
    flags &= CACHE_CREATE_MASK;
    /* Fail closed on bad usersize of useroffset values. */
    if (WARN_ON(!usersize && useroffset) ||
        WARN_ON(size < usersize || size - usersize < useroffset))
        usersize = useroffset = 0;
    
    // 根据size, align, flag等在slab_caches中寻找和当前需求相似的slab专用缓存
    if (!usersize)
        s = __kmem_cache_alias(name, size, align, flags, ctor);
    if (s)
        goto out_unlock;
    
    // 根据输入名字和gfp flag生成slab cache的名字
    cache_name = kstrdup_const(name, GFP_KERNEL);
    if (!cache_name) {
        err = -ENOMEM;
        goto out_unlock;
    }
    
    // ----------------------------------------------------------------------- (1)
    // 正式创建slab cache
    s = create_cache(cache_name, size, calculate_alignment(flags, align, size),
                     flags, useroffset, usersize, ctor, NULL, NULL);
    if (IS_ERR(s)) {
        err = PTR_ERR(s);
        kfree_const(cache_name);
    }

out_unlock:
    // 至此，slab cache已经创建成功(或失败)，解除之前所有的锁
    mutex_unlock(&slab_mutex);
    memcg_put_cache_ids();
    put_online_mems();
    put_online_cpus();
    
    // 如果slab创建失败，输出相应的错误信息
    if (err) {
        if (flags & SLAB_PANIC)
            panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
                name, err);
        else {
            pr_warn("kmem_cache_create(%s) failed with error %d\n",
                name, err);
            dump_stack();
        }
        return NULL;
    }
    return s;
}

这个函数主要干了两件事情：1. 上锁并在slab_caches中寻找和当前需求相似的专用缓存；2. 如果没有找到类似的专用缓存，则调用create_cache()来创建该专用缓存(参考上面(1)处)。

create_cache()

static struct kmem_cache *create_cache(const char *name,
                                       unsigned int object_size, unsigned int align,
                                       slab_flags_t flags, unsigned int useroffset,
                                       unsigned int usersize, void (*ctor)(void *),
                                       struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
    struct kmem_cache *s;
    int err;

    if (WARN_ON(useroffset + usersize > object_size))
        useroffset = usersize = 0;

    err = -ENOMEM;
    // ------------------------------------------------------------------- (1)
    // 这是一个slab分配函数，这里我们只需要知道它会从kmem_cache分配一个对象
    // kmem_cache是kmem_cache类型的slab缓存
    s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
    if (!s)
        goto out;
    
    // 更新kmem_cache对象的部分成员变量
    s->name = name;
    s->size = s->object_size = object_size;
    s->align = align;
    s->ctor = ctor;
    s->useroffset = useroffset;
    s->usersize = usersize;
    
    // CONFIG_MEMCG_KMEM未定义时函数返回0 
    err = init_memcg_params(s, memcg, root_cache);
    if (err)
        goto out_free_cache;

    // ------------------------------------------------------------------- (2)
    err = __kmem_cache_create(s, flags);
    if (err)
        goto out_free_cache;

    s->refcount = 1;
    list_add(&s->list, &slab_caches);
    memcg_link_cache(s);
out:
    if (err)
        return ERR_PTR(err);
    return s;

out_free_cache:
    destroy_memcg_params(s);
    kmem_cache_free(kmem_cache, s);
    goto out;
}

这个函数重要的地方就是(1)和(2)，(1)是使用slab分配kmem_cache缓存的对象；(2)是将这个分配的kmem_cache缓存的对象进行初始化。关于__kmem_cache_create()的细节，请参考【LM13】slab系统的初始化 - __kmem_cache_create()

slab缓存的分配

上面我已经分析了slab系统的初始化和创建，接下来我们就进入这篇笔记的重点部分 - slab缓存的分配。
在Linux内核中，我们有一系列的函数可以用于分配slab内存，其中最常见的就是kmalloc()，这个函数和malloc()函数类似。除此之外，在slab缓存的初始化和创建过程中，我们使用了一个函数 - kmem_cache_zalloc()，这个函数会从特定的kmem_cache中分配对象。下面我们就从这个函数出发，一步一步地分析slab函数的分配过程。

kmem_cache_zalloc()

在正式深入代码前，我们来简单看看整个流程大概是怎样的，

kmem_cache_zalloc()
---> kmem_cache_alloc()
     ---> slab_alloc()
          ---> __do_cache_alloc()
               ---> ____cache_alloc()
                    ---> cache_alloc_refill()

在这几个函数中，slab_alloc()是一个重要的函数，因为所有的顶层slab分配函数都会调用它。而它的前两层函数仅仅是封装函数，我们这里简单来看看。

// ./include/linux/slab.h

// ###################################################################
// kmem_cache_zalloc()
// ###################################################################
/*
 * Shortcuts
 */
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
    // 获取一个kmem_cache类型的对象，如果需要获取新的页，则填充为0
    return kmem_cache_alloc(k, flags | __GFP_ZERO);
}

// ./mm/slab.c

// ###################################################################
// kmem_cache_allock()
// ###################################################################
/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.  The flags are only relevant
 * if the cache has no available objects.
 */
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *ret = slab_alloc(cachep, flags, _RET_IP_);

    trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags);

    return ret;
}

slab_alloc()

这个函数做的事情就要多些了，在slab进行内存分配前后会进行一系列的准备和清理工作。核心部分是下面的(1)，内存的分配就是通过(1)完成的。(2)也比较重要，它会根据情况对slab分配后的对象进行RED_ZONE和POISON的处理。
这里我简单地介绍下RED_ZONE以及USER信息在对象中的布局，

// 简化模型 
// [padding][RED_ZONE][obj][RED_ZONE][USER_DATA]

// 详细模型
// [0, cachep->obj_offset - BYTES_PER_WORD - 1]                           - 这部分是padding
// [cachep->obj_offset - BYTES_PER_WORD][cachep->obj_offset - 1]          - 第一部分RED_ZONE 
// [cachep->obj_offset, cachep->size - 2 * BYTES_PER_WORD - 1]            - 对象
// [cachep->size - 2 * BYTES_PER_WORD, cachep->size - BYTES_PER_WORD - 1] - 第二部分RED_ZONE 
// [cachep->size - BYTES_PER_WORD, cachep->size - 1]                      - 用户信息

下面关于DEBUG相关的内容不是重点，了解一下就行了。下面来看看slab_alloc()函数。

// ./mm/slab.c

// ######################################################################
// slab_alloc()
// ######################################################################
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
    unsigned long save_flags;
    void *objp;

    flags &= gfp_allowed_mask;
    // slab分配前的准备函数 - 根据flag判断是否允许睡眠、是否开启错误注入等
    cachep = slab_pre_alloc_hook(cachep, flags);
    if (unlikely(!cachep))
        return NULL;

    // 这个函数只做一件事 - 根据flag判断是否允许睡眠    
    cache_alloc_debugcheck_before(cachep, flags);
    local_irq_save(save_flags);
    // --------------------------------------------------------------------- (1) 
    objp = __do_cache_alloc(cachep, flags);
    local_irq_restore(save_flags);
    // --------------------------------------------------------------------- (2) 
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
    prefetchw(objp);

    if (unlikely(flags & __GFP_ZERO) && objp)
        memset(objp, 0, cachep->object_size);
    // 做一些kasan, kmemleak和memcg相关工作(宏定义未开启都是空函数)
    slab_post_alloc_hook(cachep, flags, 1, &objp);
    return objp;
}

// ######################################################################
// cache_alloc_debugcheck_afters()
// ######################################################################
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                gfp_t flags, void *objp, unsigned long caller)
{
    WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
    if (!objp)
        return objp;
    // 当开启SLAB_POISON后，SLAB_STORE_USER和SLAB_RED_ZONE都将被清除，因此后面
    // 两个判断语句不会被执行
    if (cachep->flags & SLAB_POISON) {
        // 这两句不清楚作用    
        check_poison_obj(cachep, objp);
        slab_kernel_map(cachep, objp, 1, 0);
        // 向对象中写入特殊字段
        poison_obj(cachep, objp, POISON_INUSE);
    }
    // 在对象最后插入一些用户数据
    if (cachep->flags & SLAB_STORE_USER)
        *dbg_userword(cachep, objp) = (void *)caller;

    // 当开启RED_ZONE后，对象的大小增加了两个long long，一个位于对象的最后，一个位于对象的offset后面
    if (cachep->flags & SLAB_RED_ZONE) {
        // 检查RED_ZONE区域是否被修改
        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
            slab_error(cachep, "double free, or memory outside object was overwritten");
            pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
                   objp, *dbg_redzone1(cachep, objp),
                   *dbg_redzone2(cachep, objp));
        }
        // 更新RED_ZONE的值
        *dbg_redzone1(cachep, objp) = RED_ACTIVE;
        *dbg_redzone2(cachep, objp) = RED_ACTIVE;
    }

    // 在开启debug并设置SLAB_RED_ZONE后，obj_offset(cachep)返回的是真正对象的相对地址，
    // 在这里void指针+/-数字等于+/- 1Byte而不是4Byte，因此objp是真正对象的地址
    objp += obj_offset(cachep);
    // 如果存在对象的构造函数，调用它来初始化分配的对象    
    if (cachep->ctor && cachep->flags & SLAB_POISON)
        cachep->ctor(objp);
    if (ARCH_SLAB_MINALIGN &&
        ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
        pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
               objp, (int)ARCH_SLAB_MINALIGN);
    }
    return objp;
}
#else
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif

上面的代码对slab_alloc()中的(2)进行了分析，下面我们来单独看看slab_alloc()中的(1)这部分。

__do_cache_alloc()和___cache_alloc()

对于UMA的系统，__do_cache_alloc()函数就是一个纯粹的封装函数，它直接调用____cache_alloc()，

// ./mm/slab.c

// ##############################################################################
// __do_cache_alloc()
// ##############################################################################
static __always_inline void *__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    return ____cache_alloc(cachep, flags);
}

// ##############################################################################
// ____cache_alloc()
// ##############################################################################
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;
    struct array_cache *ac;

    check_irq_off();

    // ----------------------------------------------------------------- (1)
    // 先尝试从per-cpu的缓存中获取对象
    // 优点：
    // 1. 让一个对象尽可能地运行在同一个CPU上，可以让对象尽可能地使用同一个CPU的cache，有助于提高性能;
    // 2. 访问Per-CPU类型的本地对象缓冲池不需要获取额外的自旋锁，因为不会有另外的CPU来访问这些Per-CPU
    //    类型的对象缓存池，避免自旋锁的争用。  
    ac = cpu_cache_get(cachep);
    if (likely(ac->avail)) {
        ac->touched = 1;
        // 直接使用ac->avail - 1获取对象，ac->avail - 1指向的是最后一个有效的对象
        // 获取后并不需要更新其它的entry
        objp = ac->entry[--ac->avail];

        STATS_INC_ALLOCHIT(cachep);
        goto out;
    }

    STATS_INC_ALLOCMISS(cachep);
    // ----------------------------------------------------------------- (2)
    // 如果(1)失败那么就要去kmem_cache_node[]中找对象了(通过各种方式对per-cpu的cache进行补充)
    objp = cache_alloc_refill(cachep, flags);
    /*
     * the 'ac' may be updated by cache_alloc_refill(),
     * and kmemleak_erase() requires its correct value.
     */
    ac = cpu_cache_get(cachep);

out:
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}

和伙伴系统的分配函数一样，这个函数也分了几条路线。第一条路线就是在____cache_alloc()的(1)处先尝试从per-cpu的缓存获取对象，如果失败，则到____cache_alloc()的(2)处通过kmem_cache_node获取对象。(1)很简单，直接看代码即可，这里我们仔细看(2)处的函数。

cache_alloc_refill()

这个函数就比较复杂了，读源码的时候需要点耐心，下面就开始吧。

// ./mm/slab.c

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
    int batchcount;
    struct kmem_cache_node *n;
    struct array_cache *ac, *shared;
    int node;
    void *list = NULL;
    struct page *page;

    check_irq_off();
    node = numa_mem_id();

    ac = cpu_cache_get(cachep);
    batchcount = ac->batchcount;

    // 如果最近这个per-cpu的缓存没有被访问过并且其要求补充的对象过多，
    // 为了节约空间(因为这个per-cpu可能很少被访问)，函数会减少每次
    // 补充对象的数量
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill.  Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    // 根据nid获取kmem_cache_node[]
    n = get_node(cachep, node);
    
    // 检查per-cpu缓存是否为空，正常情况下per-cpu缓存为空才会调用这个函数
    BUG_ON(ac->avail > 0 || !n);
    
    // 获取共享的array_cache    
    shared = READ_ONCE(n->shared);
    // 如果kmem_cache_node[]中没有空余的内存空间并且shared array_cache也没有空余的内存空间，
    // 那么就需要使用伙伴系统获取新的页
    if (!n->free_objects && (!shared || !shared->avail))
        goto direct_grow;

    spin_lock(&n->list_lock);
    shared = READ_ONCE(n->shared);

    /* See if we can refill from the shared array */
    // static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max)
    // 如果shared array_cache有至少一个对象的内存空间，那么我们就将shared array_cache中的对象复制到per-cpu缓存
    // 复制完后，不会清除shared array_cache中的内容(只更新ac->avail)
    if (shared && transfer_objects(ac, shared, batchcount)) {
        shared->touched = 1;
        goto alloc_done;
    }
    
    // 能走到这里，说明满足(n->free_objects && !shared)
    while (batchcount > 0) {
        /* Get slab alloc is to come from. */
        // 通过kmem_cache_node[]中的slabs_partial(第一优先级)和slabs_free(第二优先级)寻找slab page    
        page = get_first_slab(n, false);
        // 如果还是没有slab page，那么我们就需要通过伙伴系统获取新的页了
        if (!page)
            goto must_grow;

        check_spinlock_acquired(cachep);
        // ---------------------------------------------------------------- (1)
        batchcount = alloc_block(cachep, ac, page, batchcount);
        fixup_slab_list(cachep, n, page, &list);
    }

must_grow:
    // 如果上面成功完成对ac->entry[]的补充，kmem_cache_node[]中的对象就减少ac->avail
    // 如果上面对ac->entry[]的补充失败，ac->avail = 0，kmem_cache_node[]中的对象也是减少ac->avail
    n->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&n->list_lock);
    // 向对象中注入预设的值 - POISON_FREE    
    fixup_objfreelist_debug(cachep, &list);

direct_grow:
    // 大部分情况到这里ac->avail != 0, 少量情况到这里ac->avail == 0
    // 所以大部分情况这里不运行，少部分情况这里才会被执行进行slab缓存的补充 
    if (unlikely(!ac->avail)) {
        /* Check if we can use obj in pfmemalloc slab */
        // 未定义CONFIG_NET时该函数返回0
        if (sk_memalloc_socks()) {
            void *obj = cache_alloc_pfmemalloc(cachep, n, flags);

            if (obj)
                return obj;
        }
        // ---------------------------------------------------------------- (2)
        page = cache_grow_begin(cachep, gfp_exact_node(flags), node);

        /*
         * cache_grow_begin() can reenable interrupts,
         * then ac could change.
         */
        ac = cpu_cache_get(cachep);
        if (!ac->avail && page)
            // 到这里，我们有很大的概率获取新的页，可以再次调用alloc_block()函数来补充内存空间了    
            alloc_block(cachep, ac, page, batchcount);
        // ---------------------------------------------------------------- (3)
        cache_grow_end(cachep, page);

        // 如果到这里ac->avail还是0，那么我们就没有办法了
        if (!ac->avail)
            return NULL;
    }
    ac->touched = 1;

    return ac->entry[--ac->avail];
}

这个函数内容很干，这里我来简单总结下整个函数的流程，

整个函数都是通过kmem_cache_node[]来补充per-cpu缓存，首先需要在shared缓存寻找是否有空余的内存空间(至少有一个对象的空间)，如果有，就优先将其复制到per-cpu的缓存中；
如果shared缓存没有足够的内存空间，那么就去slab_list中寻找，然后再填充per-cpu的缓存；
如果slab_list还是没有足够的内存空间，那么就需要通过伙伴系统来获取新的页，并用来填充per-cpu的缓存，然后再将该页添加入slab_list中。

有了大概了认识，我们接下来就要深入看cache_alloc_refill()的(1), (2)和(3)了。首先我们来看alloc_block()这个函数，

alloc_block()

在进入这个函数之前，我想先介绍下对象、freelist是如何工作的。这里假设使用的是默认的方式 - 也就是对象和freelist都放在同一个页，并且freelist不是放在一个对象中也不是用一个byte用于索引。对于这种情况，下图显示了它们的排列方式以及和页的关系，

从图中我们可以看出，一个slab页被分成了三部分 - 绿色的部分是着色的offset(注意，根据下面源码可知，offset在不同的page中是不同的，这样可以确保不同page中的对象地址能错开，从而可以映射不同的cache line，提高cache的利用效率)，橘黄色的部分是对象，蓝色的部分是对象的索引。在页中，有三个变量 - s_mem, freelist和active一起用于获取空闲的对象，下面的例子展示了它们是如何工作的，

// 1. 初始状态 
// page->active = 0, 它指向freelist[0], 目前所有对象都是空闲的
// [X][O0空闲][O1空闲][O2空闲][O3空闲][O4空闲][...][0][1][2][3][4][...][N]
//                                                |
// 2. 连续获取4个对象
// page->active = 4, 它指向freelist[4], 目前前4个对象已经被使用
// [X][O0使用][O1使用][O2使用][O3使用][O4空闲][...][0][1][2][3][4][...][N]
//                                                            |
// 3. 释放对象1
// page->active = 3(先减1), 更新freelist[3]为1, 目前通过page->active获取的对象索引是1，也是下一个空闲的对象 
// [X][O0使用][O1空闲][O2使用][O3使用][O4空闲][...][0][1][2][1][4][...][N]
//                                                         |

从上面这个例子，我们还可以总结出几点，

freelist[page->active]及其后的freelist[X]都是指向空闲的对象；
在分配对象时，优先分配最近释放的对象。

有了上面的基础，我们可以来看alloc_block()函数了，这个函数完成从slab page到ac->entry的补充。

// ./mm/slab.c

// ##########################################################################
// alloc_block()
// ##########################################################################
/*
 * Slab list should be fixed up by fixup_slab_list() for existing slab
 * or cache_grow_end() for new slab
 */
static __always_inline int alloc_block(struct kmem_cache *cachep,
        struct array_cache *ac, struct page *page, int batchcount)
{
    /*
     * There must be at least one object available for
     * allocation.
     */
    // page->active是当前的freelist的索引，它不是对象的直接索引，但必须要小于kmem_cache中对象的数量 
    BUG_ON(page->active >= cachep->num);

    while (page->active < cachep->num && batchcount--) {
        STATS_INC_ALLOCED(cachep);
        STATS_INC_ACTIVE(cachep);
        STATS_SET_HIGH(cachep);

        ac->entry[ac->avail++] = slab_get_obj(cachep, page);
    }

    return batchcount;
}

// ##########################################################################
// slab_get_obj()
// ##########################################################################
static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
    void *objp;

    // 这个函数完成如下功能 - page->s_mem + cache->size * idx, 其中idx = get_free_obj(page, page->active)
    // 2. get_free_obj()函数功能如下 - ((freelist_idx_t *)page->freelist)[idx]
    // 
    // 通过上面可以看出几点，
    // 1. page->s_mem对应的是第一个对象的地址;
    // 2. 对象的idx并不直接等于page->active;
    // 3. page->freelist[page->active]存的才是对象的索引。
    // 对于前面两点很容易理解，对于第3点，参考上面的例子
    objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
    page->active++;

#if DEBUG
    if (cachep->flags & SLAB_STORE_USER)
        set_store_user_dirty(cachep);
#endif

    return objp;
}

// ##########################################################################
// index_to_obj()
// ##########################################################################
static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx)
{
    return page->s_mem + cache->size * idx;
}

下面进入cache_alloc_refill()函数的(2)部分，

cache_grow_begin()

这个函数最关键的部分是通过伙伴系统获取页，第二重要的内容是对slab对象管理信息的更新。

// ./mm/slab.c

/*
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static struct page *cache_grow_begin(struct kmem_cache *cachep,
                gfp_t flags, int nodeid)
{
    void *freelist;
    size_t offset;
    gfp_t local_flags;
    int page_node;
    struct kmem_cache_node *n;
    struct page *page;

    /*
     * Be lazy and only check for valid flags here,  keeping it out of the
     * critical path in kmem_cache_alloc().
     */
    if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
        gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
        flags &= ~GFP_SLAB_BUG_MASK;
        pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
                invalid_mask, &invalid_mask, flags, &flags);
        dump_stack();
    }
    WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
    local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

    check_irq_off();
    if (gfpflags_allow_blocking(local_flags))
        local_irq_enable();

    /*
     * Get mem for the objs.  Attempt to allocate a physical page from
     * 'nodeid'.
     */
    // ------------------------------------------------------------------- (1)    
    page = kmem_getpages(cachep, local_flags, nodeid);
    if (!page)
        goto failed;

    page_node = page_to_nid(page);
    // 获取page对应的kmem_cache_node[]
    n = get_node(cachep, page_node);

    /* Get colour for the slab, and cal the next value. */
    // colour以及offset主要用于调整对象在页中的起始地址，在下面(2)处进行详细地讨论
    n->colour_next++;
    if (n->colour_next >= cachep->colour)
        n->colour_next = 0;

    offset = n->colour_next;
    if (offset >= cachep->colour)
        offset = 0;

    offset *= cachep->colour_off;

    /*
     * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
     * page_address() in the latter returns a non-tagged pointer,
     * as it should be for slab pages.
     */
    kasan_poison_slab(page);

    /* Get slab management. */
    // ------------------------------------------------------------------- (2)    
    freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, page_node);
    if (OFF_SLAB(cachep) && !freelist)
        goto opps1;

    // 将page和kmem_cache和freelist联系起来
    // page->slab_cache = cache;
    // page->freelist = freelist;
    slab_map_pages(cachep, page, freelist);

    // 对于OBJFREELIST_SLAB的情况，如果不随机化，那么就将将最后一个对象地址作为freelist;如果随机化，
    // 那么就不做任何操作
    cache_init_objs(cachep, page);

    if (gfpflags_allow_blocking(local_flags))
        local_irq_disable();

    return page;

opps1:
    kmem_freepages(cachep, page);
failed:
    if (gfpflags_allow_blocking(local_flags))
        local_irq_disable();
    return NULL;
}

cache_grow_begin()的(1)通过__alloc_pages_node()函数获取页，这个函数伙伴系统相关的文章有讲；
cache_grow_begin()的(2)处函数值得单独拎出来看看，

// ./mm/slab.c

/*
 * Get the memory for a slab management obj.
 *
 * For a slab cache when the slab descriptor is off-slab, the
 * slab descriptor can't come from the same cache which is being created,
 * Because if it is the case, that means we defer the creation of
 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
 * And we eventually call down to __kmem_cache_create(), which
 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
 * This is a "chicken-and-egg" problem.
 *
 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
 * which are all initialized during kmem_cache_init().
 */
static void *alloc_slabmgmt(struct kmem_cache *cachep,
                           struct page *page, int colour_off,
                           gfp_t local_flags, int nodeid)
{
    void *freelist;
    void *addr = page_address(page);

    // 这里我们可以看出对象是放在页的开始部分
    page->s_mem = addr + colour_off;
    page->active = 0;

    if (OBJFREELIST_SLAB(cachep))
        freelist = NULL;
    else if (OFF_SLAB(cachep)) {
        /* Slab management obj is off-slab. */
        freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid);
        freelist = kasan_reset_tag(freelist);
        if (!freelist)
            return NULL;
    } else {
        /* We will use last bytes at the slab for freelist */
        // 这里我们可以看出freelist是放在页的末尾
        // 这里我们还可以看出，每个slab cache可以有多个page，对于多个page的情况，freelist
        // 依然在页的末尾
        freelist = addr + (PAGE_SIZE << cachep->gfporder) - cachep->freelist_size;
    }

    return freelist;
}

这个函数最大的作用是告诉我们对象和freelist是如何分布在页中的，这点印证了cache_alloc_refill()(1)中的相关分析。注意这里OBJFREELIST_SLAB的情况并没有给freelist赋值，它会在之后的cache_init_objs()函数中被赋值。
现在我们回到cache_alloc_refill()函数，在它的(3)之前，我们获得了新的页，也将页中对象的地址给了per-cpu缓存，接下来我们就需要将这个页也添加进kmem_cache_node中适合的list中了，这部分在cache_grow_end()中完成。

// ./mm/slab.c

static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
{
    struct kmem_cache_node *n;
    void *list = NULL;

    check_irq_off();

    if (!page)
        return;

    INIT_LIST_HEAD(&page->lru);
    n = get_node(cachep, page_to_nid(page));

    spin_lock(&n->list_lock);
    n->total_slabs++;
    // 根据page->active的值，将页放入free/partial/full的list中
    if (!page->active) {
        list_add_tail(&page->lru, &(n->slabs_free));
        n->free_slabs++;
    } else
        // 对于OBJFREELIST_SLAB的情况，在page->active == cachep->num时(这说明page已经满了)，
        // page->freelist = NULL
        fixup_slab_list(cachep, n, page, &list);

    STATS_INC_GROWN(cachep);
    n->free_objects += cachep->num - page->active;
    spin_unlock(&n->list_lock);

    fixup_objfreelist_debug(cachep, &list);
}

至此，cache_alloc_refill()就分析完了，同时___cache_alloc()等函数也都完成了。我们已经了解从kmem_cache_zalloc()开始，slab是如何分配内存空间的，接下来再简单看看另外一个函数。

kmalloc()

和kmem_cache_zalloc()不同，kmalloc()不是通过某个kmem_cache获取相应的对象，而是在通用的缓存中获取对象。下面我们先来看看kmalloc()是如何调用底层分配函数的，

kmalloc()
----> __kmalloc()
      ----> __do_kmalloc()
            ----> kmalloc_slab()
            ----> slab_alloc()

kmalloc()函数如下，看上去很长，真正有作用的就是最后一句，而最后调用的函数__kmalloc()同样只是一个封装函数，

// ./mm/slab.c

// ####################################################################
// kmalloc()
// ####################################################################
/**
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *    Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *    Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *    Allocation will not sleep.  May use emergency pools.
 *
 * %GFP_HIGHUSER
 *    Allocate memory from high memory on behalf of user.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_HIGH
 *    This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *    Indicate that this allocation is in no way allowed to fail
 *    (think twice before using).
 *
 * %__GFP_NORETRY
 *    If memory is not immediately available,
 *    then give up at once.
 *
 * %__GFP_NOWARN
 *    If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *    Try really hard to succeed the allocation but fail
 *    eventually.
 */
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
    if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
        unsigned int index;
#endif
        if (size > KMALLOC_MAX_CACHE_SIZE)
            return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
        index = kmalloc_index(size);

        if (!index)
            return ZERO_SIZE_PTR;

        return kmem_cache_alloc_trace(
                kmalloc_caches[kmalloc_type(flags)][index],
                flags, size);
#endif
    }
    return __kmalloc(size, flags);
}

// ####################################################################
// __kmalloc()
// ####################################################################
void *__kmalloc(size_t size, gfp_t flags)
{
    return __do_kmalloc(size, flags, _RET_IP_);
}

__do_kmalloc()

接下来看做点事情的函数了，这个函数很简单，主要做两件事情。第一，获取通用的slab缓存，第二，在缓存中获取对象。

/**
 * __do_kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 * @caller: function caller for debug tracking of the caller
 */
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                      unsigned long caller)
{
    struct kmem_cache *cachep;
    void *ret;

    if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
        return NULL;
    // 通过通用的slab缓存获取通用缓存
    cachep = kmalloc_slab(size, flags);
    if (unlikely(ZERO_OR_NULL_PTR(cachep)))
        return cachep;
    // 获取对象    
    ret = slab_alloc(cachep, flags, caller);

    ret = kasan_kmalloc(cachep, ret, size, flags);
    trace_kmalloc(caller, ret, size, cachep->size, flags);

    return ret;
}

总结

话不多说了，最后放一张简化版的slab缓存分配流程图作为总结吧。

参考资料

Next Post

2023投资总结
Previous Post

【LM13】slab系统的初始化

CATALOG

1. slab缓存的创建
2. slab缓存的分配
3. 总结
4. 参考资料