从零到负一

【LM15】slab系统的释放

2024/01/22

在上一篇笔记【LM14】slab系统的创建和分配,我分析了slab缓存的分配,这篇笔记我将继续分析slab缓存的下一个重要内容 - 释放。相比于slab缓存的分配,其释放过程要更简单。下面我们从kmem_cache_free()这个函数开始,一步一步地分析slab缓存的释放。

kmem_cache_free()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
// ./mm/slab.c

/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
*
* Free an object which was previously allocated from this
* cache.
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
// 判断要释放的对象objp是否来自cachep这个kmem_cache缓存
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;

local_irq_save(flags);
// 这里我跳过所有和debug相关的内容
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);

trace_kmem_cache_free(_RET_IP_, objp);
}

cache_from_obj()

这个函数用于判断对象是否来自特定的kmem_cache

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// ./mm/slab.h

// #############################################################################
// cache_from_obj()
// #############################################################################
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
struct page *page;

/*
* When kmemcg is not being used, both assignments should return the
* same value. but we don't want to pay the assignment price in that
* case. If it is not compiled in, the compiler should be smart enough
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
if (!memcg_kmem_enabled() &&
!unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
// ------------------------------------------------------------ (1)
page = virt_to_head_page(x);
cachep = page->slab_cache;
if (slab_equal_or_root(cachep, s))
return cachep;

pr_err("%s: Wrong slab cache. %s but object is from %s\n",
__func__, s->name, cachep->name);
WARN_ON_ONCE(1);
return s;
}

// ./include/linux/mm.h

// #############################################################################
// virt_to_head_page()
// #############################################################################
static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);

return compound_head(page);
}

// ./arch/arm64/include/asm/memory.h
#ifndef CONFIG_SPARSEMEM_VMEMMAP
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#else
#define virt_to_page(vaddr) ((struct page *)((__virt_to_pgoff(vaddr)) | VMEMMAP_START))

#define __pa(x) __virt_to_phys((unsigned long)(x))
#define __virt_to_phys(x) __virt_to_phys_nodebug(x)
#define __virt_to_phys_nodebug(x) ({\
phys_addr_t __x = (phys_addr_t)(x);\
__is_lm_address(__x) ? __lm_to_phys(__x) :\
__kimg_to_phys(__x);\
})
/*
* The linear kernel range starts in the middle of the virtual adddress
* space. Testing the top bit for the start of the region is a
* sufficient check.
*/
#define __is_lm_address(addr) (!!((addr) & BIT(VA_BITS - 1)))
#define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
#define __kimg_to_phys(addr) ((addr) - kimage_voffset)

(1)是这个函数最重要的一部分,它根据obj的地址找到对应的page,其实现过程是通过一系列的宏定义完成的。这里假设CONFIG_SPARSEMEM_VMEMMAP未定义,通过一系列的转换就可以找到obj对应的page。关于虚拟内存的布局和VA->PA的转换,请参考【LM05】paging_init()以及内存的布局 - 内存的布局;关于pfn_to_page的相关内容,请参考【LM06】bootmem_init()和Linux的内存模型。获取obj对应的page后,接下来的工作就很简单了,直接看源码即可。

这里我其实有一个疑问,为什么这个函数不阻止错误的obj的释放?从源码上来看,如果obj不是来自该kmem_cache,那么除了输出警告信息外,并没有阻止obj的释放。只是该obj被释放到了另外一个kmem_cache,这样就埋下了一个隐患,之后内核可能会因此而崩溃。我个人理解是,我们默认内核的代码是安全的,不会出现错误释放obj的情况。如果出现,那么就需要修复代码,因此在这里内核不会阻止obj的释放。

__cache_free()

这个函数是一个封装函数,它调用___cache_free()函数完成对象的释放。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// ./mm/slab.c

// #############################################################################
// __cache_free()
// #############################################################################
/*
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
/* Put the object into the quarantine, don't touch it for now. */
if (kasan_slab_free(cachep, objp, _RET_IP_))
return;

___cache_free(cachep, objp, caller);
}

// #############################################################################
// ___cache_free()
// #############################################################################
void ___cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);

check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
// 这里我们不考虑debug相关的内容
objp = cache_free_debugcheck(cachep, objp, caller);

/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
* is per page memory reference) to get nodeid. Instead use a global
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
// case1: 如果ac中的obj还没有达到limit,那么就很简单,直接释放obj到ac即可
if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
// case2: 如果ac中的obj满了,那么就需要先进行flush操作了
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
}
// 这里不讨论net相关内容,该函数返回0
if (sk_memalloc_socks()) {
struct page *page = virt_to_head_page(objp);

if (unlikely(PageSlabPfmemalloc(page))) {
cache_free_pfmemalloc(cachep, page, objp);
return;
}
}

ac->entry[ac->avail++] = objp;
}

我们先来看case1这种情况,这种情况很简单,per-cpu缓存中还有空位,那么就直接将这个对象放在per-cpu缓存就行了。这里有个细节需要注意,当我们把这个obj放回per-cpu缓存后,并没有更新其所在的slab page,只是更新了per-cpu缓存的avail变量。至于slab page的更新,只有等per-cpu缓存中的对象释放回slab page后才会更新。在case2中我们会分析这种情况。

cache_flusharray()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_cache_node *n;
int node = numa_mem_id();
LIST_HEAD(list);

batchcount = ac->batchcount;

check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
// --------------------------------------------------------------- (1)
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
memcpy(&(shared_array->entry[shared_array->avail]),
ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}

// --------------------------------------------------------------- (2)
free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
{
int i = 0;
struct page *page;

list_for_each_entry(page, &n->slabs_free, lru) {
BUG_ON(page->active);

i++;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&n->list_lock);
// 在(2)中,我们可能会释放部分free slab pages,这些pages都会在这里被销毁
// 销毁主要工作包括 - 修改page和slab相关的信息,将page释放回伙伴系统,
// 将off_slab中kmem_cache的freelist对象释放
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}

我们先看(1),根据上一篇笔记可以知道,不是每个kmem_cache都有shared缓存。对于有shared的缓存,只要它还有空闲空间,那么我们就可以将per-cpu缓存中的对象复制到shared缓存中,这样,per-cpu缓存中就有足够的空间放释放的对象。
但是,对于没有shared缓存的kmem_cache,我们就需要执行(2)处的函数了。

free_block()

由于没有shared缓存,在这种情况下只能将对象释放回slab page

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
*/
static void free_block(struct kmem_cache *cachep, void **objpp,
int nr_objects, int node, struct list_head *list)
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
struct page *page;

n->free_objects += nr_objects;

for (i = 0; i < nr_objects; i++) {
void *objp;
struct page *page;
// per-cpu缓存中的对象
objp = objpp[i];
// 获取per-cpu缓存对象所在的页
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
// 因为这个obj是从该slab page复制到per-cpu缓存的,因此它一定可以释放回该slab page
// 将obj的objnr写入更新后的freelist中
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);

// 根据情况将slab放回slab_list
/* fixup slab chains */
if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
list_add_tail(&page->lru, &n->slabs_partial);
}
}

// 由于是一次释放ac->batchcount个对象,如果kmem_cache_node中的空闲对象太多,
// 我们就需要尝试释放一些free的slab page
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
// 这里我们至少有1个free的slab page,因此不会出现最后结果为负的情况
n->free_objects -= cachep->num;

page = list_last_entry(&n->slabs_free, struct page, lru);
// 把slab page从slab_list中断开并添加进list
list_move(&page->lru, list);
n->free_slabs--;
n->total_slabs--;
}
}

至此,slab缓存的释过程已经分析完毕,整个slab系统的学习也将告一段落。接下来准备学习vmalloc相关的知识,加油吧!

参考资料

CATALOG
  1. 1. kmem_cache_free()
    1. 1.1. cache_from_obj()
    2. 1.2. __cache_free()
      1. 1.2.1. cache_flusharray()
      2. 1.2.2. free_block()
  2. 2. 参考资料