/* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. 'size' contains the total * object size including these internal fields, while 'obj_offset' * and 'object_size' contain the offset to the user object and its * size. */ int obj_offset; #endif/* CONFIG_DEBUG_SLAB */
/* * struct array_cache * * Purpose: * - LIFO ordering, to hand out cache-warm objects from _alloc * - reduce the number of linked list operations * - reduce spinlock operations * * The limit is stored in the per-cpu structure to reduce the data cache * footprint. * */ // 在初始化阶段,per-cpu缓存在setup_cpu_cache()中被初始化,其中limit, batchcount都等于1 // 在这个阶段,shared缓存没有被创建 // 当初始化结束,per-cpu缓存和shared缓存在kmem_cache_init_late()->enable_cpucache()中重新被初始化, // limit, batchcount等的值是根据缓存的大小等决定的 // shared缓存是根据slab缓存的大小以及CPU数量决定是否创建 structarray_cache { unsignedint avail; unsignedint limit; unsignedint batchcount; unsignedint touched; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. */ };
/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ void __init kmem_cache_init(void) { int i; // kmem_cache_boot是静态定义的kmem_cache类型的slab缓存 // kmem_cache是全局变量 kmem_cache = &kmem_cache_boot;
if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) use_alien_caches = 0; // 内核静态定义了NUM_INIT_LISTS个kmem_cache_node,这里NUM_INIT_LISTS等于2 for (i = 0; i < NUM_INIT_LISTS; i++) // 初始化静态定义的kmem_cache_node kmem_cache_node_init(&init_kmem_cache_node[i]);
/* * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: * 1) initialize the kmem_cache cache: it contains the struct * kmem_cache structures of all caches, except kmem_cache itself: * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_cache_node structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The struct kmem_cache for the new cache is allocated normally. * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. * 5) Replace the __init data for kmem_cache_node for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */
/* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, constchar *name, unsignedint size, slab_flags_t flags, unsignedint useroffset, unsignedint usersize) { int err;
/** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor * @flags: SLAB flags * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; unsignedint size = cachep->size;
// 根据不同的条件设置debug的标志位(SLAB_RED_ZONE, SLAB_STORE_USER, SLAB_POISON) #if DEBUG #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size - 1 + REDZONE_ALIGN + 2 * sizeof(unsignedlonglong))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_TYPESAFE_BY_RCU)) flags |= SLAB_POISON; #endif #endif
// --------------------------------------------------------------------------------------------- (1) // 将对象大小按照word进行对齐 /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ size = ALIGN(size, BYTES_PER_WORD); // 更新对象的对齐值 if (flags & SLAB_RED_ZONE) { // #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ // 再次调整对象大小的对齐 size = ALIGN(size, REDZONE_ALIGN); }
/* 3) caller mandated alignment */ if (ralign < cachep->align) { ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsignedlonglong)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ // 更新缓存的对齐 cachep->align = ralign; // 初始化并更新slab的着色offset cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; // 判断slab是否完成初始化(slab_state>=UP) if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT;
// 根据debug的标志位更新obj_offset和size #if DEBUG /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsignedlonglong); size += 2 * sizeof(unsignedlonglong); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #endif // 没有定义CONFIG_KASAN时为空函数 kasan_cache_create(cachep, &size, &flags); // 再一次对对象大小进行对齐 size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. */ // 这部分参考SLAB_OBJ_MIN_SIZE的注释,这里简单解释下, // 如果只用一个byte来做slab的索引,那么只能映射2 ^ 8个slab // 因此,在这种情况slab缓存的大小不能太小,否则一个page就有可能 // 超过2 ^ 8个slab if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
// --------------------------------------------------------------------------------------------- (2) #if DEBUG /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab * doesn't get initialized so it would not be possible. So, we need * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ if (size >= 256 && cachep->object_size > cache_line_size()) { if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { size_t tmp_size = ALIGN(size, PAGE_SIZE); if (set_off_slab_cache(cachep, tmp_size, flags)) { flags |= CFLGS_OFF_SLAB; cachep->obj_offset += tmp_size - size; size = tmp_size; goto done; } } } #endif // 情况1. 所有的freelist_idx_t都在一个object中 // 不满足条件要求: // if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) return false; if (set_objfreelist_slab_cache(cachep, size, flags)) { flags |= CFLGS_OBJFREELIST_SLAB; goto done; } // 情况2. 所有的freelist_idx_t都在一个外部的缓存中 // 不满足条件要求: // if (left >= cachep->num * sizeof(freelist_idx_t)) return false; if (set_off_slab_cache(cachep, size, flags)) { flags |= CFLGS_OFF_SLAB; goto done; } // 情况3. 最基本的情况,所有的freelist_idx_t都在当前缓存中 if (set_on_slab_cache(cachep, size, flags)) goto done;
/** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. * @flags: slab allocation flags * * Also calculates the number of objects per slab. * * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ staticsize_tcalculate_slab_order(struct kmem_cache *cachep, size_t size, slab_flags_t flags) { size_t left_over = 0; int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsignedint num; size_t remainder; // 计算出slab cache中最多可以装多少obj num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue;
/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ // 找到最小的order满足这个条件后就跳出循环 if (num > SLAB_OBJ_MAX_NUM) break;
freelist_size = num * sizeof(freelist_idx_t); freelist_cache = kmalloc_slab(freelist_size, 0u); if (!freelist_cache) continue;
/* * Needed to avoid possible looping condition * in cache_grow_begin() */ if (OFF_SLAB(freelist_cache)) continue;
/* check if off slab has enough benefit */ if (freelist_cache->size > cachep->size / 2) continue; }
// 当前order满足条件,先更新kmem_cache的成员变量 /* Found something acceptable - save it away */ cachep->num = num; cachep->gfporder = gfporder; left_over = remainder;
/* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ if (flags & SLAB_RECLAIM_ACCOUNT) break;
/* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ if (gfporder >= slab_max_order) break;
/* * Calculate the number of objects and left-over bytes for a given buffer size. */ staticunsignedintcache_estimate(unsignedlong gfporder, size_t buffer_size, slab_flags_t flags, size_t *left_over) { unsignedint num; size_t slab_size = PAGE_SIZE << gfporder;
/* * The slab management structure can be either off the slab or * on it. For the latter case, the memory allocated for a * slab is used for: * * - @buffer_size bytes for each object * - One freelist_idx_t for each object * * We don't need to consider alignment of freelist because * freelist will be at the end of slab page. The objects will be * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ // 这两种情况,我们都不需要考虑freelist_idx_t占用的空间。第一种情况,所有的freelist_idx_t就在一个object中, // 将其当成一个普通object即可; 第二种情况,所有的freelist_idx_t在其它的页中。 if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { num = slab_size / buffer_size; *left_over = slab_size % buffer_size; // 这种情况,每个object都对应一个freelist_idx_t,因此需要将它们两个看做一个整体 } else { num = slab_size / (buffer_size + sizeof(freelist_idx_t)); *left_over = slab_size % (buffer_size + sizeof(freelist_idx_t)); }
/* * Always use on-slab management when SLAB_NOLEAKTRACE * to avoid recursive calls into kmemleak. */ if (flags & SLAB_NOLEAKTRACE) returnfalse;
/* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); if (!cachep->num) returnfalse;
/* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ // 只要存放object的页有足够空间方向所有的freelist_idx_t,那么就不用这种方法 if (left >= cachep->num * sizeof(freelist_idx_t)) returnfalse;
cachep->colour = left / cachep->colour_off;
returntrue; }
set_on_slab_cache()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// ./mm/slab.c
staticboolset_on_slab_cache(struct kmem_cache *cachep, cachep->num = 0; left = calculate_slab_order(cachep, size, flags); // 这个函数已经是最后一个选择了,只要object的数量不为0就行 if (!cachep->num) returnfalse; cachep->colour = left / cachep->colour_off; returntrue; }
// ########################################################################### // init_arraycache() // ########################################################################### staticvoidinit_arraycache(struct array_cache *ac, int limit, int batch) { /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another * cache the pointers are not cleared and they could be counted as * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; ac->batchcount = batch; ac->touched = 0; } }
/* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) if (enable_cpucache(cachep, GFP_NOWAIT)) BUG(); mutex_unlock(&slab_mutex);
/* Done! */ slab_state = FULL;
#ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees * node. */ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif
/* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. */ }