jemalloc 内存分配器完全指南 / 08 - 基准测试

第 8 章：基准测试

8.1 基准测试概述

内存分配器的基准测试需要评估以下维度：

维度	指标	工具/方法
吞吐量	ops/sec (malloc+free/s)	多线程循环计时
延迟	p50/p99/p999 延迟	高精度计时
碎片率	RSS / 实际使用量	RSS 监控
可扩展性	多线程下的性能衰减	不同线程数测试
CPU 开销	perf 上报的热点	perf record

8.2 自定义基准测试框架

8.2.1 多线程吞吐量测试

// bench_throughput.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <time.h>
#include <getopt.h>

#define MAX_THREADS  128

typedef struct {
    int id;
    int n_ops;
    int obj_size;
    double elapsed;
} thread_arg_t;

static void *bench_worker(void *arg) {
    thread_arg_t *a = (thread_arg_t *)arg;
    void **ptrs = malloc(a->n_ops * sizeof(void *));
    struct timespec t0, t1;

    clock_gettime(CLOCK_MONOTONIC, &t0);

    // 交替 malloc/free 模拟真实场景
    for (int i = 0; i < a->n_ops; i++) {
        ptrs[i] = malloc(a->obj_size);
        if (ptrs[i]) memset(ptrs[i], 0xAB, a->obj_size);
        // 每分配 4 个释放 1 个（模拟真实分配模式）
        if (i >= 4 && (i % 4 == 0)) {
            free(ptrs[i - 4]);
            ptrs[i - 4] = NULL;
        }
    }

    // 释放剩余
    for (int i = 0; i < a->n_ops; i++) {
        if (ptrs[i]) free(ptrs[i]);
    }

    clock_gettime(CLOCK_MONOTONIC, &t1);
    a->elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;

    free(ptrs);
    return NULL;
}

int main(int argc, char *argv[]) {
    int n_threads = 8;
    int n_ops     = 100000;
    int obj_size  = 256;

    int opt;
    while ((opt = getopt(argc, argv, "t:n:s:")) != -1) {
        switch (opt) {
            case 't': n_threads = atoi(optarg); break;
            case 'n': n_ops     = atoi(optarg); break;
            case 's': obj_size  = atoi(optarg); break;
        }
    }

    pthread_t threads[MAX_THREADS];
    thread_arg_t args[MAX_THREADS];

    struct timespec t0, t1;
    clock_gettime(CLOCK_MONOTONIC, &t0);

    for (int i = 0; i < n_threads; i++) {
        args[i] = (thread_arg_t){i, n_ops, obj_size, 0};
        pthread_create(&threads[i], NULL, bench_worker, &args[i]);
    }
    for (int i = 0; i < n_threads; i++) {
        pthread_join(threads[i], NULL);
    }

    clock_gettime(CLOCK_MONOTONIC, &t1);
    double total = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
    double total_ops = (double)n_threads * n_ops * 2; // malloc + free

    printf("threads=%d obj_size=%d elapsed=%.3fs throughput=%.1f Mops/s\n",
           n_threads, obj_size, total, total_ops / total / 1e6);

    return 0;
}

gcc -O2 -g -o bench_throughput bench_throughput.c -lpthread

# 默认 malloc
./bench_throughput -t 8 -n 200000 -s 256

# jemalloc
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 \
./bench_throughput -t 8 -n 200000 -s 256

# tcmalloc（需要安装）
LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 \
./bench_throughput -t 8 -n 200000 -s 256

8.2.2 延迟分布测试

// bench_latency.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>

#define N_SAMPLES  1000000

static inline uint64_t now_ns() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

static int cmp_u64(const void *a, const void *b) {
    uint64_t va = *(const uint64_t *)a;
    uint64_t vb = *(const uint64_t *)b;
    return (va > vb) - (va < vb);
}

int main() {
    uint64_t *latencies = malloc(N_SAMPLES * sizeof(uint64_t));

    // 采集 malloc 延迟
    for (int i = 0; i < N_SAMPLES; i++) {
        uint64_t t0 = now_ns();
        void *p = malloc(256);
        uint64_t t1 = now_ns();
        latencies[i] = t1 - t0;
        free(p);
    }

    // 排序计算百分位
    qsort(latencies, N_SAMPLES, sizeof(uint64_t), cmp_u64);

    printf("=== malloc(256) Latency (ns) ===\n");
    printf("p50:   %6" PRIu64 "\n", latencies[N_SAMPLES * 50  / 100]);
    printf("p90:   %6" PRIu64 "\n", latencies[N_SAMPLES * 90  / 100]);
    printf("p99:   %6" PRIu64 "\n", latencies[N_SAMPLES * 99  / 100]);
    printf("p999:  %6" PRIu64 "\n", latencies[N_SAMPLES * 999 / 1000]);
    printf("max:   %6" PRIu64 "\n", latencies[N_SAMPLES - 1]);
    printf("mean:  %6.0f\n", (double)latencies[N_SAMPLES / 2]);

    free(latencies);
    return 0;
}

gcc -O2 -o bench_latency bench_latency.c -lpthread

echo "=== glibc malloc ==="
./bench_latency

echo "=== jemalloc ==="
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 ./bench_latency

典型输出：

=== glibc malloc ===
p50:     45
p90:    120
p99:    380
p999:  1200
max:  12500

=== jemalloc ===
p50:     28
p90:     85
p99:    210
p999:   650
max:   4200

8.3 碎片率测试

// bench_fragmentation.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>

#ifdef USE_JEMALLOC
#include <jemalloc/jemalloc.h>
#endif

#define N_ALLOCS     100000
#define MIN_SIZE     16
#define MAX_SIZE     8192

int main() {
    void *ptrs[N_ALLOCS];
    size_t total_requested = 0;

    // 随机分配
    srand(42);
    for (int i = 0; i < N_ALLOCS; i++) {
        size_t sz = MIN_SIZE + rand() % (MAX_SIZE - MIN_SIZE + 1);
        ptrs[i] = malloc(sz);
        if (ptrs[i]) memset(ptrs[i], 0xAB, sz);
        total_requested += sz;
    }

    // 随机释放约 70%（制造碎片）
    for (int i = 0; i < N_ALLOCS; i++) {
        if (rand() % 100 < 70) {
            free(ptrs[i]);
            ptrs[i] = NULL;
        }
    }

    // 查看统计
    struct mallinfo mi = mallinfo();

    printf("=== Fragmentation Report ===\n");
    printf("Total requested:  %12zu bytes (%.1f MB)\n",
           total_requested, total_requested / 1048576.0);
    printf("Arena (from OS):  %12d bytes (%.1f MB)\n",
           mi.arena, mi.arena / 1048576.0);
    printf("In-use:           %12d bytes (%.1f MB)\n",
           mi.uordblks, mi.uordblks / 1048576.0);
    printf("Free in arena:    %12d bytes (%.1f MB)\n",
           mi.fordblks, mi.fordblks / 1048576.0);

    // 碎片率 = (RSS - 实际使用) / RSS
    double frag_ratio = mi.arena > 0 ?
        (double)(mi.arena - mi.uordblks) / mi.arena * 100 : 0;
    printf("Fragmentation:    %.1f%%\n", frag_ratio);

#ifdef USE_JEMALLOC
    printf("\n=== jemalloc Detailed Stats ===\n");
    je_malloc_stats_print(NULL, NULL, NULL);
#endif

    // 清理
    for (int i = 0; i < N_ALLOCS; i++) {
        if (ptrs[i]) free(ptrs[i]);
    }

    return 0;
}

# glibc malloc
gcc -O2 -DUSE_JEMALLOC=0 -o bench_frag bench_fragmentation.c
./bench_frag

# jemalloc
gcc -O2 -DUSE_JEMALLOC -o bench_frag_jemalloc bench_fragmentation.c -ljemalloc
./bench_frag_jemalloc

8.4 使用 memtier_benchmark 测试 Redis

Redis 是 jemalloc 最重要的用户之一。使用 memtier_benchmark 测试不同分配器的 Redis 性能：

# 安装 memtier_benchmark
sudo apt install memtier-benchmark  # 或从源码编译

# 启动使用 glibc 的 Redis
redis-server --daemonize yes --port 6379
memtier_benchmark -p 6379 --protocol=redis --data-size=256 \
  --key-maximum=100000 --threads=8 --clients=50 --test-time=30

# 重启使用 jemalloc 的 Redis
kill $(pgrep redis-server)
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 \
MALLOC_CONF="narenas:4,background_thread:true" \
redis-server --daemonize yes --port 6379
memtier_benchmark -p 6379 --protocol=redis --data-size=256 \
  --key-maximum=100000 --threads=8 --clients=50 --test-time=30

预期结果对比

指标	glibc malloc	jemalloc	提升
Ops/sec	~120,000	~180,000	+50%
p50 latency	0.8ms	0.5ms	-37%
p99 latency	2.1ms	1.2ms	-43%
RSS (1GB 数据)	~1.4GB	~1.1GB	-21%

注意：实际数据取决于硬件配置和工作负载。

8.5 综合对比测试脚本

#!/bin/bash
# bench_all.sh - 对比不同分配器性能

ALLOCATORS=("glibc" "jemalloc" "tcmalloc" "mimalloc")
PRELOADS=("" \
    "/usr/local/lib/libjemalloc.so.2" \
    "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" \
    "/usr/lib/x86_64-linux-gnu/libmimalloc.so")

echo "=== Memory Allocator Benchmark ==="
echo "Threads: 1 2 4 8 16"
echo ""

for idx in "${!ALLOCATORS[@]}"; do
    name="${ALLOCATORS[$idx]}"
    preload="${PRELOADS[$idx]}"
    echo "--- $name ---"

    for threads in 1 2 4 8 16; do
        if [ -n "$preload" ]; then
            result=$(LD_PRELOAD="$preload" \
                ./bench_throughput -t $threads -n 100000 -s 256 2>&1)
        else
            result=$(./bench_throughput -t $threads -n 100000 -s 256 2>&1)
        fi
        printf "  threads=%-2d  %s\n" "$threads" "$result"
    done
    echo ""
done

8.6 RSS 监控脚本

#!/bin/bash
# monitor_rss.sh - 实时监控进程 RSS

PID=$1
if [ -z "$PID" ]; then
    echo "Usage: $0 <pid>"
    exit 1
fi

echo "Monitoring RSS for PID $PID (Ctrl+C to stop)"
echo "Time(s)    RSS(MB)    VSZ(MB)"

while true; do
    if [ ! -d "/proc/$PID" ]; then
        echo "Process $PID exited"
        break
    fi
    rss=$(awk '/VmRSS/{print $2}' /proc/$PID/status 2>/dev/null)
    vsz=$(awk '/VmSize/{print $2}' /proc/$PID/status 2>/dev/null)
    elapsed=$(($(date +%s) - start_time))
    printf "%-10d %-10s %-10s\n" \
        $elapsed $((rss / 1024)) $((vsz / 1024))
    sleep 1
done

8.7 性能分析工具

perf

# 采集 CPU 热点
perf record -g -p $(pgrep my_server) -- sleep 30
perf report

# 查看 malloc/free 的调用频率
perf stat -e 'syscalls:sys_enter_mmap,syscalls:sys_enter_brk' \
  -p $(pgrep my_server) -- sleep 10

Flame Graph

# 生成火焰图（需要 FlameGraph 工具）
git clone https://github.com/brendangregg/FlameGraph.git

perf record -g -p $(pgrep my_server) -- sleep 30
perf script | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > flame.svg

8.8 测试注意事项

要点	说明
关闭 ASLR	`echo 0 > /proc/sys/kernel/randomize_va_space`（可选，减少波动）
固定 CPU 频率	`cpupower frequency-set -g performance`
多轮取均值	至少运行 5 轮，取中位数
预热	先跑一轮预热，使缓存和页表稳定
隔离环境	关闭不必要的后台进程
相同编译选项	`-O2 -g` 确保公平比较

8.9 本章小结

测试维度	关键指标	推荐工具
吞吐量	Mops/sec	自定义 bench
延迟	p50/p99/p999	自定义 bench + clock_gettime
碎片率	RSS / requested	mallinfo + RSS 监控
可扩展性	不同线程数的 ops	自定义 bench
实际应用	ops/sec, latency	memtier_benchmark

扩展阅读

上一章：第 7 章：系统集成 下一章：第 9 章：Docker 容器化