musl 与 glibc 完全对比教程 / 第 09 章：性能对比分析

第 09 章：性能对比分析

通过基准测试深入了解 musl 与 glibc 在内存使用、启动时间、线程性能和 IO 性能方面的差异。

9.1 性能测试环境

测试平台

# 查看系统信息
$ uname -a
$ cat /proc/cpuinfo | head -20
$ free -h
$ cat /etc/os-release

# 测试环境（本章示例）
# CPU: Intel i7-12700K (12C/20T)
# RAM: 32 GB DDR4
# OS: Ubuntu 24.04 (glibc) + Alpine 3.20 (musl)
# Kernel: 6.5

编译环境

# glibc 编译
$ gcc --version | head -1
# gcc (Ubuntu 13.3.0-1ubuntu1) 13.3.0
$ ldd --version | head -1
# ldd (Ubuntu GLIBC 2.39-0ubuntu8) 2.39

# musl 编译
$ musl-gcc --version | head -1
# gcc (Alpine 13.2.1_git20240309) 13.2.1 20240309
$ echo "int main(){return 0;}" | musl-gcc -xc - -o /tmp/test && file /tmp/test
# ELF 64-bit LSB executable, dynamically linked, interpreter /lib/ld-musl-x86_64.so.1

9.2 内存使用对比

进程内存占用测试

/* mem_usage.c — 测量进程内存占用 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

static void print_meminfo(const char *label) {
    char path[64];
    snprintf(path, sizeof(path), "/proc/%d/status", getpid());

    FILE *fp = fopen(path, "r");
    char line[256];
    long vmrss = 0, vmhwm = 0;

    while (fgets(line, sizeof(line), fp)) {
        if (sscanf(line, "VmRSS: %ld kB", &vmrss) == 1) continue;
        if (sscanf(line, "VmHWM: %ld kB", &vmhwm) == 1) continue;
    }
    fclose(fp);

    printf("%-20s VmRSS: %6ld kB  VmHWM: %6ld kB\n", label, vmrss, vmhwm);
}

int main() {
    print_meminfo("Empty program");

    /* 分配 10 MB 内存 */
    char *buf = malloc(10 * 1024 * 1024);
    memset(buf, 0, 10 * 1024 * 1024);
    print_meminfo("After 10MB alloc");

    /* 分配 100 MB */
    char *buf2 = malloc(100 * 1024 * 1024);
    memset(buf2, 0, 100 * 1024 * 1024);
    print_meminfo("After 100MB alloc");

    free(buf);
    free(buf2);
    print_meminfo("After free");

    return 0;
}

# 编译运行
$ gcc -O2 -o mem_test_glibc mem_usage.c
$ musl-gcc -O2 -o mem_test_musl mem_usage.c

$ echo "=== glibc ===" && ./mem_test_glibc
=== glibc ===
Empty program        VmRSS:   1536 kB  VmHWM:   1536 kB
After 10MB alloc     VmRSS:  11776 kB  VmHWM:  11776 kB
After 100MB alloc    VmRSS: 112128 kB  VmHWM: 112128 kB
After free           VmRSS:   2048 kB  VmHWM: 112128 kB

$ echo "=== musl ===" && ./mem_test_musl
=== musl ===
Empty program        VmRSS:    512 kB  VmHWM:    512 kB
After 10MB alloc     VmRSS:  10752 kB  VmHWM:  10752 kB
After 100MB alloc    VmRSS: 111104 kB  VmHWM: 111104 kB
After free           VmRSS:    512 kB  VmHWM: 111104 kB

空进程内存占用

指标	glibc	musl	差异
空进程 VmRSS	~1.5 MB	~0.5 MB	musl 少 67%
libc 映射大小	~2.5 MB	~600 KB	musl 少 76%
TLS 开销	~16 KB	~8 KB	musl 少 50%
线程默认栈	8 MB	128 KB	musl 少 98%

线程内存开销

/* thread_mem.c — 测量线程内存开销 */
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#define NUM_THREADS 1000

static void print_mem(const char *label) {
    char cmd[128];
    snprintf(cmd, sizeof(cmd),
             "grep VmRSS /proc/%d/status | awk '{print \"%s: \" $2 \" kB\"}'",
             getpid(), label);
    system(cmd);
}

void *thread_func(void *arg) {
    sleep(60);  /* 保持线程存活 */
    return NULL;
}

int main() {
    pthread_t tids[NUM_THREADS];
    pthread_attr_t attr;

    pthread_attr_init(&attr);
    pthread_attr_setstacksize(&attr, 256 * 1024);  /* 256KB 栈 */

    print_mem("Before threads");

    for (int i = 0; i < NUM_THREADS; i++) {
        if (pthread_create(&tids[i], &attr, thread_func, NULL) != 0) {
            printf("Failed to create thread %d\n", i);
            break;
        }
    }

    print_mem("After 1000 threads");

    /* 清理 */
    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_cancel(tids[i]);
        pthread_join(tids[i], NULL);
    }
    pthread_attr_destroy(&attr);

    print_mem("After cleanup");
    return 0;
}

/*
 * glibc（256KB 栈 x 1000 线程）：~260 MB
 * musl（256KB 栈 x 1000 线程）：~260 MB
 * 相同栈大小下，内存占用基本一致
 *
 * 但 musl 默认 128KB vs glibc 默认 8MB：
 * glibc：~8 GB（1000 线程 x 8MB 默认栈）
 * musl：~128 MB（1000 线程 x 128KB 默认栈）
 */

9.3 启动时间对比

程序启动基准测试

/* startup_bench.c — 测量程序启动时间 */
#include <stdio.h>
#include <time.h>

int main() {
    /* 程序启动到 main() 的时间由动态链接器决定 */
    /* 使用外部计时来测量 */
    return 0;
}

# 编译两种版本
$ gcc -O2 -o startup_glibc startup_bench.c
$ musl-gcc -O2 -o startup_musl startup_bench.c
$ musl-gcc -static -O2 -o startup_musl_static startup_bench.c

# 测量启动时间（使用 perf）
$ perf stat -r 100 ./startup_glibc 2>&1 | grep "time elapsed"
$ perf stat -r 100 ./startup_musl 2>&1 | grep "time elapsed"
$ perf stat -r 100 ./startup_musl_static 2>&1 | grep "time elapsed"

# 或者使用 hyperfine
$ hyperfine --warmup 10 \
    './startup_glibc' \
    './startup_musl' \
    './startup_musl_static'

# 典型结果：
# glibc 动态链接：  ~0.5 ms
# musl 动态链接：   ~0.3 ms
# musl 静态链接：   ~0.1 ms

复杂程序启动时间

# 测试复杂程序的启动时间
# 以 Python 为例

$ hyperfine --warmup 5 \
    'python3 -c "print(1)"' \
    'python3 -c "import sys; print(sys.version)"'

# 在 glibc (Ubuntu) 和 musl (Alpine) 上分别测试
# 差异通常很小（~5-10%），因为 Python 本身的初始化时间主导

Docker 容器启动时间

# 测量容器启动时间
$ time docker run --rm alpine:3.20 echo "hello"
# real    0m0.350s  (Alpine/musl)

$ time docker run --rm ubuntu:24.04 echo "hello"
# real    0m0.520s  (Ubuntu/glibc)

# 差异主要来自：
# 1. 镜像大小（alpine 更小，overlay fs 层更少）
# 2. 动态链接器加载（musl 更轻量）
# 3. init 进程（如有）

9.4 线程性能对比

线程创建速度

/* thread_create_bench.c */
#include <pthread.h>
#include <stdio.h>
#include <time.h>

#define ITERATIONS 100000

void *empty_thread(void *arg) {
    return NULL;
}

int main() {
    pthread_t tid;
    pthread_attr_t attr;
    pthread_attr_init(&attr);
    pthread_attr_setstacksize(&attr, 256 * 1024);

    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);

    for (int i = 0; i < ITERATIONS; i++) {
        pthread_create(&tid, &attr, empty_thread, NULL);
        pthread_join(tid, NULL);
    }

    clock_gettime(CLOCK_MONOTONIC, &end);
    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("Thread create+join x %d: %.3f sec (%.1f μs each)\n",
           ITERATIONS, elapsed, elapsed / ITERATIONS * 1e6);

    pthread_attr_destroy(&attr);
    return 0;
}

$ gcc -O2 -pthread -o thread_bench_glibc thread_create_bench.c
$ musl-gcc -O2 -pthread -o thread_bench_musl thread_create_bench.c

$ echo "=== glibc ===" && ./thread_bench_glibc
=== glibc ===
Thread create+join x 100000: 1.523 sec (15.2 μs each)

$ echo "=== musl ===" && ./thread_bench_musl
=== musl ===
Thread create+join x 100000: 1.087 sec (10.9 μs each)

# musl 线程创建通常快 20-30%

线程同步性能

/* mutex_bench.c — 互斥锁性能测试 */
#include <pthread.h>
#include <stdio.h>
#include <time.h>

#define NUM_THREADS 4
#define ITERATIONS 10000000

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
long counter = 0;

void *increment(void *arg) {
    for (int i = 0; i < ITERATIONS; i++) {
        pthread_mutex_lock(&mutex);
        counter++;
        pthread_mutex_unlock(&mutex);
    }
    return NULL;
}

int main() {
    pthread_t tids[NUM_THREADS];
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);

    for (int i = 0; i < NUM_THREADS; i++)
        pthread_create(&tids[i], NULL, increment, NULL);
    for (int i = 0; i < NUM_THREADS; i++)
        pthread_join(tids[i], NULL);

    clock_gettime(CLOCK_MONOTONIC, &end);
    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("Mutex: %d threads x %d ops = %.3f sec (%.1f ns/op)\n",
           NUM_THREADS, ITERATIONS, elapsed,
           elapsed / (NUM_THREADS * ITERATIONS) * 1e9);
    return 0;
}

读写锁性能

/* rwlock_bench.c — 读写锁性能测试 */
#include <pthread.h>
#include <stdio.h>
#include <time.h>

#define NUM_READERS 8
#define NUM_WRITERS 2
#define ITERATIONS 1000000

pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
long shared_data = 0;

void *reader(void *arg) {
    for (int i = 0; i < ITERATIONS; i++) {
        pthread_rwlock_rdlock(&rwlock);
        volatile long val = shared_data;
        (void)val;
        pthread_rwlock_unlock(&rwlock);
    }
    return NULL;
}

void *writer(void *arg) {
    for (int i = 0; i < ITERATIONS; i++) {
        pthread_rwlock_wrlock(&rwlock);
        shared_data++;
        pthread_rwlock_unlock(&rwlock);
    }
    return NULL;
}

线程性能总结

操作	glibc NPTL	musl	差异
`pthread_create()`	~15 μs	~10 μs	musl 快 ~30%
`pthread_join()`	~2 μs	~1.5 μs	musl 快 ~25%
`pthread_mutex_lock/unlock`	~25 ns	~30 ns	glibc 快 ~17%
`pthread_rwlock_rdlock`	~20 ns	~25 ns	glibc 快 ~20%
`pthread_cond_wait/signal`	~50 ns	~45 ns	相当
条件变量广播	O(n)	O(n)	相当

分析：musl 线程创建更快（栈分配更简单），但互斥锁操作 glibc 略快（futex 优化更成熟）。在实际应用中，这种差异通常被业务逻辑开销掩盖。

9.5 内存分配性能

malloc/free 性能测试

/* malloc_bench.c */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

static void bench_malloc(size_t obj_size, int count) {
    void **ptrs = malloc(count * sizeof(void *));
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < count; i++) {
        ptrs[i] = malloc(obj_size);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double alloc_time = (end.tv_sec - start.tv_sec) +
                        (end.tv_nsec - start.tv_nsec) / 1e9;

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < count; i++) {
        free(ptrs[i]);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double free_time = (end.tv_sec - start.tv_sec) +
                       (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("Size %6zu bytes: alloc %.3fs, free %.3fs (x%d)\n",
           obj_size, alloc_time, free_time, count);

    free(ptrs);
}

int main() {
    int count = 1000000;
    bench_malloc(16, count);      /* 小对象 */
    bench_malloc(64, count);      /* 小对象 */
    bench_malloc(256, count);     /* 中对象 */
    bench_malloc(4096, count);    /* 大对象 */
    bench_malloc(65536, count);   /* 大对象 */
    return 0;
}

多线程 malloc 性能

/* mt_malloc_bench.c — 多线程 malloc 压力测试 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <time.h>

#define NUM_THREADS 8
#define ALLOCS_PER_THREAD 500000

void *alloc_thread(void *arg) {
    void *ptrs[64];
    for (int i = 0; i < ALLOCS_PER_THREAD; i++) {
        int idx = i % 64;
        size_t size = 16 + (i % 1024);
        ptrs[idx] = malloc(size);
        if (ptrs[idx]) memset(ptrs[idx], 0, size);
        if (i >= 64) free(ptrs[(i - 64) % 64]);
    }
    for (int i = 0; i < 64; i++) free(ptrs[i]);
    return NULL;
}

int main() {
    pthread_t tids[NUM_THREADS];
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < NUM_THREADS; i++)
        pthread_create(&tids[i], NULL, alloc_thread, NULL);
    for (int i = 0; i < NUM_THREADS; i++)
        pthread_join(tids[i], NULL);
    clock_gettime(CLOCK_MONOTONIC, &end);

    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;
    printf("MT malloc: %d threads x %d allocs: %.3f sec\n",
           NUM_THREADS, ALLOCS_PER_THREAD, elapsed);
    return 0;
}

malloc 性能总结

场景	glibc ptmalloc	musl malloc	差异
单线程小对象	快	良好	glibc 快 ~20%
单线程大对象	良好	良好	相当（都用 mmap）
多线程并发	快（per-thread arena）	较慢（全局锁）	glibc 快 2-5x
内存碎片	较多	较少	musl 更优
内存归还	⚠️ 可能不归还	✅ 及时归还	musl 更优
内存开销	较高（arena 元数据）	较低	musl 更优

建议：如果程序大量并发 malloc/free，考虑使用 jemalloc 或 mimalloc，它们在两种 libc 上都有出色表现。

9.6 IO 性能对比

文件 IO 测试

/* io_bench.c */
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>

#define FILE_SIZE (64 * 1024 * 1024)  /* 64 MB */
#define BLOCK_SIZE 4096

int main() {
    char *buf = malloc(FILE_SIZE);
    memset(buf, 'A', FILE_SIZE);
    const char *filename = "/tmp/io_bench_test";

    struct timespec start, end;

    /* 写入测试 */
    clock_gettime(CLOCK_MONOTONIC, &start);
    int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
    for (size_t offset = 0; offset < FILE_SIZE; offset += BLOCK_SIZE) {
        write(fd, buf + offset, BLOCK_SIZE);
    }
    close(fd);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double write_time = (end.tv_sec - start.tv_sec) +
                        (end.tv_nsec - start.tv_nsec) / 1e9;

    /* 读取测试 */
    clock_gettime(CLOCK_MONOTONIC, &start);
    fd = open(filename, O_RDONLY);
    for (size_t offset = 0; offset < FILE_SIZE; offset += BLOCK_SIZE) {
        read(fd, buf + offset, BLOCK_SIZE);
    }
    close(fd);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double read_time = (end.tv_sec - start.tv_sec) +
                       (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("Write 64MB: %.3f sec (%.1f MB/s)\n",
           write_time, FILE_SIZE / write_time / 1e6);
    printf("Read  64MB: %.3f sec (%.1f MB/s)\n",
           read_time, FILE_SIZE / read_time / 1e6);

    unlink(filename);
    free(buf);
    return 0;
}

# IO 性能在 glibc 和 musl 之间通常没有显著差异
# 因为两者都直接调用内核系统调用
# 差异主要在 stdio 缓冲区管理上

$ gcc -O2 -o io_bench_glibc io_bench.c && ./io_bench_glibc
$ musl-gcc -O2 -o io_bench_musl io_bench.c && ./io_bench_musl

# 典型结果：差异 < 5%

格式化 IO（printf）

/* printf_bench.c */
#include <stdio.h>
#include <time.h>

int main() {
    struct timespec start, end;
    const int iterations = 1000000;

    /* 输出到 /dev/null 以消除终端影响 */
    FILE *fp = fopen("/dev/null", "w");

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iterations; i++) {
        fprintf(fp, "Value: %d, Float: %.4f, String: %s, Hex: 0x%x\n",
                i, i * 3.14159, "hello world", i);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;
    printf("fprintf x %d: %.3f sec (%.0f ns each)\n",
           iterations, elapsed, elapsed / iterations * 1e9);

    fclose(fp);
    return 0;
}

/*
 * printf 性能在两者之间差异很小
 * glibc 的 printf 实现更复杂（支持更多格式修饰符）
 * musl 的 printf 实现更简洁
 * 实际性能基本持平
 */

9.7 网络性能对比

Socket 性能

/* socket_bench.c — 基本 socket 性能测试 */
#include <stdio.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <time.h>

#define MSG_SIZE 1024
#define ITERATIONS 100000

/* 网络性能主要取决于内核，而非 libc */
/* libc 仅是系统调用的薄封装 */

int main() {
    int sv[2];
    socketpair(AF_UNIX, SOCK_STREAM, 0, sv);

    char send_buf[MSG_SIZE];
    char recv_buf[MSG_SIZE];
    memset(send_buf, 'A', MSG_SIZE);

    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);

    for (int i = 0; i < ITERATIONS; i++) {
        write(sv[0], send_buf, MSG_SIZE);
        read(sv[1], recv_buf, MSG_SIZE);
    }

    clock_gettime(CLOCK_MONOTONIC, &end);
    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("Socket pair: %d x %d byte msgs: %.3f sec (%.1f ns/msg)\n",
           ITERATIONS, MSG_SIZE, elapsed,
           elapsed / ITERATIONS * 1e9);

    close(sv[0]);
    close(sv[1]);
    return 0;
}

/*
 * 结论：socket 性能在 glibc 和 musl 之间几乎无差异
 * 因为两者都直接调用 send()/recv() 系统调用
 * 真正的性能瓶颈在内核网络栈
 */

9.8 综合基准测试总结

测试项	glibc	musl	差异	说明
空进程 RSS	1.5 MB	0.5 MB	musl -67%	libc 映射更小
线程创建	15 μs	10 μs	musl -33%	栈分配更简单
mutex 操作	25 ns	30 ns	glibc -17%	futex 优化更成熟
单线程 malloc	基准	+20%	glibc 更快	ptmalloc 优化
多线程 malloc	基准	+2-5x	glibc 快很多	per-thread arena
memcpy 大块	基准	+30-50%	glibc 更快	AVX/ERMS 优化
strlen 长串	基准	+20-30%	glibc 更快	SIMD 优化
文件 IO	基准	~相同	差异 < 5%	系统调用主导
网络 IO	基准	~相同	差异 < 5%	系统调用主导
printf	基准	~相同	差异 < 5%	实现复杂度相当
程序启动（动态）	基准	-30%	musl 更快	链接器更轻量
程序启动（静态）	N/A	基准	无动态链接开销	musl 独有优势
内存碎片	较多	较少	musl 更优	malloc 算法简单

9.9 性能优化建议

glibc 优化技巧

# 1. 调整 malloc arena 数量
export MALLOC_ARENA_MAX=4  # 减少内存碎片

# 2. 启用 tcache（glibc 2.26+）
export MALLOC_TCACHE_COUNT=1024

# 3. 使用 HWCAP 选择最优函数
# glibc 自动完成，无需手动配置

# 4. 编译时指定 CPU 特性
$ gcc -O2 -march=native -mtune=native -o program program.c

musl 优化技巧

# 1. 使用静态链接消除动态链接开销
$ musl-gcc -static -O2 -o program program.c

# 2. 显式设置线程栈大小
$ export MUSL_LOCPATH=/usr/share/i18n/locales

# 3. 使用 jemalloc/mimalloc 替代内置 malloc
$ musl-gcc -O2 -o program program.c -ljemalloc

# 4. LTO 优化
$ musl-gcc -O2 -flto -o program program.c

9.10 本章小结

性能选择指南：

内存受限：选择 musl（更小的 libc、更少的碎片）
大量线程：选择 musl（更小的默认栈、更快的创建）
高并发 malloc：选择 glibc 或外部 allocator
计算密集：选择 glibc（更好的 SIMD 优化）
IO 密集：两者相当（瓶颈在内核）
启动速度：选择 musl（更轻量的链接器）
容器化：选择 musl（更小的镜像）

扩展阅读

glibc Performance — glibc 性能优化指南
musl FAQ: Performance — musl 性能相关问题
jemalloc — 高性能内存分配器
mimalloc — Microsoft 内存分配器
Linux Performance — Linux 性能分析工具