Julia 教程 / Julia 文件 I/O 与序列化

16. 文件 I/O 与序列化

文件读写和数据序列化是数据处理的基础。Julia 提供了丰富的 I/O 工具，从基本的文本操作到高效的二进制格式。

16.1 文件读写基础

读取整个文件

# 读取为字符串
content = read("/etc/hostname", String)
println(content)

# 读取为字节数组
bytes = read("/etc/hostname")
println(typeof(bytes))  # Vector{UInt8}

# 读取行
lines = readlines("/etc/hostname")
println(lines)  # ["your-hostname"]

写入文件

# 写入字符串
write("/tmp/test.txt", "Hello, Julia!\n第二行\n")

# 写入字节
write("/tmp/test.bin", [0x48, 0x65, 0x6c, 0x6c, 0x6f])

# 追加写入
open("/tmp/test.txt", "a") do f
    write(f, "追加的内容\n")
end

逐行读取

# eachline 返回惰性迭代器
for line in eachline("/etc/hostname")
    println(">> ", line)
end

# 带行号
for (i, line) in enumerate(eachline("/etc/hostname"))
    println("$i: $line")
end

16.2 do 语法（自动关闭）

# do 语法确保文件自动关闭
open("/tmp/test.txt", "r") do f
    content = read(f, String)
    println(content)
end  # f 在这里自动关闭

# 写入示例
open("/tmp/output.txt", "w") do f
    for i in 1:10
        write(f, "行 $i: $(rand())\n")
    end
end

# 不用 do 语法（需手动关闭）
f = open("/tmp/test.txt", "r")
try
    content = read(f, String)
finally
    close(f)
end

💡 最佳实践：始终使用 do 语法或 try-finally 确保文件关闭。忘记关闭文件会导致资源泄漏。

16.3 IOStream 与 IOBuffer

IOStream：文件流

# 读写模式打开
open("/tmp/iostream_test.txt", "w+") do io
    write(io, "第一行\n")
    write(io, "第二行\n")
    
    # 回到开头
    seekstart(io)
    
    # 逐行读取
    for line in eachline(io)
        println(line)
    end
end

IOBuffer：内存流

# 创建内存缓冲区
buf = IOBuffer()

# 写入
write(buf, "Hello, ")
write(buf, "World!")
write(buf, "\n数值: ", 42)

# 获取内容
str = String(take!(buf))
println(str)

# 从字符串创建 IOBuffer
io = IOBuffer("Hello, Julia!\n第二行\n")
for line in eachline(io)
    println(">> ", line)
end

# 用于格式化输出
buf = IOBuffer()
for i in 1:5
    println(buf, "项 $i: $(round(rand(), digits=4))")
end
formatted = String(take!(buf))
print(formatted)

临时 IOBuffer

# 捕获标准输出
function capture_output(f)
    buf = IOBuffer()
    redirect_stdout(buf) do
        f()
    end
    return String(take!(buf))
end

output = capture_output() do
    println("这被捕获了")
    println("而不是打印到终端")
end
println("捕获到: ", output)

16.4 CSV 读写 (CSV.jl)

using CSV, DataFrames

# 写入 CSV
df = DataFrame(
    姓名 = ["Alice", "Bob", "Charlie"],
    年龄 = [25, 30, 35],
    分数 = [95.5, 87.3, 92.1]
)
CSV.write("/tmp/data.csv", df)

# 读取 CSV
df_read = CSV.read("/tmp/data.csv", DataFrame)
println(df_read)

# 带选项读取
df2 = CSV.read("/tmp/data.csv", DataFrame;
    delim = ',',           # 分隔符
    header = 1,            # 表头行
    types = Dict("年龄" => Int),  # 指定类型
    skipto = 2,            # 从第2行开始
)

# 流式读取大文件
rows = CSV.Rows("/tmp/data.csv")
for row in rows
    println(row.姓名, ": ", row.分数)
end

CSV 写入选项

# 完整选项
CSV.write("/tmp/output.csv", df;
    delim = '\t',         # Tab 分隔
    header = true,        # 写入表头
    append = false,       # 覆盖模式
    newline = '\n',       # 换行符
    quotestrings = true,  # 字符串加引号
)

16.5 JSON 读写 (JSON.jl)

using JSON

# 写入 JSON
data = Dict(
    "name" => "Alice",
    "age" => 25,
    "scores" => [95, 87, 92],
    "active" => true,
    "address" => Dict(
        "city" => "北京",
        "zip" => "100000"
    )
)

# 美化输出
json_str = JSON.json(data, 2)  # 缩进2格
println(json_str)

# 写入文件
open("/tmp/data.json", "w") do f
    JSON.print(f, data, 2)
end

# 读取 JSON
loaded = JSON.parsefile("/tmp/data.json")
println(loaded["name"])    # Alice
println(loaded["scores"])  # [95, 87, 92]

# 从字符串解析
from_str = JSON.parse("""{"x": 1, "y": 2}""")

JSON3.jl（更快的替代）

# JSON3.jl 通常比 JSON.jl 更快
# using JSON3
# obj = JSON3.read(json_string)
# json_str = JSON3.write(obj)

16.6 JLD2/HDF5 序列化

JLD2：Julia 原生格式

using JLD2

# 保存变量
x = rand(1000)
y = [1, 2, 3]
data = Dict("key" => "value", "count" => 42)

jldsave("/tmp/data.jld2"; x, y, data)

# 加载变量
loaded = load("/tmp/data.jld2")
println(keys(loaded))     # ["data", "x", "y"]
println(loaded["x"][1:5])  # 前5个值

# 加载单个变量
x_loaded = load("/tmp/data.jld2", "x")

# @save / @load 宏
@save "/tmp/vars.jld2" x y
# @load "/tmp/vars.jld2" x y  # 注意：@load 会覆盖当前变量

# 使用 jldopen 精细控制
jldopen("/tmp/data.jld2", "r") do file
    println(file["x"][1])
end

# 追加数据
jldopen("/tmp/data.jld2", "a+") do file
    file["new_key"] = [100, 200, 300]
end

HDF5 格式

using HDF5

# 写入 HDF5
h5write("/tmp/data.h5", "group/dataset", rand(100, 100))

# 读取
data = h5read("/tmp/data.h5", "group/dataset")

# 使用 h5open 精细控制
h5open("/tmp/data.h5", "w") do file
    file["matrix"] = rand(10, 10)
    file["vector"] = [1, 2, 3, 4, 5]
    attrs(file)["description"] = "测试数据"
end

# 读取属性
h5open("/tmp/data.h5", "r") do file
    println(attrs(file)["description"])
    println(size(file["matrix"]))
end

格式	特点	适用场景
JLD2	Julia 原生，保留类型	Julia 间数据交换
HDF5	通用科学格式	跨语言、大数据集
CSV	人类可读	表格数据、Excel 交换
JSON	结构化文本	API、配置文件

16.7 二进制 I/O (read!/write!)

# 写入二进制数据
data = Float64[1.0, 2.0, 3.0, 4.0, 5.0]
open("/tmp/binary.dat", "w") do f
    write(f, length(data))  # 先写长度
    write(f, data)          # 再写数据
end

# 读取二进制数据
open("/tmp/binary.dat", "r") do f
    n = read(f, Int)        # 读长度
    result = Vector{Float64}(undef, n)
    read!(f, result)        # 读数据到已分配的数组
    println(result)
end

# read! 比 read 更高效（避免分配）
# read!(io, buffer)  — 读入已分配的 buffer
# read(io, T, n)     — 返回新分配的数组

大数组的二进制 I/O

using Mmap

# 写入大矩阵
matrix = rand(1000, 1000)
open("/tmp/matrix.bin", "w") do f
    write(f, size(matrix, 1))
    write(f, size(matrix, 2))
    write(f, matrix)
end

# 读取
open("/tmp/matrix.bin", "r") do f
    m = read(f, Int)
    n = read(f, Int)
    loaded = Matrix{Float64}(undef, m, n)
    read!(f, loaded)
    println("大小: ", size(loaded))
end

16.8 内存映射 mmap

using Mmap

# 创建文件并映射
open("/tmp/mmap_test.bin", "w+") do f
    # 写入初始数据
    write(f, zeros(Float64, 1000))
end

# 内存映射
open("/tmp/mmap_test.bin", "r+") do f
    arr = Mmap.mmap(f, Vector{Float64}, 1000)
    
    # 直接操作（不会立即写入磁盘）
    arr[1] = 42.0
    arr[2] = 3.14
    
    # 强制写入磁盘
    Mmap.sync!(arr)
    
    println(arr[1:5])
end

# 再次读取验证
open("/tmp/mmap_test.bin", "r") do f
    arr = Mmap.mmap(f, Vector{Float64}, 1000)
    println("验证: ", arr[1:5])
end

💡 mmap 优势：对于超大文件（超过可用内存），mmap 允许操作系统按需加载页面，无需一次性读入。

16.9 文件遍历 walkdir

# 递归遍历目录
for (root, dirs, files) in walkdir("/tmp")
    println("目录: $root")
    for dir in dirs
        println("  子目录: $dir")
    end
    for file in files
        filepath = joinpath(root, file)
        size = filesize(filepath)
        println("  文件: $file ($size bytes)")
    end
end

# 查找特定文件
function find_files(dir, pattern)
    found = String[]
    for (root, _, files) in walkdir(dir)
        for file in files
            if occursin(pattern, file)
                push!(found, joinpath(root, file))
            end
        end
    end
    return found
end

julia_files = find_files("/tmp", r"\.jl$")

实用目录操作

# 创建目录
mkpath("/tmp/test/nested/dir")

# 列出目录
readdir("/tmp")
readdir("/tmp"; join=true)  # 返回完整路径

# 检查路径
isfile("/tmp/test.txt")
isdir("/tmp")
ispath("/tmp/test.txt")  # 文件或目录都返回 true

# 文件信息
stat = stat("/tmp/test.txt")
println("大小: ", stat.size)
println("修改时间: ", stat.mtime)

# 临时文件
tmpfile = tempname()
tmpdir = mktempdir()

16.10 GZip 压缩文件

using CodecZlib

# 写入 gzip 文件
open("/tmp/data.gz", "w") do f
    io = GzipCompressorStream(f)
    for i in 1:10000
        write(io, "行 $i: $(rand())\n")
    end
    close(io)
end

# 读取 gzip 文件
open("/tmp/data.gz", "r") do f
    io = GzipDecompressorStream(f)
    lines = readlines(io)
    println("读取了 $(length(lines)) 行")
    close(io)
end

# 使用 TranscodingStreams 接口
using TranscodingStreams, CodecZlib

# 一步完成压缩写入
open("/tmp/compressed.gz", "w") do f
    stream = GzipCompressorStream(f)
    write(stream, "Hello, compressed world!\n")
    close(stream)
end

# 一步完成解压读取
content = open("/tmp/compressed.gz", "r") do f
    stream = GzipDecompressorStream(f)
    result = read(stream, String)
    close(stream)
    result
end
println(content)

Tar 归档

using Tar, CodecZlib

# 创建 tar.gz 归档
open("/tmp/archive.tar.gz", "w") do f
    io = GzipCompressorStream(f)
    Tar.create("/tmp/test_dir", io)
    close(io)
end

# 解压归档
open("/tmp/archive.tar.gz", "r") do f
    io = GzipDecompressorStream(f)
    Tar.extract(io, "/tmp/extracted")
    close(io)
end

16.11 实际业务场景

场景一：配置文件管理

using JSON

# 配置文件管理器
struct Config
    data::Dict{String,Any}
end

function load_config(path::String)
    if isfile(path)
        data = JSON.parsefile(path)
    else
        data = Dict{String,Any}()
    end
    return Config(data)
end

function save_config(config::Config, path::String)
    open(path, "w") do f
        JSON.print(f, config.data, 2)
    end
end

function get(config::Config, key::String, default=nothing)
    return get(config.data, key, default)
end

# 使用
cfg = load_config("/tmp/app.json")
# get(cfg, "host", "localhost")
# get(cfg, "port", 8080)

场景二：批量数据处理

using CSV, DataFrames

function process_csv_files(input_dir, output_file)
    all_data = DataFrame()
    
    for (_, _, files) in walkdir(input_dir)
        for file in files
            endswith(file, ".csv") || continue
            path = joinpath(input_dir, file)
            
            try
                df = CSV.read(path, DataFrame)
                df.source .= file  # 添加来源列
                all_data = vcat(all_data, df; cols=:union)
            catch e
                @warn "读取失败: $path" exception=e
            end
        end
    end
    
    CSV.write(output_file, all_data)
    return all_data
end

场景三：日志文件轮转

function rotate_log(logpath; max_size_mb=100, max_files=5)
    isfile(logpath) || return
    
    size_mb = filesize(logpath) / (1024 * 1024)
    size_mb < max_size_mb && return
    
    # 删除最旧的
    for i in max_files:-1:2
        old = "$logpath.$(i-1)"
        new = "$logpath.$i"
        isfile(old) && mv(old, new; force=true)
    end
    
    # 轮转当前日志
    mv(logpath, "$logpath.1")
end

16.12 扩展阅读

资源	链接
Julia 官方文档 - I/O	https://docs.julialang.org/en/v1/base/io-network/
CSV.jl	https://github.com/JuliaData/CSV.jl
JSON.jl	https://github.com/JuliaIO/JSON.jl
JLD2.jl	https://github.com/JuliaIO/JLD2.jl
HDF5.jl	https://github.com/JuliaIO/HDF5.jl
CodecZlib.jl	https://github.com/JuliaIO/CodecZlib.jl

16.13 本章小结

主题	要点
基础读写	`read`/`write`/`readlines`/`eachline`
do 语法	自动关闭文件资源
IOBuffer	内存流，用于格式化
CSV.jl	表格数据的标准格式
JSON.jl	结构化数据交换
JLD2	Julia 原生序列化
mmap	大文件内存映射
walkdir	递归目录遍历
CodecZlib	压缩/解压文件