后端Pass简介——ExpandMemCmp
ExpandMemCmp
这是一个面向特定指令的 Pass,该 Pass 将对比内存的 memcmp 调用展开为针对目标架构最优大小的若干次加载(load)与比较(compare)指令序列,从而减少库调用开销并利用宽度更大的加载指令提升性能。
一句话:是不是决定内联 memcmp 函数
下面依次介绍一些相关的概念:
- memcmp(ptr1, ptr2, N) 在运行时通常会调用库函数:先检查长度、然后逐字节或分块比较,最后返回比较结果。库函数调用有调用开销,且不易让编译器做进一步优化。
- 该 Pass 在编译阶段判断 memcmp 的大小(常量或可推断长度)是否在可展开范围内,如果合适,则在调用点处内联展开:按照目标支持的最大对齐/宽度(例如 16 字节、8 字节或更大)分割成若干块加载 + 比较,最后处理残余字节。
先来个例子吧
- 调用前(伪 LLVM IR):
; C 代码: if (memcmp(p, q, 24) == 0) { ... }
%cmp = call i32 @memcmp(i8* %p, i8* %q, i64 24)
%is_zero = icmp eq i32 %cmp, 0
br i1 %is_zero, label %equal, label %notequal
- Pass 触发内联展开后,可能变为:
; 生成类似下面逻辑(伪 IR)
entry:
; 第1块:offset=0, load 8 字节
%p0 = bitcast i8* %p to i64*
%q0 = bitcast i8* %q to i64*
%v0p = load i64, ptr %p0 ; load 8 bytes from p+0
%v0q = load i64, ptr %q0
%neq0 = icmp ne i64 %v0p, %v0q
br i1 %neq0, label %notequal, label %cmp1
cmp1:
; 第2块:offset=8
%p1 = getelementptr i8, i8* %p, i64 8
%q1 = getelementptr i8, i8* %q, i64 8
%p1i = bitcast i8* %p1 to i64*
%q1i = bitcast i8* %q1 to i64*
%v1p = load i64, ptr %p1i
%v1q = load i64, ptr %q1i
%neq1 = icmp ne i64 %v1p, %v1q
br i1 %neq1, label %notequal, label %cmp2
cmp2:
; 第3块:offset=16,剩余 8 字节(24-16=8)
%p2 = getelementptr i8, i8* %p, i64 16
%q2 = getelementptr i8, i8* %q, i64 16
%p2i = bitcast i8* %p2 to i64*
%q2i = bitcast i8* %q2 to i64*
%v2p = load i64, ptr %p2i
%v2q = load i64, ptr %q2i
%neq2 = icmp ne i64 %v2p, %v2q
br i1 %neq2, label %notequal, label %equal
equal:
; memcmp == 0 的处理
br label %after
notequal:
; memcmp != 0 的处理
br label %after
after:
; 后续逻辑
如上代码针对 memcmp(比较)展开成内联的 load-and-compare 代码,以去掉函数调用开销并利用宽加载指令做高效比较。对于纯拷贝(memcpy),编译器一般已经识别并调用底层高效实现或产生非临时存储指令;而这个 Pass 重点在于比较操作,把 memcmp(p, q, N) 展开成多个按目标平台最佳大小的加载和比较,以便更早退出和利用硬件特性,从而提升性能。
这是一个可调参 Pass。
STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
STATISTIC(NumMemCmpGreaterThanMax,
"Number of memcmp calls with size greater than max size");
STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
// 用于配置当 memcmp 仅用于与零比较时,每个基本块允许的加载次数(默认 1)
static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock(
"memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
cl::desc("The number of loads per basic block for inline expansion of "
"memcmp that is only being compared against zero."));
// 用于设置一般情况下单次 memcmp 展开中最多允许的加载次数,以避免过多基本块或代码膨胀
static cl::opt<unsigned> MaxLoadsPerMemcmp(
"max-loads-per-memcmp", cl::Hidden,
cl::desc("Set maximum number of loads used in expanded memcmp"));
//专门用于 -Os/-Oz 优化级别下,设置更严格的最大加载次数,以控制代码尺寸
static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(
"max-loads-per-memcmp-opt-size", cl::Hidden,
cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));
从代码上看,该 Pass 依赖如下分析:
- TargetLibraryAnalysis
- TargetIRAnalysis
- DominatorTreeAnalysis
核心流程:
PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI,
const TargetLowering *TL, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI, DominatorTree *DT) {
std::optional<DomTreeUpdater> DTU;
if (DT)
DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
const DataLayout& DL = F.getDataLayout();
bool MadeChanges = false;
for (auto BBIt = F.begin(); BBIt != F.end();) {
if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
MadeChanges = true;
// If changes were made, restart the function from the beginning, since
// the structure of the function was changed. BBIt = F.begin();
} else {
++BBIt;
}
}
if (MadeChanges)
for (BasicBlock &BB : F)
SimplifyInstructionsInBlock(&BB);
if (!MadeChanges)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
return PA;
}
附录-A
官方例子:
We want to transform:
%call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
To:
loadbb:
%0 = bitcast i32* %buffer2 to i8*
%1 = bitcast i32* %buffer1 to i8*
%2 = bitcast i8* %1 to i64*
%3 = bitcast i8* %0 to i64*
%4 = load i64, i64* %2
%5 = load i64, i64* %3
%6 = call i64 @llvm.bswap.i64(i64 %4)
%7 = call i64 @llvm.bswap.i64(i64 %5)
%8 = sub i64 %6, %7
%9 = icmp ne i64 %8, 0
br i1 %9, label %res_block, label %loadbb1
res_block: ; preds = %loadbb2,
%loadbb1, %loadbb
%phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
%phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
%10 = icmp ult i64 %phi.src1, %phi.src2
%11 = select i1 %10, i32 -1, i32 1
br label %endblock
loadbb1: ; preds = %loadbb
%12 = bitcast i32* %buffer2 to i8*
%13 = bitcast i32* %buffer1 to i8*
%14 = bitcast i8* %13 to i32*
%15 = bitcast i8* %12 to i32*
%16 = getelementptr i32, i32* %14, i32 2
%17 = getelementptr i32, i32* %15, i32 2
%18 = load i32, i32* %16
%19 = load i32, i32* %17
%20 = call i32 @llvm.bswap.i32(i32 %18)
%21 = call i32 @llvm.bswap.i32(i32 %19)
%22 = zext i32 %20 to i64
%23 = zext i32 %21 to i64
%24 = sub i64 %22, %23
%25 = icmp ne i64 %24, 0
br i1 %25, label %res_block, label %loadbb2
loadbb2: ; preds = %loadbb1
%26 = bitcast i32* %buffer2 to i8*
%27 = bitcast i32* %buffer1 to i8*
%28 = bitcast i8* %27 to i16*
%29 = bitcast i8* %26 to i16*
%30 = getelementptr i16, i16* %28, i16 6
%31 = getelementptr i16, i16* %29, i16 6
%32 = load i16, i16* %30
%33 = load i16, i16* %31
%34 = call i16 @llvm.bswap.i16(i16 %32)
%35 = call i16 @llvm.bswap.i16(i16 %33)
%36 = zext i16 %34 to i64
%37 = zext i16 %35 to i64
%38 = sub i64 %36, %37
%39 = icmp ne i64 %38, 0
br i1 %39, label %res_block, label %loadbb3
loadbb3: ; preds = %loadbb2
%40 = bitcast i32* %buffer2 to i8*
%41 = bitcast i32* %buffer1 to i8*
%42 = getelementptr i8, i8* %41, i8 14
%43 = getelementptr i8, i8* %40, i8 14
%44 = load i8, i8* %42
%45 = load i8, i8* %43
%46 = zext i8 %44 to i32
%47 = zext i8 %45 to i32
%48 = sub i32 %46, %47
br label %endblock
endblock: ; preds = %res_block,
%loadbb3
%phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
ret i32 %phi.res
评论