ExpandMemCmp

这是一个面向特定指令的 Pass,该 Pass 将对比内存的 memcmp 调用展开为针对目标架构最优大小的若干次加载(load)与比较(compare)指令序列,从而减少库调用开销并利用宽度更大的加载指令提升性能

一句话:是不是决定内联 memcmp 函数

下面依次介绍一些相关的概念:

  • memcmp(ptr1, ptr2, N) 在运行时通常会调用库函数:先检查长度、然后逐字节或分块比较,最后返回比较结果。库函数调用有调用开销,且不易让编译器做进一步优化
  • 该 Pass 在编译阶段判断 memcmp 的大小(常量或可推断长度)是否在可展开范围内,如果合适,则在调用点处内联展开:按照目标支持的最大对齐/宽度(例如 16 字节、8 字节或更大)分割成若干块加载 + 比较,最后处理残余字节。

先来个例子吧

  • 调用前(伪 LLVM IR):
; C 代码: if (memcmp(p, q, 24) == 0) { ... }
%cmp = call i32 @memcmp(i8* %p, i8* %q, i64 24)
%is_zero = icmp eq i32 %cmp, 0
br i1 %is_zero, label %equal, label %notequal
  • Pass 触发内联展开后,可能变为:
; 生成类似下面逻辑(伪 IR)
entry:
  ; 第1块:offset=0, load 8 字节
  %p0 = bitcast i8* %p to i64*
  %q0 = bitcast i8* %q to i64*
  %v0p = load i64, ptr %p0        ; load 8 bytes from p+0
  %v0q = load i64, ptr %q0
  %neq0 = icmp ne i64 %v0p, %v0q
  br i1 %neq0, label %notequal, label %cmp1

cmp1:
  ; 第2块:offset=8
  %p1 = getelementptr i8, i8* %p, i64 8
  %q1 = getelementptr i8, i8* %q, i64 8
  %p1i = bitcast i8* %p1 to i64*
  %q1i = bitcast i8* %q1 to i64*
  %v1p = load i64, ptr %p1i
  %v1q = load i64, ptr %q1i
  %neq1 = icmp ne i64 %v1p, %v1q
  br i1 %neq1, label %notequal, label %cmp2

cmp2:
  ; 第3块:offset=16,剩余 8 字节(24-16=8)
  %p2 = getelementptr i8, i8* %p, i64 16
  %q2 = getelementptr i8, i8* %q, i64 16
  %p2i = bitcast i8* %p2 to i64*
  %q2i = bitcast i8* %q2 to i64*
  %v2p = load i64, ptr %p2i
  %v2q = load i64, ptr %q2i
  %neq2 = icmp ne i64 %v2p, %v2q
  br i1 %neq2, label %notequal, label %equal

equal:
  ; memcmp == 0 的处理
  br label %after

notequal:
  ; memcmp != 0 的处理
  br label %after

after:
  ; 后续逻辑

如上代码针对 memcmp(比较)展开成内联的 load-and-compare 代码,以去掉函数调用开销并利用宽加载指令做高效比较。对于纯拷贝(memcpy),编译器一般已经识别并调用底层高效实现或产生非临时存储指令;而这个 Pass 重点在于比较操作,把 memcmp(p, q, N) 展开成多个按目标平台最佳大小的加载和比较,以便更早退出和利用硬件特性,从而提升性能。

这是一个可调参 Pass。

STATISTIC(NumMemCmpCalls, "Number of memcmp calls");  
STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");  
STATISTIC(NumMemCmpGreaterThanMax,  
          "Number of memcmp calls with size greater than max size");  
STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");

// 用于配置当 memcmp 仅用于与零比较时,每个基本块允许的加载次数(默认 1)
static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock(  
    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),  
    cl::desc("The number of loads per basic block for inline expansion of "  
             "memcmp that is only being compared against zero."));  
// 用于设置一般情况下单次 memcmp 展开中最多允许的加载次数,以避免过多基本块或代码膨胀
static cl::opt<unsigned> MaxLoadsPerMemcmp(  
    "max-loads-per-memcmp", cl::Hidden,  
    cl::desc("Set maximum number of loads used in expanded memcmp"));  
//专门用于 -Os/-Oz 优化级别下,设置更严格的最大加载次数,以控制代码尺寸
static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(  
    "max-loads-per-memcmp-opt-size", cl::Hidden,  
    cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));

从代码上看,该 Pass 依赖如下分析:

  • TargetLibraryAnalysis
  • TargetIRAnalysis
  • DominatorTreeAnalysis
    核心流程:
PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,  
                          const TargetTransformInfo *TTI,  
                          const TargetLowering *TL, ProfileSummaryInfo *PSI,  
                          BlockFrequencyInfo *BFI, DominatorTree *DT) {  
  std::optional<DomTreeUpdater> DTU;  
  if (DT)  
    DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);  
  
  const DataLayout& DL = F.getDataLayout();  
  bool MadeChanges = false;  
  for (auto BBIt = F.begin(); BBIt != F.end();) {  
    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {  
      MadeChanges = true;  
      // If changes were made, restart the function from the beginning, since  
      // the structure of the function was changed.      BBIt = F.begin();  
    } else {  
      ++BBIt;  
    }  
  }  
  if (MadeChanges)  
    for (BasicBlock &BB : F)  
      SimplifyInstructionsInBlock(&BB);  
  if (!MadeChanges)  
    return PreservedAnalyses::all();  
  PreservedAnalyses PA;  
  PA.preserve<DominatorTreeAnalysis>();  
  return PA;  
}

附录-A

官方例子:

We want to transform:  
%call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)  
To:  
loadbb:  
 %0 = bitcast i32* %buffer2 to i8*  
 %1 = bitcast i32* %buffer1 to i8*  
 %2 = bitcast i8* %1 to i64*  
 %3 = bitcast i8* %0 to i64*  
 %4 = load i64, i64* %2  
 %5 = load i64, i64* %3  
 %6 = call i64 @llvm.bswap.i64(i64 %4)  
 %7 = call i64 @llvm.bswap.i64(i64 %5)  
 %8 = sub i64 %6, %7  
 %9 = icmp ne i64 %8, 0  
 br i1 %9, label %res_block, label %loadbb1  
res_block:                                        ; preds = %loadbb2,  
%loadbb1, %loadbb  
 %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]  
 %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]  
 %10 = icmp ult i64 %phi.src1, %phi.src2  
 %11 = select i1 %10, i32 -1, i32 1  
 br label %endblock  
loadbb1:                                          ; preds = %loadbb  
 %12 = bitcast i32* %buffer2 to i8*  
 %13 = bitcast i32* %buffer1 to i8*  
 %14 = bitcast i8* %13 to i32*  
 %15 = bitcast i8* %12 to i32*  
 %16 = getelementptr i32, i32* %14, i32 2  
 %17 = getelementptr i32, i32* %15, i32 2  
 %18 = load i32, i32* %16  
 %19 = load i32, i32* %17  
 %20 = call i32 @llvm.bswap.i32(i32 %18)  
 %21 = call i32 @llvm.bswap.i32(i32 %19)  
 %22 = zext i32 %20 to i64  
 %23 = zext i32 %21 to i64  
 %24 = sub i64 %22, %23  
 %25 = icmp ne i64 %24, 0  
 br i1 %25, label %res_block, label %loadbb2  
loadbb2:                                          ; preds = %loadbb1  
 %26 = bitcast i32* %buffer2 to i8*  
 %27 = bitcast i32* %buffer1 to i8*  
 %28 = bitcast i8* %27 to i16*  
 %29 = bitcast i8* %26 to i16*  
 %30 = getelementptr i16, i16* %28, i16 6  
 %31 = getelementptr i16, i16* %29, i16 6  
 %32 = load i16, i16* %30  
 %33 = load i16, i16* %31  
 %34 = call i16 @llvm.bswap.i16(i16 %32)  
 %35 = call i16 @llvm.bswap.i16(i16 %33)  
 %36 = zext i16 %34 to i64  
 %37 = zext i16 %35 to i64  
 %38 = sub i64 %36, %37  
 %39 = icmp ne i64 %38, 0  
 br i1 %39, label %res_block, label %loadbb3  
loadbb3:                                          ; preds = %loadbb2  
 %40 = bitcast i32* %buffer2 to i8*  
 %41 = bitcast i32* %buffer1 to i8*  
 %42 = getelementptr i8, i8* %41, i8 14  
 %43 = getelementptr i8, i8* %40, i8 14  
 %44 = load i8, i8* %42  
 %45 = load i8, i8* %43  
 %46 = zext i8 %44 to i32  
 %47 = zext i8 %45 to i32  
 %48 = sub i32 %46, %47  
 br label %endblock  
endblock:                                         ; preds = %res_block,  
%loadbb3  
 %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]  
 ret i32 %phi.res