[DFSan] Add callback that allows to track which function tainted data reaches.
authorAndrew Browne <browneee@google.com>
Sat, 10 Dec 2022 01:47:54 +0000 (17:47 -0800)
committerAndrew Browne <browneee@google.com>
Mon, 12 Dec 2022 07:10:17 +0000 (23:10 -0800)
Authored-by: Christopher Liebchen <liebchen@google.com>
Co-authored-by: Andrew Browne <browneee@google.com>
Reviewed By: browneee

Differential Revision: https://reviews.llvm.org/D139543

compiler-rt/include/sanitizer/dfsan_interface.h
compiler-rt/lib/dfsan/dfsan.cpp
compiler-rt/lib/dfsan/done_abilist.txt
compiler-rt/test/dfsan/Inputs/flags_abilist.txt
compiler-rt/test/dfsan/reaches_function.c [new file with mode: 0644]
llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
llvm/test/Instrumentation/DataFlowSanitizer/reaches_function.ll [new file with mode: 0644]

index 8e581a6..519bfff 100644 (file)
@@ -31,6 +31,14 @@ typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
 typedef void (*dfsan_conditional_callback_t)(dfsan_label label,
                                              dfsan_origin origin);
 
+/// Signature of the callback argument to dfsan_set_reaches_function_callback().
+/// The description is intended to hold the name of the variable.
+typedef void (*dfsan_reaches_function_callback_t)(dfsan_label label,
+                                                  dfsan_origin origin,
+                                                  const char *file,
+                                                  unsigned int line,
+                                                  const char *function);
+
 /// Computes the union of \c l1 and \c l2, resulting in a union label.
 dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
 
@@ -91,6 +99,18 @@ void dfsan_set_conditional_callback(dfsan_conditional_callback_t callback);
 /// This function returns all label bits seen in signal handler conditions.
 dfsan_label dfsan_get_labels_in_signal_conditional();
 
+/// Sets a callback to be invoked when tainted data reaches a function.
+/// This could occur at function entry, or at a load instruction.
+/// These callbacks will only be added if -dfsan-reaches-function-callbacks=1.
+void dfsan_set_reaches_function_callback(
+    dfsan_reaches_function_callback_t callback);
+
+/// Making callbacks that handle signals well is tricky, so when
+/// -dfsan-reaches-function-callbacks=true, functions reached in signal
+/// handlers will add the labels they see into a global (bitwise-or together).
+/// This function returns all label bits seen during signal handlers.
+dfsan_label dfsan_get_labels_in_signal_reaches_function();
+
 /// Interceptor hooks.
 /// Whenever a dfsan's custom function is called the corresponding
 /// hook is called it non-zero. The hooks should be defined by the user.
index 0a6f319..faf5a66 100644 (file)
@@ -718,6 +718,67 @@ dfsan_get_labels_in_signal_conditional() {
   return __dfsan::labels_in_signal_conditional;
 }
 
+namespace __dfsan {
+
+typedef void (*dfsan_reaches_function_callback_t)(dfsan_label label,
+                                                  dfsan_origin origin,
+                                                  const char *file,
+                                                  unsigned int line,
+                                                  const char *function);
+static dfsan_reaches_function_callback_t reaches_function_callback = nullptr;
+static dfsan_label labels_in_signal_reaches_function = 0;
+
+static void ReachesFunctionCallback(dfsan_label label, dfsan_origin origin,
+                                    const char *file, unsigned int line,
+                                    const char *function) {
+  if (label == 0) {
+    return;
+  }
+  if (reaches_function_callback == nullptr) {
+    return;
+  }
+
+  // This initial ReachesFunctionCallback handler needs to be in here in dfsan
+  // runtime (rather than being an entirely user implemented hook) so that it
+  // has access to dfsan thread information.
+  DFsanThread *t = GetCurrentThread();
+  // A callback operation which does useful work (like record the flow) will
+  // likely be too long executed in a signal handler.
+  if (t && t->InSignalHandler()) {
+    // Record set of labels used in signal handler for completeness.
+    labels_in_signal_reaches_function |= label;
+    return;
+  }
+
+  reaches_function_callback(label, origin, file, line, function);
+}
+
+}  // namespace __dfsan
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__dfsan_reaches_function_callback_origin(dfsan_label label, dfsan_origin origin,
+                                         const char *file, unsigned int line,
+                                         const char *function) {
+  __dfsan::ReachesFunctionCallback(label, origin, file, line, function);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__dfsan_reaches_function_callback(dfsan_label label, const char *file,
+                                  unsigned int line, const char *function) {
+  __dfsan::ReachesFunctionCallback(label, 0, file, line, function);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+dfsan_set_reaches_function_callback(
+    __dfsan::dfsan_reaches_function_callback_t callback) {
+  __dfsan::reaches_function_callback = callback;
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label
+dfsan_get_labels_in_signal_reaches_function() {
+  return __dfsan::labels_in_signal_reaches_function;
+}
+
 class Decorator : public __sanitizer::SanitizerCommonDecorator {
  public:
   Decorator() : SanitizerCommonDecorator() {}
@@ -1031,6 +1092,7 @@ extern "C" void dfsan_flush() {
     }
   }
   __dfsan::labels_in_signal_conditional = 0;
+  __dfsan::labels_in_signal_reaches_function = 0;
 }
 
 // TODO: CheckMemoryLayoutSanity is based on msan.
index e8fcd83..ff8a37f 100644 (file)
@@ -50,6 +50,12 @@ fun:dfsan_set_conditional_callback=uninstrumented
 fun:dfsan_set_conditional_callback=discard
 fun:dfsan_get_labels_in_signal_conditional=uninstrumented
 fun:dfsan_get_labels_in_signal_conditional=discard
+fun:dfsan_set_reaches_function_callback=uninstrumented
+fun:dfsan_set_reaches_function_callback=discard
+fun:dfsan_get_labels_in_signal_reaches_function=uninstrumented
+fun:dfsan_get_labels_in_signal_reaches_function=discard
+fun:dfsan_reaches_function_callback=uninstrumented
+fun:dfsan_reaches_function_callback=discard
 
 ###############################################################################
 # glibc
index 6245a41..f0dff9b 100644 (file)
@@ -13,3 +13,9 @@ fun:my_dfsan_conditional_callback=discard
 
 fun:dfsan_set_conditional_callback=uninstrumented
 fun:dfsan_set_conditional_callback=discard
+
+fun:my_dfsan_reaches_function_callback=uninstrumented
+fun:my_dfsan_reaches_function_callback=discard
+
+fun:dfsan_set_reaches_function_callback=uninstrumented
+fun:dfsan_set_reaches_function_callback=discard
diff --git a/compiler-rt/test/dfsan/reaches_function.c b/compiler-rt/test/dfsan/reaches_function.c
new file mode 100644 (file)
index 0000000..46a2b7b
--- /dev/null
@@ -0,0 +1,67 @@
+// RUN: %clang_dfsan -fno-sanitize=dataflow -O2 -fPIE -DCALLBACKS -c %s -o %t-callbacks.o
+// RUN: %clang_dfsan -gmlt -fsanitize-ignorelist=%S/Inputs/flags_abilist.txt -O2 -mllvm -dfsan-reaches-function-callbacks=1 %s %t-callbacks.o -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// RUN: %clang_dfsan -fno-sanitize=dataflow -O2 -fPIE -DCALLBACKS -DORIGIN_TRACKING -c %s -o %t-callbacks.o
+// RUN: %clang_dfsan -gmlt -fsanitize-ignorelist=%S/Inputs/flags_abilist.txt -O2 -mllvm -dfsan-reaches-function-callbacks=1 -mllvm -dfsan-track-origins=2 %s %t-callbacks.o -o %t
+// RUN: %run %t 2>&1 | FileCheck --check-prefix=CHECK-ORIGIN-TRACKING %s
+
+// REQUIRES: x86_64-target-arch
+
+// Tests that callbacks are inserted for reached functions when
+// -dfsan-reaches-function-callbacks is specified.
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <sanitizer/dfsan_interface.h>
+
+#ifdef CALLBACKS
+// Compile this code without DFSan to avoid recursive instrumentation.
+
+void my_dfsan_reaches_function_callback(dfsan_label label, dfsan_origin origin,
+                                        const char *file, unsigned int line,
+                                        const char *function) {
+#ifdef ORIGIN_TRACKING
+  dfsan_print_origin_id_trace(origin);
+#else
+  printf("%s:%d %s\n", file, line, function);
+#endif
+}
+
+#else
+
+__attribute__((noinline)) uint64_t add(uint64_t *a, uint64_t *b) {
+
+  return *a + *b;
+  // CHECK: {{.*}}compiler-rt/test/dfsan/reaches_function.c:[[# @LINE - 1]] add.dfsan
+  // CHECK-ORIGIN-TRACKING: Origin value: 0x10000002, Taint value was stored to memory at
+  // CHECK-ORIGIN-TRACKING: #0 {{.*}} in add.dfsan {{.*}}compiler-rt/test/dfsan/reaches_function.c:[[# @LINE - 3]]:{{.*}}
+  // CHECK-ORIGIN-TRACKING: Origin value: 0x1, Taint value was created at
+  // CHECK-ORIGIN-TRACKING: #0 {{.*}} in main {{.*}}compiler-rt/test/dfsan/reaches_function.c:{{.*}}
+}
+
+extern void my_dfsan_reaches_function_callback(dfsan_label label,
+                                               dfsan_origin origin,
+                                               const char *file,
+                                               unsigned int line,
+                                               const char *function);
+
+int main(int argc, char *argv[]) {
+
+  dfsan_set_reaches_function_callback(my_dfsan_reaches_function_callback);
+
+  uint64_t a = 0;
+  uint64_t b = 0;
+
+  dfsan_set_label(8, &a, sizeof(a));
+  uint64_t c = add(&a, &b);
+  // CHECK: {{.*}}compiler-rt/test/dfsan/reaches_function.c:[[# @LINE - 1]] main
+  // CHECK-ORIGIN-TRACKING: Origin value: 0x10000002, Taint value was stored to memory at
+  // CHECK-ORIGIN-TRACKING: #0 {{.*}} in add.dfsan {{.*}}compiler-rt/test/dfsan/reaches_function.c:{{.*}}
+  // CHECK-ORIGIN-TRACKING: Origin value: 0x1, Taint value was created at
+  // CHECK-ORIGIN-TRACKING: #0 {{.*}} in main {{.*}}compiler-rt/test/dfsan/reaches_function.c:[[# @LINE - 6]]:{{.*}}
+  return c;
+}
+
+#endif // #ifdef CALLBACKS
index fe2a502..b62f150 100644 (file)
@@ -223,6 +223,14 @@ static cl::opt<bool> ClConditionalCallbacks(
     cl::desc("Insert calls to callback functions on conditionals."), cl::Hidden,
     cl::init(false));
 
+// Experimental feature that inserts callbacks for data reaching a function,
+// either via function arguments and loads.
+// This must be true for dfsan_set_reaches_function_callback() to have effect.
+static cl::opt<bool> ClReachesFunctionCallbacks(
+    "dfsan-reaches-function-callbacks",
+    cl::desc("Insert calls to callback functions on data reaching a function."),
+    cl::Hidden, cl::init(false));
+
 // Controls whether the pass tracks the control flow of select instructions.
 static cl::opt<bool> ClTrackSelectControlFlow(
     "dfsan-track-select-control-flow",
@@ -446,6 +454,8 @@ class DataFlowSanitizer {
   FunctionType *DFSanVarargWrapperFnTy;
   FunctionType *DFSanConditionalCallbackFnTy;
   FunctionType *DFSanConditionalCallbackOriginFnTy;
+  FunctionType *DFSanReachesFunctionCallbackFnTy;
+  FunctionType *DFSanReachesFunctionCallbackOriginFnTy;
   FunctionType *DFSanCmpCallbackFnTy;
   FunctionType *DFSanLoadStoreCallbackFnTy;
   FunctionType *DFSanMemTransferCallbackFnTy;
@@ -467,6 +477,8 @@ class DataFlowSanitizer {
   FunctionCallee DFSanMemTransferCallbackFn;
   FunctionCallee DFSanConditionalCallbackFn;
   FunctionCallee DFSanConditionalCallbackOriginFn;
+  FunctionCallee DFSanReachesFunctionCallbackFn;
+  FunctionCallee DFSanReachesFunctionCallbackOriginFn;
   FunctionCallee DFSanCmpCallbackFn;
   FunctionCallee DFSanChainOriginFn;
   FunctionCallee DFSanChainOriginIfTaintedFn;
@@ -673,6 +685,11 @@ struct DFSanFunction {
   // branch instruction using the given conditional expression.
   void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition);
 
+  // If ClReachesFunctionCallbacks is enabled, insert a callback for each
+  // argument and load instruction.
+  void addReachesFunctionCallbacksIfEnabled(IRBuilder<> &IRB, Instruction &I,
+                                            Value *Data);
+
   bool isLookupTableConstant(Value *P);
 
 private:
@@ -1025,6 +1042,45 @@ void DFSanFunction::addConditionalCallbacksIfEnabled(Instruction &I,
   }
 }
 
+void DFSanFunction::addReachesFunctionCallbacksIfEnabled(IRBuilder<> &IRB,
+                                                         Instruction &I,
+                                                         Value *Data) {
+  if (!ClReachesFunctionCallbacks) {
+    return;
+  }
+  const DebugLoc &dbgloc = I.getDebugLoc();
+  Value *DataShadow = collapseToPrimitiveShadow(getShadow(Data), IRB);
+  ConstantInt *CILine;
+  llvm::Value *FilePathPtr;
+
+  if (dbgloc.get() == nullptr) {
+    CILine = llvm::ConstantInt::get(I.getContext(), llvm::APInt(32, 0, false));
+    FilePathPtr = IRB.CreateGlobalStringPtr(
+        I.getFunction()->getParent()->getSourceFileName());
+  } else {
+    CILine = llvm::ConstantInt::get(I.getContext(),
+                                    llvm::APInt(32, dbgloc.getLine(), false));
+    FilePathPtr =
+        IRB.CreateGlobalStringPtr(dbgloc->getFilename());
+  }
+
+  llvm::Value *FunctionNamePtr =
+      IRB.CreateGlobalStringPtr(I.getFunction()->getName());
+
+  CallInst *CB;
+  std::vector<Value *> args;
+
+  if (DFS.shouldTrackOrigins()) {
+    Value *DataOrigin = getOrigin(Data);
+    args = { DataShadow, DataOrigin, FilePathPtr, CILine, FunctionNamePtr };
+    CB = IRB.CreateCall(DFS.DFSanReachesFunctionCallbackOriginFn, args);
+  } else {
+    args = { DataShadow, FilePathPtr, CILine, FunctionNamePtr };
+    CB = IRB.CreateCall(DFS.DFSanReachesFunctionCallbackFn, args);
+  }
+  CB->setDebugLoc(dbgloc);
+}
+
 Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) {
   if (!OrigTy->isSized())
     return PrimitiveShadowTy;
@@ -1097,6 +1153,16 @@ bool DataFlowSanitizer::initializeModule(Module &M) {
   DFSanConditionalCallbackOriginFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), DFSanConditionalCallbackOriginArgs,
       /*isVarArg=*/false);
+  Type *DFSanReachesFunctionCallbackArgs[4] = {PrimitiveShadowTy, Int8Ptr,
+                                               OriginTy, Int8Ptr};
+  DFSanReachesFunctionCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), DFSanReachesFunctionCallbackArgs,
+                        /*isVarArg=*/false);
+  Type *DFSanReachesFunctionCallbackOriginArgs[5] = {
+      PrimitiveShadowTy, OriginTy, Int8Ptr, OriginTy, Int8Ptr};
+  DFSanReachesFunctionCallbackOriginFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), DFSanReachesFunctionCallbackOriginArgs,
+      /*isVarArg=*/false);
   DFSanCmpCallbackFnTy =
       FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
                         /*isVarArg=*/false);
@@ -1325,6 +1391,10 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   DFSanRuntimeFunctions.insert(
       DFSanConditionalCallbackOriginFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
+      DFSanReachesFunctionCallbackFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
+      DFSanReachesFunctionCallbackOriginFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
       DFSanCmpCallbackFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanChainOriginFn.getCallee()->stripPointerCasts());
@@ -1357,6 +1427,11 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
   DFSanConditionalCallbackOriginFn =
       Mod->getOrInsertFunction("__dfsan_conditional_callback_origin",
                                DFSanConditionalCallbackOriginFnTy);
+  DFSanReachesFunctionCallbackFn = Mod->getOrInsertFunction(
+      "__dfsan_reaches_function_callback", DFSanReachesFunctionCallbackFnTy);
+  DFSanReachesFunctionCallbackOriginFn =
+      Mod->getOrInsertFunction("__dfsan_reaches_function_callback_origin",
+                               DFSanReachesFunctionCallbackOriginFnTy);
 }
 
 void DataFlowSanitizer::injectMetadataGlobals(Module &M) {
@@ -1585,6 +1660,31 @@ bool DataFlowSanitizer::runImpl(
     DFSanFunction DFSF(*this, F, FnsWithNativeABI.count(F),
                        FnsWithForceZeroLabel.count(F), GetTLI(*F));
 
+    if (ClReachesFunctionCallbacks) {
+      // Add callback for arguments reaching this function.
+      for (auto &FArg : F->args()) {
+        Instruction *Next = &F->getEntryBlock().front();
+        Value *FArgShadow = DFSF.getShadow(&FArg);
+        if (isZeroShadow(FArgShadow))
+          continue;
+        if (Instruction *FArgShadowInst = dyn_cast<Instruction>(FArgShadow)) {
+          Next = FArgShadowInst->getNextNode();
+        }
+        if (shouldTrackOrigins()) {
+          if (Instruction *Origin =
+                  dyn_cast<Instruction>(DFSF.getOrigin(&FArg))) {
+            // Ensure IRB insertion point is after loads for shadow and origin.
+            Instruction *OriginNext = Origin->getNextNode();
+            if (Next->comesBefore(OriginNext)) {
+              Next = OriginNext;
+            }
+          }
+        }
+        IRBuilder<> IRB(Next);
+        DFSF.addReachesFunctionCallbacksIfEnabled(IRB, *Next, &FArg);
+      }
+    }
+
     // DFSanVisitor may create new basic blocks, which confuses df_iterator.
     // Build a copy of the list before iterating over it.
     SmallVector<BasicBlock *, 4> BBList(depth_first(&F->getEntryBlock()));
@@ -2267,6 +2367,7 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   if (LI.isAtomic())
     LI.setOrdering(addAcquireOrdering(LI.getOrdering()));
 
+  Instruction *AfterLi = LI.getNextNode();
   Instruction *Pos = LI.isAtomic() ? LI.getNextNode() : &LI;
   std::vector<Value *> Shadows;
   std::vector<Value *> Origins;
@@ -2304,6 +2405,9 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
     Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
     IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr8});
   }
+
+  IRBuilder<> IRB(AfterLi);
+  DFSF.addReachesFunctionCallbacksIfEnabled(IRB, LI, &LI);
 }
 
 Value *DFSanFunction::updateOriginIfTainted(Value *Shadow, Value *Origin,
@@ -3303,6 +3407,8 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
       DFSF.SkipInsts.insert(LI);
       DFSF.setOrigin(&CB, LI);
     }
+
+    DFSF.addReachesFunctionCallbacksIfEnabled(NextIRB, CB, &CB);
   }
 }
 
diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/reaches_function.ll b/llvm/test/Instrumentation/DataFlowSanitizer/reaches_function.ll
new file mode 100644 (file)
index 0000000..6546968
--- /dev/null
@@ -0,0 +1,29 @@
+; RUN: opt < %s -passes=dfsan -dfsan-reaches-function-callbacks=1 -S | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare i32 @f()
+
+define void @load(i32) {
+  ; CHECK-LABEL: define void @load.dfsan
+  ; CHECK: call{{.*}}@__dfsan_reaches_function_callback
+  %i = alloca i32
+  store i32 %0, ptr %i
+  ret void
+}
+
+define void @store(ptr) {
+  ; CHECK-LABEL: define void @store.dfsan
+  ; CHECK: call{{.*}}@__dfsan_reaches_function_callback
+  %load = load i32, ptr %0
+  ret void
+}
+
+define void @call() {
+  ; CHECK-LABEL: define void @call.dfsan
+  ; CHECK: call{{.*}}@__dfsan_reaches_function_callback
+  %ret = call i32 @f()
+  ret void
+}
+
+; CHECK-LABEL: @__dfsan_reaches_function_callback(i8, ptr, i32, ptr)