You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							462 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
	
	
							462 lines
						
					
					
						
							12 KiB
						
					
					
				| #!/usr/bin/env bash
 | |
| #
 | |
| # american fuzzy lop - corpus minimization tool
 | |
| # ---------------------------------------------
 | |
| #
 | |
| # Written and maintained by Michal Zalewski <lcamtuf@google.com>
 | |
| #
 | |
| # Copyright 2014, 2015 Google LLC All rights reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at:
 | |
| #
 | |
| #   http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # This tool tries to find the smallest subset of files in the input directory
 | |
| # that still trigger the full range of instrumentation data points seen in
 | |
| # the starting corpus. This has two uses:
 | |
| #
 | |
| #   - Screening large corpora of input files before using them as a seed for
 | |
| #     afl-fuzz. The tool will remove functionally redundant files and likely
 | |
| #     leave you with a much smaller set.
 | |
| #
 | |
| #     (In this case, you probably also want to consider running afl-tmin on
 | |
| #     the individual files later on to reduce their size.)
 | |
| #
 | |
| #   - Minimizing the corpus generated organically by afl-fuzz, perhaps when
 | |
| #     planning to feed it to more resource-intensive tools. The tool achieves
 | |
| #     this by removing all entries that used to trigger unique behaviors in the
 | |
| #     past, but have been made obsolete by later finds.
 | |
| #
 | |
| # Note that the tool doesn't modify the files themselves. For that, you want
 | |
| # afl-tmin.
 | |
| #
 | |
| # This script must use bash because other shells may have hardcoded limits on
 | |
| # array sizes.
 | |
| #
 | |
| 
 | |
| echo "corpus minimization tool for afl-fuzz by <lcamtuf@google.com>"
 | |
| echo
 | |
| 
 | |
| #########
 | |
| # SETUP #
 | |
| #########
 | |
| 
 | |
| # Process command-line options...
 | |
| 
 | |
| MEM_LIMIT=100
 | |
| TIMEOUT=none
 | |
| 
 | |
| unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
 | |
|   AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE
 | |
| 
 | |
| while getopts "+i:o:f:m:t:eQC" opt; do
 | |
| 
 | |
|   case "$opt" in 
 | |
| 
 | |
|     "i")
 | |
|          IN_DIR="$OPTARG"
 | |
|          ;;
 | |
| 
 | |
|     "o")
 | |
|          OUT_DIR="$OPTARG"
 | |
|          ;;
 | |
|     "f")
 | |
|          STDIN_FILE="$OPTARG"
 | |
|          ;;
 | |
|     "m")
 | |
|          MEM_LIMIT="$OPTARG"
 | |
|          MEM_LIMIT_GIVEN=1
 | |
|          ;;
 | |
|     "t")
 | |
|          TIMEOUT="$OPTARG"
 | |
|          ;;
 | |
|     "e")
 | |
|          EXTRA_PAR="$EXTRA_PAR -e"
 | |
|          ;;
 | |
|     "C")
 | |
|          export AFL_CMIN_CRASHES_ONLY=1
 | |
|          ;;
 | |
|     "Q")
 | |
|          EXTRA_PAR="$EXTRA_PAR -Q"
 | |
|          test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
 | |
|          QEMU_MODE=1
 | |
|          ;;
 | |
|     "?")
 | |
|          exit 1
 | |
|          ;;
 | |
| 
 | |
|    esac
 | |
| 
 | |
| done
 | |
| 
 | |
| shift $((OPTIND-1))
 | |
| 
 | |
| TARGET_BIN="$1"
 | |
| 
 | |
| if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
 | |
| 
 | |
|   cat 1>&2 <<_EOF_
 | |
| Usage: $0 [ options ] -- /path/to/target_app [ ... ]
 | |
| 
 | |
| Required parameters:
 | |
| 
 | |
|   -i dir        - input directory with the starting corpus
 | |
|   -o dir        - output directory for minimized files
 | |
| 
 | |
| Execution control settings:
 | |
| 
 | |
|   -f file       - location read by the fuzzed program (stdin)
 | |
|   -m megs       - memory limit for child process ($MEM_LIMIT MB)
 | |
|   -t msec       - run time limit for child process (none)
 | |
|   -Q            - use binary-only instrumentation (QEMU mode)
 | |
| 
 | |
| Minimization settings:
 | |
| 
 | |
|   -C            - keep crashing inputs, reject everything else
 | |
|   -e            - solve for edge coverage only, ignore hit counts
 | |
| 
 | |
| For additional tips, please consult docs/README.
 | |
| 
 | |
| _EOF_
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| # Do a sanity check to discourage the use of /tmp, since we can't really
 | |
| # handle this safely from a shell script.
 | |
| 
 | |
| if [ "$AFL_ALLOW_TMP" = "" ]; then
 | |
| 
 | |
|   echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
 | |
|   T1="$?"
 | |
| 
 | |
|   echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
 | |
|   T2="$?"
 | |
| 
 | |
|   echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
 | |
|   T3="$?"
 | |
| 
 | |
|   echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
 | |
|   T4="$?"
 | |
| 
 | |
|   echo "$PWD" | grep -qE '^(/var)?/tmp/'
 | |
|   T5="$?"
 | |
| 
 | |
|   if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
 | |
|     echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
 | |
|     exit 1
 | |
|   fi
 | |
| 
 | |
| fi
 | |
| 
 | |
| # If @@ is specified, but there's no -f, let's come up with a temporary input
 | |
| # file name.
 | |
| 
 | |
| TRACE_DIR="$OUT_DIR/.traces"
 | |
| 
 | |
| if [ "$STDIN_FILE" = "" ]; then
 | |
| 
 | |
|   if echo "$*" | grep -qF '@@'; then
 | |
|     STDIN_FILE="$TRACE_DIR/.cur_input"
 | |
|   fi
 | |
| 
 | |
| fi
 | |
| 
 | |
| # Check for obvious errors.
 | |
| 
 | |
| if [ ! "$MEM_LIMIT" = "none" ]; then
 | |
| 
 | |
|   if [ "$MEM_LIMIT" -lt "5" ]; then
 | |
|     echo "[-] Error: dangerously low memory limit." 1>&2
 | |
|     exit 1
 | |
|   fi
 | |
| 
 | |
| fi
 | |
| 
 | |
| if [ ! "$TIMEOUT" = "none" ]; then
 | |
| 
 | |
|   if [ "$TIMEOUT" -lt "10" ]; then
 | |
|     echo "[-] Error: dangerously low timeout." 1>&2
 | |
|     exit 1
 | |
|   fi
 | |
| 
 | |
| fi
 | |
| 
 | |
| if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
 | |
| 
 | |
|   TNEW="`which "$TARGET_BIN" 2>/dev/null`"
 | |
| 
 | |
|   if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
 | |
|     echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
 | |
|     exit 1
 | |
|   fi
 | |
| 
 | |
|   TARGET_BIN="$TNEW"
 | |
| 
 | |
| fi
 | |
| 
 | |
| if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then
 | |
| 
 | |
|   if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
 | |
|     echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
 | |
|     exit 1
 | |
|   fi
 | |
| 
 | |
| fi
 | |
| 
 | |
| if [ ! -d "$IN_DIR" ]; then
 | |
|   echo "[-] Error: directory '$IN_DIR' not found." 1>&2
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
 | |
| 
 | |
| find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
 | |
| rm -rf "$TRACE_DIR" 2>/dev/null
 | |
| 
 | |
| rmdir "$OUT_DIR" 2>/dev/null
 | |
| 
 | |
| if [ -d "$OUT_DIR" ]; then
 | |
|   echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| mkdir -m 700 -p "$TRACE_DIR" || exit 1
 | |
| 
 | |
| if [ ! "$STDIN_FILE" = "" ]; then
 | |
|   rm -f "$STDIN_FILE" || exit 1
 | |
|   touch "$STDIN_FILE" || exit 1
 | |
| fi
 | |
| 
 | |
| if [ "$AFL_PATH" = "" ]; then
 | |
|   SHOWMAP="${0%/afl-cmin}/afl-showmap"
 | |
| else
 | |
|   SHOWMAP="$AFL_PATH/afl-showmap"
 | |
| fi
 | |
| 
 | |
| if [ ! -x "$SHOWMAP" ]; then
 | |
|   echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
 | |
|   rm -rf "$TRACE_DIR"
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
 | |
| 
 | |
| if [ "$IN_COUNT" = "0" ]; then
 | |
|   echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
 | |
|   rm -rf "$TRACE_DIR"
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| FIRST_FILE=`ls "$IN_DIR" | head -1`
 | |
| 
 | |
| # Make sure that we're not dealing with a directory.
 | |
| 
 | |
| if [ -d "$IN_DIR/$FIRST_FILE" ]; then
 | |
|   echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
 | |
|   rm -rf "$TRACE_DIR"
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| # Check for the more efficient way to copy files...
 | |
| 
 | |
| if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
 | |
|   CP_TOOL=ln
 | |
| else
 | |
|   CP_TOOL=cp
 | |
| fi
 | |
| 
 | |
| # Make sure that we can actually get anything out of afl-showmap before we
 | |
| # waste too much time.
 | |
| 
 | |
| echo "[*] Testing the target binary..."
 | |
| 
 | |
| if [ "$STDIN_FILE" = "" ]; then
 | |
| 
 | |
|   AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
 | |
| 
 | |
| else
 | |
| 
 | |
|   cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
 | |
|   AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
 | |
| 
 | |
| fi
 | |
| 
 | |
| FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
 | |
| 
 | |
| if [ "$FIRST_COUNT" -gt "0" ]; then
 | |
| 
 | |
|   echo "[+] OK, $FIRST_COUNT tuples recorded."
 | |
| 
 | |
| else
 | |
| 
 | |
|   echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
 | |
|   test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
 | |
|   exit 1
 | |
| 
 | |
| fi
 | |
| 
 | |
| # Let's roll!
 | |
| 
 | |
| #############################
 | |
| # STEP 1: COLLECTING TRACES #
 | |
| #############################
 | |
| 
 | |
| echo "[*] Obtaining traces for input files in '$IN_DIR'..."
 | |
| 
 | |
| (
 | |
| 
 | |
|   CUR=0
 | |
| 
 | |
|   if [ "$STDIN_FILE" = "" ]; then
 | |
| 
 | |
|     while read -r fn; do
 | |
| 
 | |
|       CUR=$((CUR+1))
 | |
|       printf "\\r    Processing file $CUR/$IN_COUNT... "
 | |
| 
 | |
|       "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
 | |
| 
 | |
|     done < <(ls "$IN_DIR")
 | |
| 
 | |
|   else
 | |
| 
 | |
|     while read -r fn; do
 | |
| 
 | |
|       CUR=$((CUR+1))
 | |
|       printf "\\r    Processing file $CUR/$IN_COUNT... "
 | |
| 
 | |
|       cp "$IN_DIR/$fn" "$STDIN_FILE"
 | |
| 
 | |
|       "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
 | |
| 
 | |
|     done < <(ls "$IN_DIR")
 | |
| 
 | |
| 
 | |
|   fi
 | |
| 
 | |
| )
 | |
| 
 | |
| echo
 | |
| 
 | |
| ##########################
 | |
| # STEP 2: SORTING TUPLES #
 | |
| ##########################
 | |
| 
 | |
| # With this out of the way, we sort all tuples by popularity across all
 | |
| # datasets. The reasoning here is that we won't be able to avoid the files
 | |
| # that trigger unique tuples anyway, so we will want to start with them and
 | |
| # see what's left.
 | |
| 
 | |
| echo "[*] Sorting trace sets (this may take a while)..."
 | |
| 
 | |
| ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
 | |
|   sort | uniq -c | sort -n >"$TRACE_DIR/.all_uniq"
 | |
| 
 | |
| TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
 | |
| 
 | |
| echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
 | |
| 
 | |
| #####################################
 | |
| # STEP 3: SELECTING CANDIDATE FILES #
 | |
| #####################################
 | |
| 
 | |
| # The next step is to find the best candidate for each tuple. The "best"
 | |
| # part is understood simply as the smallest input that includes a particular
 | |
| # tuple in its trace. Empirical evidence suggests that this produces smaller
 | |
| # datasets than more involved algorithms that could be still pulled off in
 | |
| # a shell script.
 | |
| 
 | |
| echo "[*] Finding best candidates for each tuple..."
 | |
| 
 | |
| CUR=0
 | |
| 
 | |
| while read -r fn; do
 | |
| 
 | |
|   CUR=$((CUR+1))
 | |
|   printf "\\r    Processing file $CUR/$IN_COUNT... "
 | |
| 
 | |
|   sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
 | |
| 
 | |
| done < <(ls -rS "$IN_DIR")
 | |
| 
 | |
| echo
 | |
| 
 | |
| ##############################
 | |
| # STEP 4: LOADING CANDIDATES #
 | |
| ##############################
 | |
| 
 | |
| # At this point, we have a file of tuple-file pairs, sorted by file size
 | |
| # in ascending order (as a consequence of ls -rS). By doing sort keyed
 | |
| # only by tuple (-k 1,1) and configured to output only the first line for
 | |
| # every key (-s -u), we end up with the smallest file for each tuple.
 | |
| 
 | |
| echo "[*] Sorting candidate list (be patient)..."
 | |
| 
 | |
| sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
 | |
|   sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
 | |
| 
 | |
| if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
 | |
|   echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
 | |
|   test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| # The sed command converted the sorted list to a shell script that populates
 | |
| # BEST_FILE[tuple]="fname". Let's load that!
 | |
| 
 | |
| . "$TRACE_DIR/.candidate_script"
 | |
| 
 | |
| ##########################
 | |
| # STEP 5: WRITING OUTPUT #
 | |
| ##########################
 | |
| 
 | |
| # The final trick is to grab the top pick for each tuple, unless said tuple is
 | |
| # already set due to the inclusion of an earlier candidate; and then put all
 | |
| # tuples associated with the newly-added file to the "already have" list. The
 | |
| # loop works from least popular tuples and toward the most common ones.
 | |
| 
 | |
| echo "[*] Processing candidates and writing output files..."
 | |
| 
 | |
| CUR=0
 | |
| 
 | |
| touch "$TRACE_DIR/.already_have"
 | |
| 
 | |
| while read -r cnt tuple; do
 | |
| 
 | |
|   CUR=$((CUR+1))
 | |
|   printf "\\r    Processing tuple $CUR/$TUPLE_COUNT... "
 | |
| 
 | |
|   # If we already have this tuple, skip it.
 | |
| 
 | |
|   grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
 | |
| 
 | |
|   FN=${BEST_FILE[tuple]}
 | |
| 
 | |
|   $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
 | |
| 
 | |
|   if [ "$((CUR % 5))" = "0" ]; then
 | |
|     sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
 | |
|     mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
 | |
|   else
 | |
|     cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
 | |
|   fi
 | |
| 
 | |
| done <"$TRACE_DIR/.all_uniq"
 | |
| 
 | |
| echo
 | |
| 
 | |
| OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
 | |
| 
 | |
| if [ "$OUT_COUNT" = "1" ]; then
 | |
|   echo "[!] WARNING: All test cases had the same traces, check syntax!"
 | |
| fi
 | |
| 
 | |
| echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
 | |
| echo
 | |
| 
 | |
| test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
 | |
| 
 | |
| exit 0
 |