diff --git a/afl-cmin b/afl-cmin new file mode 100644 index 0000000..147f320 --- /dev/null +++ b/afl-cmin @@ -0,0 +1,461 @@ +#!/usr/bin/env bash +# +# american fuzzy lop - corpus minimization tool +# --------------------------------------------- +# +# Written and maintained by Michal Zalewski +# +# Copyright 2014, 2015 Google LLC All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# This tool tries to find the smallest subset of files in the input directory +# that still trigger the full range of instrumentation data points seen in +# the starting corpus. This has two uses: +# +# - Screening large corpora of input files before using them as a seed for +# afl-fuzz. The tool will remove functionally redundant files and likely +# leave you with a much smaller set. +# +# (In this case, you probably also want to consider running afl-tmin on +# the individual files later on to reduce their size.) +# +# - Minimizing the corpus generated organically by afl-fuzz, perhaps when +# planning to feed it to more resource-intensive tools. The tool achieves +# this by removing all entries that used to trigger unique behaviors in the +# past, but have been made obsolete by later finds. +# +# Note that the tool doesn't modify the files themselves. For that, you want +# afl-tmin. +# +# This script must use bash because other shells may have hardcoded limits on +# array sizes. +# + +echo "corpus minimization tool for afl-fuzz by " +echo + +######### +# SETUP # +######### + +# Process command-line options... + +MEM_LIMIT=100 +TIMEOUT=none + +unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \ + AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE + +while getopts "+i:o:f:m:t:eQC" opt; do + + case "$opt" in + + "i") + IN_DIR="$OPTARG" + ;; + + "o") + OUT_DIR="$OPTARG" + ;; + "f") + STDIN_FILE="$OPTARG" + ;; + "m") + MEM_LIMIT="$OPTARG" + MEM_LIMIT_GIVEN=1 + ;; + "t") + TIMEOUT="$OPTARG" + ;; + "e") + EXTRA_PAR="$EXTRA_PAR -e" + ;; + "C") + export AFL_CMIN_CRASHES_ONLY=1 + ;; + "Q") + EXTRA_PAR="$EXTRA_PAR -Q" + test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250 + QEMU_MODE=1 + ;; + "?") + exit 1 + ;; + + esac + +done + +shift $((OPTIND-1)) + +TARGET_BIN="$1" + +if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then + + cat 1>&2 <<_EOF_ +Usage: $0 [ options ] -- /path/to/target_app [ ... ] + +Required parameters: + + -i dir - input directory with the starting corpus + -o dir - output directory for minimized files + +Execution control settings: + + -f file - location read by the fuzzed program (stdin) + -m megs - memory limit for child process ($MEM_LIMIT MB) + -t msec - run time limit for child process (none) + -Q - use binary-only instrumentation (QEMU mode) + +Minimization settings: + + -C - keep crashing inputs, reject everything else + -e - solve for edge coverage only, ignore hit counts + +For additional tips, please consult docs/README. + +_EOF_ + exit 1 +fi + +# Do a sanity check to discourage the use of /tmp, since we can't really +# handle this safely from a shell script. + +if [ "$AFL_ALLOW_TMP" = "" ]; then + + echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' + T1="$?" + + echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' + T2="$?" + + echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' + T3="$?" + + echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' + T4="$?" + + echo "$PWD" | grep -qE '^(/var)?/tmp/' + T5="$?" + + if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then + echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 + exit 1 + fi + +fi + +# If @@ is specified, but there's no -f, let's come up with a temporary input +# file name. + +TRACE_DIR="$OUT_DIR/.traces" + +if [ "$STDIN_FILE" = "" ]; then + + if echo "$*" | grep -qF '@@'; then + STDIN_FILE="$TRACE_DIR/.cur_input" + fi + +fi + +# Check for obvious errors. + +if [ ! "$MEM_LIMIT" = "none" ]; then + + if [ "$MEM_LIMIT" -lt "5" ]; then + echo "[-] Error: dangerously low memory limit." 1>&2 + exit 1 + fi + +fi + +if [ ! "$TIMEOUT" = "none" ]; then + + if [ "$TIMEOUT" -lt "10" ]; then + echo "[-] Error: dangerously low timeout." 1>&2 + exit 1 + fi + +fi + +if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then + + TNEW="`which "$TARGET_BIN" 2>/dev/null`" + + if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then + echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 + exit 1 + fi + + TARGET_BIN="$TNEW" + +fi + +if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then + + if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then + echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 + exit 1 + fi + +fi + +if [ ! -d "$IN_DIR" ]; then + echo "[-] Error: directory '$IN_DIR' not found." 1>&2 + exit 1 +fi + +test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" + +find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null +rm -rf "$TRACE_DIR" 2>/dev/null + +rmdir "$OUT_DIR" 2>/dev/null + +if [ -d "$OUT_DIR" ]; then + echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 + exit 1 +fi + +mkdir -m 700 -p "$TRACE_DIR" || exit 1 + +if [ ! "$STDIN_FILE" = "" ]; then + rm -f "$STDIN_FILE" || exit 1 + touch "$STDIN_FILE" || exit 1 +fi + +if [ "$AFL_PATH" = "" ]; then + SHOWMAP="${0%/afl-cmin}/afl-showmap" +else + SHOWMAP="$AFL_PATH/afl-showmap" +fi + +if [ ! -x "$SHOWMAP" ]; then + echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 + rm -rf "$TRACE_DIR" + exit 1 +fi + +IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) + +if [ "$IN_COUNT" = "0" ]; then + echo "[+] Hmm, no inputs in the target directory. Nothing to be done." + rm -rf "$TRACE_DIR" + exit 1 +fi + +FIRST_FILE=`ls "$IN_DIR" | head -1` + +# Make sure that we're not dealing with a directory. + +if [ -d "$IN_DIR/$FIRST_FILE" ]; then + echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 + rm -rf "$TRACE_DIR" + exit 1 +fi + +# Check for the more efficient way to copy files... + +if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then + CP_TOOL=ln +else + CP_TOOL=cp +fi + +# Make sure that we can actually get anything out of afl-showmap before we +# waste too much time. + +echo "[*] Testing the target binary..." + +if [ "$STDIN_FILE" = "" ]; then + + AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" + +else + + cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" + AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" &2 + test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" + exit 1 + +fi + +# Let's roll! + +############################# +# STEP 1: COLLECTING TRACES # +############################# + +echo "[*] Obtaining traces for input files in '$IN_DIR'..." + +( + + CUR=0 + + if [ "$STDIN_FILE" = "" ]; then + + while read -r fn; do + + CUR=$((CUR+1)) + printf "\\r Processing file $CUR/$IN_COUNT... " + + "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" + + done < <(ls "$IN_DIR") + + else + + while read -r fn; do + + CUR=$((CUR+1)) + printf "\\r Processing file $CUR/$IN_COUNT... " + + cp "$IN_DIR/$fn" "$STDIN_FILE" + + "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" "$TRACE_DIR/.all_uniq" + +TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) + +echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." + +##################################### +# STEP 3: SELECTING CANDIDATE FILES # +##################################### + +# The next step is to find the best candidate for each tuple. The "best" +# part is understood simply as the smallest input that includes a particular +# tuple in its trace. Empirical evidence suggests that this produces smaller +# datasets than more involved algorithms that could be still pulled off in +# a shell script. + +echo "[*] Finding best candidates for each tuple..." + +CUR=0 + +while read -r fn; do + + CUR=$((CUR+1)) + printf "\\r Processing file $CUR/$IN_COUNT... " + + sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" + +done < <(ls -rS "$IN_DIR") + +echo + +############################## +# STEP 4: LOADING CANDIDATES # +############################## + +# At this point, we have a file of tuple-file pairs, sorted by file size +# in ascending order (as a consequence of ls -rS). By doing sort keyed +# only by tuple (-k 1,1) and configured to output only the first line for +# every key (-s -u), we end up with the smallest file for each tuple. + +echo "[*] Sorting candidate list (be patient)..." + +sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ + sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" + +if [ ! -s "$TRACE_DIR/.candidate_script" ]; then + echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 + test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" + exit 1 +fi + +# The sed command converted the sorted list to a shell script that populates +# BEST_FILE[tuple]="fname". Let's load that! + +. "$TRACE_DIR/.candidate_script" + +########################## +# STEP 5: WRITING OUTPUT # +########################## + +# The final trick is to grab the top pick for each tuple, unless said tuple is +# already set due to the inclusion of an earlier candidate; and then put all +# tuples associated with the newly-added file to the "already have" list. The +# loop works from least popular tuples and toward the most common ones. + +echo "[*] Processing candidates and writing output files..." + +CUR=0 + +touch "$TRACE_DIR/.already_have" + +while read -r cnt tuple; do + + CUR=$((CUR+1)) + printf "\\r Processing tuple $CUR/$TUPLE_COUNT... " + + # If we already have this tuple, skip it. + + grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue + + FN=${BEST_FILE[tuple]} + + $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" + + if [ "$((CUR % 5))" = "0" ]; then + sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" + mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" + else + cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" + fi + +done <"$TRACE_DIR/.all_uniq" + +echo + +OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` + +if [ "$OUT_COUNT" = "1" ]; then + echo "[!] WARNING: All test cases had the same traces, check syntax!" +fi + +echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." +echo + +test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" + +exit 0