You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1228 lines
40 KiB

/*
* Copyright 2002-2019 Intel Corporation.
*
* This software is provided to you as Sample Source Code as defined in the accompanying
* End User License Agreement for the Intel(R) Software Development Products ("Agreement")
* section 1.L.
*
* This software and the related documents are provided as is, with no express or implied
* warranties, other than those that are expressly stated in the License.
*/
/* ===================================================================== */
/*! @file This file contains a static and dynamic opcode/ISA extension/ISA
* category mix profiler
*
* This is derived from mix.cpp. Handles an arbitrary number of threads
* using TLS for data storage and avoids locking, except during I/O.
*/
#include <vector>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <fstream>
#include <cstdlib>
#include <map>
#include <unistd.h>
#include "pin.H"
#include "instlib.H"
using namespace INSTLIB;
// key for accessing TLS storage in the threads. initialized once in main()
static TLS_KEY tls_key;
typedef UINT32 stat_index_t;
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
static string disassemble(UINT64 start, UINT64 stop);
#endif
/* ===================================================================== */
/* Commandline Switches */
/* ===================================================================== */
KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool",
"o", "mix.out", "specify profile file name");
KNOB<UINT32> KnobTopBlocks(KNOB_MODE_WRITEONCE, "pintool",
"top_blocks", "20", "specify a maximal number of top blocks for which icounts are printed");
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
KNOB<BOOL> KnobShowDisassembly(KNOB_MODE_WRITEONCE, "pintool",
"disas", "0", "Show disassembly for top blocks");
#endif
KNOB<BOOL> KnobPid(KNOB_MODE_WRITEONCE, "pintool",
"i", "0", "append pid to output");
KNOB<BOOL> KnobProfilePredicated(KNOB_MODE_WRITEONCE, "pintool",
"p", "0", "enable accurate profiling for predicated instructions");
KNOB<BOOL> KnobProfileStaticOnly(KNOB_MODE_WRITEONCE, "pintool",
"s", "0", "terminate after collection of static profile for main image");
#ifndef TARGET_WINDOWS
KNOB<BOOL> KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool",
"d", "0", "Only collect dynamic profile");
#else
KNOB<BOOL> KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool",
"d", "1", "Only collect dynamic profile");
#endif
KNOB<BOOL> KnobNoSharedLibs(KNOB_MODE_WRITEONCE, "pintool",
"no_shared_libs", "0", "do not instrument shared libraries");
KNOB<BOOL> KnobInstructionLengthMix(KNOB_MODE_WRITEONCE, "pintool","ilen", "0", "Compute instruction length mix");
KNOB<BOOL> KnobCategoryMix(KNOB_MODE_WRITEONCE, "pintool", "category", "0", "Compute ISA category mix");
KNOB<BOOL> KnobIformMix(KNOB_MODE_WRITEONCE, "pintool", "iform", "0", "Compute ISA iform mix");
KNOB<BOOL> KnobMapToFile(KNOB_MODE_WRITEONCE, "pintool", "mapaddr", "0", "Map Addresses to File/Line information");
typedef enum { measure_opcode=0, measure_category=1, measure_ilen=2, measure_iform=3 } measurement_t;
measurement_t measurement = measure_opcode;
/* ===================================================================== */
INT32 Usage()
{
cerr << "This pin tool computes a static and dynamic opcode, "
<< "instruction form, instruction length, extension or category mix profile\n\n";
cerr << KNOB_BASE::StringKnobSummary();
cerr << endl;
cerr << "The default is to do opcode and ISA extension profileing" << endl;
cerr << "At most one of -iform, -ilen or -category is allowed" << endl;
cerr << endl;
return -1;
}
/* ===================================================================== */
/* INDEX HELPERS */
/* ===================================================================== */
const UINT32 INDEX_SPECIAL = 3000;
const UINT32 MAX_MEM_SIZE = 520;
const UINT32 MAX_EXTENSION = 50;
const UINT32 INDEX_TOTAL = INDEX_SPECIAL + 0;
const UINT32 INDEX_MEM_ATOMIC = INDEX_SPECIAL + 1;
const UINT32 INDEX_STACK_READ = INDEX_SPECIAL + 2;
const UINT32 INDEX_STACK_WRITE = INDEX_SPECIAL + 3;
const UINT32 INDEX_IPREL_READ = INDEX_SPECIAL + 4;
const UINT32 INDEX_IPREL_WRITE = INDEX_SPECIAL + 5;
const UINT32 INDEX_MEM_READ_SIZE = INDEX_SPECIAL + 6;
const UINT32 INDEX_MEM_WRITE_SIZE = INDEX_SPECIAL + 6 + MAX_MEM_SIZE;
const UINT32 INDEX_EXTENSION = INDEX_SPECIAL + 6 + 2*MAX_MEM_SIZE;
const UINT32 INDEX_FMA_BASE = INDEX_EXTENSION + MAX_EXTENSION;
const UINT32 INDEX_FMA = INDEX_FMA_BASE + 1;
const UINT32 INDEX_FMA_ADD = INDEX_FMA_BASE + 2;
const UINT32 INDEX_FMA_MUL = INDEX_FMA_BASE + 3;
const UINT32 INDEX_FMA_S = INDEX_FMA_BASE + 4;
const UINT32 INDEX_FMA_S_ADD = INDEX_FMA_BASE + 5; // NOTE: skipped 6. does not matter
const UINT32 INDEX_FMA_S_MUL = INDEX_FMA_BASE + 7;
const UINT32 INDEX_FMA_D = INDEX_FMA_BASE + 8;
const UINT32 INDEX_FMA_D_ADD = INDEX_FMA_BASE + 9;
const UINT32 INDEX_FMA_D_MUL = INDEX_FMA_BASE + 10;
const UINT32 INDEX_FPMA = INDEX_FMA_BASE + 11;
const UINT32 INDEX_FPMA_ADD = INDEX_FMA_BASE + 12;
const UINT32 INDEX_FPMA_MUL = INDEX_FMA_BASE + 13;
const UINT32 INDEX_FMS = INDEX_FMA_BASE + 14;
const UINT32 INDEX_FMS_SUB = INDEX_FMA_BASE + 15;
const UINT32 INDEX_FMS_MUL = INDEX_FMA_BASE + 16;
const UINT32 INDEX_FMS_S = INDEX_FMA_BASE + 17;
const UINT32 INDEX_FMS_S_SUB = INDEX_FMA_BASE + 18;
const UINT32 INDEX_FMS_S_MUL = INDEX_FMA_BASE + 19;
const UINT32 INDEX_FMS_D = INDEX_FMA_BASE + 20;
const UINT32 INDEX_FMS_D_SUB = INDEX_FMA_BASE + 21;
const UINT32 INDEX_FMS_D_MUL = INDEX_FMA_BASE + 22;
const UINT32 INDEX_FPMS = INDEX_FMA_BASE + 23;
const UINT32 INDEX_FPMS_SUB = INDEX_FMA_BASE + 24;
const UINT32 INDEX_FPMS_MUL = INDEX_FMA_BASE + 25;
const UINT32 INDEX_FNMA = INDEX_FMA_BASE + 26;
const UINT32 INDEX_FNMA_ADD = INDEX_FMA_BASE + 27;
const UINT32 INDEX_FNMA_MUL = INDEX_FMA_BASE + 28;
const UINT32 INDEX_FNMA_S = INDEX_FMA_BASE + 29;
const UINT32 INDEX_FNMA_S_ADD = INDEX_FMA_BASE + 30;
const UINT32 INDEX_FNMA_S_MUL = INDEX_FMA_BASE + 31;
const UINT32 INDEX_FNMA_D = INDEX_FMA_BASE + 32;
const UINT32 INDEX_FNMA_D_ADD = INDEX_FMA_BASE + 33;
const UINT32 INDEX_FNMA_D_MUL = INDEX_FMA_BASE + 34;
const UINT32 INDEX_FPNMA = INDEX_FMA_BASE + 35;
const UINT32 INDEX_FPNMA_ADD = INDEX_FMA_BASE + 36;
const UINT32 INDEX_FPNMA_MUL = INDEX_FMA_BASE + 37;
const UINT32 INDEX_SPECIAL_END = INDEX_FMA_BASE + 38;
BOOL IsMemReadIndex(UINT32 i)
{
return (INDEX_MEM_READ_SIZE <= i && i < INDEX_MEM_READ_SIZE + MAX_MEM_SIZE );
}
BOOL IsMemWriteIndex(UINT32 i)
{
return (INDEX_MEM_WRITE_SIZE <= i && i < INDEX_MEM_WRITE_SIZE + MAX_MEM_SIZE );
}
/* ===================================================================== */
LOCALFUN UINT32 INS_GetIndex(INS ins)
{
UINT32 index = 0;
switch(measurement) {
case measure_opcode:
index = INS_Opcode(ins);
break;
case measure_ilen:
index = INS_Size(ins);
break;
case measure_category:
index = INS_Category(ins);
break;
case measure_iform:
{
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
xed_decoded_inst_t* xedd = INS_XedDec(ins);
xed_iform_enum_t iform = xed_decoded_inst_get_iform_enum(xedd);
index = static_cast<UINT32>(iform);
#endif
}
break;
}
return index;
}
/* ===================================================================== */
LOCALFUN BOOL INS_IsFMA(INS ins)
{
return FALSE;
}
/* ===================================================================== */
LOCALFUN UINT32 IndexStringLength(BBL bbl, BOOL memory_access_profile)
{
UINT32 count = 0;
for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins))
{
count++; // one for the ins
if (measurement != measure_iform)
count++; // one for the ISA extension.
if( measurement == measure_opcode && memory_access_profile )
{
if( INS_IsMemoryRead(ins) ) count++; // for size
if( INS_IsStackRead(ins) ) count++;
if( INS_IsIpRelRead(ins) ) count++;
if( INS_IsMemoryWrite(ins) ) count++; // for size
if( INS_IsStackWrite(ins) ) count++;
if( INS_IsIpRelWrite(ins) ) count++;
if( INS_IsAtomicUpdate(ins) ) count++;
if( INS_IsFMA(ins) ) count++;
}
}
return count;
}
/* ===================================================================== */
LOCALFUN UINT32 MemsizeToIndex(UINT32 size, BOOL write)
{
return (write ? INDEX_MEM_WRITE_SIZE : INDEX_MEM_READ_SIZE ) + size;
}
LOCALFUN stat_index_t *INS_GenerateIndexFMA(INS ins, stat_index_t *stats)
{
return stats;
}
/* ===================================================================== */
LOCALFUN stat_index_t* INS_GenerateIndexString(INS ins, stat_index_t *stats, BOOL memory_access_profile)
{
*stats++ = INS_GetIndex(ins);
if (measurement != measure_iform)
*stats++ = INS_Extension(ins) + INDEX_EXTENSION;
if( measurement == measure_opcode && memory_access_profile )
{
if( INS_IsMemoryRead(ins) ) *stats++ = MemsizeToIndex( INS_MemoryReadSize(ins), 0 );
if( INS_IsMemoryWrite(ins) ) *stats++ = MemsizeToIndex( INS_MemoryWriteSize(ins), 1 );
if( INS_IsAtomicUpdate(ins) ) *stats++ = INDEX_MEM_ATOMIC;
if( INS_IsStackRead(ins) ) *stats++ = INDEX_STACK_READ;
if( INS_IsStackWrite(ins) ) *stats++ = INDEX_STACK_WRITE;
if( INS_IsIpRelRead(ins) ) *stats++ = INDEX_IPREL_READ;
if( INS_IsIpRelWrite(ins) ) *stats++ = INDEX_IPREL_WRITE;
}
return stats;
}
/* ===================================================================== */
LOCALFUN string IndexToString( UINT32 index )
{
if (measurement == measure_iform)
{
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
return xed_iform_enum_t2str(static_cast<xed_iform_enum_t>(index));
#else
return "???";
#endif
}
if( INDEX_SPECIAL <= index && index < INDEX_SPECIAL_END)
{
if( index == INDEX_TOTAL ) return "*total";
else if( IsMemReadIndex(index) ) return "*mem-read-" + decstr( index - INDEX_MEM_READ_SIZE );
else if( IsMemWriteIndex(index)) return "*mem-write-" + decstr( index - INDEX_MEM_WRITE_SIZE );
else if( index == INDEX_MEM_ATOMIC ) return "*mem-atomic";
else if( index == INDEX_STACK_READ ) return "*stack-read";
else if( index == INDEX_STACK_WRITE ) return "*stack-write";
else if( index == INDEX_IPREL_READ ) return "*iprel-read";
else if( index == INDEX_IPREL_WRITE ) return "*iprel-write";
else if (index >= INDEX_EXTENSION && index < INDEX_EXTENSION + MAX_EXTENSION)
return "*isa-ext-" + EXTENSION_StringShort(index - INDEX_EXTENSION);
else if ( index == INDEX_FMA ) return "*FMA";
else if ( index == INDEX_FMA_ADD ) return "*FMA_ADD";
else if ( index == INDEX_FMA_MUL ) return "*FMA_MUL";
else if ( index == INDEX_FMA_S ) return "*FMA_S";
else if ( index == INDEX_FMA_S_ADD ) return "*FMA_S_ADD";
else if ( index == INDEX_FMA_S_MUL ) return "*FMA_S_MUL";
else if ( index == INDEX_FMA_D ) return "*FMA_D";
else if ( index == INDEX_FMA_D_ADD ) return "*FMA_D_ADD";
else if ( index == INDEX_FMA_D_MUL ) return "*FMA_D_MUL";
else if ( index == INDEX_FPMA ) return "*FPMA";
else if ( index == INDEX_FPMA_ADD ) return "*FPMA_ADD";
else if ( index == INDEX_FPMA_MUL ) return "*FPMA_MUL";
else if ( index == INDEX_FMS ) return "*FMS";
else if ( index == INDEX_FMS_SUB ) return "*FMS_SUB";
else if ( index == INDEX_FMS_MUL ) return "*FMS_MUL";
else if ( index == INDEX_FMS_S ) return "*FMS_S";
else if ( index == INDEX_FMS_S_SUB ) return "*FMS_S_SUB";
else if ( index == INDEX_FMS_S_MUL ) return "*FMS_S_MUL";
else if ( index == INDEX_FMS_D ) return "*FMS_D";
else if ( index == INDEX_FMS_D_SUB ) return "*FMS_D_SUB";
else if ( index == INDEX_FMS_D_MUL ) return "*FMS_D_MUL";
else if ( index == INDEX_FPMS ) return "*FPMS";
else if ( index == INDEX_FPMS_SUB ) return "*FPMS_SUB";
else if ( index == INDEX_FPMS_MUL ) return "*FPMS_MUL";
else if ( index == INDEX_FNMA ) return "*FNMA";
else if ( index == INDEX_FNMA_ADD ) return "*FNMA_ADD";
else if ( index == INDEX_FNMA_MUL ) return "*FNMA_MUL";
else if ( index == INDEX_FNMA_S ) return "*FNMA_S";
else if ( index == INDEX_FNMA_S_ADD ) return "*FNMA_S_ADD";
else if ( index == INDEX_FNMA_S_MUL ) return "*FNMA_S_MUL";
else if ( index == INDEX_FNMA_D ) return "*FNMA_D";
else if ( index == INDEX_FNMA_D_ADD ) return "*FNMA_D_ADD";
else if ( index == INDEX_FNMA_D_MUL ) return "*FNMA_D_MUL";
else if ( index == INDEX_FPNMA ) return "*FPNMA";
else if ( index == INDEX_FPNMA_ADD ) return "*FPNMA_ADD";
else if ( index == INDEX_FPNMA_MUL ) return "*FPNMA_MUL";
else
{
ASSERTX(0);
return "";
}
}
else if (measurement == measure_ilen)
{
ostringstream s;
s << "ILEN-" << index;
return s.str();
}
else if (measurement == measure_opcode)
{
return OPCODE_StringShort(index);
}
else if (measurement == measure_category)
{
return CATEGORY_StringShort(index);
}
ASSERTX(0);
return "";
}
/* ===================================================================== */
/* ===================================================================== */
typedef UINT64 COUNTER;
/* zero initialized */
typedef map<UINT32,COUNTER> stat_map_t;
class CSTATS
{
public:
CSTATS()
{
clear();
}
stat_map_t unpredicated;
stat_map_t predicated;
stat_map_t predicated_true;
VOID clear()
{
unpredicated.erase(unpredicated.begin(),unpredicated.end());
predicated.erase(predicated.begin(),predicated.end());
predicated_true.erase(predicated_true.begin(),predicated_true.end());
}
};
class BBL_SORT_STATS
{
public:
ADDRINT _pc;
UINT64 _icount;
UINT64 _executions;
UINT64 _nbytes;
};
CSTATS GlobalStatsStatic; // summary stats for static analysis
class BBLSTATS
{
// Our first pass sets up the types of stats we need to update for this
// block. We have one stat per instruction in the block. The _stats
// array is null terminated.
public:
const stat_index_t* const _stats;
const ADDRINT _pc; // start PC of the block
const UINT32 _ninst; // # of instructions
const UINT32 _nbytes; // # of bytes in the block
BBLSTATS(stat_index_t* stats, ADDRINT pc, UINT32 ninst, UINT32 nbytes) : _stats(stats), _pc(pc),
_ninst(ninst), _nbytes(nbytes) { };
};
LOCALVAR vector<BBLSTATS*> statsList;
/* ===================================================================== */
#if defined(__GNUC__)
# if defined(TARGET_MAC) || defined(TARGET_WINDOWS)
// macOS* XCODE2.4.1 gcc and Cgywin gcc 3.4.x only allow for 16b
// alignment! So we need to pad!
# define ALIGN_LOCK __attribute__ ((aligned(16)))
# define NEED_TO_PAD
# else
# define ALIGN_LOCK __attribute__ ((aligned(64)))
# endif
#else
# define ALIGN_LOCK __declspec(align(64))
#endif
#if defined(NEED_TO_PAD)
LOCALVAR char pad0[48];
#endif
LOCALVAR PIN_LOCK ALIGN_LOCK pinLock;
#if defined(NEED_TO_PAD)
LOCALVAR char pad1[48];
#endif
LOCALVAR PIN_LOCK ALIGN_LOCK bbl_list_lock;
#if defined(NEED_TO_PAD)
LOCALVAR char pad2[48];
#endif
static std::ofstream* out;
class thread_data_t
{
public:
thread_data_t()
: enabled(0)
{
}
CSTATS cstats;
UINT32 enabled;
vector<COUNTER> block_counts;
UINT32 size()
{
UINT32 limit;
limit = block_counts.size();
return limit;
}
void resize(UINT32 n)
{
if (size() < n)
block_counts.resize(2*n);
}
};
thread_data_t* get_tls(THREADID tid)
{
thread_data_t* tdata =
static_cast<thread_data_t*>(PIN_GetThreadData(tls_key, tid));
return tdata;
}
VOID activate_counting(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
tdata->enabled = 1;
}
VOID deactivate_counting(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
tdata->enabled = 0;
}
UINT32 numThreads = 0;
VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v)
{
// This function is locked no need for a Pin Lock here
numThreads++;
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# Starting tid " << tid << endl;
PIN_ReleaseLock(&pinLock);
thread_data_t* tdata = new thread_data_t;
// remember my pointer for later
PIN_SetThreadData(tls_key, tdata, tid);
// make sure the thread is counting stuff.
// FIXME: The controller should start all threads if no trigger
// conditions are specified, but currently it only starts
// TID0. Starting here is wrong if the controller has a nontrivial
// starting condition, but this is what most people want. They can
// always stop the controller and zero the stats using markers as a
// workaround.
if (tid)
activate_counting(tid);
}
VOID emit_stats(THREADID tid); //forward prototype
VOID emit_pc_stats(THREADID tid); //forward prototype
VOID zero_stats(THREADID tid); //forward prototype
VOID emit_bbl_stats_sorted(THREADID tid);
LOCALVAR CONTROL_MANAGER control;
LOCALFUN VOID Handler(EVENT_TYPE ev, VOID *val, CONTEXT *ctxt, VOID *ip, THREADID tid, bool bcast)
{
switch(ev)
{
case EVENT_START:
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# Start counting for tid " << tid << endl;
PIN_ReleaseLock(&pinLock);
activate_counting(tid);
break;
case EVENT_STOP:
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# Stop counting for tid " << tid << endl;
if (control.PinPointsActive()) {
UINT32 pp = control.CurrentPp(tid);
UINT32 phase = control.CurrentPhase(tid);
*out << "# PinPointNumber " << pp << endl;
*out << "# PinPointPhase " << phase << endl;
}
PIN_ReleaseLock(&pinLock);
deactivate_counting(tid);
if (control.PinPointsActive()) {
// when doing pinpoints "mixes" we want to emit and then zero the stats when we stop a region.
emit_stats(tid);
emit_bbl_stats_sorted(tid);
zero_stats(tid);
}
break;
case CONTROL_STATS_EMIT:
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# Emit stats for tid " << tid << endl;
PIN_ReleaseLock(&pinLock);
emit_stats(tid);
break;
case CONTROL_STATS_RESET:
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# Reset stats for tid " << tid << endl;
PIN_ReleaseLock(&pinLock);
zero_stats(tid);
break;
default:
ASSERTX(false);
}
}
/* ===================================================================== */
VOID validate_bbl_count(THREADID tid, ADDRINT block_count_for_trace)
{
thread_data_t* tdata = get_tls(tid);
tdata->resize(block_count_for_trace+1);
}
VOID PIN_FAST_ANALYSIS_CALL docount_bbl(ADDRINT block_id, THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
//ASSERTX(tdata->size() > block_id);
tdata->block_counts[block_id] += tdata->enabled;
}
VOID docount_predicated_true(UINT32 index, THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
if (tdata->enabled) {
stat_map_t::iterator i = tdata->cstats.predicated_true.find(index);
if (i == tdata->cstats.predicated_true.end())
tdata->cstats.predicated_true[index] = 1;
else
i->second += 1;
}
}
/* ===================================================================== */
VOID zero_stats(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
tdata->cstats.clear();
UINT32 limit = tdata->size();
for(UINT32 i=0;i< limit;i++)
tdata->block_counts[i]=0;
}
/* ===================================================================== */
VOID CheckForSpecialMarkers(INS ins, ADDRINT pc, unsigned int instruction_size)
{
// This checks for single instances of special 3B NOPs.
// 0F1FF3 - start
// 0F1FF4 - stop
// 0F1FF5 - emit stats
// 0F1FF6 - zero stats
// FIXME: if there are collisions with existing instructions, we can
// change them here.
//FIXME: Ideally this would be integrated in to the control.H so file
//so that anything can use it.
if (instruction_size != 3)
return;
UINT8* pc_ptr = reinterpret_cast<UINT8*>(pc);
if (pc_ptr[0] == 0x0F &&
pc_ptr[1] == 0x1F)
{
switch(pc_ptr[2])
{
case 0xF3: // start
INS_InsertCall(ins,
IPOINT_BEFORE,
(AFUNPTR)activate_counting,
IARG_THREAD_ID,
IARG_END);
break;
case 0xF4: // stop
INS_InsertCall(ins,
IPOINT_BEFORE,
(AFUNPTR)deactivate_counting,
IARG_THREAD_ID,
IARG_END);
break;
case 0xF5: // emit
INS_InsertCall(ins,
IPOINT_BEFORE,
(AFUNPTR)emit_stats,
IARG_THREAD_ID,
IARG_END);
break;
case 0xF6: // zero
INS_InsertCall(ins,
IPOINT_BEFORE,
(AFUNPTR)zero_stats,
IARG_THREAD_ID,
IARG_END);
break;
default:
break;
}
}
}
/* ===================================================================== */
VOID Trace(TRACE trace, VOID *v)
{
static UINT32 basic_blocks = 0;
const BOOL accurate_handling_of_predicates = KnobProfilePredicated.Value();
ADDRINT pc = TRACE_Address(trace);
ADDRINT start_pc = pc;
UINT32 new_blocks = 0;
for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl))
{
const INS head = BBL_InsHead(bbl);
if (! INS_Valid(head)) continue;
new_blocks++;
}
TRACE_InsertCall(trace,
IPOINT_BEFORE,
AFUNPTR(validate_bbl_count),
IARG_THREAD_ID,
IARG_UINT32,
basic_blocks+new_blocks,
IARG_END);
for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl))
{
const INS head = BBL_InsHead(bbl);
if (! INS_Valid(head)) continue;
// Summarize the stats for the bbl in a 0 terminated list
// This is done at instrumentation time
const UINT32 n = IndexStringLength(bbl, 1);
// stats is an array of index types. We later multiply it by the
// dynamic count for a block.
stat_index_t *const stats = new stat_index_t[ n + 1];
stat_index_t *const stats_end = stats + (n + 1);
stat_index_t *curr = stats;
UINT32 ninsts = 0;
for (INS ins = head; INS_Valid(ins); ins = INS_Next(ins))
{
unsigned int instruction_size = INS_Size(ins);
// This checks for x86-specific opcodes
CheckForSpecialMarkers(ins, pc, instruction_size);
// Count the number of times a predicated instruction is actually executed
// this is expensive and hence disabled by default
if( INS_IsPredicated(ins) && accurate_handling_of_predicates )
{
INS_InsertPredicatedCall(ins,
IPOINT_BEFORE,
AFUNPTR(docount_predicated_true),
IARG_UINT32,
INS_GetIndex(ins),
IARG_THREAD_ID,
IARG_END);
}
if (KnobMapToFile) {
INT32 line;
string filename;
PIN_GetSourceLocation(pc, NULL, &line, &filename);
if (!filename.empty())
*out << "MAPADDR 0x" << hex << pc << " " << dec << line << " " << filename << endl;
}
curr = INS_GenerateIndexString(ins,curr,1);
if (measurement == measure_opcode)
curr = INS_GenerateIndexFMA(ins,curr);
pc = pc + instruction_size;
ninsts++;
}
// stats terminator
*curr++ = 0;
ASSERTX( curr == stats_end );
// Insert instrumentation to count the number of times the bbl is executed
BBLSTATS * bblstats = new BBLSTATS(stats, start_pc, ninsts, pc-start_pc);
INS_InsertCall(head,
IPOINT_BEFORE,
AFUNPTR(docount_bbl),
IARG_FAST_ANALYSIS_CALL,
IARG_UINT32,
basic_blocks,
IARG_THREAD_ID,
IARG_END);
// Remember the counter and stats so we can compute a summary at the end
basic_blocks++;
PIN_GetLock(&bbl_list_lock,1);
statsList.push_back(bblstats);
PIN_ReleaseLock(&bbl_list_lock);
}
}
/* ===================================================================== */
VOID DumpStats(ofstream& out,
CSTATS& stats,
BOOL predicated_true,
const string& title,
THREADID tid)
{
out << "#\n# " << title << "\n#\n";
if (tid != INVALID_THREADID)
out << "# TID " << tid << "\n";
out << "# ";
if (measurement == measure_opcode)
out << "opcode";
else if (measurement == measure_ilen)
out << "inslen";
else if (measurement == measure_category)
out << "catgry";
else if (measurement == measure_iform)
out << "iform ";
out<< " count-unpredicated count-predicated";
if( predicated_true )
out << " count-predicated-true";
out << "\n#\n";
// Compute the "total" bin. Stop at the INDEX_TOTAL for all histograms
// except the iform. Iforms donot use the special rows, so we count everything.
// build a map of the valid stats index values for all 3 tables.
map<UINT32, bool> m;
UINT32 tu=0, tp=0, tpt=0;
for(stat_map_t::iterator it = stats.unpredicated.begin() ; it != stats.unpredicated.end() ; it++) {
if (measurement == measure_iform || it->first < INDEX_TOTAL)
tu += it->second;
m[it->first]=true;
}
for(stat_map_t::iterator it = stats.predicated.begin() ; it != stats.predicated.end() ; it++) {
if (measurement == measure_iform || it->first < INDEX_TOTAL)
tp += it->second;
m[it->first]=true;
}
for(stat_map_t::iterator it=stats.predicated_true.begin();it != stats.predicated_true.end() ; it++) {
if (measurement == measure_iform || it->first < INDEX_TOTAL)
tpt += it->second;
m[it->first]=true;
}
for(map<UINT32,bool>::iterator it = m.begin(); it != m.end(); it++) {
stat_map_t::iterator s;
COUNTER up=0,pr=0,prt=0;
UINT32 indx = it->first;
s = stats.unpredicated.find(indx);
if (s != stats.unpredicated.end())
up = s->second;
s = stats.predicated.find(indx);
if (s != stats.predicated.end())
pr = s->second;
if (up == 0 && pr == 0)
continue;
out << setw(6) << indx << " "
<< ljstr(IndexToString(indx),25) << " "
<< setw(16) << up << " "
<< setw(16) << pr;
if( predicated_true ) {
s = stats.predicated_true.find(indx);
prt = 0;
if (s != stats.predicated_true.end())
prt = s->second;
out << " " << setw(16) << prt;
}
out << endl;
}
// print the totals
out << setw(6) << "000000" << " "
<< ljstr("*total",25) << " "
<< setw(16) << tu << " "
<< setw(16) << tp;
if( predicated_true )
out << " " << setw(16) << tpt;
out << endl;
}
/* ===================================================================== */
static UINT32 stat_dump_count = 0;
VOID emit_bbl_stats(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
// dynamic Counts
// Need to lock here because we might be resize (and thus reallocing)
// the statsList when we do a push_back in the instrumentation.
PIN_GetLock(&bbl_list_lock,tid+1);
UINT32 limit = tdata->size();
if ( limit > statsList.size() )
limit = statsList.size();
for(UINT32 i=0;i< limit ; i++)
{
UINT32 bcount = tdata->block_counts[i];
BBLSTATS* b = statsList[i];
if (b && b->_stats)
for (const stat_index_t* stats = b->_stats; *stats; stats++)
tdata->cstats.unpredicated[*stats] += bcount;
}
PIN_ReleaseLock(&bbl_list_lock);
PIN_GetLock(&pinLock, tid+1); // for output
stat_dump_count++;
*out << "# EMIT_STATS " << stat_dump_count << endl;
DumpStats(*out, tdata->cstats, KnobProfilePredicated, "$dynamic-counts",tid);
*out << "# END_STATS" << endl;
PIN_ReleaseLock(&pinLock);
}
int qsort_compare_fn(const void *a, const void *b)
{
const BBL_SORT_STATS* ba = static_cast<const BBL_SORT_STATS*>(a);
const BBL_SORT_STATS* bb = static_cast<const BBL_SORT_STATS*>(b);
return (bb->_icount - ba->_icount); // descending sort
}
VOID emit_bbl_stats_sorted(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
// dynamic Counts
// Need to lock here because we might be resize (and thus reallocing)
// the statsList when we do a push_back in the instrumentation.
PIN_GetLock(&bbl_list_lock,tid+1);
UINT32 limit = tdata->size();
if ( limit > statsList.size() )
limit = statsList.size();
BBL_SORT_STATS* icounts = new BBL_SORT_STATS[limit];
UINT64 thread_total = 0;
for(UINT32 i=0;i< limit ; i++)
{
BBLSTATS* b = statsList[i];
if (b) {
UINT32 bcount = tdata->block_counts[i];
icounts[i]._icount = bcount * b->_ninst;
icounts[i]._pc = b->_pc;
icounts[i]._executions = bcount;
icounts[i]._nbytes = b->_nbytes;
thread_total += icounts[i]._icount;
}
}
PIN_ReleaseLock(&bbl_list_lock);
qsort(icounts, limit, sizeof(BBL_SORT_STATS), qsort_compare_fn);
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# EMIT_STATS TOP BLOCKS " << stat_dump_count
<< " FOR TID " << tid
<< endl;
if (limit > KnobTopBlocks.Value())
limit = KnobTopBlocks.Value();
UINT64 t =0;
for(UINT32 i=0;i<limit;i++) {
t+= icounts[i]._icount;
*out << "BLOCK: " << setw(5) << i
<< " PC: "
<< hex
<< setfill('0')
<< setw(sizeof(ADDRINT)*2) << icounts[i]._pc
<< setfill(' ')
<< dec
<< " ICOUNT: "
<< setw(9) << icounts[i]._icount
<< " EXECUTIONS: "
<< setw(9) << icounts[i]._executions
<< " #BYTES: "
<< setw(2) << icounts[i]._nbytes
<< " %: "
<< setw(5) << setprecision(3) << 100.0*icounts[i]._icount/thread_total
<< " cumltv%: "
<< setw(5) << setprecision(3) << 100.0*t/thread_total
<< endl;
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
if (KnobShowDisassembly) {
string s = disassemble(icounts[i]._pc, icounts[i]._pc + icounts[i]._nbytes);
*out << s << endl;
}
#endif
}
*out << "# END_STATS" << endl;
PIN_ReleaseLock(&pinLock);
delete [] icounts;
}
VOID emit_static_stats()
{
*out << "# EMIT_STATIC_STATS " << stat_dump_count << endl;
DumpStats(*out, GlobalStatsStatic, false, "$static-counts",INVALID_THREADID);
*out << endl << "# END_STATIC_STATS" << endl;
}
VOID emit_pc_stats(THREADID tid)
{
thread_data_t* tdata = get_tls(tid);
// dynamic Counts
// Need to lock here because we might be resize (and thus reallocing)
// the statsList when we do a push_back in the instrumentation.
PIN_GetLock(&pinLock, tid+1); // for output
*out << "# EMIT_PC_STATS for TID " << tid << endl;
PIN_GetLock(&bbl_list_lock,tid+1);
UINT32 limit = tdata->size();
if ( limit > statsList.size() )
limit = statsList.size();
for(UINT32 i=0;i< limit ; i++)
{
UINT32 bcount = tdata->block_counts[i];
BBLSTATS* b = statsList[i];
if (bcount && b && b->_stats)
*out << "BLOCKCOUNT 0x" << hex << b->_pc << " " << dec << (bcount * b->_ninst ) << endl;
}
PIN_ReleaseLock(&bbl_list_lock);
*out << "# END_EMIT_PC_STATS for TID " << tid << endl;
PIN_ReleaseLock(&pinLock);
}
VOID emit_stats(THREADID tid)
{
emit_bbl_stats(tid);
if (KnobMapToFile)
emit_pc_stats(tid);
}
/* ===================================================================== */
void combine_dynamic_stats(unsigned int numThreads)
{
// combine all the rows from each thread in to the total variable.
CSTATS total;
for (THREADID i=0;i<numThreads; i++)
{
thread_data_t* tdata = get_tls(i);
for(stat_map_t::iterator it = tdata->cstats.unpredicated.begin(); it != tdata->cstats.unpredicated.end() ; it++) {
stat_map_t::iterator x = total.unpredicated.find(it->first);
if (x == total.unpredicated.end())
total.unpredicated[it->first] = it->second;
else
x->second += it->second;
}
for(stat_map_t::iterator it = tdata->cstats.predicated.begin(); it != tdata->cstats.predicated.end() ; it++) {
stat_map_t::iterator x = total.predicated.find(it->first);
if (x == total.predicated.end())
total.predicated[it->first] = it->second;
else
x->second += it->second;
}
for(stat_map_t::iterator it = tdata->cstats.predicated_true.begin(); it != tdata->cstats.predicated_true.end() ; it++) {
stat_map_t::iterator x = total.predicated_true.find(it->first);
if (x == total.predicated_true.end())
total.predicated_true[it->first] = it->second;
else
x->second += it->second;
}
}
*out << "# EMIT_GLOBAL_DYNAMIC_STATS " << stat_dump_count << endl;
DumpStats(*out, total, false, "$global-dynamic-counts",INVALID_THREADID);
*out << endl << "# END_GLOBAL_DYNAMIC_STATS" << endl;
}
VOID Fini(int, VOID * v) // only runs once for the application
{
*out << "# FINI: end of program" << endl;
for(unsigned int i=0;i<numThreads;i++) {
emit_stats(i);
emit_bbl_stats_sorted(i);
}
emit_static_stats();
combine_dynamic_stats(numThreads);
out->close();
}
/* ===================================================================== */
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
/////////////////////////////////////////////////////////////////////////
// Add a disassembler
/////////////////////////////////////////////////////////////////////////
static char nibble_to_ascii_hex(UINT8 i) {
if (i<10) return i+'0';
if (i<16) return i-10+'A';
return '?';
}
static void print_hex_line(char* buf, const UINT8* array, const int length) {
int n = length;
int i=0;
if (length == 0)
n = XED_MAX_INSTRUCTION_BYTES;
for( i=0 ; i< n; i++) {
buf[2*i+0] = nibble_to_ascii_hex(array[i]>>4);
buf[2*i+1] = nibble_to_ascii_hex(array[i]&0xF);
}
buf[2*i]=0;
}
static string
disassemble(UINT64 start, UINT64 stop) {
UINT64 pc = start;
xed_state_t dstate;
xed_syntax_enum_t syntax = XED_SYNTAX_INTEL;
xed_error_enum_t xed_error;
xed_decoded_inst_t xedd;
ostringstream os;
if (sizeof(ADDRINT) == 4)
xed_state_init(&dstate,
XED_MACHINE_MODE_LEGACY_32,
XED_ADDRESS_WIDTH_32b,
XED_ADDRESS_WIDTH_32b);
else
xed_state_init(&dstate,
XED_MACHINE_MODE_LONG_64,
XED_ADDRESS_WIDTH_64b,
XED_ADDRESS_WIDTH_64b);
while( pc < stop ) {
xed_decoded_inst_zero_set_mode(&xedd, &dstate);
UINT32 len = 15;
if (stop - pc < 15)
len = stop-pc;
xed_error = xed_decode(&xedd, reinterpret_cast<const UINT8*>(pc), len);
bool okay = (xed_error == XED_ERROR_NONE);
iostream::fmtflags fmt = os.flags();
os << std::setfill('0')
<< "XDIS "
<< std::hex
<< std::setw(sizeof(ADDRINT)*2)
<< pc
<< std::dec
<< ": "
<< std::setfill(' ')
<< std::setw(4);
if (okay) {
char buffer[200];
unsigned int dec_len, sp;
os << xed_extension_enum_t2str(xed_decoded_inst_get_extension(&xedd));
dec_len = xed_decoded_inst_get_length(&xedd);
print_hex_line(buffer, reinterpret_cast<UINT8*>(pc), dec_len);
os << " " << buffer;
for ( sp=dec_len; sp < 12; sp++) // pad out the instruction bytes
os << " ";
os << " ";
memset(buffer,0,200);
int dis_okay = xed_format_context(syntax, &xedd, buffer, 200, pc, 0, 0);
if (dis_okay)
os << buffer << endl;
else
os << "Error disasassembling pc 0x" << std::hex << pc << std::dec;
pc += dec_len;
}
else { // print the byte and keep going.
UINT8 memval = *reinterpret_cast<UINT8*>(pc);
os << "???? " // no extension
<< std::hex
<< std::setw(2)
<< std::setfill('0')
<< static_cast<UINT32>(memval)
<< std::endl;
pc += 1;
}
os.flags(fmt);
}
return os.str();
}
#endif
/* ===================================================================== */
int main(int argc, CHAR **argv)
{
if( PIN_Init(argc,argv) )
return Usage();
PIN_InitLock(&pinLock);
PIN_InitLock(&bbl_list_lock);
// obtain a key for TLS storage
tls_key = PIN_CreateThreadDataKey(0);
string filename = KnobOutputFile.Value();
if (KnobPid)
{
filename += "." + decstr(getpid());
}
out = new std::ofstream(filename.c_str());
control.CheckKnobs(Handler, 0);
// make sure that exactly one thing-to-count knob is specified.
if (KnobInstructionLengthMix.Value() && KnobCategoryMix.Value()) {
cerr << "Must have at most one of: -iform, -ilen or -category "
<< "as a pintool option" << endl;
exit(1);
}
if (KnobInstructionLengthMix.Value())
measurement = measure_ilen;
if (KnobCategoryMix.Value())
measurement = measure_category;
if (KnobIformMix.Value()) {
#if defined(TARGET_IA32) || defined(TARGET_IA32E)
measurement = measure_iform;
#else
cerr << "Cannot only compute iform mixes on IA32 and Intel64" << endl;
#endif
}
TRACE_AddInstrumentFunction(Trace, 0);
PIN_AddThreadStartFunction(ThreadStart, 0);
PIN_AddFiniFunction(Fini, 0);
PIN_StartProgram(); // Never returns
return 0;
#if defined(NEED_TO_PAD)
(void) pad0; //pacify compiler
(void) pad1;
(void) pad2;
#endif
}
/* ===================================================================== */
/* eof */
/* ===================================================================== */