You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1450 lines
49 KiB
1450 lines
49 KiB
/*
|
|
* Copyright 2002-2019 Intel Corporation.
|
|
*
|
|
* This software is provided to you as Sample Source Code as defined in the accompanying
|
|
* End User License Agreement for the Intel(R) Software Development Products ("Agreement")
|
|
* section 1.L.
|
|
*
|
|
* This software and the related documents are provided as is, with no express or implied
|
|
* warranties, other than those that are expressly stated in the License.
|
|
*/
|
|
|
|
/* ===================================================================== */
|
|
/*! @file This file contains a static and dynamic opcode/ISA extension/ISA
|
|
* category mix profiler
|
|
*
|
|
* This is derived from mix.cpp. Handles an arbitrary number of threads
|
|
* using TLS for data storage and avoids locking, except during I/O.
|
|
*/
|
|
|
|
#if defined(TARGET_WINDOWS)
|
|
#define strdup _strdup
|
|
#endif
|
|
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <iomanip>
|
|
#include <fstream>
|
|
#include <cstdlib>
|
|
#include <map>
|
|
#include <utility> /* for pair */
|
|
#include <vector>
|
|
#include <unistd.h>
|
|
#include "pin.H"
|
|
#include "control_manager.H"
|
|
#include "mix-fp-state.H"
|
|
using namespace CONTROLLER;
|
|
|
|
// key for accessing TLS storage in the threads. initialized once in main()
|
|
static TLS_KEY tls_key;
|
|
|
|
typedef UINT32 stat_index_t;
|
|
|
|
static string disassemble(UINT64 start, UINT64 stop);
|
|
|
|
/* ===================================================================== */
|
|
/* Commandline Switches */
|
|
/* ===================================================================== */
|
|
KNOB_COMMENT mix_knob_family("pintool:mix", "Mix knobs");
|
|
KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"o", "mix.out", "specify profile file name");
|
|
KNOB<UINT32> KnobTopBlocks(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"top_blocks", "20", "specify a maximal number of top blocks for which icounts are printed");
|
|
KNOB<BOOL> KnobShowDisassembly(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"disas", "0", "Show disassembly for top blocks");
|
|
KNOB<BOOL> KnobPid(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"i", "0", "append pid to output file name");
|
|
KNOB<BOOL> KnobProfilePredicated(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"p", "0", "enable accurate profiling for predicated instructions");
|
|
KNOB<BOOL> KnobProfileStaticOnly(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"s", "0", "terminate after collection of static profile for main image");
|
|
#ifndef TARGET_WINDOWS
|
|
KNOB<BOOL> KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"d", "0", "Only collect dynamic profile");
|
|
#else
|
|
KNOB<BOOL> KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"d", "1", "Only collect dynamic profile");
|
|
#endif
|
|
KNOB<BOOL> KnobNoSharedLibs(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"no_shared_libs", "0", "do not instrument shared libraries");
|
|
|
|
KNOB<BOOL> KnobInstructionLengthMix(KNOB_MODE_WRITEONCE, "pintool:mix","ilen", "0", "Compute instruction length mix");
|
|
KNOB<BOOL> KnobCategoryMix(KNOB_MODE_WRITEONCE, "pintool:mix", "category", "0", "Compute ISA category mix");
|
|
KNOB<BOOL> KnobIformMix(KNOB_MODE_WRITEONCE, "pintool:mix", "iform", "0", "Compute ISA iform mix");
|
|
KNOB<BOOL> KnobMapToFile(KNOB_MODE_WRITEONCE, "pintool:mix", "mapaddr", "0", "Map Addresses to File/Line information");
|
|
KNOB<BOOL> KnobEarlyOut(KNOB_MODE_WRITEONCE, "pintool:mix",
|
|
"early_out", "0" , "Exit after tracing the first region.");
|
|
|
|
|
|
typedef enum { measure_opcode=0, measure_category=1, measure_ilen=2, measure_iform=3 } measurement_t;
|
|
measurement_t measurement = measure_opcode;
|
|
|
|
/* ===================================================================== */
|
|
|
|
INT32 Usage()
|
|
{
|
|
cerr << "This pin tool computes a static and dynamic opcode, "
|
|
<< "instruction form, instruction length, extension or category mix profile\n\n";
|
|
cerr << KNOB_BASE::StringKnobSummary();
|
|
cerr << endl;
|
|
cerr << "The default is to do opcode and ISA extension profileing" << endl;
|
|
cerr << "At most one of -iform, -ilen or -category is allowed" << endl;
|
|
cerr << endl;
|
|
return -1;
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
/* INDEX HELPERS */
|
|
/* ===================================================================== */
|
|
|
|
const UINT32 INDEX_SPECIAL = 3000;
|
|
const UINT32 MAX_MEM_SIZE = 520;
|
|
const UINT32 MAX_EXTENSION = XED_EXTENSION_LAST+10;
|
|
|
|
const UINT32 INDEX_TOTAL = INDEX_SPECIAL + 0;
|
|
const UINT32 INDEX_MEM_ATOMIC = INDEX_SPECIAL + 1;
|
|
const UINT32 INDEX_STACK_READ = INDEX_SPECIAL + 2;
|
|
const UINT32 INDEX_STACK_WRITE = INDEX_SPECIAL + 3;
|
|
const UINT32 INDEX_IPREL_READ = INDEX_SPECIAL + 4;
|
|
const UINT32 INDEX_IPREL_WRITE = INDEX_SPECIAL + 5;
|
|
const UINT32 INDEX_MEM_READ_SIZE = INDEX_SPECIAL + 6;
|
|
const UINT32 INDEX_MEM_WRITE_SIZE = INDEX_SPECIAL + 6 + MAX_MEM_SIZE;
|
|
|
|
const UINT32 INDEX_EXTENSION = INDEX_SPECIAL + 6 + 2*MAX_MEM_SIZE;
|
|
|
|
const UINT32 INDEX_SCALAR_SIMD = INDEX_EXTENSION + MAX_EXTENSION;
|
|
const UINT32 INDEX_FMA_BASE = INDEX_SCALAR_SIMD + 1;
|
|
const UINT32 INDEX_FMA = INDEX_FMA_BASE + 1;
|
|
const UINT32 INDEX_FMA_ADD = INDEX_FMA_BASE + 2;
|
|
const UINT32 INDEX_FMA_MUL = INDEX_FMA_BASE + 3;
|
|
const UINT32 INDEX_FMA_S = INDEX_FMA_BASE + 4;
|
|
const UINT32 INDEX_FMA_S_ADD = INDEX_FMA_BASE + 5; // NOTE: skipped 6. does not matter
|
|
const UINT32 INDEX_FMA_S_MUL = INDEX_FMA_BASE + 7;
|
|
const UINT32 INDEX_FMA_D = INDEX_FMA_BASE + 8;
|
|
const UINT32 INDEX_FMA_D_ADD = INDEX_FMA_BASE + 9;
|
|
const UINT32 INDEX_FMA_D_MUL = INDEX_FMA_BASE + 10;
|
|
const UINT32 INDEX_FPMA = INDEX_FMA_BASE + 11;
|
|
const UINT32 INDEX_FPMA_ADD = INDEX_FMA_BASE + 12;
|
|
const UINT32 INDEX_FPMA_MUL = INDEX_FMA_BASE + 13;
|
|
const UINT32 INDEX_FMS = INDEX_FMA_BASE + 14;
|
|
const UINT32 INDEX_FMS_SUB = INDEX_FMA_BASE + 15;
|
|
const UINT32 INDEX_FMS_MUL = INDEX_FMA_BASE + 16;
|
|
const UINT32 INDEX_FMS_S = INDEX_FMA_BASE + 17;
|
|
const UINT32 INDEX_FMS_S_SUB = INDEX_FMA_BASE + 18;
|
|
const UINT32 INDEX_FMS_S_MUL = INDEX_FMA_BASE + 19;
|
|
const UINT32 INDEX_FMS_D = INDEX_FMA_BASE + 20;
|
|
const UINT32 INDEX_FMS_D_SUB = INDEX_FMA_BASE + 21;
|
|
const UINT32 INDEX_FMS_D_MUL = INDEX_FMA_BASE + 22;
|
|
const UINT32 INDEX_FPMS = INDEX_FMA_BASE + 23;
|
|
const UINT32 INDEX_FPMS_SUB = INDEX_FMA_BASE + 24;
|
|
const UINT32 INDEX_FPMS_MUL = INDEX_FMA_BASE + 25;
|
|
const UINT32 INDEX_FNMA = INDEX_FMA_BASE + 26;
|
|
const UINT32 INDEX_FNMA_ADD = INDEX_FMA_BASE + 27;
|
|
const UINT32 INDEX_FNMA_MUL = INDEX_FMA_BASE + 28;
|
|
const UINT32 INDEX_FNMA_S = INDEX_FMA_BASE + 29;
|
|
const UINT32 INDEX_FNMA_S_ADD = INDEX_FMA_BASE + 30;
|
|
const UINT32 INDEX_FNMA_S_MUL = INDEX_FMA_BASE + 31;
|
|
const UINT32 INDEX_FNMA_D = INDEX_FMA_BASE + 32;
|
|
const UINT32 INDEX_FNMA_D_ADD = INDEX_FMA_BASE + 33;
|
|
const UINT32 INDEX_FNMA_D_MUL = INDEX_FMA_BASE + 34;
|
|
const UINT32 INDEX_FPNMA = INDEX_FMA_BASE + 35;
|
|
const UINT32 INDEX_FPNMA_ADD = INDEX_FMA_BASE + 36;
|
|
const UINT32 INDEX_FPNMA_MUL = INDEX_FMA_BASE + 37;
|
|
|
|
const UINT32 INDEX_SPECIAL_END = INDEX_FMA_BASE + 38;
|
|
|
|
BOOL IsMemReadIndex(UINT32 i)
|
|
{
|
|
return (INDEX_MEM_READ_SIZE <= i && i < INDEX_MEM_READ_SIZE + MAX_MEM_SIZE );
|
|
}
|
|
|
|
BOOL IsMemWriteIndex(UINT32 i)
|
|
{
|
|
return (INDEX_MEM_WRITE_SIZE <= i && i < INDEX_MEM_WRITE_SIZE + MAX_MEM_SIZE );
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
LOCALFUN UINT32 INS_GetIndex(INS ins)
|
|
{
|
|
UINT32 index = 0;
|
|
switch(measurement) {
|
|
case measure_opcode:
|
|
index = INS_Opcode(ins);
|
|
break;
|
|
case measure_ilen:
|
|
index = INS_Size(ins);
|
|
break;
|
|
case measure_category:
|
|
index = INS_Category(ins);
|
|
break;
|
|
case measure_iform:
|
|
{
|
|
xed_decoded_inst_t* xedd = INS_XedDec(ins);
|
|
xed_iform_enum_t iform = xed_decoded_inst_get_iform_enum(xedd);
|
|
index = static_cast<UINT32>(iform);
|
|
}
|
|
break;
|
|
}
|
|
return index;
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
|
|
LOCALFUN bool IsScalarSimd(INS ins)
|
|
{
|
|
xed_decoded_inst_t* xedd = INS_XedDec(ins);
|
|
return xed_decoded_inst_get_attribute(xedd, XED_ATTRIBUTE_SIMD_SCALAR);
|
|
}
|
|
|
|
LOCALFUN UINT32 IndexStringLength(BBL bbl, BOOL memory_access_profile)
|
|
{
|
|
UINT32 count = 0;
|
|
|
|
for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins))
|
|
{
|
|
count++; // one for the ins
|
|
if (measurement != measure_iform)
|
|
count++; // one for the ISA extension.
|
|
|
|
if( measurement == measure_opcode && memory_access_profile )
|
|
{
|
|
if( INS_IsMemoryRead(ins) ) count++; // for size
|
|
|
|
if( INS_IsStackRead(ins) ) count++;
|
|
|
|
if( INS_IsIpRelRead(ins) ) count++;
|
|
|
|
|
|
if( INS_IsMemoryWrite(ins) ) count++; // for size
|
|
|
|
if( INS_IsStackWrite(ins) ) count++;
|
|
|
|
if( INS_IsIpRelWrite(ins) ) count++;
|
|
|
|
if( INS_IsAtomicUpdate(ins) ) count++;
|
|
|
|
if (IsScalarSimd(ins)) count++;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
LOCALFUN UINT32 MemsizeToIndex(UINT32 size, BOOL write)
|
|
{
|
|
return (write ? INDEX_MEM_WRITE_SIZE : INDEX_MEM_READ_SIZE ) + size;
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
LOCALFUN stat_index_t* INS_GenerateIndexString(INS ins, stat_index_t *stats, BOOL memory_access_profile)
|
|
{
|
|
*stats++ = INS_GetIndex(ins);
|
|
if (measurement != measure_iform)
|
|
*stats++ = INS_Extension(ins) + INDEX_EXTENSION;
|
|
|
|
if( measurement == measure_opcode && memory_access_profile )
|
|
{
|
|
if( INS_IsMemoryRead(ins) ) *stats++ = MemsizeToIndex( INS_MemoryReadSize(ins), 0 );
|
|
if( INS_IsMemoryWrite(ins) ) *stats++ = MemsizeToIndex( INS_MemoryWriteSize(ins), 1 );
|
|
|
|
if( INS_IsAtomicUpdate(ins) ) *stats++ = INDEX_MEM_ATOMIC;
|
|
|
|
if( INS_IsStackRead(ins) ) *stats++ = INDEX_STACK_READ;
|
|
if( INS_IsStackWrite(ins) ) *stats++ = INDEX_STACK_WRITE;
|
|
|
|
if( INS_IsIpRelRead(ins) ) *stats++ = INDEX_IPREL_READ;
|
|
if( INS_IsIpRelWrite(ins) ) *stats++ = INDEX_IPREL_WRITE;
|
|
|
|
if (IsScalarSimd(ins)) *stats++ = INDEX_SCALAR_SIMD;
|
|
}
|
|
|
|
return stats;
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
|
|
LOCALFUN string IndexToString( UINT32 index )
|
|
{
|
|
if (measurement == measure_iform)
|
|
{
|
|
return xed_iform_enum_t2str(static_cast<xed_iform_enum_t>(index));
|
|
}
|
|
|
|
if( INDEX_SPECIAL <= index && index < INDEX_SPECIAL_END)
|
|
{
|
|
if( index == INDEX_TOTAL ) return "*total";
|
|
else if( IsMemReadIndex(index) ) return "*mem-read-" + decstr( index - INDEX_MEM_READ_SIZE );
|
|
else if( IsMemWriteIndex(index)) return "*mem-write-" + decstr( index - INDEX_MEM_WRITE_SIZE );
|
|
else if( index == INDEX_MEM_ATOMIC ) return "*mem-atomic";
|
|
else if( index == INDEX_STACK_READ ) return "*stack-read";
|
|
else if( index == INDEX_STACK_WRITE ) return "*stack-write";
|
|
else if( index == INDEX_IPREL_READ ) return "*iprel-read";
|
|
else if( index == INDEX_IPREL_WRITE ) return "*iprel-write";
|
|
else if( index == INDEX_SCALAR_SIMD) return "*scalar-simd";
|
|
else if (index >= INDEX_EXTENSION && index < INDEX_EXTENSION + MAX_EXTENSION)
|
|
return "*isa-ext-" + EXTENSION_StringShort(index - INDEX_EXTENSION);
|
|
|
|
else if ( index == INDEX_FMA ) return "*FMA";
|
|
else if ( index == INDEX_FMA_ADD ) return "*FMA_ADD";
|
|
else if ( index == INDEX_FMA_MUL ) return "*FMA_MUL";
|
|
else if ( index == INDEX_FMA_S ) return "*FMA_S";
|
|
else if ( index == INDEX_FMA_S_ADD ) return "*FMA_S_ADD";
|
|
else if ( index == INDEX_FMA_S_MUL ) return "*FMA_S_MUL";
|
|
else if ( index == INDEX_FMA_D ) return "*FMA_D";
|
|
else if ( index == INDEX_FMA_D_ADD ) return "*FMA_D_ADD";
|
|
else if ( index == INDEX_FMA_D_MUL ) return "*FMA_D_MUL";
|
|
else if ( index == INDEX_FPMA ) return "*FPMA";
|
|
else if ( index == INDEX_FPMA_ADD ) return "*FPMA_ADD";
|
|
else if ( index == INDEX_FPMA_MUL ) return "*FPMA_MUL";
|
|
else if ( index == INDEX_FMS ) return "*FMS";
|
|
else if ( index == INDEX_FMS_SUB ) return "*FMS_SUB";
|
|
else if ( index == INDEX_FMS_MUL ) return "*FMS_MUL";
|
|
else if ( index == INDEX_FMS_S ) return "*FMS_S";
|
|
else if ( index == INDEX_FMS_S_SUB ) return "*FMS_S_SUB";
|
|
else if ( index == INDEX_FMS_S_MUL ) return "*FMS_S_MUL";
|
|
else if ( index == INDEX_FMS_D ) return "*FMS_D";
|
|
else if ( index == INDEX_FMS_D_SUB ) return "*FMS_D_SUB";
|
|
else if ( index == INDEX_FMS_D_MUL ) return "*FMS_D_MUL";
|
|
else if ( index == INDEX_FPMS ) return "*FPMS";
|
|
else if ( index == INDEX_FPMS_SUB ) return "*FPMS_SUB";
|
|
else if ( index == INDEX_FPMS_MUL ) return "*FPMS_MUL";
|
|
else if ( index == INDEX_FNMA ) return "*FNMA";
|
|
else if ( index == INDEX_FNMA_ADD ) return "*FNMA_ADD";
|
|
else if ( index == INDEX_FNMA_MUL ) return "*FNMA_MUL";
|
|
else if ( index == INDEX_FNMA_S ) return "*FNMA_S";
|
|
else if ( index == INDEX_FNMA_S_ADD ) return "*FNMA_S_ADD";
|
|
else if ( index == INDEX_FNMA_S_MUL ) return "*FNMA_S_MUL";
|
|
else if ( index == INDEX_FNMA_D ) return "*FNMA_D";
|
|
else if ( index == INDEX_FNMA_D_ADD ) return "*FNMA_D_ADD";
|
|
else if ( index == INDEX_FNMA_D_MUL ) return "*FNMA_D_MUL";
|
|
else if ( index == INDEX_FPNMA ) return "*FPNMA";
|
|
else if ( index == INDEX_FPNMA_ADD ) return "*FPNMA_ADD";
|
|
else if ( index == INDEX_FPNMA_MUL ) return "*FPNMA_MUL";
|
|
|
|
else
|
|
{
|
|
ASSERTX(0);
|
|
return "";
|
|
}
|
|
}
|
|
else if (measurement == measure_ilen)
|
|
{
|
|
ostringstream s;
|
|
s << "ILEN-" << index;
|
|
return s.str();
|
|
}
|
|
else if (measurement == measure_opcode)
|
|
{
|
|
return OPCODE_StringShort(index);
|
|
}
|
|
else if (measurement == measure_category)
|
|
{
|
|
return CATEGORY_StringShort(index);
|
|
}
|
|
ASSERTX(0);
|
|
return "";
|
|
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
/* ===================================================================== */
|
|
typedef UINT64 COUNTER;
|
|
|
|
|
|
/* zero initialized */
|
|
|
|
typedef map<UINT32,COUNTER> stat_map_t;
|
|
|
|
class CSTATS
|
|
{
|
|
public:
|
|
CSTATS()
|
|
{
|
|
clear();
|
|
}
|
|
|
|
stat_map_t unpredicated;
|
|
stat_map_t predicated;
|
|
stat_map_t predicated_true;
|
|
|
|
VOID clear()
|
|
{
|
|
unpredicated.erase(unpredicated.begin(),unpredicated.end());
|
|
predicated.erase(predicated.begin(),predicated.end());
|
|
predicated_true.erase(predicated_true.begin(),predicated_true.end());
|
|
}
|
|
};
|
|
|
|
class BBL_SORT_STATS
|
|
{
|
|
public:
|
|
ADDRINT _pc;
|
|
UINT32 _rtn_num;
|
|
COUNTER _icount;
|
|
COUNTER _executions;
|
|
COUNTER _nbytes;
|
|
};
|
|
|
|
CSTATS GlobalStatsStatic; // summary stats for static analysis
|
|
|
|
class BBLSTATS
|
|
{
|
|
// Our first pass sets up the types of stats we need to update for this
|
|
// block. We have one stat per instruction in the block. The _stats
|
|
// array is null terminated.
|
|
public:
|
|
const stat_index_t* const _stats;
|
|
const ADDRINT _pc; // start PC of the block
|
|
const UINT32 _ninst; // # of instructions
|
|
const UINT32 _nbytes; // # of bytes in the block
|
|
const UINT32 _rtn_num; // index of the routine address/function name information
|
|
BBLSTATS(stat_index_t* stats, ADDRINT pc, UINT32 ninst, UINT32 nbytes, UINT32 rtn_number)
|
|
: _stats(stats),
|
|
_pc(pc),
|
|
_ninst(ninst),
|
|
_nbytes(nbytes),
|
|
_rtn_num(rtn_number) {};
|
|
};
|
|
|
|
LOCALVAR vector<BBLSTATS*> statsList;
|
|
|
|
/* ===================================================================== */
|
|
|
|
#if defined(__GNUC__)
|
|
# if defined(TARGET_MAC) || defined(TARGET_WINDOWS)
|
|
// macOS* XCODE2.4.1 gcc and Cgywin gcc 3.4.x only allow for 16b
|
|
// alignment! So we need to pad!
|
|
# define ALIGN_LOCK __attribute__ ((aligned(16)))
|
|
# else
|
|
# define ALIGN_LOCK __attribute__ ((aligned(64)))
|
|
# endif
|
|
#else
|
|
# define ALIGN_LOCK __declspec(align(64))
|
|
#endif
|
|
|
|
typedef struct {
|
|
char pad0[64];
|
|
PIN_LOCK ALIGN_LOCK lock; /* for mediating output */
|
|
char pad1[64];
|
|
PIN_LOCK ALIGN_LOCK bbl_list_lock; /* for the bbl list */
|
|
char pad2[64];
|
|
PIN_LOCK ALIGN_LOCK rtn_table_lock; /* for the rtn table */
|
|
char pad3[64];
|
|
} ALIGN_LOCK locks_t;
|
|
|
|
locks_t locks;
|
|
|
|
|
|
static std::ofstream* out;
|
|
|
|
class thread_data_t
|
|
{
|
|
public:
|
|
thread_data_t()
|
|
: enabled(0)
|
|
{
|
|
}
|
|
CSTATS cstats;
|
|
vector<CSTATS> stats_per_function; // indexed by rtn_num
|
|
UINT32 enabled;
|
|
|
|
vector<COUNTER> block_counts;
|
|
|
|
UINT32 size()
|
|
{
|
|
UINT32 limit;
|
|
limit = block_counts.size();
|
|
return limit;
|
|
}
|
|
|
|
void resize(UINT32 n)
|
|
{
|
|
if (size() < n)
|
|
block_counts.resize(2*n);
|
|
}
|
|
|
|
};
|
|
|
|
thread_data_t* get_tls(THREADID tid)
|
|
{
|
|
thread_data_t* tdata =
|
|
static_cast<thread_data_t*>(PIN_GetThreadData(tls_key, tid));
|
|
return tdata;
|
|
}
|
|
|
|
VOID activate_counting(THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
tdata->enabled = 1;
|
|
}
|
|
VOID deactivate_counting(THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
tdata->enabled = 0;
|
|
}
|
|
|
|
UINT32 numThreads = 0;
|
|
|
|
VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v)
|
|
{
|
|
// This function is locked no need for a Pin Lock here
|
|
numThreads++;
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
*out << "# Starting tid " << tid << endl;
|
|
PIN_ReleaseLock(&locks.lock);
|
|
|
|
thread_data_t* tdata = new thread_data_t;
|
|
// remember my pointer for later
|
|
PIN_SetThreadData(tls_key, tdata, tid);
|
|
|
|
// make sure the thread is counting stuff.
|
|
|
|
// FIXME: The controller should start all threads if no trigger
|
|
// conditions are specified, but currently it only starts
|
|
// TID0. Starting here is wrong if the controller has a nontrivial
|
|
// starting condition, but this is what most people want. They can
|
|
// always stop the controller and zero the stats using markers as a
|
|
// workaround.
|
|
|
|
if (tid)
|
|
activate_counting(tid);
|
|
}
|
|
|
|
|
|
|
|
|
|
VOID emit_stats(THREADID tid); //forward prototype
|
|
VOID emit_pc_stats(THREADID tid); //forward prototype
|
|
VOID zero_stats(THREADID tid); //forward prototype
|
|
VOID Fini(int, VOID * v);
|
|
VOID emit_bbl_stats_sorted(THREADID tid);
|
|
LOCALVAR CONTROL_MANAGER control;
|
|
|
|
|
|
|
|
|
|
LOCALFUN VOID Handler(EVENT_TYPE ev, VOID *val, CONTEXT *ctxt, VOID *ip, THREADID tid, bool bcast)
|
|
{
|
|
switch(ev)
|
|
{
|
|
case EVENT_START:
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
*out << "# Start counting for tid " << tid << endl;
|
|
PIN_ReleaseLock(&locks.lock);
|
|
activate_counting(tid);
|
|
break;
|
|
case EVENT_STOP:
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
*out << "# Stop counting for tid " << tid << endl;
|
|
PIN_ReleaseLock(&locks.lock);
|
|
deactivate_counting(tid);
|
|
if (KnobEarlyOut) {
|
|
*out << "Exiting due to -early-out" << endl;
|
|
Fini(0, 0);
|
|
exit(0);
|
|
}
|
|
|
|
break;
|
|
case EVENT_STATS_EMIT:
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
*out << "# Emit stats for tid " << static_cast<int>(tid) << endl;
|
|
PIN_ReleaseLock(&locks.lock);
|
|
emit_stats(tid);
|
|
break;
|
|
case EVENT_STATS_RESET:
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
*out << "# Reset stats for tid " << static_cast<int>(tid) << endl;
|
|
PIN_ReleaseLock(&locks.lock);
|
|
zero_stats(tid);
|
|
break;
|
|
|
|
default:
|
|
ASSERTX(false);
|
|
}
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
VOID validate_bbl_count(THREADID tid, ADDRINT block_count_for_trace)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
tdata->resize(block_count_for_trace+1);
|
|
}
|
|
|
|
VOID PIN_FAST_ANALYSIS_CALL docount_bbl(ADDRINT block_id, THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
//ASSERTX(tdata->size() > block_id);
|
|
tdata->block_counts[block_id] += tdata->enabled;
|
|
}
|
|
|
|
|
|
VOID docount_predicated_true(UINT32 index, THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
if (tdata->enabled) {
|
|
stat_map_t::iterator i = tdata->cstats.predicated_true.find(index);
|
|
if (i == tdata->cstats.predicated_true.end())
|
|
tdata->cstats.predicated_true[index] = 1;
|
|
else
|
|
i->second += 1;
|
|
}
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
|
|
VOID zero_stats(THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
tdata->cstats.clear();
|
|
UINT32 limit = tdata->size();
|
|
for(UINT32 i=0;i< limit;i++)
|
|
tdata->block_counts[i]=0;
|
|
tdata->stats_per_function.erase( tdata->stats_per_function.begin(),
|
|
tdata->stats_per_function.end());
|
|
}
|
|
/* ===================================================================== */
|
|
|
|
VOID CheckForSpecialMarkers(INS ins, ADDRINT pc, unsigned int instruction_size)
|
|
{
|
|
// This checks for single instances of special 3B NOPs.
|
|
// 0F1FF3 - start
|
|
// 0F1FF4 - stop
|
|
// 0F1FF5 - emit stats
|
|
// 0F1FF6 - zero stats
|
|
|
|
// FIXME: if there are collisions with existing instructions, we can
|
|
// change them here.
|
|
|
|
//FIXME: Ideally this would be integrated in to the control.H so file
|
|
//so that anything can use it.
|
|
if (instruction_size != 3)
|
|
return;
|
|
|
|
UINT8* pc_ptr = reinterpret_cast<UINT8*>(pc);
|
|
if (pc_ptr[0] == 0x0F &&
|
|
pc_ptr[1] == 0x1F)
|
|
{
|
|
switch(pc_ptr[2])
|
|
{
|
|
case 0xF3: // start
|
|
INS_InsertCall(ins,
|
|
IPOINT_BEFORE,
|
|
(AFUNPTR)activate_counting,
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
break;
|
|
case 0xF4: // stop
|
|
INS_InsertCall(ins,
|
|
IPOINT_BEFORE,
|
|
(AFUNPTR)deactivate_counting,
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
break;
|
|
case 0xF5: // emit
|
|
INS_InsertCall(ins,
|
|
IPOINT_BEFORE,
|
|
(AFUNPTR)emit_stats,
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
break;
|
|
case 0xF6: // zero
|
|
INS_InsertCall(ins,
|
|
IPOINT_BEFORE,
|
|
(AFUNPTR)zero_stats,
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
class RTN_TABLE_ENTRY
|
|
{
|
|
public:
|
|
ADDRINT _address;
|
|
ADDRINT _end_address;
|
|
const char* _name;
|
|
UINT32 _rtn_num; /* facilitates sorting later */
|
|
UINT64 _total;
|
|
|
|
RTN_TABLE_ENTRY(ADDRINT address=0, ADDRINT end_address=0, const char* name=0, UINT32 rtn_num=0)
|
|
: _address(address),
|
|
_end_address(end_address),
|
|
_name(name),
|
|
_rtn_num(rtn_num), _total(0) {}
|
|
};
|
|
|
|
static vector<RTN_TABLE_ENTRY> rtn_table;
|
|
|
|
static int qsort_rtn_compare_fn(const void *a, const void *b)
|
|
{
|
|
/*descending sort*/
|
|
const RTN_TABLE_ENTRY* ba = static_cast<const RTN_TABLE_ENTRY*>(a);
|
|
const RTN_TABLE_ENTRY* bb = static_cast<const RTN_TABLE_ENTRY*>(b);
|
|
|
|
if (ba->_total < bb->_total)
|
|
return 1;
|
|
if (ba->_total > bb->_total)
|
|
return -1;
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
UINT32 routine_identifier(RTN rtn, THREADID tid)
|
|
{
|
|
if (!RTN_Valid(rtn)) {
|
|
return 0; /* we never return nonzero global routine numbers */
|
|
}
|
|
ADDRINT rtn_address = RTN_Address(rtn);
|
|
ADDRINT rtn_end_address;
|
|
const char *rtn_name = RTN_Name(rtn).c_str();
|
|
|
|
/* RTN_Id returns a unique global identifier. However, the numeration is not necessarily consecutive
|
|
so we keep our own numeration for convenience (and legacy) - see comment below. */
|
|
UINT32 rtn_num = RTN_Id(rtn);
|
|
UINT32 img_num = IMG_Id(SEC_Img(RTN_Sec(rtn)));
|
|
|
|
/* The pair <img_id, rtn_id> is used to index a map of global routine
|
|
numbers. The function is stored at the global routine number in the
|
|
rtn_table. The basic blocks are tagged with the same number so they
|
|
can be reassociated later for printing. */
|
|
static UINT32 next_rtn_num = 1;
|
|
UINT32 global_rtn_num;
|
|
typedef pair<UINT32,UINT32> rtn_identifier_t;
|
|
static map<rtn_identifier_t, UINT32> rtn_mapper;
|
|
rtn_identifier_t ri(img_num,rtn_num);
|
|
PIN_GetLock(&locks.rtn_table_lock,tid+2);
|
|
map<rtn_identifier_t, UINT32>::iterator it = rtn_mapper.find(ri);
|
|
if (it == rtn_mapper.end()) {
|
|
rtn_mapper[ri] = next_rtn_num;
|
|
global_rtn_num = next_rtn_num;
|
|
next_rtn_num++;
|
|
|
|
/* store the RTN info using the global routine number */
|
|
|
|
if (global_rtn_num >= rtn_table.size())
|
|
rtn_table.resize(global_rtn_num*2+1);
|
|
rtn_end_address = RTN_Address(rtn) + RTN_Size(rtn);
|
|
rtn_table[global_rtn_num ] = RTN_TABLE_ENTRY (rtn_address,
|
|
rtn_end_address,
|
|
strdup(rtn_name), global_rtn_num);
|
|
}
|
|
else {
|
|
global_rtn_num = it->second;
|
|
}
|
|
PIN_ReleaseLock(&locks.rtn_table_lock);
|
|
|
|
return global_rtn_num;
|
|
}
|
|
|
|
|
|
VOID Trace(TRACE trace, VOID *v)
|
|
{
|
|
static UINT32 basic_blocks = 0;
|
|
|
|
if ( KnobNoSharedLibs.Value()
|
|
&& IMG_Type(SEC_Img(RTN_Sec(TRACE_Rtn(trace)))) == IMG_TYPE_SHAREDLIB)
|
|
return;
|
|
|
|
|
|
const BOOL accurate_handling_of_predicates = KnobProfilePredicated.Value();
|
|
ADDRINT pc = TRACE_Address(trace);
|
|
|
|
RTN rtn = TRACE_Rtn(trace);
|
|
UINT32 rtn_id = routine_identifier(rtn,1); /* fake tid for the instrumenting code */
|
|
|
|
UINT32 new_blocks = 0;
|
|
for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl))
|
|
{
|
|
const INS head = BBL_InsHead(bbl);
|
|
if (! INS_Valid(head)) continue;
|
|
new_blocks++;
|
|
}
|
|
|
|
|
|
TRACE_InsertCall(trace,
|
|
IPOINT_BEFORE,
|
|
AFUNPTR(validate_bbl_count),
|
|
IARG_THREAD_ID,
|
|
IARG_UINT32,
|
|
basic_blocks+new_blocks,
|
|
IARG_END);
|
|
|
|
for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl))
|
|
{
|
|
const INS head = BBL_InsHead(bbl);
|
|
if (! INS_Valid(head)) continue;
|
|
|
|
// Summarize the stats for the bbl in a 0 terminated list
|
|
// This is done at instrumentation time
|
|
const UINT32 n = IndexStringLength(bbl, 1);
|
|
|
|
ADDRINT block_start_pc = pc;
|
|
|
|
// stats is an array of index types. We later multiply it by the
|
|
// dynamic count for a block.
|
|
stat_index_t *const stats = new stat_index_t[ n + 1];
|
|
stat_index_t *const stats_end = stats + (n + 1);
|
|
stat_index_t *curr = stats;
|
|
UINT32 ninsts = 0;
|
|
for (INS ins = head; INS_Valid(ins); ins = INS_Next(ins))
|
|
{
|
|
unsigned int instruction_size = INS_Size(ins);
|
|
// This checks for x86-specific opcodes
|
|
CheckForSpecialMarkers(ins, pc, instruction_size);
|
|
|
|
// Count the number of times a predicated instruction is actually executed
|
|
// this is expensive and hence disabled by default
|
|
if( INS_IsPredicated(ins) && accurate_handling_of_predicates )
|
|
{
|
|
INS_InsertPredicatedCall(ins,
|
|
IPOINT_BEFORE,
|
|
AFUNPTR(docount_predicated_true),
|
|
IARG_UINT32,
|
|
INS_GetIndex(ins),
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
}
|
|
|
|
|
|
if (KnobMapToFile) {
|
|
INT32 line;
|
|
string filename;
|
|
PIN_GetSourceLocation(pc, NULL, &line, &filename);
|
|
if (!filename.empty())
|
|
*out << "MAPADDR 0x" << hex << pc << " "
|
|
<< dec << line << " " << filename
|
|
<< endl;
|
|
}
|
|
curr = INS_GenerateIndexString(ins,curr,1);
|
|
pc = pc + instruction_size;
|
|
ninsts++;
|
|
}
|
|
|
|
// stats terminator
|
|
*curr++ = 0;
|
|
ASSERTX( curr == stats_end );
|
|
|
|
// Insert instrumentation to count the number of times the bbl is executed
|
|
BBLSTATS * bblstats = new BBLSTATS(stats, block_start_pc, ninsts, pc-block_start_pc, rtn_id);
|
|
INS_InsertCall(head,
|
|
IPOINT_BEFORE,
|
|
AFUNPTR(docount_bbl),
|
|
IARG_FAST_ANALYSIS_CALL,
|
|
IARG_UINT32,
|
|
basic_blocks,
|
|
IARG_THREAD_ID,
|
|
IARG_END);
|
|
|
|
// Remember the counter and stats so we can compute a summary at the end
|
|
basic_blocks++;
|
|
PIN_GetLock(&locks.bbl_list_lock,1);
|
|
statsList.push_back(bblstats);
|
|
PIN_ReleaseLock(&locks.bbl_list_lock);
|
|
}
|
|
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
VOID DumpStats(ofstream& out,
|
|
CSTATS& stats,
|
|
BOOL predicated_true,
|
|
const string& title,
|
|
THREADID tid)
|
|
{
|
|
out << "#\n# " << title << "\n#\n";
|
|
if (tid != INVALID_THREADID)
|
|
out << "# TID " << tid << "\n";
|
|
out << "# ";
|
|
const char *label = 0;
|
|
if (measurement == measure_opcode)
|
|
label = "opcode";
|
|
else if (measurement == measure_ilen)
|
|
label = "inslen";
|
|
else if (measurement == measure_category)
|
|
label = "category";
|
|
else if (measurement == measure_iform)
|
|
label = "iform";
|
|
|
|
if (label)
|
|
out << ljstr(label,24);
|
|
|
|
out<< setw(16) << "count";
|
|
if( predicated_true )
|
|
out << " count-predicated-true";
|
|
out << "\n#\n";
|
|
|
|
// Compute the "total" bin. Stop at the INDEX_SPECIAL for all histograms
|
|
// except the iform. Iforms do not use the special rows, so we count everything.
|
|
|
|
// build a map of the valid stats index values for all 3 tables.
|
|
map<UINT32, bool> m;
|
|
|
|
COUNTER tu=0;
|
|
for(stat_map_t::iterator it = stats.unpredicated.begin() ; it != stats.unpredicated.end() ; it++) {
|
|
if (measurement == measure_iform || it->first < INDEX_SPECIAL)
|
|
tu += it->second;
|
|
m[it->first]=true;
|
|
}
|
|
|
|
COUNTER tpt=0;
|
|
for(stat_map_t::iterator it=stats.predicated_true.begin();it != stats.predicated_true.end() ; it++) {
|
|
if (measurement == measure_iform || it->first < INDEX_SPECIAL)
|
|
tpt += it->second;
|
|
m[it->first]=true;
|
|
}
|
|
|
|
for(map<UINT32,bool>::iterator it = m.begin(); it != m.end(); it++) {
|
|
stat_map_t::iterator s;
|
|
COUNTER up=0;
|
|
UINT32 indx = it->first;
|
|
|
|
s = stats.unpredicated.find(indx);
|
|
if (s != stats.unpredicated.end())
|
|
up = s->second;
|
|
|
|
if (up == 0)
|
|
continue;
|
|
|
|
out << ljstr(IndexToString(indx),25) << " " << setw(16) << up;
|
|
if( predicated_true ) {
|
|
COUNTER prt=0;
|
|
s = stats.predicated_true.find(indx);
|
|
if (s != stats.predicated_true.end())
|
|
{
|
|
prt = s->second;
|
|
out << " " << setw(16) << prt;
|
|
}
|
|
}
|
|
out << endl;
|
|
}
|
|
|
|
// print the totals
|
|
out << ljstr("*total",25) << " " << setw(16) << tu;
|
|
if( predicated_true )
|
|
out << " " << setw(16) << tpt;
|
|
out << endl;
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
static UINT32 stat_dump_count = 0;
|
|
|
|
VOID emit_bbl_stats_by_function(THREADID tid)
|
|
{
|
|
// dynamic counts for one thread and for each function of that thread.
|
|
|
|
thread_data_t* tdata = get_tls(tid);
|
|
|
|
// grab a copy of the routines list that we can sort and where we can
|
|
// compute function totals. Need to lock because instrumentation might
|
|
// reallocate the vector.
|
|
PIN_GetLock(&locks.rtn_table_lock,tid+2);
|
|
UINT32 functions = rtn_table.size();
|
|
RTN_TABLE_ENTRY* rtn_table_sorted = new RTN_TABLE_ENTRY[functions];
|
|
for(UINT32 i=0; i<functions;i++) {
|
|
rtn_table_sorted[i]=rtn_table[i];
|
|
}
|
|
PIN_ReleaseLock(&locks.rtn_table_lock);
|
|
|
|
|
|
// Need to lock here because we might be resize (and thus reallocing)
|
|
// the statsList when we do a push_back in the instrumentation.
|
|
PIN_GetLock(&locks.bbl_list_lock,tid+2);
|
|
|
|
if (tdata->stats_per_function.size() < functions)
|
|
tdata->stats_per_function.resize(functions+1);
|
|
|
|
UINT32 limit = tdata->size();
|
|
if ( limit > statsList.size() )
|
|
limit = statsList.size();
|
|
for(UINT32 i=0;i< limit ; i++)
|
|
{
|
|
COUNTER bcount = tdata->block_counts[i];
|
|
BBLSTATS* b = statsList[i];
|
|
/* the last test below is for when new bbl's get jitted while we
|
|
* are emitting stats */
|
|
if (b && b->_stats && b->_rtn_num < functions)
|
|
for (const stat_index_t* stats = b->_stats; *stats; stats++) {
|
|
tdata->cstats.unpredicated[*stats] += bcount;
|
|
//*out << "# stat for block " << b->_rtn_num << endl;
|
|
tdata->stats_per_function[b->_rtn_num].unpredicated[*stats] += bcount;
|
|
if (*stats < INDEX_SPECIAL)
|
|
rtn_table_sorted[b->_rtn_num]._total += bcount;
|
|
}
|
|
}
|
|
|
|
|
|
PIN_ReleaseLock(&locks.bbl_list_lock);
|
|
|
|
// emit the "normal" dynamic stats
|
|
|
|
|
|
*out << "# EMIT_DYNAMIC_STATS FOR TID " << tid << " EMIT #" << stat_dump_count << endl;
|
|
DumpStats(*out, tdata->cstats, KnobProfilePredicated, "$dynamic-counts",tid);
|
|
*out << "# END_DYNAMIC_STATS" << endl;
|
|
|
|
// sort the routines by the total instr count
|
|
qsort(rtn_table_sorted, functions, sizeof(RTN_TABLE_ENTRY), qsort_rtn_compare_fn);
|
|
|
|
// total every thing up for all functions in this thread
|
|
UINT64 total = 0;
|
|
for(UINT32 i=0; i<functions;i++)
|
|
if (rtn_table_sorted[i]._address && rtn_table_sorted[i]._total)
|
|
total += rtn_table_sorted[i]._total;
|
|
|
|
// print the functions out per thread, sorted order
|
|
double cumulative = 0;
|
|
*out << "# FUNCTION TOTALS FOR TID " << tid << endl;
|
|
*out << "#rank total % cumulative% address function name" << endl;
|
|
for(UINT32 i=0; i<functions;i++) {
|
|
if (rtn_table_sorted[i]._address && rtn_table_sorted[i]._total) {
|
|
cumulative += rtn_table_sorted[i]._total; // FP GUARD
|
|
*out << setw(4) << i << ": "
|
|
<< setw(12) << rtn_table_sorted[i]._total
|
|
<< " "
|
|
<< fltstr(100.0 * rtn_table_sorted[i]._total / total, 3,7) //FP GUARD
|
|
<< " "
|
|
<< fltstr(100.0 * cumulative / total,3,7) //FP GUARD
|
|
<< " "
|
|
<< hex << setw(16) << rtn_table_sorted[i]._address << dec
|
|
<< " ";
|
|
if (rtn_table_sorted[i]._name)
|
|
*out << rtn_table_sorted[i]._name;
|
|
else
|
|
*out << "UNKNOWN";
|
|
*out << endl;
|
|
}
|
|
}
|
|
*out << setw(4) << " " << " " << setw(12) << total << " " << "TOTAL" << endl;
|
|
*out << "# END FUNCTION TOTALS" << endl;
|
|
|
|
// print the functions histos for the nonempty functions
|
|
*out << "# EMIT_PER_FUNCTION_STATS FOR TID " << tid << " EMIT# " << stat_dump_count << endl;
|
|
for(UINT32 i=0; i< functions;i++) {
|
|
if (rtn_table_sorted[i]._address && rtn_table_sorted[i]._total) {
|
|
string title = "$dynamic-counts-for-function: ";
|
|
if (rtn_table_sorted[i]._name)
|
|
title += rtn_table_sorted[i]._name ;
|
|
else
|
|
title += "UNKNOWN";
|
|
/* {x|a <= x < b}, or [a,b) */
|
|
title += " at [" + hexstr(rtn_table_sorted[i]._address) + ", ";
|
|
title += hexstr(rtn_table_sorted[i]._end_address) + ")"; /* right endpoint is open interval */
|
|
double pct = 100.0 * rtn_table_sorted[i]._total / total; // FP GUARD
|
|
title += " " + fltstr(pct,3,7) + "%";
|
|
// we sorted, so get the original routine number
|
|
UINT32 rtn_num = rtn_table_sorted[i]._rtn_num;
|
|
DumpStats(*out, tdata->stats_per_function[rtn_num], KnobProfilePredicated, title,tid);
|
|
}
|
|
}
|
|
*out << "# END_PER_FUNCTION_STATS " << endl;
|
|
delete [] rtn_table_sorted;
|
|
}
|
|
|
|
|
|
int qsort_compare_fn(const void *a, const void *b)
|
|
{
|
|
const BBL_SORT_STATS* ba = static_cast<const BBL_SORT_STATS*>(a);
|
|
const BBL_SORT_STATS* bb = static_cast<const BBL_SORT_STATS*>(b);
|
|
if (bb->_icount > ba->_icount)
|
|
return 1;
|
|
if (bb->_icount < ba->_icount)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
VOID emit_bbl_stats_sorted(THREADID tid)
|
|
{
|
|
/* emit the top blocks for this tid */
|
|
|
|
thread_data_t* tdata = get_tls(tid);
|
|
// dynamic Counts
|
|
|
|
// Need to lock here because we might be resize (and thus reallocing)
|
|
// the statsList when we do a push_back in the instrumentation.
|
|
PIN_GetLock(&locks.bbl_list_lock,tid+2);
|
|
UINT32 limit = tdata->size();
|
|
if ( limit > statsList.size() )
|
|
limit = statsList.size();
|
|
BBL_SORT_STATS* icounts = new BBL_SORT_STATS[limit];
|
|
COUNTER thread_total = 0;
|
|
for(UINT32 i=0;i< limit ; i++)
|
|
{
|
|
BBLSTATS* b = statsList[i];
|
|
if (b) {
|
|
COUNTER bcount = tdata->block_counts[i];
|
|
COUNTER x = b->_ninst;
|
|
x = x * bcount;
|
|
//*out << hex << "ALL PC: " << b->_pc << " COUNT: " << x << endl;
|
|
icounts[i]._icount = x;
|
|
icounts[i]._pc = b->_pc;
|
|
icounts[i]._rtn_num = b->_rtn_num;
|
|
icounts[i]._executions = bcount;
|
|
icounts[i]._nbytes = b->_nbytes;
|
|
thread_total += icounts[i]._icount;
|
|
}
|
|
}
|
|
PIN_ReleaseLock(&locks.bbl_list_lock);
|
|
|
|
qsort(icounts, limit, sizeof(BBL_SORT_STATS), qsort_compare_fn);
|
|
|
|
PIN_GetLock(&locks.rtn_table_lock,tid+2);
|
|
|
|
*out << "# EMIT_TOP_BLOCK_STATS FOR TID " << tid
|
|
<< " EMIT # " << stat_dump_count
|
|
<< endl;
|
|
if (limit > KnobTopBlocks.Value())
|
|
limit = KnobTopBlocks.Value();
|
|
COUNTER t =0;
|
|
for(UINT32 i=0;i<limit;i++) {
|
|
t+= icounts[i]._icount;
|
|
*out << "BLOCK: " << setw(5) << i
|
|
<< " PC: "
|
|
<< hex
|
|
<< setfill('0')
|
|
<< setw(sizeof(ADDRINT)*2) << icounts[i]._pc
|
|
<< setfill(' ')
|
|
<< dec
|
|
<< " ICOUNT: "
|
|
<< setw(9) << icounts[i]._icount
|
|
<< " EXECUTIONS: "
|
|
<< setw(9) << icounts[i]._executions
|
|
<< " #BYTES: "
|
|
<< setw(2) << icounts[i]._nbytes
|
|
<< " %: "
|
|
<< setw(5) << setprecision(3) << 100.0*icounts[i]._icount/thread_total // FP GUARD
|
|
<< " cumltv%: "
|
|
<< setw(5) << setprecision(3) << 100.0*t/thread_total; // FP GUARD
|
|
UINT32 idx = icounts[i]._rtn_num;
|
|
|
|
if (rtn_table.size() > idx)
|
|
if (rtn_table[idx]._name)
|
|
*out << " FN: " << rtn_table[idx]._name;
|
|
*out << endl;
|
|
if (KnobShowDisassembly) {
|
|
string s = disassemble(icounts[i]._pc, icounts[i]._pc + icounts[i]._nbytes);
|
|
*out << s << endl;
|
|
}
|
|
}
|
|
|
|
*out << "# END_TOP_BLOCK_STATS" << endl;
|
|
PIN_ReleaseLock(&locks.rtn_table_lock);
|
|
delete [] icounts;
|
|
}
|
|
|
|
VOID emit_static_stats()
|
|
{
|
|
*out << "# EMIT_STATIC_STATS " << endl;
|
|
DumpStats(*out, GlobalStatsStatic, false, "$static-counts",INVALID_THREADID);
|
|
*out << endl << "# END_STATIC_STATS" << endl;
|
|
}
|
|
|
|
VOID emit_pc_stats(THREADID tid)
|
|
{
|
|
thread_data_t* tdata = get_tls(tid);
|
|
// dynamic Counts
|
|
|
|
// Need to lock here because we might be resize (and thus reallocing)
|
|
// the statsList when we do a push_back in the instrumentation.
|
|
|
|
*out << "# EMIT_PC_STATS FOR TID " << tid << endl;
|
|
PIN_GetLock(&locks.bbl_list_lock,tid+2);
|
|
UINT32 limit = tdata->size();
|
|
if ( limit > statsList.size() )
|
|
limit = statsList.size();
|
|
for(UINT32 i=0;i< limit ; i++)
|
|
{
|
|
COUNTER bcount = tdata->block_counts[i];
|
|
BBLSTATS* b = statsList[i];
|
|
if (bcount && b && b->_stats)
|
|
*out << "BLOCKCOUNT 0x" << hex << b->_pc << " " << dec << (bcount * b->_ninst ) << endl;
|
|
}
|
|
PIN_ReleaseLock(&locks.bbl_list_lock);
|
|
*out << "# END_EMIT_PC_STATS FOR TID " << tid << endl;
|
|
}
|
|
|
|
VOID emit_stats(THREADID tid)
|
|
{
|
|
/* we must save the X87 state because we do floating point computation
|
|
* when doing output and on 32b platforms, that can use x87. Pin does
|
|
* not save the X87 state, so analysis routines must do it
|
|
* themselves. */
|
|
mix_fp_save_buffer_t save_buf;
|
|
unsigned char* p = MIX_FP_ALIGN(&save_buf); /* make sure it is 16B aligned */
|
|
MIX_FP_SAVE(p);
|
|
|
|
PIN_GetLock(&locks.lock, tid+2); // for output
|
|
stat_dump_count++;
|
|
*out << "# ==============================================" << endl;
|
|
*out << "# STATS FOR TID " << tid << " EMIT# " << stat_dump_count << endl;
|
|
*out << "# ==============================================" << endl;
|
|
emit_bbl_stats_sorted(tid); // top blocks
|
|
emit_bbl_stats_by_function(tid); // function level stats
|
|
if (KnobMapToFile)
|
|
emit_pc_stats(tid);
|
|
PIN_ReleaseLock(&locks.lock);
|
|
MIX_FP_RELOAD(p);
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
|
|
void combine_dynamic_stats(unsigned int numThreads)
|
|
{
|
|
// combine all the rows from each thread in to the total variable.
|
|
CSTATS total;
|
|
for (THREADID i=0;i<numThreads; i++)
|
|
{
|
|
thread_data_t* tdata = get_tls(i);
|
|
|
|
for(stat_map_t::iterator it = tdata->cstats.unpredicated.begin(); it != tdata->cstats.unpredicated.end() ; it++) {
|
|
stat_map_t::iterator x = total.unpredicated.find(it->first);
|
|
if (x == total.unpredicated.end())
|
|
total.unpredicated[it->first] = it->second;
|
|
else
|
|
x->second += it->second;
|
|
}
|
|
|
|
for(stat_map_t::iterator it = tdata->cstats.predicated.begin(); it != tdata->cstats.predicated.end() ; it++) {
|
|
stat_map_t::iterator x = total.predicated.find(it->first);
|
|
if (x == total.predicated.end())
|
|
total.predicated[it->first] = it->second;
|
|
else
|
|
x->second += it->second;
|
|
}
|
|
|
|
|
|
for(stat_map_t::iterator it = tdata->cstats.predicated_true.begin(); it != tdata->cstats.predicated_true.end() ; it++) {
|
|
stat_map_t::iterator x = total.predicated_true.find(it->first);
|
|
if (x == total.predicated_true.end())
|
|
total.predicated_true[it->first] = it->second;
|
|
else
|
|
x->second += it->second;
|
|
}
|
|
}
|
|
|
|
*out << "# EMIT_GLOBAL_DYNAMIC_STATS EMIT# " << stat_dump_count << endl;
|
|
DumpStats(*out, total, false, "$global-dynamic-counts",INVALID_THREADID);
|
|
*out << endl << "# END_GLOBAL_DYNAMIC_STATS" << endl;
|
|
|
|
}
|
|
|
|
VOID Fini(int, VOID * v) // only runs once for the application
|
|
{
|
|
*out << "# FINI: end of program" << endl;
|
|
for(unsigned int i=0;i<numThreads;i++)
|
|
emit_stats(i);
|
|
emit_static_stats();
|
|
combine_dynamic_stats(numThreads);
|
|
|
|
out->close();
|
|
}
|
|
|
|
|
|
/* ===================================================================== */
|
|
|
|
VOID Image(IMG img, VOID * v)
|
|
{
|
|
for (SEC sec = IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec))
|
|
{
|
|
for (RTN rtn = SEC_RtnHead(sec); RTN_Valid(rtn); rtn = RTN_Next(rtn))
|
|
{
|
|
// Prepare for processing of RTN, an RTN is not broken up into BBLs,
|
|
// it is merely a sequence of INSs
|
|
RTN_Open(rtn);
|
|
|
|
for (INS ins = RTN_InsHead(rtn); INS_Valid(ins); ins = INS_Next(ins))
|
|
{
|
|
stat_index_t array[256];
|
|
stat_index_t* mid = INS_GenerateIndexString(ins,array,1);
|
|
stat_index_t* end = mid;
|
|
|
|
if( INS_IsPredicated(ins) )
|
|
{
|
|
for( stat_index_t *start= array; start < end; start++)
|
|
{
|
|
GlobalStatsStatic.predicated[ *start ]++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for( stat_index_t *start= array; start < end; start++)
|
|
{
|
|
GlobalStatsStatic.unpredicated[ *start ]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// to preserve space, release data associated with RTN after we have processed it
|
|
RTN_Close(rtn);
|
|
}
|
|
}
|
|
|
|
if( KnobProfileStaticOnly.Value() )
|
|
{
|
|
Fini(0,0);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////
|
|
// Add a disassembler
|
|
/////////////////////////////////////////////////////////////////////////
|
|
|
|
static char nibble_to_ascii_hex(UINT8 i) {
|
|
if (i<10) return i+'0';
|
|
if (i<16) return i-10+'A';
|
|
return '?';
|
|
}
|
|
|
|
static void print_hex_line(char* buf, const UINT8* array, const int length) {
|
|
int n = length;
|
|
int i=0;
|
|
if (length == 0)
|
|
n = XED_MAX_INSTRUCTION_BYTES;
|
|
for( i=0 ; i< n; i++) {
|
|
buf[2*i+0] = nibble_to_ascii_hex(array[i]>>4);
|
|
buf[2*i+1] = nibble_to_ascii_hex(array[i]&0xF);
|
|
}
|
|
buf[2*i]=0;
|
|
}
|
|
|
|
|
|
static string
|
|
disassemble(UINT64 start, UINT64 stop) {
|
|
UINT64 pc = start;
|
|
xed_state_t dstate;
|
|
xed_syntax_enum_t syntax = XED_SYNTAX_INTEL;
|
|
xed_error_enum_t xed_error;
|
|
xed_decoded_inst_t xedd;
|
|
ostringstream os;
|
|
if (sizeof(ADDRINT) == 4)
|
|
xed_state_init(&dstate,
|
|
XED_MACHINE_MODE_LEGACY_32,
|
|
XED_ADDRESS_WIDTH_32b,
|
|
XED_ADDRESS_WIDTH_32b);
|
|
else
|
|
xed_state_init(&dstate,
|
|
XED_MACHINE_MODE_LONG_64,
|
|
XED_ADDRESS_WIDTH_64b,
|
|
XED_ADDRESS_WIDTH_64b);
|
|
|
|
while( pc < stop ) {
|
|
xed_decoded_inst_zero_set_mode(&xedd, &dstate);
|
|
UINT32 len = 15;
|
|
if (stop - pc < 15)
|
|
len = stop-pc;
|
|
|
|
xed_error = xed_decode(&xedd, reinterpret_cast<const UINT8*>(pc), len);
|
|
bool okay = (xed_error == XED_ERROR_NONE);
|
|
iostream::fmtflags fmt = os.flags();
|
|
os << std::setfill('0')
|
|
<< "XDIS "
|
|
<< std::hex
|
|
<< std::setw(sizeof(ADDRINT)*2)
|
|
<< pc
|
|
<< std::dec
|
|
<< ": "
|
|
<< std::setfill(' ')
|
|
<< std::setw(4);
|
|
|
|
if (okay) {
|
|
char buffer[200];
|
|
unsigned int dec_len, sp;
|
|
|
|
os << xed_extension_enum_t2str(xed_decoded_inst_get_extension(&xedd));
|
|
dec_len = xed_decoded_inst_get_length(&xedd);
|
|
print_hex_line(buffer, reinterpret_cast<UINT8*>(pc), dec_len);
|
|
os << " " << buffer;
|
|
for ( sp=dec_len; sp < 12; sp++) // pad out the instruction bytes
|
|
os << " ";
|
|
os << " ";
|
|
memset(buffer,0,200);
|
|
int dis_okay = xed_format_context(syntax, &xedd, buffer, 200, pc, 0, 0);
|
|
if (dis_okay)
|
|
os << buffer << endl;
|
|
else
|
|
os << "Error disasassembling pc 0x" << std::hex << pc << std::dec << endl;
|
|
pc += dec_len;
|
|
}
|
|
else { // print the byte and keep going.
|
|
UINT8 memval = *reinterpret_cast<UINT8*>(pc);
|
|
os << "???? " // no extension
|
|
<< std::hex
|
|
<< std::setw(2)
|
|
<< std::setfill('0')
|
|
<< static_cast<UINT32>(memval)
|
|
<< std::endl;
|
|
pc += 1;
|
|
}
|
|
os.flags(fmt);
|
|
}
|
|
return os.str();
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
|
|
int main(int argc, CHAR **argv)
|
|
{
|
|
PIN_InitSymbols();
|
|
if( PIN_Init(argc,argv) )
|
|
return Usage();
|
|
|
|
PIN_InitLock(&locks.lock);
|
|
PIN_InitLock(&locks.bbl_list_lock);
|
|
PIN_InitLock(&locks.rtn_table_lock);
|
|
|
|
// obtain a key for TLS storage
|
|
tls_key = PIN_CreateThreadDataKey(0);
|
|
|
|
string filename = KnobOutputFile.Value();
|
|
if (KnobPid)
|
|
{
|
|
filename += "." + decstr(getpid());
|
|
}
|
|
out = new std::ofstream(filename.c_str());
|
|
*out << "# Mix output version 2" << endl;
|
|
control.RegisterHandler(Handler, 0, FALSE);
|
|
control.Activate();
|
|
|
|
|
|
// make sure that exactly one thing-to-count knob is specified.
|
|
if (KnobInstructionLengthMix.Value() && KnobCategoryMix.Value()) {
|
|
cerr << "Must have at most one of: -iform, -ilen or -category "
|
|
<< "as a pintool option" << endl;
|
|
exit(1);
|
|
}
|
|
if (KnobInstructionLengthMix.Value())
|
|
measurement = measure_ilen;
|
|
if (KnobCategoryMix.Value())
|
|
measurement = measure_category;
|
|
if (KnobIformMix.Value()) {
|
|
measurement = measure_iform;
|
|
}
|
|
|
|
|
|
TRACE_AddInstrumentFunction(Trace, 0);
|
|
PIN_AddThreadStartFunction(ThreadStart, 0);
|
|
PIN_AddFiniFunction(Fini, 0);
|
|
|
|
if( !KnobProfileDynamicOnly.Value() )
|
|
IMG_AddInstrumentFunction(Image, 0);
|
|
|
|
PIN_StartProgram(); // Never returns
|
|
return 0;
|
|
}
|
|
|
|
/* ===================================================================== */
|
|
/* eof */
|
|
/* ===================================================================== */
|