/* * Copyright 2002-2019 Intel Corporation. * * This software is provided to you as Sample Source Code as defined in the accompanying * End User License Agreement for the Intel(R) Software Development Products ("Agreement") * section 1.L. * * This software and the related documents are provided as is, with no express or implied * warranties, other than those that are expressly stated in the License. */ /* ===================================================================== */ /*! @file This file contains a static and dynamic opcode/ISA extension/ISA * category mix profiler * * This is derived from mix.cpp. Handles an arbitrary number of threads * using TLS for data storage and avoids locking, except during I/O. */ #if defined(TARGET_WINDOWS) #define strdup _strdup #endif #include #include #include #include #include #include #include #include /* for pair */ #include #include #include "pin.H" #include "control_manager.H" #include "mix-fp-state.H" using namespace CONTROLLER; // key for accessing TLS storage in the threads. initialized once in main() static TLS_KEY tls_key; typedef UINT32 stat_index_t; static string disassemble(UINT64 start, UINT64 stop); /* ===================================================================== */ /* Commandline Switches */ /* ===================================================================== */ KNOB_COMMENT mix_knob_family("pintool:mix", "Mix knobs"); KNOB KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool:mix", "o", "mix.out", "specify profile file name"); KNOB KnobTopBlocks(KNOB_MODE_WRITEONCE, "pintool:mix", "top_blocks", "20", "specify a maximal number of top blocks for which icounts are printed"); KNOB KnobShowDisassembly(KNOB_MODE_WRITEONCE, "pintool:mix", "disas", "0", "Show disassembly for top blocks"); KNOB KnobPid(KNOB_MODE_WRITEONCE, "pintool:mix", "i", "0", "append pid to output file name"); KNOB KnobProfilePredicated(KNOB_MODE_WRITEONCE, "pintool:mix", "p", "0", "enable accurate profiling for predicated instructions"); KNOB KnobProfileStaticOnly(KNOB_MODE_WRITEONCE, "pintool:mix", "s", "0", "terminate after collection of static profile for main image"); #ifndef TARGET_WINDOWS KNOB KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool:mix", "d", "0", "Only collect dynamic profile"); #else KNOB KnobProfileDynamicOnly(KNOB_MODE_WRITEONCE, "pintool:mix", "d", "1", "Only collect dynamic profile"); #endif KNOB KnobNoSharedLibs(KNOB_MODE_WRITEONCE, "pintool:mix", "no_shared_libs", "0", "do not instrument shared libraries"); KNOB KnobInstructionLengthMix(KNOB_MODE_WRITEONCE, "pintool:mix","ilen", "0", "Compute instruction length mix"); KNOB KnobCategoryMix(KNOB_MODE_WRITEONCE, "pintool:mix", "category", "0", "Compute ISA category mix"); KNOB KnobIformMix(KNOB_MODE_WRITEONCE, "pintool:mix", "iform", "0", "Compute ISA iform mix"); KNOB KnobMapToFile(KNOB_MODE_WRITEONCE, "pintool:mix", "mapaddr", "0", "Map Addresses to File/Line information"); KNOB KnobEarlyOut(KNOB_MODE_WRITEONCE, "pintool:mix", "early_out", "0" , "Exit after tracing the first region."); typedef enum { measure_opcode=0, measure_category=1, measure_ilen=2, measure_iform=3 } measurement_t; measurement_t measurement = measure_opcode; /* ===================================================================== */ INT32 Usage() { cerr << "This pin tool computes a static and dynamic opcode, " << "instruction form, instruction length, extension or category mix profile\n\n"; cerr << KNOB_BASE::StringKnobSummary(); cerr << endl; cerr << "The default is to do opcode and ISA extension profileing" << endl; cerr << "At most one of -iform, -ilen or -category is allowed" << endl; cerr << endl; return -1; } /* ===================================================================== */ /* INDEX HELPERS */ /* ===================================================================== */ const UINT32 INDEX_SPECIAL = 3000; const UINT32 MAX_MEM_SIZE = 520; const UINT32 MAX_EXTENSION = XED_EXTENSION_LAST+10; const UINT32 INDEX_TOTAL = INDEX_SPECIAL + 0; const UINT32 INDEX_MEM_ATOMIC = INDEX_SPECIAL + 1; const UINT32 INDEX_STACK_READ = INDEX_SPECIAL + 2; const UINT32 INDEX_STACK_WRITE = INDEX_SPECIAL + 3; const UINT32 INDEX_IPREL_READ = INDEX_SPECIAL + 4; const UINT32 INDEX_IPREL_WRITE = INDEX_SPECIAL + 5; const UINT32 INDEX_MEM_READ_SIZE = INDEX_SPECIAL + 6; const UINT32 INDEX_MEM_WRITE_SIZE = INDEX_SPECIAL + 6 + MAX_MEM_SIZE; const UINT32 INDEX_EXTENSION = INDEX_SPECIAL + 6 + 2*MAX_MEM_SIZE; const UINT32 INDEX_SCALAR_SIMD = INDEX_EXTENSION + MAX_EXTENSION; const UINT32 INDEX_FMA_BASE = INDEX_SCALAR_SIMD + 1; const UINT32 INDEX_FMA = INDEX_FMA_BASE + 1; const UINT32 INDEX_FMA_ADD = INDEX_FMA_BASE + 2; const UINT32 INDEX_FMA_MUL = INDEX_FMA_BASE + 3; const UINT32 INDEX_FMA_S = INDEX_FMA_BASE + 4; const UINT32 INDEX_FMA_S_ADD = INDEX_FMA_BASE + 5; // NOTE: skipped 6. does not matter const UINT32 INDEX_FMA_S_MUL = INDEX_FMA_BASE + 7; const UINT32 INDEX_FMA_D = INDEX_FMA_BASE + 8; const UINT32 INDEX_FMA_D_ADD = INDEX_FMA_BASE + 9; const UINT32 INDEX_FMA_D_MUL = INDEX_FMA_BASE + 10; const UINT32 INDEX_FPMA = INDEX_FMA_BASE + 11; const UINT32 INDEX_FPMA_ADD = INDEX_FMA_BASE + 12; const UINT32 INDEX_FPMA_MUL = INDEX_FMA_BASE + 13; const UINT32 INDEX_FMS = INDEX_FMA_BASE + 14; const UINT32 INDEX_FMS_SUB = INDEX_FMA_BASE + 15; const UINT32 INDEX_FMS_MUL = INDEX_FMA_BASE + 16; const UINT32 INDEX_FMS_S = INDEX_FMA_BASE + 17; const UINT32 INDEX_FMS_S_SUB = INDEX_FMA_BASE + 18; const UINT32 INDEX_FMS_S_MUL = INDEX_FMA_BASE + 19; const UINT32 INDEX_FMS_D = INDEX_FMA_BASE + 20; const UINT32 INDEX_FMS_D_SUB = INDEX_FMA_BASE + 21; const UINT32 INDEX_FMS_D_MUL = INDEX_FMA_BASE + 22; const UINT32 INDEX_FPMS = INDEX_FMA_BASE + 23; const UINT32 INDEX_FPMS_SUB = INDEX_FMA_BASE + 24; const UINT32 INDEX_FPMS_MUL = INDEX_FMA_BASE + 25; const UINT32 INDEX_FNMA = INDEX_FMA_BASE + 26; const UINT32 INDEX_FNMA_ADD = INDEX_FMA_BASE + 27; const UINT32 INDEX_FNMA_MUL = INDEX_FMA_BASE + 28; const UINT32 INDEX_FNMA_S = INDEX_FMA_BASE + 29; const UINT32 INDEX_FNMA_S_ADD = INDEX_FMA_BASE + 30; const UINT32 INDEX_FNMA_S_MUL = INDEX_FMA_BASE + 31; const UINT32 INDEX_FNMA_D = INDEX_FMA_BASE + 32; const UINT32 INDEX_FNMA_D_ADD = INDEX_FMA_BASE + 33; const UINT32 INDEX_FNMA_D_MUL = INDEX_FMA_BASE + 34; const UINT32 INDEX_FPNMA = INDEX_FMA_BASE + 35; const UINT32 INDEX_FPNMA_ADD = INDEX_FMA_BASE + 36; const UINT32 INDEX_FPNMA_MUL = INDEX_FMA_BASE + 37; const UINT32 INDEX_SPECIAL_END = INDEX_FMA_BASE + 38; BOOL IsMemReadIndex(UINT32 i) { return (INDEX_MEM_READ_SIZE <= i && i < INDEX_MEM_READ_SIZE + MAX_MEM_SIZE ); } BOOL IsMemWriteIndex(UINT32 i) { return (INDEX_MEM_WRITE_SIZE <= i && i < INDEX_MEM_WRITE_SIZE + MAX_MEM_SIZE ); } /* ===================================================================== */ LOCALFUN UINT32 INS_GetIndex(INS ins) { UINT32 index = 0; switch(measurement) { case measure_opcode: index = INS_Opcode(ins); break; case measure_ilen: index = INS_Size(ins); break; case measure_category: index = INS_Category(ins); break; case measure_iform: { xed_decoded_inst_t* xedd = INS_XedDec(ins); xed_iform_enum_t iform = xed_decoded_inst_get_iform_enum(xedd); index = static_cast(iform); } break; } return index; } /* ===================================================================== */ LOCALFUN bool IsScalarSimd(INS ins) { xed_decoded_inst_t* xedd = INS_XedDec(ins); return xed_decoded_inst_get_attribute(xedd, XED_ATTRIBUTE_SIMD_SCALAR); } LOCALFUN UINT32 IndexStringLength(BBL bbl, BOOL memory_access_profile) { UINT32 count = 0; for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins)) { count++; // one for the ins if (measurement != measure_iform) count++; // one for the ISA extension. if( measurement == measure_opcode && memory_access_profile ) { if( INS_IsMemoryRead(ins) ) count++; // for size if( INS_IsStackRead(ins) ) count++; if( INS_IsIpRelRead(ins) ) count++; if( INS_IsMemoryWrite(ins) ) count++; // for size if( INS_IsStackWrite(ins) ) count++; if( INS_IsIpRelWrite(ins) ) count++; if( INS_IsAtomicUpdate(ins) ) count++; if (IsScalarSimd(ins)) count++; } } return count; } /* ===================================================================== */ LOCALFUN UINT32 MemsizeToIndex(UINT32 size, BOOL write) { return (write ? INDEX_MEM_WRITE_SIZE : INDEX_MEM_READ_SIZE ) + size; } /* ===================================================================== */ LOCALFUN stat_index_t* INS_GenerateIndexString(INS ins, stat_index_t *stats, BOOL memory_access_profile) { *stats++ = INS_GetIndex(ins); if (measurement != measure_iform) *stats++ = INS_Extension(ins) + INDEX_EXTENSION; if( measurement == measure_opcode && memory_access_profile ) { if( INS_IsMemoryRead(ins) ) *stats++ = MemsizeToIndex( INS_MemoryReadSize(ins), 0 ); if( INS_IsMemoryWrite(ins) ) *stats++ = MemsizeToIndex( INS_MemoryWriteSize(ins), 1 ); if( INS_IsAtomicUpdate(ins) ) *stats++ = INDEX_MEM_ATOMIC; if( INS_IsStackRead(ins) ) *stats++ = INDEX_STACK_READ; if( INS_IsStackWrite(ins) ) *stats++ = INDEX_STACK_WRITE; if( INS_IsIpRelRead(ins) ) *stats++ = INDEX_IPREL_READ; if( INS_IsIpRelWrite(ins) ) *stats++ = INDEX_IPREL_WRITE; if (IsScalarSimd(ins)) *stats++ = INDEX_SCALAR_SIMD; } return stats; } /* ===================================================================== */ LOCALFUN string IndexToString( UINT32 index ) { if (measurement == measure_iform) { return xed_iform_enum_t2str(static_cast(index)); } if( INDEX_SPECIAL <= index && index < INDEX_SPECIAL_END) { if( index == INDEX_TOTAL ) return "*total"; else if( IsMemReadIndex(index) ) return "*mem-read-" + decstr( index - INDEX_MEM_READ_SIZE ); else if( IsMemWriteIndex(index)) return "*mem-write-" + decstr( index - INDEX_MEM_WRITE_SIZE ); else if( index == INDEX_MEM_ATOMIC ) return "*mem-atomic"; else if( index == INDEX_STACK_READ ) return "*stack-read"; else if( index == INDEX_STACK_WRITE ) return "*stack-write"; else if( index == INDEX_IPREL_READ ) return "*iprel-read"; else if( index == INDEX_IPREL_WRITE ) return "*iprel-write"; else if( index == INDEX_SCALAR_SIMD) return "*scalar-simd"; else if (index >= INDEX_EXTENSION && index < INDEX_EXTENSION + MAX_EXTENSION) return "*isa-ext-" + EXTENSION_StringShort(index - INDEX_EXTENSION); else if ( index == INDEX_FMA ) return "*FMA"; else if ( index == INDEX_FMA_ADD ) return "*FMA_ADD"; else if ( index == INDEX_FMA_MUL ) return "*FMA_MUL"; else if ( index == INDEX_FMA_S ) return "*FMA_S"; else if ( index == INDEX_FMA_S_ADD ) return "*FMA_S_ADD"; else if ( index == INDEX_FMA_S_MUL ) return "*FMA_S_MUL"; else if ( index == INDEX_FMA_D ) return "*FMA_D"; else if ( index == INDEX_FMA_D_ADD ) return "*FMA_D_ADD"; else if ( index == INDEX_FMA_D_MUL ) return "*FMA_D_MUL"; else if ( index == INDEX_FPMA ) return "*FPMA"; else if ( index == INDEX_FPMA_ADD ) return "*FPMA_ADD"; else if ( index == INDEX_FPMA_MUL ) return "*FPMA_MUL"; else if ( index == INDEX_FMS ) return "*FMS"; else if ( index == INDEX_FMS_SUB ) return "*FMS_SUB"; else if ( index == INDEX_FMS_MUL ) return "*FMS_MUL"; else if ( index == INDEX_FMS_S ) return "*FMS_S"; else if ( index == INDEX_FMS_S_SUB ) return "*FMS_S_SUB"; else if ( index == INDEX_FMS_S_MUL ) return "*FMS_S_MUL"; else if ( index == INDEX_FMS_D ) return "*FMS_D"; else if ( index == INDEX_FMS_D_SUB ) return "*FMS_D_SUB"; else if ( index == INDEX_FMS_D_MUL ) return "*FMS_D_MUL"; else if ( index == INDEX_FPMS ) return "*FPMS"; else if ( index == INDEX_FPMS_SUB ) return "*FPMS_SUB"; else if ( index == INDEX_FPMS_MUL ) return "*FPMS_MUL"; else if ( index == INDEX_FNMA ) return "*FNMA"; else if ( index == INDEX_FNMA_ADD ) return "*FNMA_ADD"; else if ( index == INDEX_FNMA_MUL ) return "*FNMA_MUL"; else if ( index == INDEX_FNMA_S ) return "*FNMA_S"; else if ( index == INDEX_FNMA_S_ADD ) return "*FNMA_S_ADD"; else if ( index == INDEX_FNMA_S_MUL ) return "*FNMA_S_MUL"; else if ( index == INDEX_FNMA_D ) return "*FNMA_D"; else if ( index == INDEX_FNMA_D_ADD ) return "*FNMA_D_ADD"; else if ( index == INDEX_FNMA_D_MUL ) return "*FNMA_D_MUL"; else if ( index == INDEX_FPNMA ) return "*FPNMA"; else if ( index == INDEX_FPNMA_ADD ) return "*FPNMA_ADD"; else if ( index == INDEX_FPNMA_MUL ) return "*FPNMA_MUL"; else { ASSERTX(0); return ""; } } else if (measurement == measure_ilen) { ostringstream s; s << "ILEN-" << index; return s.str(); } else if (measurement == measure_opcode) { return OPCODE_StringShort(index); } else if (measurement == measure_category) { return CATEGORY_StringShort(index); } ASSERTX(0); return ""; } /* ===================================================================== */ /* ===================================================================== */ typedef UINT64 COUNTER; /* zero initialized */ typedef map stat_map_t; class CSTATS { public: CSTATS() { clear(); } stat_map_t unpredicated; stat_map_t predicated; stat_map_t predicated_true; VOID clear() { unpredicated.erase(unpredicated.begin(),unpredicated.end()); predicated.erase(predicated.begin(),predicated.end()); predicated_true.erase(predicated_true.begin(),predicated_true.end()); } }; class BBL_SORT_STATS { public: ADDRINT _pc; UINT32 _rtn_num; COUNTER _icount; COUNTER _executions; COUNTER _nbytes; }; CSTATS GlobalStatsStatic; // summary stats for static analysis class BBLSTATS { // Our first pass sets up the types of stats we need to update for this // block. We have one stat per instruction in the block. The _stats // array is null terminated. public: const stat_index_t* const _stats; const ADDRINT _pc; // start PC of the block const UINT32 _ninst; // # of instructions const UINT32 _nbytes; // # of bytes in the block const UINT32 _rtn_num; // index of the routine address/function name information BBLSTATS(stat_index_t* stats, ADDRINT pc, UINT32 ninst, UINT32 nbytes, UINT32 rtn_number) : _stats(stats), _pc(pc), _ninst(ninst), _nbytes(nbytes), _rtn_num(rtn_number) {}; }; LOCALVAR vector statsList; /* ===================================================================== */ #if defined(__GNUC__) # if defined(TARGET_MAC) || defined(TARGET_WINDOWS) // macOS* XCODE2.4.1 gcc and Cgywin gcc 3.4.x only allow for 16b // alignment! So we need to pad! # define ALIGN_LOCK __attribute__ ((aligned(16))) # else # define ALIGN_LOCK __attribute__ ((aligned(64))) # endif #else # define ALIGN_LOCK __declspec(align(64)) #endif typedef struct { char pad0[64]; PIN_LOCK ALIGN_LOCK lock; /* for mediating output */ char pad1[64]; PIN_LOCK ALIGN_LOCK bbl_list_lock; /* for the bbl list */ char pad2[64]; PIN_LOCK ALIGN_LOCK rtn_table_lock; /* for the rtn table */ char pad3[64]; } ALIGN_LOCK locks_t; locks_t locks; static std::ofstream* out; class thread_data_t { public: thread_data_t() : enabled(0) { } CSTATS cstats; vector stats_per_function; // indexed by rtn_num UINT32 enabled; vector block_counts; UINT32 size() { UINT32 limit; limit = block_counts.size(); return limit; } void resize(UINT32 n) { if (size() < n) block_counts.resize(2*n); } }; thread_data_t* get_tls(THREADID tid) { thread_data_t* tdata = static_cast(PIN_GetThreadData(tls_key, tid)); return tdata; } VOID activate_counting(THREADID tid) { thread_data_t* tdata = get_tls(tid); tdata->enabled = 1; } VOID deactivate_counting(THREADID tid) { thread_data_t* tdata = get_tls(tid); tdata->enabled = 0; } UINT32 numThreads = 0; VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v) { // This function is locked no need for a Pin Lock here numThreads++; PIN_GetLock(&locks.lock, tid+2); // for output *out << "# Starting tid " << tid << endl; PIN_ReleaseLock(&locks.lock); thread_data_t* tdata = new thread_data_t; // remember my pointer for later PIN_SetThreadData(tls_key, tdata, tid); // make sure the thread is counting stuff. // FIXME: The controller should start all threads if no trigger // conditions are specified, but currently it only starts // TID0. Starting here is wrong if the controller has a nontrivial // starting condition, but this is what most people want. They can // always stop the controller and zero the stats using markers as a // workaround. if (tid) activate_counting(tid); } VOID emit_stats(THREADID tid); //forward prototype VOID emit_pc_stats(THREADID tid); //forward prototype VOID zero_stats(THREADID tid); //forward prototype VOID Fini(int, VOID * v); VOID emit_bbl_stats_sorted(THREADID tid); LOCALVAR CONTROL_MANAGER control; LOCALFUN VOID Handler(EVENT_TYPE ev, VOID *val, CONTEXT *ctxt, VOID *ip, THREADID tid, bool bcast) { switch(ev) { case EVENT_START: PIN_GetLock(&locks.lock, tid+2); // for output *out << "# Start counting for tid " << tid << endl; PIN_ReleaseLock(&locks.lock); activate_counting(tid); break; case EVENT_STOP: PIN_GetLock(&locks.lock, tid+2); // for output *out << "# Stop counting for tid " << tid << endl; PIN_ReleaseLock(&locks.lock); deactivate_counting(tid); if (KnobEarlyOut) { *out << "Exiting due to -early-out" << endl; Fini(0, 0); exit(0); } break; case EVENT_STATS_EMIT: PIN_GetLock(&locks.lock, tid+2); // for output *out << "# Emit stats for tid " << static_cast(tid) << endl; PIN_ReleaseLock(&locks.lock); emit_stats(tid); break; case EVENT_STATS_RESET: PIN_GetLock(&locks.lock, tid+2); // for output *out << "# Reset stats for tid " << static_cast(tid) << endl; PIN_ReleaseLock(&locks.lock); zero_stats(tid); break; default: ASSERTX(false); } } /* ===================================================================== */ VOID validate_bbl_count(THREADID tid, ADDRINT block_count_for_trace) { thread_data_t* tdata = get_tls(tid); tdata->resize(block_count_for_trace+1); } VOID PIN_FAST_ANALYSIS_CALL docount_bbl(ADDRINT block_id, THREADID tid) { thread_data_t* tdata = get_tls(tid); //ASSERTX(tdata->size() > block_id); tdata->block_counts[block_id] += tdata->enabled; } VOID docount_predicated_true(UINT32 index, THREADID tid) { thread_data_t* tdata = get_tls(tid); if (tdata->enabled) { stat_map_t::iterator i = tdata->cstats.predicated_true.find(index); if (i == tdata->cstats.predicated_true.end()) tdata->cstats.predicated_true[index] = 1; else i->second += 1; } } /* ===================================================================== */ VOID zero_stats(THREADID tid) { thread_data_t* tdata = get_tls(tid); tdata->cstats.clear(); UINT32 limit = tdata->size(); for(UINT32 i=0;i< limit;i++) tdata->block_counts[i]=0; tdata->stats_per_function.erase( tdata->stats_per_function.begin(), tdata->stats_per_function.end()); } /* ===================================================================== */ VOID CheckForSpecialMarkers(INS ins, ADDRINT pc, unsigned int instruction_size) { // This checks for single instances of special 3B NOPs. // 0F1FF3 - start // 0F1FF4 - stop // 0F1FF5 - emit stats // 0F1FF6 - zero stats // FIXME: if there are collisions with existing instructions, we can // change them here. //FIXME: Ideally this would be integrated in to the control.H so file //so that anything can use it. if (instruction_size != 3) return; UINT8* pc_ptr = reinterpret_cast(pc); if (pc_ptr[0] == 0x0F && pc_ptr[1] == 0x1F) { switch(pc_ptr[2]) { case 0xF3: // start INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)activate_counting, IARG_THREAD_ID, IARG_END); break; case 0xF4: // stop INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)deactivate_counting, IARG_THREAD_ID, IARG_END); break; case 0xF5: // emit INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)emit_stats, IARG_THREAD_ID, IARG_END); break; case 0xF6: // zero INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)zero_stats, IARG_THREAD_ID, IARG_END); break; default: break; } } } /* ===================================================================== */ class RTN_TABLE_ENTRY { public: ADDRINT _address; ADDRINT _end_address; const char* _name; UINT32 _rtn_num; /* facilitates sorting later */ UINT64 _total; RTN_TABLE_ENTRY(ADDRINT address=0, ADDRINT end_address=0, const char* name=0, UINT32 rtn_num=0) : _address(address), _end_address(end_address), _name(name), _rtn_num(rtn_num), _total(0) {} }; static vector rtn_table; static int qsort_rtn_compare_fn(const void *a, const void *b) { /*descending sort*/ const RTN_TABLE_ENTRY* ba = static_cast(a); const RTN_TABLE_ENTRY* bb = static_cast(b); if (ba->_total < bb->_total) return 1; if (ba->_total > bb->_total) return -1; return 0; } UINT32 routine_identifier(RTN rtn, THREADID tid) { if (!RTN_Valid(rtn)) { return 0; /* we never return nonzero global routine numbers */ } ADDRINT rtn_address = RTN_Address(rtn); ADDRINT rtn_end_address; const char *rtn_name = RTN_Name(rtn).c_str(); /* RTN_Id returns a unique global identifier. However, the numeration is not necessarily consecutive so we keep our own numeration for convenience (and legacy) - see comment below. */ UINT32 rtn_num = RTN_Id(rtn); UINT32 img_num = IMG_Id(SEC_Img(RTN_Sec(rtn))); /* The pair is used to index a map of global routine numbers. The function is stored at the global routine number in the rtn_table. The basic blocks are tagged with the same number so they can be reassociated later for printing. */ static UINT32 next_rtn_num = 1; UINT32 global_rtn_num; typedef pair rtn_identifier_t; static map rtn_mapper; rtn_identifier_t ri(img_num,rtn_num); PIN_GetLock(&locks.rtn_table_lock,tid+2); map::iterator it = rtn_mapper.find(ri); if (it == rtn_mapper.end()) { rtn_mapper[ri] = next_rtn_num; global_rtn_num = next_rtn_num; next_rtn_num++; /* store the RTN info using the global routine number */ if (global_rtn_num >= rtn_table.size()) rtn_table.resize(global_rtn_num*2+1); rtn_end_address = RTN_Address(rtn) + RTN_Size(rtn); rtn_table[global_rtn_num ] = RTN_TABLE_ENTRY (rtn_address, rtn_end_address, strdup(rtn_name), global_rtn_num); } else { global_rtn_num = it->second; } PIN_ReleaseLock(&locks.rtn_table_lock); return global_rtn_num; } VOID Trace(TRACE trace, VOID *v) { static UINT32 basic_blocks = 0; if ( KnobNoSharedLibs.Value() && IMG_Type(SEC_Img(RTN_Sec(TRACE_Rtn(trace)))) == IMG_TYPE_SHAREDLIB) return; const BOOL accurate_handling_of_predicates = KnobProfilePredicated.Value(); ADDRINT pc = TRACE_Address(trace); RTN rtn = TRACE_Rtn(trace); UINT32 rtn_id = routine_identifier(rtn,1); /* fake tid for the instrumenting code */ UINT32 new_blocks = 0; for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) { const INS head = BBL_InsHead(bbl); if (! INS_Valid(head)) continue; new_blocks++; } TRACE_InsertCall(trace, IPOINT_BEFORE, AFUNPTR(validate_bbl_count), IARG_THREAD_ID, IARG_UINT32, basic_blocks+new_blocks, IARG_END); for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) { const INS head = BBL_InsHead(bbl); if (! INS_Valid(head)) continue; // Summarize the stats for the bbl in a 0 terminated list // This is done at instrumentation time const UINT32 n = IndexStringLength(bbl, 1); ADDRINT block_start_pc = pc; // stats is an array of index types. We later multiply it by the // dynamic count for a block. stat_index_t *const stats = new stat_index_t[ n + 1]; stat_index_t *const stats_end = stats + (n + 1); stat_index_t *curr = stats; UINT32 ninsts = 0; for (INS ins = head; INS_Valid(ins); ins = INS_Next(ins)) { unsigned int instruction_size = INS_Size(ins); // This checks for x86-specific opcodes CheckForSpecialMarkers(ins, pc, instruction_size); // Count the number of times a predicated instruction is actually executed // this is expensive and hence disabled by default if( INS_IsPredicated(ins) && accurate_handling_of_predicates ) { INS_InsertPredicatedCall(ins, IPOINT_BEFORE, AFUNPTR(docount_predicated_true), IARG_UINT32, INS_GetIndex(ins), IARG_THREAD_ID, IARG_END); } if (KnobMapToFile) { INT32 line; string filename; PIN_GetSourceLocation(pc, NULL, &line, &filename); if (!filename.empty()) *out << "MAPADDR 0x" << hex << pc << " " << dec << line << " " << filename << endl; } curr = INS_GenerateIndexString(ins,curr,1); pc = pc + instruction_size; ninsts++; } // stats terminator *curr++ = 0; ASSERTX( curr == stats_end ); // Insert instrumentation to count the number of times the bbl is executed BBLSTATS * bblstats = new BBLSTATS(stats, block_start_pc, ninsts, pc-block_start_pc, rtn_id); INS_InsertCall(head, IPOINT_BEFORE, AFUNPTR(docount_bbl), IARG_FAST_ANALYSIS_CALL, IARG_UINT32, basic_blocks, IARG_THREAD_ID, IARG_END); // Remember the counter and stats so we can compute a summary at the end basic_blocks++; PIN_GetLock(&locks.bbl_list_lock,1); statsList.push_back(bblstats); PIN_ReleaseLock(&locks.bbl_list_lock); } } /* ===================================================================== */ VOID DumpStats(ofstream& out, CSTATS& stats, BOOL predicated_true, const string& title, THREADID tid) { out << "#\n# " << title << "\n#\n"; if (tid != INVALID_THREADID) out << "# TID " << tid << "\n"; out << "# "; const char *label = 0; if (measurement == measure_opcode) label = "opcode"; else if (measurement == measure_ilen) label = "inslen"; else if (measurement == measure_category) label = "category"; else if (measurement == measure_iform) label = "iform"; if (label) out << ljstr(label,24); out<< setw(16) << "count"; if( predicated_true ) out << " count-predicated-true"; out << "\n#\n"; // Compute the "total" bin. Stop at the INDEX_SPECIAL for all histograms // except the iform. Iforms do not use the special rows, so we count everything. // build a map of the valid stats index values for all 3 tables. map m; COUNTER tu=0; for(stat_map_t::iterator it = stats.unpredicated.begin() ; it != stats.unpredicated.end() ; it++) { if (measurement == measure_iform || it->first < INDEX_SPECIAL) tu += it->second; m[it->first]=true; } COUNTER tpt=0; for(stat_map_t::iterator it=stats.predicated_true.begin();it != stats.predicated_true.end() ; it++) { if (measurement == measure_iform || it->first < INDEX_SPECIAL) tpt += it->second; m[it->first]=true; } for(map::iterator it = m.begin(); it != m.end(); it++) { stat_map_t::iterator s; COUNTER up=0; UINT32 indx = it->first; s = stats.unpredicated.find(indx); if (s != stats.unpredicated.end()) up = s->second; if (up == 0) continue; out << ljstr(IndexToString(indx),25) << " " << setw(16) << up; if( predicated_true ) { COUNTER prt=0; s = stats.predicated_true.find(indx); if (s != stats.predicated_true.end()) { prt = s->second; out << " " << setw(16) << prt; } } out << endl; } // print the totals out << ljstr("*total",25) << " " << setw(16) << tu; if( predicated_true ) out << " " << setw(16) << tpt; out << endl; } /* ===================================================================== */ static UINT32 stat_dump_count = 0; VOID emit_bbl_stats_by_function(THREADID tid) { // dynamic counts for one thread and for each function of that thread. thread_data_t* tdata = get_tls(tid); // grab a copy of the routines list that we can sort and where we can // compute function totals. Need to lock because instrumentation might // reallocate the vector. PIN_GetLock(&locks.rtn_table_lock,tid+2); UINT32 functions = rtn_table.size(); RTN_TABLE_ENTRY* rtn_table_sorted = new RTN_TABLE_ENTRY[functions]; for(UINT32 i=0; istats_per_function.size() < functions) tdata->stats_per_function.resize(functions+1); UINT32 limit = tdata->size(); if ( limit > statsList.size() ) limit = statsList.size(); for(UINT32 i=0;i< limit ; i++) { COUNTER bcount = tdata->block_counts[i]; BBLSTATS* b = statsList[i]; /* the last test below is for when new bbl's get jitted while we * are emitting stats */ if (b && b->_stats && b->_rtn_num < functions) for (const stat_index_t* stats = b->_stats; *stats; stats++) { tdata->cstats.unpredicated[*stats] += bcount; //*out << "# stat for block " << b->_rtn_num << endl; tdata->stats_per_function[b->_rtn_num].unpredicated[*stats] += bcount; if (*stats < INDEX_SPECIAL) rtn_table_sorted[b->_rtn_num]._total += bcount; } } PIN_ReleaseLock(&locks.bbl_list_lock); // emit the "normal" dynamic stats *out << "# EMIT_DYNAMIC_STATS FOR TID " << tid << " EMIT #" << stat_dump_count << endl; DumpStats(*out, tdata->cstats, KnobProfilePredicated, "$dynamic-counts",tid); *out << "# END_DYNAMIC_STATS" << endl; // sort the routines by the total instr count qsort(rtn_table_sorted, functions, sizeof(RTN_TABLE_ENTRY), qsort_rtn_compare_fn); // total every thing up for all functions in this thread UINT64 total = 0; for(UINT32 i=0; istats_per_function[rtn_num], KnobProfilePredicated, title,tid); } } *out << "# END_PER_FUNCTION_STATS " << endl; delete [] rtn_table_sorted; } int qsort_compare_fn(const void *a, const void *b) { const BBL_SORT_STATS* ba = static_cast(a); const BBL_SORT_STATS* bb = static_cast(b); if (bb->_icount > ba->_icount) return 1; if (bb->_icount < ba->_icount) return -1; return 0; } VOID emit_bbl_stats_sorted(THREADID tid) { /* emit the top blocks for this tid */ thread_data_t* tdata = get_tls(tid); // dynamic Counts // Need to lock here because we might be resize (and thus reallocing) // the statsList when we do a push_back in the instrumentation. PIN_GetLock(&locks.bbl_list_lock,tid+2); UINT32 limit = tdata->size(); if ( limit > statsList.size() ) limit = statsList.size(); BBL_SORT_STATS* icounts = new BBL_SORT_STATS[limit]; COUNTER thread_total = 0; for(UINT32 i=0;i< limit ; i++) { BBLSTATS* b = statsList[i]; if (b) { COUNTER bcount = tdata->block_counts[i]; COUNTER x = b->_ninst; x = x * bcount; //*out << hex << "ALL PC: " << b->_pc << " COUNT: " << x << endl; icounts[i]._icount = x; icounts[i]._pc = b->_pc; icounts[i]._rtn_num = b->_rtn_num; icounts[i]._executions = bcount; icounts[i]._nbytes = b->_nbytes; thread_total += icounts[i]._icount; } } PIN_ReleaseLock(&locks.bbl_list_lock); qsort(icounts, limit, sizeof(BBL_SORT_STATS), qsort_compare_fn); PIN_GetLock(&locks.rtn_table_lock,tid+2); *out << "# EMIT_TOP_BLOCK_STATS FOR TID " << tid << " EMIT # " << stat_dump_count << endl; if (limit > KnobTopBlocks.Value()) limit = KnobTopBlocks.Value(); COUNTER t =0; for(UINT32 i=0;i idx) if (rtn_table[idx]._name) *out << " FN: " << rtn_table[idx]._name; *out << endl; if (KnobShowDisassembly) { string s = disassemble(icounts[i]._pc, icounts[i]._pc + icounts[i]._nbytes); *out << s << endl; } } *out << "# END_TOP_BLOCK_STATS" << endl; PIN_ReleaseLock(&locks.rtn_table_lock); delete [] icounts; } VOID emit_static_stats() { *out << "# EMIT_STATIC_STATS " << endl; DumpStats(*out, GlobalStatsStatic, false, "$static-counts",INVALID_THREADID); *out << endl << "# END_STATIC_STATS" << endl; } VOID emit_pc_stats(THREADID tid) { thread_data_t* tdata = get_tls(tid); // dynamic Counts // Need to lock here because we might be resize (and thus reallocing) // the statsList when we do a push_back in the instrumentation. *out << "# EMIT_PC_STATS FOR TID " << tid << endl; PIN_GetLock(&locks.bbl_list_lock,tid+2); UINT32 limit = tdata->size(); if ( limit > statsList.size() ) limit = statsList.size(); for(UINT32 i=0;i< limit ; i++) { COUNTER bcount = tdata->block_counts[i]; BBLSTATS* b = statsList[i]; if (bcount && b && b->_stats) *out << "BLOCKCOUNT 0x" << hex << b->_pc << " " << dec << (bcount * b->_ninst ) << endl; } PIN_ReleaseLock(&locks.bbl_list_lock); *out << "# END_EMIT_PC_STATS FOR TID " << tid << endl; } VOID emit_stats(THREADID tid) { /* we must save the X87 state because we do floating point computation * when doing output and on 32b platforms, that can use x87. Pin does * not save the X87 state, so analysis routines must do it * themselves. */ mix_fp_save_buffer_t save_buf; unsigned char* p = MIX_FP_ALIGN(&save_buf); /* make sure it is 16B aligned */ MIX_FP_SAVE(p); PIN_GetLock(&locks.lock, tid+2); // for output stat_dump_count++; *out << "# ==============================================" << endl; *out << "# STATS FOR TID " << tid << " EMIT# " << stat_dump_count << endl; *out << "# ==============================================" << endl; emit_bbl_stats_sorted(tid); // top blocks emit_bbl_stats_by_function(tid); // function level stats if (KnobMapToFile) emit_pc_stats(tid); PIN_ReleaseLock(&locks.lock); MIX_FP_RELOAD(p); } /* ===================================================================== */ void combine_dynamic_stats(unsigned int numThreads) { // combine all the rows from each thread in to the total variable. CSTATS total; for (THREADID i=0;icstats.unpredicated.begin(); it != tdata->cstats.unpredicated.end() ; it++) { stat_map_t::iterator x = total.unpredicated.find(it->first); if (x == total.unpredicated.end()) total.unpredicated[it->first] = it->second; else x->second += it->second; } for(stat_map_t::iterator it = tdata->cstats.predicated.begin(); it != tdata->cstats.predicated.end() ; it++) { stat_map_t::iterator x = total.predicated.find(it->first); if (x == total.predicated.end()) total.predicated[it->first] = it->second; else x->second += it->second; } for(stat_map_t::iterator it = tdata->cstats.predicated_true.begin(); it != tdata->cstats.predicated_true.end() ; it++) { stat_map_t::iterator x = total.predicated_true.find(it->first); if (x == total.predicated_true.end()) total.predicated_true[it->first] = it->second; else x->second += it->second; } } *out << "# EMIT_GLOBAL_DYNAMIC_STATS EMIT# " << stat_dump_count << endl; DumpStats(*out, total, false, "$global-dynamic-counts",INVALID_THREADID); *out << endl << "# END_GLOBAL_DYNAMIC_STATS" << endl; } VOID Fini(int, VOID * v) // only runs once for the application { *out << "# FINI: end of program" << endl; for(unsigned int i=0;iclose(); } /* ===================================================================== */ VOID Image(IMG img, VOID * v) { for (SEC sec = IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec)) { for (RTN rtn = SEC_RtnHead(sec); RTN_Valid(rtn); rtn = RTN_Next(rtn)) { // Prepare for processing of RTN, an RTN is not broken up into BBLs, // it is merely a sequence of INSs RTN_Open(rtn); for (INS ins = RTN_InsHead(rtn); INS_Valid(ins); ins = INS_Next(ins)) { stat_index_t array[256]; stat_index_t* mid = INS_GenerateIndexString(ins,array,1); stat_index_t* end = mid; if( INS_IsPredicated(ins) ) { for( stat_index_t *start= array; start < end; start++) { GlobalStatsStatic.predicated[ *start ]++; } } else { for( stat_index_t *start= array; start < end; start++) { GlobalStatsStatic.unpredicated[ *start ]++; } } } // to preserve space, release data associated with RTN after we have processed it RTN_Close(rtn); } } if( KnobProfileStaticOnly.Value() ) { Fini(0,0); exit(0); } } ///////////////////////////////////////////////////////////////////////// // Add a disassembler ///////////////////////////////////////////////////////////////////////// static char nibble_to_ascii_hex(UINT8 i) { if (i<10) return i+'0'; if (i<16) return i-10+'A'; return '?'; } static void print_hex_line(char* buf, const UINT8* array, const int length) { int n = length; int i=0; if (length == 0) n = XED_MAX_INSTRUCTION_BYTES; for( i=0 ; i< n; i++) { buf[2*i+0] = nibble_to_ascii_hex(array[i]>>4); buf[2*i+1] = nibble_to_ascii_hex(array[i]&0xF); } buf[2*i]=0; } static string disassemble(UINT64 start, UINT64 stop) { UINT64 pc = start; xed_state_t dstate; xed_syntax_enum_t syntax = XED_SYNTAX_INTEL; xed_error_enum_t xed_error; xed_decoded_inst_t xedd; ostringstream os; if (sizeof(ADDRINT) == 4) xed_state_init(&dstate, XED_MACHINE_MODE_LEGACY_32, XED_ADDRESS_WIDTH_32b, XED_ADDRESS_WIDTH_32b); else xed_state_init(&dstate, XED_MACHINE_MODE_LONG_64, XED_ADDRESS_WIDTH_64b, XED_ADDRESS_WIDTH_64b); while( pc < stop ) { xed_decoded_inst_zero_set_mode(&xedd, &dstate); UINT32 len = 15; if (stop - pc < 15) len = stop-pc; xed_error = xed_decode(&xedd, reinterpret_cast(pc), len); bool okay = (xed_error == XED_ERROR_NONE); iostream::fmtflags fmt = os.flags(); os << std::setfill('0') << "XDIS " << std::hex << std::setw(sizeof(ADDRINT)*2) << pc << std::dec << ": " << std::setfill(' ') << std::setw(4); if (okay) { char buffer[200]; unsigned int dec_len, sp; os << xed_extension_enum_t2str(xed_decoded_inst_get_extension(&xedd)); dec_len = xed_decoded_inst_get_length(&xedd); print_hex_line(buffer, reinterpret_cast(pc), dec_len); os << " " << buffer; for ( sp=dec_len; sp < 12; sp++) // pad out the instruction bytes os << " "; os << " "; memset(buffer,0,200); int dis_okay = xed_format_context(syntax, &xedd, buffer, 200, pc, 0, 0); if (dis_okay) os << buffer << endl; else os << "Error disasassembling pc 0x" << std::hex << pc << std::dec << endl; pc += dec_len; } else { // print the byte and keep going. UINT8 memval = *reinterpret_cast(pc); os << "???? " // no extension << std::hex << std::setw(2) << std::setfill('0') << static_cast(memval) << std::endl; pc += 1; } os.flags(fmt); } return os.str(); } /* ===================================================================== */ int main(int argc, CHAR **argv) { PIN_InitSymbols(); if( PIN_Init(argc,argv) ) return Usage(); PIN_InitLock(&locks.lock); PIN_InitLock(&locks.bbl_list_lock); PIN_InitLock(&locks.rtn_table_lock); // obtain a key for TLS storage tls_key = PIN_CreateThreadDataKey(0); string filename = KnobOutputFile.Value(); if (KnobPid) { filename += "." + decstr(getpid()); } out = new std::ofstream(filename.c_str()); *out << "# Mix output version 2" << endl; control.RegisterHandler(Handler, 0, FALSE); control.Activate(); // make sure that exactly one thing-to-count knob is specified. if (KnobInstructionLengthMix.Value() && KnobCategoryMix.Value()) { cerr << "Must have at most one of: -iform, -ilen or -category " << "as a pintool option" << endl; exit(1); } if (KnobInstructionLengthMix.Value()) measurement = measure_ilen; if (KnobCategoryMix.Value()) measurement = measure_category; if (KnobIformMix.Value()) { measurement = measure_iform; } TRACE_AddInstrumentFunction(Trace, 0); PIN_AddThreadStartFunction(ThreadStart, 0); PIN_AddFiniFunction(Fini, 0); if( !KnobProfileDynamicOnly.Value() ) IMG_AddInstrumentFunction(Image, 0); PIN_StartProgram(); // Never returns return 0; } /* ===================================================================== */ /* eof */ /* ===================================================================== */