mirror of
https://github.com/libretro/hatari.git
synced 2024-11-27 02:00:31 +00:00
initial profiler support for getting full costs for subroutine calls
includes: - common code for collecting, processing and outputting the data about the full costs - full CPU side support - related structure changes in DSP code - some extra refactoring to improve code CPU/DSP profiling code - post-processor support for parsing the new data and storing it internally missing: - DSP side code (opcode classification) needed for getting the full costs - post-processor actually using the new data - testing the new data correctness
This commit is contained in:
parent
e5fd99c8c3
commit
57694ea262
@ -11,6 +11,7 @@
|
||||
const char Profile_fileid[] = "Hatari profile.c : " __DATE__ " " __TIME__;
|
||||
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "main.h"
|
||||
#include "version.h"
|
||||
#include "debugui.h"
|
||||
@ -44,17 +45,60 @@ static const struct {
|
||||
*/
|
||||
static int cmp_callers(const void *c1, const void *c2)
|
||||
{
|
||||
Uint32 count1 = ((const caller_t*)c1)->count;
|
||||
Uint32 count2 = ((const caller_t*)c2)->count;
|
||||
if (count1 > count2) {
|
||||
Uint32 calls1 = ((const caller_t*)c1)->calls;
|
||||
Uint32 calls2 = ((const caller_t*)c2)->calls;
|
||||
if (calls1 > calls2) {
|
||||
return -1;
|
||||
}
|
||||
if (count1 < count2) {
|
||||
if (calls1 < calls2) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* output caller counter information
|
||||
*/
|
||||
static bool output_counter_info(FILE *fp, counters_t *counter)
|
||||
{
|
||||
if (!counter->count) {
|
||||
return false;
|
||||
}
|
||||
fprintf(fp, " %lld/%lld/%lld", counter->calls, counter->count, counter->cycles);
|
||||
if (counter->misses) {
|
||||
/* these are only with specific WinUAE CPU core */
|
||||
fprintf(fp, "/%lld", counter->misses);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* output caller call counts, call type(s) and costs
|
||||
*/
|
||||
static void output_caller_info(FILE *fp, caller_t *info, Uint32 *typeaddr)
|
||||
{
|
||||
int k, typecount;
|
||||
|
||||
fprintf(fp, "0x%x = %d", info->addr, info->calls);
|
||||
if (info->flags) { /* calltypes supported? */
|
||||
fputc(' ', fp);
|
||||
typecount = 0;
|
||||
for (k = 0; k < ARRAYSIZE(flaginfo); k++) {
|
||||
if (info->flags & flaginfo[k].bit) {
|
||||
fputc(flaginfo[k].chr, fp);
|
||||
typecount++;
|
||||
}
|
||||
}
|
||||
if (typecount > 1) {
|
||||
*typeaddr = info->addr;
|
||||
}
|
||||
}
|
||||
if (output_counter_info(fp, &(info->all))) {
|
||||
output_counter_info(fp, &(info->own));
|
||||
}
|
||||
fputs(", ", fp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Show collected CPU/DSP callee/caller information.
|
||||
*
|
||||
@ -63,19 +107,19 @@ static int cmp_callers(const void *c1, const void *c2)
|
||||
*/
|
||||
void Profile_ShowCallers(FILE *fp, int sites, callee_t *callsite, const char * (*addr2name)(Uint32, Uint64 *))
|
||||
{
|
||||
int typecount, countissues, countdiff;
|
||||
int i, j, k;
|
||||
int i, j, countissues, countdiff;
|
||||
const char *name;
|
||||
caller_t *info;
|
||||
Uint64 total;
|
||||
Uint32 addr, typeaddr;
|
||||
|
||||
/* legend */
|
||||
fputs("# ", fp);
|
||||
for (k = 0; k < ARRAYSIZE(flaginfo); k++) {
|
||||
fprintf(fp, "%c = %s, ", flaginfo[k].chr, flaginfo[k].info);
|
||||
fputs("# <callee>: <caller1> = <calls> <types>[ <inclusive/totals>[ <exclusive/totals>]], <caller2> ..., <callee name>", fp);
|
||||
fputs("\n# types: ", fp);
|
||||
for (i = 0; i < ARRAYSIZE(flaginfo); i++) {
|
||||
fprintf(fp, "%c = %s, ", flaginfo[i].chr, flaginfo[i].info);
|
||||
}
|
||||
fputs("\n", fp);
|
||||
fputs("- = info missing\n# totals: calls/instructions/cycles/misses\n", fp);
|
||||
|
||||
countdiff = 0;
|
||||
countissues = 0;
|
||||
@ -94,25 +138,11 @@ void Profile_ShowCallers(FILE *fp, int sites, callee_t *callsite, const char * (
|
||||
if (!info->addr) {
|
||||
break;
|
||||
}
|
||||
fprintf(fp, "0x%x = %d", info->addr, info->count);
|
||||
total -= info->count;
|
||||
if (info->flags) { /* calltypes supported? */
|
||||
fputs(" ", fp);
|
||||
typecount = 0;
|
||||
for (k = 0; k < ARRAYSIZE(flaginfo); k++) {
|
||||
if (info->flags & flaginfo[k].bit) {
|
||||
fputc(flaginfo[k].chr, fp);
|
||||
typecount++;
|
||||
}
|
||||
}
|
||||
if (typecount > 1) {
|
||||
typeaddr = info->addr;
|
||||
}
|
||||
}
|
||||
fputs(", ", fp);
|
||||
total -= info->calls;
|
||||
output_caller_info(fp, info, &typeaddr);
|
||||
}
|
||||
if (name) {
|
||||
fprintf(fp, "(%s)", name);
|
||||
fprintf(fp, "%s", name);
|
||||
}
|
||||
fputs("\n", fp);
|
||||
if (total) {
|
||||
@ -138,14 +168,61 @@ void Profile_ShowCallers(FILE *fp, int sites, callee_t *callsite, const char * (
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Update CPU/DSP callee / caller information, if called address contains
|
||||
* symbol address (= function, or other interesting place in code)
|
||||
* add second counter values to first counters
|
||||
*/
|
||||
void Profile_UpdateCaller(callee_t *callsite, Uint32 pc, Uint32 caller, calltype_t flag)
|
||||
static void add_counter_costs(counters_t *dst, counters_t *src)
|
||||
{
|
||||
dst->calls += src->calls;
|
||||
dst->count += src->count;
|
||||
dst->cycles += src->cycles;
|
||||
dst->misses += src->misses;
|
||||
}
|
||||
|
||||
/**
|
||||
* set first counter values to their difference from a reference value
|
||||
*/
|
||||
static void set_counter_diff(counters_t *dst, counters_t *ref)
|
||||
{
|
||||
dst->calls = ref->calls - dst->calls;
|
||||
dst->count = ref->count - dst->count;
|
||||
dst->cycles = ref->cycles - dst->cycles;
|
||||
dst->misses = ref->misses - dst->misses;
|
||||
}
|
||||
|
||||
/**
|
||||
* add called (callee) function costs to caller information
|
||||
*/
|
||||
static void add_callee_cost(callee_t *callsite, callstack_t *stack)
|
||||
{
|
||||
caller_t *info = callsite->callers;
|
||||
counters_t owncost;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < callsite->count; i++, info++) {
|
||||
if (info->addr == stack->caller_addr) {
|
||||
/* own cost for callee is its child (out) costs
|
||||
* deducted from full (all) costs
|
||||
*/
|
||||
owncost = stack->out;
|
||||
set_counter_diff(&owncost, &(stack->all));
|
||||
add_counter_costs(&(info->own), &owncost);
|
||||
add_counter_costs(&(info->all), &(stack->all));
|
||||
return;
|
||||
}
|
||||
}
|
||||
/* cost is only added for updated callers,
|
||||
* so they should always exist
|
||||
*/
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
static void add_caller(callee_t *callsite, Uint32 pc, Uint32 prev_pc, calltype_t flag)
|
||||
{
|
||||
int i, count;
|
||||
caller_t *info;
|
||||
int i, count;
|
||||
|
||||
/* need to store real call addresses as symbols can change
|
||||
* after profiling has been stopped
|
||||
@ -166,16 +243,16 @@ void Profile_UpdateCaller(callee_t *callsite, Uint32 pc, Uint32 caller, calltype
|
||||
count = callsite->count;
|
||||
for (;;) {
|
||||
for (i = 0; i < count; i++, info++) {
|
||||
if (info->addr == caller) {
|
||||
if (info->addr == prev_pc) {
|
||||
info->flags |= flag;
|
||||
info->count++;
|
||||
info->calls++;
|
||||
return;
|
||||
}
|
||||
if (!info->addr) {
|
||||
/* empty slot */
|
||||
info->addr = caller;
|
||||
info->addr = prev_pc;
|
||||
info->flags |= flag;
|
||||
info->count = 1;
|
||||
info->calls = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -192,29 +269,151 @@ void Profile_UpdateCaller(callee_t *callsite, Uint32 pc, Uint32 caller, calltype
|
||||
}
|
||||
}
|
||||
|
||||
int Profile_AllocCallerInfo(const char *info, int oldcount, int count, callee_t **callsite)
|
||||
static void subcall_start_check(int idx, callinfo_t *callinfo, Uint32 prev_pc, calltype_t flag, Uint32 pc, Uint32 ret_pc, counters_t *runcounts)
|
||||
{
|
||||
if (*callsite) {
|
||||
/* free old data */
|
||||
int i;
|
||||
callee_t *site = *callsite;
|
||||
for (i = 0; i < oldcount; i++, site++) {
|
||||
callstack_t *stack;
|
||||
int count;
|
||||
|
||||
if (flag != CALL_SUBROUTINE) {
|
||||
return;
|
||||
}
|
||||
runcounts->calls++;
|
||||
|
||||
if (unlikely(!callinfo->count)) {
|
||||
/* initial stack alloc, can be a bit larger */
|
||||
count = 8;
|
||||
stack = calloc(count, sizeof(*stack));
|
||||
if (!stack) {
|
||||
fputs("ERROR: callstack alloc failed!\n", stderr);
|
||||
return;
|
||||
}
|
||||
callinfo->stack = stack;
|
||||
callinfo->count = count;
|
||||
|
||||
} else if (unlikely(callinfo->depth+1 >= callinfo->count)) {
|
||||
/* need to alloc more stack space for new call? */
|
||||
count = callinfo->count * 2;
|
||||
stack = realloc(callinfo->stack, count * sizeof(*stack));
|
||||
if (!stack) {
|
||||
fputs("ERROR: callstack alloc failed!\n", stderr);
|
||||
return;
|
||||
}
|
||||
memset(stack + callinfo->count, 0, callinfo->count * sizeof(*stack));
|
||||
callinfo->stack = stack;
|
||||
callinfo->count = count;
|
||||
}
|
||||
|
||||
/* called function */
|
||||
stack = &(callinfo->stack[callinfo->depth++]);
|
||||
|
||||
/* store current running totals & zero subcall costs */
|
||||
stack->all = *runcounts;
|
||||
memset(&(stack->out), 0, sizeof(stack->out));
|
||||
|
||||
/* set subroutine call information */
|
||||
assert(ret_pc);
|
||||
stack->ret_addr = ret_pc;
|
||||
stack->callee_idx = idx;
|
||||
stack->caller_addr = prev_pc;
|
||||
stack->callee_addr = pc;
|
||||
}
|
||||
|
||||
static void subcall_end_check(callinfo_t *callinfo, Uint32 prev_pc, calltype_t flag, Uint32 pc, Uint32 ret_pc, counters_t *runcounts)
|
||||
{
|
||||
callstack_t *stack;
|
||||
|
||||
if (!callinfo->depth) {
|
||||
/* no calls yet */
|
||||
return;
|
||||
}
|
||||
stack = &(callinfo->stack[callinfo->depth-1]);
|
||||
|
||||
if (likely(pc != stack->ret_addr)) {
|
||||
/* not return address */
|
||||
return;
|
||||
}
|
||||
if (unlikely(flag == CALL_SUBROUTINE)) {
|
||||
/* return address, but not return
|
||||
* (EmuTOS perversity: JSR back from JSR)
|
||||
*/
|
||||
return;
|
||||
}
|
||||
/* remove call info from stack */
|
||||
callinfo->depth--;
|
||||
|
||||
if (flag != CALL_SUBRETURN) {
|
||||
/* wasn't a real subroutine call... */
|
||||
if (flag == CALL_EXCRETURN) {
|
||||
/* back from interrupt handler, ignore whole call */
|
||||
return;
|
||||
}
|
||||
/* ...but a mystery to debug */
|
||||
fprintf(stderr, "WARNING: subroutine call from 0x%x -> 0x%x returned 0x%x -> 0x%x!\n",
|
||||
stack->caller_addr, stack->callee_addr, prev_pc, pc);
|
||||
DebugUI(REASON_USER);
|
||||
return;
|
||||
}
|
||||
|
||||
/* full cost is orignal global cost (in ->all)
|
||||
* deducted from current global (runcost) cost
|
||||
*/
|
||||
set_counter_diff(&(stack->all), runcounts);
|
||||
add_callee_cost(callinfo->site + stack->callee_idx, stack);
|
||||
|
||||
/* add full cost of this to parent caller's outside costs */
|
||||
if (callinfo->depth) {
|
||||
callstack_t *pstack = stack - 1;
|
||||
add_counter_costs(&(pstack->out), &(stack->all));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update CPU/DSP callee / caller information, if called address contains
|
||||
* symbol address (= function, or other interesting place in code)
|
||||
*/
|
||||
void Profile_UpdateCallinfo(int idx, callinfo_t *callinfo, Uint32 prev_pc, calltype_t flag, Uint32 pc, Uint32 ret_pc, counters_t *runcounts)
|
||||
{
|
||||
if (unlikely(idx >= 0 && idx < callinfo->sites)) {
|
||||
subcall_start_check(idx, callinfo, prev_pc, flag, pc, ret_pc, runcounts);
|
||||
add_caller(callinfo->site + idx, pc, prev_pc, flag);
|
||||
|
||||
} else {
|
||||
subcall_end_check(callinfo, prev_pc, flag, pc, ret_pc, runcounts);
|
||||
}
|
||||
}
|
||||
|
||||
int Profile_AllocCallinfo(callinfo_t *callinfo, int count, const char *name)
|
||||
{
|
||||
callinfo->sites = count;
|
||||
if (count) {
|
||||
/* alloc & clear new data */
|
||||
callinfo->site = calloc(count, sizeof(callee_t));
|
||||
if (callinfo->site) {
|
||||
printf("Allocated %s profile callsite buffer for %d symbols.\n", name, count);
|
||||
} else {
|
||||
fprintf(stderr, "ERROR: callesite buffer alloc failed!\n");
|
||||
callinfo->sites = 0;
|
||||
}
|
||||
}
|
||||
return callinfo->sites;
|
||||
}
|
||||
|
||||
void Profile_FreeCallinfo(callinfo_t *callinfo)
|
||||
{
|
||||
int i;
|
||||
if (callinfo->sites) {
|
||||
callee_t *site = callinfo->site;
|
||||
for (i = 0; i < callinfo->sites; i++, site++) {
|
||||
if (site->callers) {
|
||||
free(site->callers);
|
||||
}
|
||||
}
|
||||
free(*callsite);
|
||||
*callsite = NULL;
|
||||
}
|
||||
if (count) {
|
||||
/* alloc & clear new data */
|
||||
*callsite = calloc(count, sizeof(callee_t));
|
||||
if (*callsite) {
|
||||
printf("Allocated %s profile callsite buffer for %d symbols.\n", info, count);
|
||||
return count;
|
||||
free(callinfo->site);
|
||||
if (callinfo->stack) {
|
||||
free(callinfo->stack);
|
||||
}
|
||||
memset(callinfo, 0, sizeof(*callinfo));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -21,11 +21,26 @@ typedef enum {
|
||||
CALL_INTERRUPT = 128
|
||||
} calltype_t;
|
||||
|
||||
typedef struct {
|
||||
Uint64 calls, count, cycles, misses;
|
||||
} counters_t;
|
||||
|
||||
typedef struct {
|
||||
int callee_idx; /* index of called function */
|
||||
Uint32 ret_addr; /* address after returning from call */
|
||||
Uint32 caller_addr; /* remove informational caller address */
|
||||
Uint32 callee_addr; /* remove informational callee address */
|
||||
counters_t all; /* totals including everything called code does */
|
||||
counters_t out; /* totals for subcalls done from callee */
|
||||
} callstack_t;
|
||||
|
||||
/* callee/caller information */
|
||||
typedef struct {
|
||||
calltype_t flags:8; /* what kind of call it was */
|
||||
unsigned int addr:24; /* address for the caller */
|
||||
Uint32 count; /* number of calls */
|
||||
Uint32 calls; /* number of calls, exclusive */
|
||||
counters_t all; /* totals including everything called code does */
|
||||
counters_t own; /* totals excluding called code (=sum(all-out)) */
|
||||
} caller_t;
|
||||
|
||||
typedef struct {
|
||||
@ -34,19 +49,29 @@ typedef struct {
|
||||
caller_t *callers; /* who called this address */
|
||||
} callee_t;
|
||||
|
||||
typedef struct {
|
||||
int sites; /* number of symbol callsites */
|
||||
int depth; /* how many callstack calls haven't yet returned */
|
||||
int count; /* number of items allocated for stack */
|
||||
callee_t *site; /* symbol specific caller information */
|
||||
callstack_t *stack; /* calls that will return */
|
||||
} callinfo_t;
|
||||
|
||||
|
||||
/* CPU/DSP memory area statistics */
|
||||
typedef struct {
|
||||
Uint64 all_cycles, all_count, all_misses;
|
||||
Uint64 max_cycles; /* for overflow check (cycles > count or misses) */
|
||||
counters_t counters; /* counters for this area */
|
||||
Uint32 lowest, highest; /* active address range within memory area */
|
||||
int active; /* number of active addresses */
|
||||
bool overflow; /* whether counters overflowed */
|
||||
} profile_area_t;
|
||||
|
||||
|
||||
/* generic profile caller functions */
|
||||
extern void Profile_ShowCallers(FILE *fp, int sites, callee_t *callsite, const char * (*addr2name)(Uint32, Uint64 *));
|
||||
extern int Profile_AllocCallerInfo(const char *info, int oldcount, int count, callee_t **callsite);
|
||||
extern void Profile_UpdateCaller(callee_t *callsite, Uint32 pc, Uint32 caller, calltype_t flag);
|
||||
extern void Profile_UpdateCallinfo(int idx, callinfo_t *callinfo, Uint32 prev_pc, calltype_t flag, Uint32 pc, Uint32 ret_pc, counters_t *runcounts);
|
||||
extern int Profile_AllocCallinfo(callinfo_t *callinfo, int count, const char *info);
|
||||
extern void Profile_FreeCallinfo(callinfo_t *callinfo);
|
||||
|
||||
/* parser helpers */
|
||||
extern void Profile_CpuGetPointers(bool **enabled, Uint32 **disasm_addr);
|
||||
|
@ -27,6 +27,8 @@ const char Profilecpu_fileid[] = "Hatari profilecpu.c : " __DATE__ " " __TIME__;
|
||||
/* if non-zero, output (more) warnings on syspicious cycle/instruction counts */
|
||||
#define DEBUG 0
|
||||
|
||||
static callinfo_t cpu_callinfo;
|
||||
|
||||
/* This is relevant with WinUAE CPU core:
|
||||
* - the default cycle exact variant needs this define to be non-zero
|
||||
* - non-cycle exact and MMU variants need this define to be 0
|
||||
@ -37,7 +39,7 @@ const char Profilecpu_fileid[] = "Hatari profilecpu.c : " __DATE__ " " __TIME__;
|
||||
#define MAX_CPU_PROFILE_VALUE 0xFFFFFFFF
|
||||
|
||||
typedef struct {
|
||||
Uint32 count; /* how many times this address is used */
|
||||
Uint32 count; /* how many times this address instrcution is executed */
|
||||
Uint32 cycles; /* how many CPU cycles was taken at this address */
|
||||
Uint32 misses; /* how many CPU cache misses happened at this address */
|
||||
} cpu_profile_item_t;
|
||||
@ -45,16 +47,14 @@ typedef struct {
|
||||
#define MAX_MISS 4
|
||||
|
||||
static struct {
|
||||
Uint64 all_cycles, all_count, all_misses;
|
||||
Uint32 miss_counts[MAX_MISS];
|
||||
counters_t all; /* total counts for all areas */
|
||||
Uint32 miss_counts[MAX_MISS]; /* cache miss counts */
|
||||
cpu_profile_item_t *data; /* profile data items */
|
||||
Uint32 size; /* number of allocated profile data items */
|
||||
profile_area_t ram; /* normal RAM stats */
|
||||
profile_area_t rom; /* cartridge ROM stats */
|
||||
profile_area_t tos; /* ROM TOS stats */
|
||||
int active; /* number of active data items in all areas */
|
||||
int sites; /* number of symbol callsites */
|
||||
callee_t *callsite; /* symbol specific caller information */
|
||||
Uint32 *sort_arr; /* data indexes used for sorting */
|
||||
Uint32 prev_cycles; /* previous instruction cycles counter */
|
||||
Uint32 prev_pc; /* previous instruction address */
|
||||
@ -129,8 +129,8 @@ bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint3
|
||||
*misses = cpu_profile.data[idx].misses;
|
||||
*cycles = cpu_profile.data[idx].cycles;
|
||||
*count = cpu_profile.data[idx].count;
|
||||
if (cpu_profile.all_count) {
|
||||
*percentage = 100.0*(*count)/cpu_profile.all_count;
|
||||
if (cpu_profile.all.count) {
|
||||
*percentage = 100.0*(*count)/cpu_profile.all.count;
|
||||
} else {
|
||||
*percentage = 0.0;
|
||||
}
|
||||
@ -153,20 +153,20 @@ static void show_cpu_area_stats(profile_area_t *area)
|
||||
area->active,
|
||||
100.0 * area->active / cpu_profile.active);
|
||||
fprintf(stderr, "- executed instructions:\n %llu (%.2f%% of all)\n",
|
||||
area->all_count,
|
||||
100.0 * area->all_count / cpu_profile.all_count);
|
||||
area->counters.count,
|
||||
100.0 * area->counters.count / cpu_profile.all.count);
|
||||
#if ENABLE_WINUAE_CPU
|
||||
if (cpu_profile.all_misses) { /* CPU cache in use? */
|
||||
if (cpu_profile.all.misses) { /* CPU cache in use? */
|
||||
fprintf(stderr, "- instruction cache misses:\n %llu (%.2f%% of all)\n",
|
||||
area->all_misses,
|
||||
100.0 * area->all_misses / cpu_profile.all_misses);
|
||||
area->counters.misses,
|
||||
100.0 * area->counters.misses / cpu_profile.all.misses);
|
||||
}
|
||||
#endif
|
||||
fprintf(stderr, "- used cycles:\n %llu (%.2f%% of all)\n = %.5fs\n",
|
||||
area->all_cycles,
|
||||
100.0 * area->all_cycles / cpu_profile.all_cycles,
|
||||
(double)area->all_cycles / MachineClocks.CPU_Freq);
|
||||
if (area->max_cycles == MAX_CPU_PROFILE_VALUE) {
|
||||
area->counters.cycles,
|
||||
100.0 * area->counters.cycles / cpu_profile.all.cycles,
|
||||
(double)area->counters.cycles / MachineClocks.CPU_Freq);
|
||||
if (area->overflow) {
|
||||
fprintf(stderr, " *** COUNTER OVERFLOW! ***\n");
|
||||
}
|
||||
}
|
||||
@ -187,10 +187,10 @@ void Profile_CpuShowStats(void)
|
||||
show_cpu_area_stats(&cpu_profile.rom);
|
||||
|
||||
fprintf(stderr, "\n= %.5fs\n",
|
||||
(double)cpu_profile.all_cycles / MachineClocks.CPU_Freq);
|
||||
(double)cpu_profile.all.cycles / MachineClocks.CPU_Freq);
|
||||
|
||||
#if ENABLE_WINUAE_CPU
|
||||
if (cpu_profile.all_misses) { /* CPU cache in use? */
|
||||
if (cpu_profile.all.misses) { /* CPU cache in use? */
|
||||
int i;
|
||||
fprintf(stderr, "\nCache misses per instruction, number of occurrences:\n");
|
||||
for (i = 0; i < MAX_MISS; i++) {
|
||||
@ -309,7 +309,7 @@ void Profile_CpuShowMisses(int show)
|
||||
for (end = sort_arr + show; sort_arr < end; sort_arr++) {
|
||||
addr = index2address(*sort_arr);
|
||||
count = data[*sort_arr].misses;
|
||||
percentage = 100.0*count/cpu_profile.all_misses;
|
||||
percentage = 100.0*count/cpu_profile.all.misses;
|
||||
printf("0x%06x\t%.2f%%\t%d%s\n", addr, percentage, count,
|
||||
count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
}
|
||||
@ -357,7 +357,7 @@ void Profile_CpuShowCycles(int show)
|
||||
for (end = sort_arr + show; sort_arr < end; sort_arr++) {
|
||||
addr = index2address(*sort_arr);
|
||||
count = data[*sort_arr].cycles;
|
||||
percentage = 100.0*count/cpu_profile.all_cycles;
|
||||
percentage = 100.0*count/cpu_profile.all.cycles;
|
||||
printf("0x%06x\t%.2f%%\t%d%s\n", addr, percentage, count,
|
||||
count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
}
|
||||
@ -410,7 +410,7 @@ void Profile_CpuShowCounts(int show, bool only_symbols)
|
||||
for (end = sort_arr + show; sort_arr < end; sort_arr++) {
|
||||
addr = index2address(*sort_arr);
|
||||
count = data[*sort_arr].count;
|
||||
percentage = 100.0*count/cpu_profile.all_count;
|
||||
percentage = 100.0*count/cpu_profile.all.count;
|
||||
printf("0x%06x\t%.2f%%\t%d%s\n",
|
||||
addr, percentage, count,
|
||||
count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
@ -435,7 +435,7 @@ void Profile_CpuShowCounts(int show, bool only_symbols)
|
||||
continue;
|
||||
}
|
||||
count = data[*sort_arr].count;
|
||||
percentage = 100.0*count/cpu_profile.all_count;
|
||||
percentage = 100.0*count/cpu_profile.all.count;
|
||||
printf("0x%06x\t%.2f%%\t%d\t%s%s\n",
|
||||
addr, percentage, count, name,
|
||||
count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
@ -461,7 +461,7 @@ static const char * addr2name(Uint32 addr, Uint64 *total)
|
||||
*/
|
||||
void Profile_CpuShowCallers(FILE *fp)
|
||||
{
|
||||
Profile_ShowCallers(fp, cpu_profile.sites, cpu_profile.callsite, addr2name);
|
||||
Profile_ShowCallers(fp, cpu_callinfo.sites, cpu_callinfo.site, addr2name);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -494,6 +494,9 @@ void Profile_CpuSave(FILE *out)
|
||||
*/
|
||||
bool Profile_CpuStart(void)
|
||||
{
|
||||
int size;
|
||||
|
||||
Profile_FreeCallinfo(&(cpu_callinfo));
|
||||
if (cpu_profile.sort_arr) {
|
||||
/* remove previous results */
|
||||
free(cpu_profile.sort_arr);
|
||||
@ -505,30 +508,36 @@ bool Profile_CpuStart(void)
|
||||
if (!cpu_profile.enabled) {
|
||||
return false;
|
||||
}
|
||||
/* zero everything */
|
||||
memset(&cpu_profile, 0, sizeof(cpu_profile));
|
||||
|
||||
/* Shouldn't change within same debug session */
|
||||
cpu_profile.size = (STRamEnd + 0x20000 + TosSize) / 2;
|
||||
size = (STRamEnd + 0x20000 + TosSize) / 2;
|
||||
|
||||
/* Add one entry for catching invalid PC values */
|
||||
cpu_profile.data = calloc(cpu_profile.size+1, sizeof(*cpu_profile.data));
|
||||
if (cpu_profile.data) {
|
||||
printf("Allocated CPU profile buffer (%d MB).\n",
|
||||
(int)sizeof(*cpu_profile.data)*cpu_profile.size/(1024*1024));
|
||||
} else {
|
||||
cpu_profile.data = calloc(size + 1, sizeof(*cpu_profile.data));
|
||||
if (!cpu_profile.data) {
|
||||
perror("ERROR, new CPU profile buffer alloc failed");
|
||||
cpu_profile.enabled = false;
|
||||
return false;
|
||||
}
|
||||
cpu_profile.sites = Profile_AllocCallerInfo("CPU", cpu_profile.sites, Symbols_CpuCount(), &(cpu_profile.callsite));
|
||||
printf("Allocated CPU profile buffer (%d MB).\n",
|
||||
(int)sizeof(*cpu_profile.data)*size/(1024*1024));
|
||||
cpu_profile.size = size;
|
||||
|
||||
Profile_AllocCallinfo(&(cpu_callinfo), Symbols_CpuCount(), "CPU");
|
||||
|
||||
memset(cpu_profile.miss_counts, 0, sizeof(cpu_profile.miss_counts));
|
||||
cpu_profile.prev_cycles = Cycles_GetCounter(CYCLES_COUNTER_CPU);
|
||||
cpu_profile.prev_pc = M68000_GetPC();
|
||||
|
||||
cpu_profile.disasm_addr = 0;
|
||||
cpu_profile.processed = false;
|
||||
cpu_profile.enabled = true;
|
||||
return cpu_profile.enabled;
|
||||
}
|
||||
|
||||
/* return branch type based on caller instruction type */
|
||||
/**
|
||||
* return caller instruction type classification
|
||||
*/
|
||||
static calltype_t cpu_opcode_type(Uint32 prev_pc, Uint32 pc)
|
||||
{
|
||||
switch (OpcodeFamily) {
|
||||
@ -570,7 +579,6 @@ static calltype_t cpu_opcode_type(Uint32 prev_pc, Uint32 pc)
|
||||
return CALL_UNKNOWN;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Update CPU cycle and count statistics for PC address.
|
||||
*
|
||||
@ -588,15 +596,6 @@ void Profile_CpuUpdate(void)
|
||||
prev_pc = cpu_profile.prev_pc;
|
||||
cpu_profile.prev_pc = pc = M68000_GetPC();
|
||||
|
||||
if (cpu_profile.sites) {
|
||||
int idx = Symbols_GetCpuAddressIndex(pc);
|
||||
if (unlikely(idx >= 0 && idx < cpu_profile.sites)) {
|
||||
calltype_t flag = cpu_opcode_type(prev_pc, pc);
|
||||
Profile_UpdateCaller(cpu_profile.callsite + idx,
|
||||
pc, prev_pc, flag);
|
||||
}
|
||||
}
|
||||
|
||||
idx = address2index(prev_pc);
|
||||
assert(idx <= cpu_profile.size);
|
||||
prev = cpu_profile.data + idx;
|
||||
@ -604,6 +603,7 @@ void Profile_CpuUpdate(void)
|
||||
if (likely(prev->count < MAX_CPU_PROFILE_VALUE)) {
|
||||
prev->count++;
|
||||
}
|
||||
cpu_profile.all.count++;
|
||||
|
||||
#if USE_CYCLES_COUNTER
|
||||
/* Confusingly, with DSP enabled, cycle counter is for this instruction,
|
||||
@ -627,6 +627,7 @@ void Profile_CpuUpdate(void)
|
||||
} else {
|
||||
prev->cycles = MAX_CPU_PROFILE_VALUE;
|
||||
}
|
||||
cpu_profile.all.cycles += cycles;
|
||||
|
||||
#if ENABLE_WINUAE_CPU
|
||||
misses = CpuInstruction.iCacheMisses;
|
||||
@ -637,6 +638,7 @@ void Profile_CpuUpdate(void)
|
||||
} else {
|
||||
prev->misses = MAX_CPU_PROFILE_VALUE;
|
||||
}
|
||||
cpu_profile.all.misses += misses;
|
||||
#endif
|
||||
|
||||
#if DEBUG
|
||||
@ -657,13 +659,27 @@ void Profile_CpuUpdate(void)
|
||||
Disasm(stderr, prev_pc, &nextpc, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (cpu_callinfo.sites) {
|
||||
int symidx;
|
||||
Uint32 ret_pc;
|
||||
calltype_t flag = cpu_opcode_type(prev_pc, pc);
|
||||
if (flag == CALL_SUBROUTINE) {
|
||||
/* slow, so needs to be checked only when needed */
|
||||
ret_pc = Disasm_GetNextPC(prev_pc);
|
||||
} else {
|
||||
ret_pc = 0;
|
||||
}
|
||||
symidx = Symbols_GetCpuAddressIndex(pc);
|
||||
Profile_UpdateCallinfo(symidx, &cpu_callinfo, prev_pc, flag, pc, ret_pc, &(cpu_profile.all));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper for collecting CPU profile area statistics.
|
||||
* Helper for accounting CPU profile area item.
|
||||
*/
|
||||
static void update_cpu_area(Uint32 addr, cpu_profile_item_t *item, profile_area_t *area)
|
||||
static void update_area_item(profile_area_t *area, Uint32 addr, cpu_profile_item_t *item)
|
||||
{
|
||||
Uint32 cycles = item->cycles;
|
||||
Uint32 count = item->count;
|
||||
@ -671,14 +687,13 @@ static void update_cpu_area(Uint32 addr, cpu_profile_item_t *item, profile_area_
|
||||
if (!count) {
|
||||
return;
|
||||
}
|
||||
area->all_count += count;
|
||||
area->all_misses += item->misses;
|
||||
area->all_cycles += cycles;
|
||||
area->counters.count += count;
|
||||
area->counters.misses += item->misses;
|
||||
area->counters.cycles += cycles;
|
||||
|
||||
if (cycles > area->max_cycles) {
|
||||
area->max_cycles = cycles;
|
||||
if (cycles == MAX_CPU_PROFILE_VALUE) {
|
||||
area->overflow = true;
|
||||
}
|
||||
|
||||
if (addr < area->lowest) {
|
||||
area->lowest = addr;
|
||||
}
|
||||
@ -687,6 +702,40 @@ static void update_cpu_area(Uint32 addr, cpu_profile_item_t *item, profile_area_
|
||||
area->active++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper for collecting CPU profile area statistics.
|
||||
*/
|
||||
static Uint32 update_area(profile_area_t *area, Uint32 start, Uint32 end)
|
||||
{
|
||||
cpu_profile_item_t *item;
|
||||
Uint32 addr;
|
||||
|
||||
memset(area, 0, sizeof(profile_area_t));
|
||||
area->lowest = cpu_profile.size;
|
||||
|
||||
item = &(cpu_profile.data[start]);
|
||||
for (addr = start; addr < end; addr++, item++) {
|
||||
update_area_item(area, addr, item);
|
||||
}
|
||||
return addr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper for initializing CPU profile area sorting indexes.
|
||||
*/
|
||||
static Uint32* index_area(profile_area_t *area, Uint32 *sort_arr)
|
||||
{
|
||||
cpu_profile_item_t *item;
|
||||
Uint32 addr;
|
||||
|
||||
item = &(cpu_profile.data[area->lowest]);
|
||||
for (addr = area->lowest; addr <= area->highest; addr++, item++) {
|
||||
if (item->count) {
|
||||
*sort_arr++ = addr;
|
||||
}
|
||||
}
|
||||
return sort_arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop and process the CPU profiling data; collect stats and
|
||||
@ -694,11 +743,8 @@ static void update_cpu_area(Uint32 addr, cpu_profile_item_t *item, profile_area_
|
||||
*/
|
||||
void Profile_CpuStop(void)
|
||||
{
|
||||
cpu_profile_item_t *item;
|
||||
profile_area_t *area;
|
||||
Uint32 *sort_arr;
|
||||
Uint32 *sort_arr, next;
|
||||
int active;
|
||||
Uint32 i;
|
||||
|
||||
if (cpu_profile.processed || !cpu_profile.enabled) {
|
||||
return;
|
||||
@ -706,39 +752,15 @@ void Profile_CpuStop(void)
|
||||
/* user didn't change RAM or TOS size in the meanwhile? */
|
||||
assert(cpu_profile.size == (STRamEnd + 0x20000 + TosSize) / 2);
|
||||
|
||||
/* find lowest and highest addresses executed... */
|
||||
item = cpu_profile.data;
|
||||
/* find lowest and highest addresses executed etc */
|
||||
next = update_area(&cpu_profile.ram, 0, STRamEnd/2);
|
||||
next = update_area(&cpu_profile.tos, next, (STRamEnd + TosSize)/2);
|
||||
next = update_area(&cpu_profile.ram, next, cpu_profile.size);
|
||||
assert(next == cpu_profile.size);
|
||||
|
||||
/* ...for normal RAM */
|
||||
area = &cpu_profile.ram;
|
||||
memset(area, 0, sizeof(profile_area_t));
|
||||
area->lowest = cpu_profile.size;
|
||||
|
||||
for (i = 0; i < STRamEnd/2; i++, item++) {
|
||||
update_cpu_area(i, item, area);
|
||||
}
|
||||
|
||||
/* ...for ROM TOS */
|
||||
area = &cpu_profile.tos;
|
||||
memset(area, 0, sizeof(profile_area_t));
|
||||
area->lowest = cpu_profile.size;
|
||||
|
||||
for (; i < (STRamEnd + TosSize)/2; i++, item++) {
|
||||
update_cpu_area(i, item, area);
|
||||
}
|
||||
|
||||
/* ... for Cartridge ROM */
|
||||
area = &cpu_profile.rom;
|
||||
memset(area, 0, sizeof(profile_area_t));
|
||||
area->lowest = cpu_profile.size;
|
||||
|
||||
for (; i < cpu_profile.size; i++, item++) {
|
||||
update_cpu_area(i, item, area);
|
||||
}
|
||||
|
||||
cpu_profile.all_misses = cpu_profile.ram.all_misses + cpu_profile.rom.all_misses + cpu_profile.tos.all_misses;
|
||||
cpu_profile.all_cycles = cpu_profile.ram.all_cycles + cpu_profile.rom.all_cycles + cpu_profile.tos.all_cycles;
|
||||
cpu_profile.all_count = cpu_profile.ram.all_count + cpu_profile.rom.all_count + cpu_profile.tos.all_count;
|
||||
assert(cpu_profile.all.misses == cpu_profile.ram.counters.misses + cpu_profile.rom.counters.misses + cpu_profile.tos.counters.misses);
|
||||
assert(cpu_profile.all.cycles == cpu_profile.ram.counters.cycles + cpu_profile.rom.counters.cycles + cpu_profile.tos.counters.cycles);
|
||||
assert(cpu_profile.all.count = cpu_profile.ram.counters.count + cpu_profile.rom.counters.count + cpu_profile.tos.counters.count);
|
||||
|
||||
/* allocate address array for sorting */
|
||||
active = cpu_profile.ram.active + cpu_profile.rom.active + cpu_profile.tos.active;
|
||||
@ -756,33 +778,10 @@ void Profile_CpuStop(void)
|
||||
cpu_profile.active = active;
|
||||
|
||||
/* and fill addresses for used instructions... */
|
||||
|
||||
/* ...for normal RAM */
|
||||
area = &cpu_profile.ram;
|
||||
item = cpu_profile.data + area->lowest;
|
||||
for (i = area->lowest; i <= area->highest; i++, item++) {
|
||||
if (item->count) {
|
||||
*sort_arr++ = i;
|
||||
}
|
||||
}
|
||||
|
||||
/* ...for TOS ROM */
|
||||
area = &cpu_profile.tos;
|
||||
item = cpu_profile.data + area->lowest;
|
||||
for (i = area->lowest; i <= area->highest; i++, item++) {
|
||||
if (item->count) {
|
||||
*sort_arr++ = i;
|
||||
}
|
||||
}
|
||||
|
||||
/* ...for Cartridge ROM */
|
||||
area = &cpu_profile.rom;
|
||||
item = cpu_profile.data + area->lowest;
|
||||
for (i = area->lowest; i <= area->highest; i++, item++) {
|
||||
if (item->count) {
|
||||
*sort_arr++ = i;
|
||||
}
|
||||
}
|
||||
sort_arr = index_area(&cpu_profile.ram, sort_arr);
|
||||
sort_arr = index_area(&cpu_profile.tos, sort_arr);
|
||||
sort_arr = index_area(&cpu_profile.rom, sort_arr);
|
||||
assert(sort_arr == cpu_profile.sort_arr + cpu_profile.active);
|
||||
//printf("%d/%d/%d\n", area->active, sort_arr-cpu_profile.sort_arr, active);
|
||||
|
||||
Profile_CpuShowStats();
|
||||
|
@ -19,6 +19,8 @@ const char Profiledsp_fileid[] = "Hatari profiledsp.c : " __DATE__ " " __TIME__;
|
||||
#include "profile_priv.h"
|
||||
#include "symbols.h"
|
||||
|
||||
static callinfo_t dsp_callinfo;
|
||||
|
||||
#define DSP_PROFILE_ARR_SIZE 0x10000
|
||||
#define MAX_DSP_PROFILE_VALUE 0xFFFFFFFFFFFFFFFFLL
|
||||
|
||||
@ -31,9 +33,7 @@ typedef struct {
|
||||
|
||||
static struct {
|
||||
dsp_profile_item_t *data; /* profile data */
|
||||
profile_area_t ram; /* normal RAM stats */
|
||||
int sites; /* number of symbol callsites */
|
||||
callee_t *callsite; /* symbol specific caller information */
|
||||
profile_area_t ram; /* statistics for whole memory */
|
||||
Uint16 *sort_arr; /* data indexes used for sorting */
|
||||
Uint16 prev_pc; /* previous PC for which the cycles are for */
|
||||
Uint32 disasm_addr; /* 'dspaddresses' command start address */
|
||||
@ -63,8 +63,8 @@ bool Profile_DspAddressData(Uint16 addr, float *percentage, Uint64 *count, Uint6
|
||||
} else {
|
||||
*cycle_diff = 0;
|
||||
}
|
||||
if (dsp_profile.ram.all_count) {
|
||||
*percentage = 100.0*(*count)/dsp_profile.ram.all_count;
|
||||
if (dsp_profile.ram.counters.count) {
|
||||
*percentage = 100.0*(*count)/dsp_profile.ram.counters.count;
|
||||
} else {
|
||||
*percentage = 0.0;
|
||||
}
|
||||
@ -87,20 +87,20 @@ void Profile_DspShowStats(void)
|
||||
fprintf(stderr, "- active instruction addresses:\n %d\n",
|
||||
area->active);
|
||||
fprintf(stderr, "- executed instructions:\n %llu\n",
|
||||
area->all_count);
|
||||
area->counters.count);
|
||||
/* indicates either instruction(s) that address different memory areas
|
||||
* (they can have different access costs), or more significantly,
|
||||
* DSP code that has changed during profiling.
|
||||
*/
|
||||
fprintf(stderr, "- sum of per instruction cycle changes\n (can indicate code change during profiling):\n %llu\n",
|
||||
area->all_misses);
|
||||
area->counters.misses);
|
||||
|
||||
fprintf(stderr, "- used cycles:\n %llu\n",
|
||||
area->all_cycles);
|
||||
if (area->max_cycles == MAX_DSP_PROFILE_VALUE) {
|
||||
area->counters.cycles);
|
||||
if (area->overflow) {
|
||||
fprintf(stderr, " *** COUNTERS OVERFLOW! ***\n");
|
||||
}
|
||||
fprintf(stderr, "\n= %.5fs\n", (double)(area->all_cycles) / MachineClocks.DSP_Freq);
|
||||
fprintf(stderr, "\n= %.5fs\n", (double)(area->counters.cycles) / MachineClocks.DSP_Freq);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -199,7 +199,7 @@ void Profile_DspShowCycles(int show)
|
||||
for (end = sort_arr + show; sort_arr < end; sort_arr++) {
|
||||
addr = *sort_arr;
|
||||
count = data[addr].cycles;
|
||||
percentage = 100.0*count/dsp_profile.ram.all_cycles;
|
||||
percentage = 100.0*count/dsp_profile.ram.counters.cycles;
|
||||
printf("0x%04x\t%.2f%%\t%lld%s\n", addr, percentage, count,
|
||||
count == MAX_DSP_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
}
|
||||
@ -253,7 +253,7 @@ void Profile_DspShowCounts(int show, bool only_symbols)
|
||||
for (end = sort_arr + show; sort_arr < end; sort_arr++) {
|
||||
addr = *sort_arr;
|
||||
count = data[addr].count;
|
||||
percentage = 100.0*count/dsp_profile.ram.all_count;
|
||||
percentage = 100.0*count/dsp_profile.ram.counters.count;
|
||||
printf("0x%04x\t%.2f%%\t%lld%s\n",
|
||||
addr, percentage, count,
|
||||
count == MAX_DSP_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
@ -278,7 +278,7 @@ void Profile_DspShowCounts(int show, bool only_symbols)
|
||||
continue;
|
||||
}
|
||||
count = data[addr].count;
|
||||
percentage = 100.0*count/dsp_profile.ram.all_count;
|
||||
percentage = 100.0*count/dsp_profile.ram.counters.count;
|
||||
printf("0x%04x\t%.2f%%\t%lld\t%s%s\n",
|
||||
addr, percentage, count, name,
|
||||
count == MAX_DSP_PROFILE_VALUE ? " (OVERFLOW)" : "");
|
||||
@ -303,7 +303,7 @@ static const char * addr2name(Uint32 addr, Uint64 *total)
|
||||
*/
|
||||
void Profile_DspShowCallers(FILE *fp)
|
||||
{
|
||||
Profile_ShowCallers(fp, dsp_profile.sites, dsp_profile.callsite, addr2name);
|
||||
Profile_ShowCallers(fp, dsp_callinfo.sites, dsp_callinfo.site, addr2name);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -334,6 +334,7 @@ bool Profile_DspStart(void)
|
||||
dsp_profile_item_t *item;
|
||||
int i;
|
||||
|
||||
Profile_FreeCallinfo(&(dsp_callinfo));
|
||||
if (dsp_profile.sort_arr) {
|
||||
/* remove previous results */
|
||||
free(dsp_profile.sort_arr);
|
||||
@ -345,33 +346,34 @@ bool Profile_DspStart(void)
|
||||
if (!dsp_profile.enabled) {
|
||||
return false;
|
||||
}
|
||||
/* zero everything */
|
||||
memset(&dsp_profile, 0, sizeof(dsp_profile));
|
||||
|
||||
dsp_profile.data = calloc(DSP_PROFILE_ARR_SIZE, sizeof(*dsp_profile.data));
|
||||
if (dsp_profile.data) {
|
||||
printf("Allocated DSP profile buffer (%d KB).\n",
|
||||
(int)sizeof(*dsp_profile.data)*DSP_PROFILE_ARR_SIZE/1024);
|
||||
} else {
|
||||
if (!dsp_profile.data) {
|
||||
perror("ERROR, new DSP profile buffer alloc failed");
|
||||
dsp_profile.enabled = false;
|
||||
return false;
|
||||
}
|
||||
printf("Allocated DSP profile buffer (%d KB).\n",
|
||||
(int)sizeof(*dsp_profile.data)*DSP_PROFILE_ARR_SIZE/1024);
|
||||
|
||||
Profile_AllocCallinfo(&(dsp_callinfo), Symbols_DspCount(), "DSP");
|
||||
|
||||
item = dsp_profile.data;
|
||||
for (i = 0; i < DSP_PROFILE_ARR_SIZE; i++, item++) {
|
||||
item->min_cycle = 0xFFFF;
|
||||
}
|
||||
|
||||
dsp_profile.sites = Profile_AllocCallerInfo("DSP", dsp_profile.sites, Symbols_DspCount(), &(dsp_profile.callsite));
|
||||
|
||||
dsp_profile.prev_pc = DSP_GetPC();
|
||||
|
||||
dsp_profile.disasm_addr = 0;
|
||||
dsp_profile.processed = false;
|
||||
dsp_profile.enabled = true;
|
||||
return dsp_profile.enabled;
|
||||
}
|
||||
|
||||
/* return branch type based on caller instruction type */
|
||||
static calltype_t dsp_opcode_type(Uint16 prev_pc, Uint16 pc)
|
||||
{
|
||||
/* not supported (yet) */
|
||||
return CALL_UNDEFINED;
|
||||
}
|
||||
|
||||
@ -388,18 +390,12 @@ void Profile_DspUpdate(void)
|
||||
|
||||
prev_pc = dsp_profile.prev_pc;
|
||||
dsp_profile.prev_pc = pc = DSP_GetPC();
|
||||
if (dsp_profile.sites) {
|
||||
int idx = Symbols_GetDspAddressIndex(pc);
|
||||
if (unlikely(idx >= 0 && idx < dsp_profile.sites)) {
|
||||
calltype_t flag = dsp_opcode_type(prev_pc, pc);
|
||||
Profile_UpdateCaller(dsp_profile.callsite + idx,
|
||||
pc, prev_pc, flag);
|
||||
}
|
||||
}
|
||||
prev = dsp_profile.data + prev_pc;
|
||||
|
||||
if (likely(prev->count < MAX_DSP_PROFILE_VALUE)) {
|
||||
prev->count++;
|
||||
}
|
||||
dsp_profile.ram.counters.count++;
|
||||
|
||||
cycles = DSP_GetInstrCycles();
|
||||
if (likely(prev->cycles < MAX_DSP_PROFILE_VALUE - cycles)) {
|
||||
@ -407,18 +403,34 @@ void Profile_DspUpdate(void)
|
||||
} else {
|
||||
prev->cycles = MAX_DSP_PROFILE_VALUE;
|
||||
}
|
||||
dsp_profile.ram.counters.cycles += cycles;
|
||||
|
||||
if (unlikely(cycles < prev->min_cycle)) {
|
||||
prev->min_cycle = cycles;
|
||||
}
|
||||
if (unlikely(cycles > prev->max_cycle)) {
|
||||
prev->max_cycle = cycles;
|
||||
}
|
||||
|
||||
if (dsp_callinfo.sites) {
|
||||
int symidx;
|
||||
Uint16 ret_pc;
|
||||
calltype_t flag = dsp_opcode_type(prev_pc, pc);
|
||||
if (flag == CALL_SUBROUTINE) {
|
||||
/* slow, so needs to be checked only when needed */
|
||||
ret_pc = DSP_GetNextPC(prev_pc);
|
||||
} else {
|
||||
ret_pc = 0;
|
||||
}
|
||||
symidx = Symbols_GetDspAddressIndex(pc);
|
||||
Profile_UpdateCallinfo(symidx, &dsp_callinfo, prev_pc, flag, pc, ret_pc, &(dsp_profile.ram.counters));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper for collecting DSP profile area statistics.
|
||||
*/
|
||||
static void update_dsp_area(Uint16 addr, dsp_profile_item_t *item, profile_area_t *area)
|
||||
static void update_area_item(profile_area_t *area, Uint16 addr, dsp_profile_item_t *item)
|
||||
{
|
||||
Uint64 cycles = item->cycles;
|
||||
Uint64 count = item->count;
|
||||
@ -427,17 +439,15 @@ static void update_dsp_area(Uint16 addr, dsp_profile_item_t *item, profile_area_
|
||||
if (!count) {
|
||||
return;
|
||||
}
|
||||
area->all_count += count;
|
||||
area->all_cycles += cycles;
|
||||
if (cycles > area->max_cycles) {
|
||||
area->max_cycles = cycles;
|
||||
if (cycles == MAX_DSP_PROFILE_VALUE) {
|
||||
area->overflow = true;
|
||||
}
|
||||
if (item->max_cycle) {
|
||||
diff = item->max_cycle - item->min_cycle;
|
||||
} else {
|
||||
diff = 0;
|
||||
}
|
||||
area->all_misses += diff;
|
||||
area->counters.misses += diff;
|
||||
|
||||
if (addr < area->lowest) {
|
||||
area->lowest = addr;
|
||||
@ -456,19 +466,19 @@ void Profile_DspStop(void)
|
||||
dsp_profile_item_t *item;
|
||||
profile_area_t *area;
|
||||
Uint16 *sort_arr;
|
||||
Uint32 i;
|
||||
Uint32 addr;
|
||||
|
||||
if (dsp_profile.processed || !dsp_profile.enabled) {
|
||||
return;
|
||||
}
|
||||
/* find lowest and highest addresses executed */
|
||||
item = dsp_profile.data;
|
||||
area = &dsp_profile.ram;
|
||||
item = dsp_profile.data;
|
||||
memset(area, 0, sizeof(profile_area_t));
|
||||
area->lowest = DSP_PROFILE_ARR_SIZE;
|
||||
|
||||
for (i = 0; i < DSP_PROFILE_ARR_SIZE; i++, item++) {
|
||||
update_dsp_area(i, item, area);
|
||||
for (addr = 0; addr < DSP_PROFILE_ARR_SIZE; addr++, item++) {
|
||||
update_area_item(area, addr, item);
|
||||
}
|
||||
|
||||
/* allocate address array for sorting */
|
||||
@ -486,10 +496,10 @@ void Profile_DspStop(void)
|
||||
|
||||
/* ...and fill addresses for used instructions... */
|
||||
area = &dsp_profile.ram;
|
||||
item = dsp_profile.data + area->lowest;
|
||||
for (i = area->lowest; i <= area->highest; i++, item++) {
|
||||
item = &(dsp_profile.data[area->lowest]);
|
||||
for (addr = area->lowest; addr <= area->highest; addr++, item++) {
|
||||
if (item->count) {
|
||||
*sort_arr++ = i;
|
||||
*sort_arr++ = addr;
|
||||
}
|
||||
}
|
||||
//printf("%d/%d/%d\n", area->active, sort_arr-dsp_profile.sort_arr, active);
|
||||
|
@ -528,6 +528,46 @@ class ProfileSymbols(Output):
|
||||
return self.get_area(addr)
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
class Callinfo:
|
||||
"class representing parsed profile data caller/callee linkage information"
|
||||
|
||||
def __init__(self, addr, calls, flags, intotal, extotal):
|
||||
self.addr = addr
|
||||
# how many times this caller called the destination
|
||||
self.calls = int(calls, 10)
|
||||
# what kind of "call" it was?
|
||||
if flags:
|
||||
self.flags = flags.strip()
|
||||
else:
|
||||
self.flags = "-"
|
||||
self.total = [None, None]
|
||||
# cost of these calls including further function calls
|
||||
self._parse_totals(0, intotal)
|
||||
# costs exclusing further function calls
|
||||
self._parse_totals(1, extotal)
|
||||
|
||||
def _parse_totals(self, idx, totalstr):
|
||||
"parse '/' separate totals string into integer sequence, or None"
|
||||
if totalstr:
|
||||
self.total[idx] = [int(x, 10) for x in totalstr.strip().split('/')]
|
||||
|
||||
def _add_total(self, idx, newtotal):
|
||||
"add given totals to own totals"
|
||||
mytotal = self.total[idx]
|
||||
if mytotal and newtotal:
|
||||
self.total[idx] = [x + y for x, y in zip(mytotal, newtotal)]
|
||||
if newtotal:
|
||||
self.total[idx] = newtotal
|
||||
|
||||
def add(self, newinfo):
|
||||
"add counts and totals from other item to this one"
|
||||
self.calls += newinfo.calls
|
||||
for i in range(len(self.total)):
|
||||
self._add_total(i, newinfo.total[i])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
class ProfileCallers(Output):
|
||||
"profile data callee/caller information parser & handler"
|
||||
@ -535,9 +575,8 @@ class ProfileCallers(Output):
|
||||
def __init__(self):
|
||||
Output.__init__(self)
|
||||
# caller info in callee line:
|
||||
# 0x<hex>: 0x<hex> = <count>, N*[0x<hex> = <count>,][ (<symbol>)
|
||||
# 0x<hex> = <count>
|
||||
self.r_caller = re.compile("^0x([0-9a-f]+) = ([0-9]+)( [a-z]+)?$")
|
||||
# 0x<hex> = <count> <typeletter>[ inclusive/totals][ exclusive/totals]
|
||||
self.r_caller = re.compile("^0x([0-9a-f]+) = ([0-9]+)( [a-z]+)?( [0-9/]+)?( [0-9/]+)?$")
|
||||
# whether there is any caller info
|
||||
self.present = False
|
||||
# address dicts
|
||||
@ -567,7 +606,7 @@ class ProfileCallers(Output):
|
||||
|
||||
def parse_callers(self, fobj, parsed, line):
|
||||
"parse callee: caller call count information"
|
||||
#0x<hex>: 0x<hex> = <count>, N*[0x<hex> = <count>,][ (<symbol>)
|
||||
#0x<hex>: 0x<hex> = <count>[ <flags>][, <inclusive/totals>[, <exclusive/totals>]]; N*[0x<hex> = <count>...;][ <symbol>]
|
||||
self.callinfo = {}
|
||||
while True:
|
||||
parsed += 1
|
||||
@ -583,24 +622,19 @@ class ProfileCallers(Output):
|
||||
self.error_exit("caller info missing on callee line %d\n\t'%s'" % (parsed, line))
|
||||
if ':' not in callers[0]:
|
||||
self.error_exit("callee/caller separator ':' missing on callee line %d\n\t'%s'" % (parsed, line))
|
||||
addr, callers[0] = callers[0].split(':')
|
||||
last = callers[-1].strip()
|
||||
if len(last) and last[-1] != ')':
|
||||
if last.startswith('0x'):
|
||||
self.error_exit("last item isn't empty or symbol name on callee line %d\n\t'%s'" % (parsed, line))
|
||||
|
||||
addr, callers[0] = callers[0].split(':')
|
||||
callinfo = {}
|
||||
for caller in callers[:-1]:
|
||||
caller = caller.strip()
|
||||
match = self.r_caller.match(caller)
|
||||
if match:
|
||||
caddr, count, flags = match.groups()
|
||||
caddr = int(caddr, 16)
|
||||
count = int(count, 10)
|
||||
if flags:
|
||||
flags = flags.strip()
|
||||
else:
|
||||
flags = '-'
|
||||
callinfo[caddr] = (count, flags)
|
||||
items = match.groups()
|
||||
caddr = int(items[0], 16)
|
||||
callinfo[caddr] = Callinfo(caddr, *items[1:])
|
||||
else:
|
||||
self.error_exit("unrecognized caller info '%s' on callee line %d\n\t'%s'" % (caller, parsed, line))
|
||||
self.callinfo[int(addr, 16)] = callinfo
|
||||
@ -621,14 +655,12 @@ class ProfileCallers(Output):
|
||||
continue
|
||||
# ...and their callers
|
||||
ignore = total = 0
|
||||
for item in caller.items():
|
||||
laddr, info = item
|
||||
count, flags = info
|
||||
for laddr, info in caller.items():
|
||||
pname, offset = symbols.get_preceeding_symbol(laddr)
|
||||
if len(flags) > 1:
|
||||
self.warning("caller instruction change ('%s') detected for '%s', did its code change during profiling?" % (flags, pname))
|
||||
elif flags in self.removable_calltypes:
|
||||
ignore += count
|
||||
if len(info.flags) > 1:
|
||||
self.warning("caller instruction change ('%s') detected for '%s', did its code change during profiling?" % (info.flags, pname))
|
||||
elif info.flags in self.removable_calltypes:
|
||||
ignore += info.calls
|
||||
continue
|
||||
# function address for the caller
|
||||
paddr = laddr - offset
|
||||
@ -637,17 +669,15 @@ class ProfileCallers(Output):
|
||||
self.warning("overriding parsed function 0x%x name '%s' with resolved caller 0x%x name '%s'" % (parent.addr, parent.name, paddr, pname))
|
||||
parent.name = pname
|
||||
# link parent and child function together
|
||||
item = (laddr, count, flags)
|
||||
if paddr in child.parent:
|
||||
if self.compact:
|
||||
oldcount = child.parent[paddr][0][1]
|
||||
child.parent[paddr] = ((paddr, oldcount + count, flags),)
|
||||
child.parent[paddr][0].add(info)
|
||||
else:
|
||||
child.parent[paddr] += (item,)
|
||||
child.parent[paddr] += (info,)
|
||||
else:
|
||||
child.parent[paddr] = (item,)
|
||||
child.parent[paddr] = (info,)
|
||||
parent.child[caddr] = True
|
||||
total += count
|
||||
total += info.calls
|
||||
# validate call count
|
||||
if ignore:
|
||||
all_ignored += ignore
|
||||
@ -687,7 +717,7 @@ class ProfileCallers(Output):
|
||||
total = 0
|
||||
# add together calls from this particular parent
|
||||
for item in info:
|
||||
total += item[1] # count
|
||||
total += item.calls
|
||||
if parents == 1 and total != calls:
|
||||
self.error_exit("%s -> %s, single parent, but callcounts don't match: %d != %d" % (parent.name, child.name, total, calls))
|
||||
# parent's share of current propagated child costs
|
||||
@ -784,7 +814,7 @@ class ProfileCallgrind(Output):
|
||||
info = child.parent[paddr]
|
||||
total = 0
|
||||
for item in info:
|
||||
total += item[1] # count
|
||||
total += item.calls
|
||||
self.write("cfn=%s\n" % child.name)
|
||||
self.write("calls=%d %d\n" % (total, child.line))
|
||||
if child.total:
|
||||
@ -1438,8 +1468,8 @@ label="%s";
|
||||
# total calls count for child
|
||||
calls = profile[caddr].data[0]
|
||||
# calls to child done from different locations in parent
|
||||
for laddr, count, flags in info:
|
||||
self.edges[(laddr, caddr)] = (paddr, count, calls, flags)
|
||||
for edge in info:
|
||||
self.edges[(edge.addr, caddr)] = (paddr, calls, edge)
|
||||
return (len(profile) - len(self.nodes))
|
||||
|
||||
def _output_nodes(self, stats, field, limit):
|
||||
@ -1487,7 +1517,7 @@ label="%s";
|
||||
"output graph edges from filtered edges dict, after nodes is called"
|
||||
for linkage, info in self.edges.items():
|
||||
laddr, caddr = linkage
|
||||
paddr, count, calls, flags = info
|
||||
paddr, calls, edge = info
|
||||
pname = self.profile[paddr].name
|
||||
offset = laddr - paddr
|
||||
style = ""
|
||||
@ -1496,6 +1526,7 @@ label="%s";
|
||||
# arrowhead/tail styles:
|
||||
# none, normal, inv, dot, odot, invdot, invodot, tee, empty,
|
||||
# invempty, open, halfopen, diamond, odiamond, box, obox, crow
|
||||
flags = edge.flags
|
||||
if flags == CALL_NEXT or flags == CALL_BRANCH:
|
||||
style += " arrowhead=dot"
|
||||
elif flags == CALL_SUBROUTINE:
|
||||
@ -1512,9 +1543,9 @@ label="%s";
|
||||
label = "%s+%d\\n($%x)" % (pname, offset, laddr)
|
||||
else:
|
||||
label = pname
|
||||
if count != calls:
|
||||
percentage = 100.0 * count / calls
|
||||
label = "%s\\n%d calls\\n=%.2f%%" % (label, count, percentage)
|
||||
if edge.calls != calls:
|
||||
percentage = 100.0 * edge.calls / calls
|
||||
label = "%s\\n%d calls\\n=%.2f%%" % (label, edge.calls, percentage)
|
||||
self.write("N_%X -> N_%X [label=\"%s\"%s];\n" % (paddr, caddr, label, style))
|
||||
|
||||
def do_output(self, profobj, fname):
|
||||
@ -1522,7 +1553,6 @@ label="%s";
|
||||
if not (self.output_enabled and profobj.callers.present):
|
||||
return
|
||||
stats = profobj.stats
|
||||
basename = os.path.splitext(fname)[0]
|
||||
for field in range(profobj.stats.items):
|
||||
if not stats.totals[field]:
|
||||
continue
|
||||
|
Loading…
Reference in New Issue
Block a user