ARM ETM traces contain packets of executed instructions. The addresses recorded in the trace correspond to the beginning and one instruction past the end of the block of executed instructions. In order to record the address and destination of executed branches, keep track of two packets: the jump occurs from the last executed instruction of the previous packet to the address of the first instruction in the current packet.
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 12 ++++ tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 1 + tools/perf/util/cs-etm.c | 91 +++++++++++++++++-------- 3 files changed, 75 insertions(+), 29 deletions(-)
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index b4cbf4c..6164bfe 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -143,6 +143,18 @@ static int cs_etm_decoder__buffer_packet(struct cs_etm_decoder *decoder, decoder->packet_buffer[et].sample_type = sample_type; decoder->packet_buffer[et].start_addr = elem->st_addr; decoder->packet_buffer[et].end_addr = elem->en_addr; + switch (elem->last_i_type) { + case OCSD_INSTR_BR: + case OCSD_INSTR_BR_INDIRECT: + decoder->packet_buffer[et].last_instruction_is_branch = true; + break; + case OCSD_INSTR_OTHER: + case OCSD_INSTR_ISB: + case OCSD_INSTR_DSB_DMB: + default: + decoder->packet_buffer[et].last_instruction_is_branch = false; + break; + } decoder->packet_buffer[et].exc = false; decoder->packet_buffer[et].exc_ret = false; decoder->packet_buffer[et].cpu = *((int*)inode->priv); diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index d15a288..4fd5a9b 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -49,6 +49,7 @@ struct cs_etm_packet { enum cs_etm_sample_type sample_type; uint64_t start_addr; uint64_t end_addr; + bool last_instruction_is_branch; bool exc; bool exc_ret; int cpu; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 8270203..c99b728 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -104,6 +104,10 @@ struct cs_etm_queue { * inserted. */ size_t last_branch_pos; + /* + * A pointer to the last and current decoded packets. + */ + struct cs_etm_packet *prev_packet, *packet; };
static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq); @@ -226,6 +230,8 @@ static void cs_etm__free_queue(void *priv) zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb); + zfree(&etmq->prev_packet); + zfree(&etmq->packet); zfree(&etmq->chain); free(etmq); } @@ -392,6 +398,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
if (etm->synth_opts.last_branch) { size_t sz = sizeof(struct branch_stack); + size_t szp = sizeof(struct cs_etm_packet);
sz += etm->synth_opts.last_branch_sz * sizeof(struct branch_entry); @@ -401,6 +408,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, etmq->last_branch_rb = zalloc(sz); if (!etmq->last_branch_rb) goto out_free; + etmq->prev_packet = zalloc(szp); + if (!etmq->prev_packet) + goto out_free; + etmq->packet = zalloc(szp); + if (!etmq->packet) + goto out_free; }
etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); @@ -451,6 +464,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb); + zfree(&etmq->prev_packet); + zfree(&etmq->packet); zfree(&etmq->chain); free(etmq); return NULL; @@ -594,8 +609,7 @@ static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) etmq->last_branch_rb->nr = 0; }
-static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq, - struct cs_etm_packet *packet) +static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq) { struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be; @@ -611,10 +625,16 @@ static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
etmq->last_branch_pos -= 1;
- be = &bs->entries[etmq->last_branch_pos]; - be->from = packet->start_addr; - be->to = packet->end_addr; - /* No support for mispredict */ + be = &bs->entries[etmq->last_branch_pos]; + + /* + * The FROM address needs to be the instruction before the end of the + * packet at END_ADDR: substract from END_ADDR the size of the last + * instruction: 4 bytes. + */ + be->from = etmq->prev_packet->end_addr - 4; + be->to = etmq->packet->start_addr; + /* No support for mispredict. */ be->flags.mispred = 0; be->flags.predicted = 1;
@@ -721,13 +741,13 @@ static struct cs_etm_cache_entry *cs_etm__cache_lookup(struct dso *dso, } #endif
-static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, - struct cs_etm_packet *packet) +static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq) { int ret = 0; struct cs_etm_auxtrace *etm = etmq->etm; union perf_event *event = etmq->event_buf; struct perf_sample sample = {.ip = 0,}; + struct cs_etm_packet *packet = etmq->packet; uint64_t start_addr = packet->start_addr; uint64_t end_addr = packet->end_addr;
@@ -941,17 +961,18 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm,
static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) { - //const struct cs_etm_state *state = etmq->state; - struct cs_etm_packet packet; - //struct cs_etm_auxtrace *etm = etmq->etm; - int err; + struct cs_etm_auxtrace *etm = etmq->etm; + struct cs_etm_packet *tmp; + int err = cs_etm_decoder__get_packet(etmq->decoder, etmq->packet);
- err = cs_etm_decoder__get_packet(etmq->decoder,&packet); - // if there is no sample, it returns err = -1, no real error + /* + * If there is no sample, it returns err = -1, no real error. + */ if (err) return err; - if (etmq->etm->synth_opts.last_branch) { - *cpu = packet.cpu; + + if (etm->synth_opts.last_branch) { + *cpu = etmq->packet->cpu;
/* * FIXME: as the trace sampling does not work for now, (for @@ -961,17 +982,30 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) * should be rewritten as "if reached sampling period". */ if (etmq->last_branch_rb->nr == - etmq->etm->synth_opts.last_branch_sz) { - err = cs_etm__synth_instruction_sample(etmq, &packet); + etm->synth_opts.last_branch_sz) { + err = cs_etm__synth_instruction_sample(etmq); if (err) return err; } - cs_etm__update_last_branch_rb(etmq, &packet); - } else if (packet.sample_type & CS_ETM_RANGE) { - err = cs_etm__synth_instruction_sample(etmq,&packet); - if (err) - return err; - } + /* + * Record a branch when the last instruction in PREV_PACKET is a + * branch. + */ + if (etmq->prev_packet->last_instruction_is_branch) + cs_etm__update_last_branch_rb(etmq); + + /* + * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for + * the next incoming packet. + */ + tmp = etmq->packet; + etmq->packet = etmq->prev_packet; + etmq->prev_packet = tmp; + } else if (etmq->packet->sample_type & CS_ETM_RANGE) { + err = cs_etm__synth_instruction_sample(etmq); + if (err) + return err; + } return 0; }
@@ -1016,15 +1050,14 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) * buffer at the end of the trace. */ if (etmq->etm->synth_opts.last_branch) { - struct cs_etm_packet packet; struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be = &bs->entries[etmq->last_branch_pos];
- packet.cpu = cpu; - packet.start_addr = be->from; - packet.end_addr = be->to; + etmq->packet->cpu = cpu; + etmq->packet->start_addr = be->to; + etmq->packet->end_addr = be->to + 4;
- err = cs_etm__synth_instruction_sample(etmq, &packet); + err = cs_etm__synth_instruction_sample(etmq); if (err) return err; }
The original patch adding support for translating ARM ETM traces into last branch records was copied from Intel-PT. We kept the code that inserts the branches decoded from the trace into a circular buffer in reverse chronological order, and that produces branch stacks in which the branch addresses are in reverse order for ARM ETM.
This patch records the branches in the same chronological order as they appear in the ETM trace. The order of the branches in a branch stack produced by "perf inject" now matches the branch order in of a stack recorded by "perf record -b" on x86.
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com --- tools/perf/util/cs-etm.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-)
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index c99b728..271b742 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -580,32 +580,28 @@ static inline void cs_etm__copy_last_branch_rb(struct cs_etm_queue *etmq) return;
/* - * As bs_src->entries is a circular buffer, we need to copy from it in - * two steps. First, copy the branches from the most recently inserted - * branch ->last_branch_pos until the end of bs_src->entries buffer. + * If we wrapped around at least once, the branches from last_branch_pos + * element to last_branch_sz are older valid branches: copy them over. */ - nr = etmq->etm->synth_opts.last_branch_sz - etmq->last_branch_pos; - memcpy(&bs_dst->entries[0], - &bs_src->entries[etmq->last_branch_pos], - sizeof(struct branch_entry) * nr); + if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) { + nr = etmq->etm->synth_opts.last_branch_sz + - etmq->last_branch_pos - 1; + memcpy(&bs_dst->entries[0], + &bs_src->entries[etmq->last_branch_pos + 1], + sizeof(struct branch_entry) * nr); + }
/* - * If we wrapped around at least once, the branches from the beginning - * of the bs_src->entries buffer and until the ->last_branch_pos element - * are older valid branches: copy them over. The total number of - * branches copied over will be equal to the number of branches asked by - * the user in last_branch_sz. + * Copy the branches from the most recently inserted branches from 0 + * to last_branch_pos included. */ - if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) { - memcpy(&bs_dst->entries[nr], - &bs_src->entries[0], - sizeof(struct branch_entry) * etmq->last_branch_pos); - } + memcpy(&bs_dst->entries[nr], &bs_src->entries[0], + sizeof(struct branch_entry) * (etmq->last_branch_pos + 1)); }
static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) { - etmq->last_branch_pos = 0; + etmq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz - 1; etmq->last_branch_rb->nr = 0; }
@@ -615,15 +611,14 @@ static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq) struct branch_entry *be;
/* - * The branches are recorded in a circular buffer in reverse - * chronological order: we start recording from the last element of the - * buffer down. After writing the first element of the stack, move the - * insert position back to the end of the buffer. + * Record branches in a circular buffer in chronological order. After + * writing the last element of the stack, move the insert position back + * to the beginning of the buffer. */ - if (!etmq->last_branch_pos) - etmq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz; - - etmq->last_branch_pos -= 1; + if (etmq->last_branch_pos == etmq->etm->synth_opts.last_branch_sz - 1) + etmq->last_branch_pos = 0; + else + etmq->last_branch_pos += 1;
be = &bs->entries[etmq->last_branch_pos];
Hi Sebastian,
On Tue, May 23, 2017 at 09:09:30AM -0500, Sebastian Pop wrote:
ARM ETM traces contain packets of executed instructions. The addresses recorded in the trace correspond to the beginning and one instruction past the end of the block of executed instructions. In order to record the address and destination of executed branches, keep track of two packets: the jump occurs from the last executed instruction of the previous packet to the address of the first instruction in the current packet.
Are the instructions in the autoFDO section of the HOWTO.md on GitHub sufficient to test this or there is another way?
Thanks, Mathieu
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com
tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 12 ++++ tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 1 + tools/perf/util/cs-etm.c | 91 +++++++++++++++++-------- 3 files changed, 75 insertions(+), 29 deletions(-)
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index b4cbf4c..6164bfe 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -143,6 +143,18 @@ static int cs_etm_decoder__buffer_packet(struct cs_etm_decoder *decoder, decoder->packet_buffer[et].sample_type = sample_type; decoder->packet_buffer[et].start_addr = elem->st_addr; decoder->packet_buffer[et].end_addr = elem->en_addr;
- switch (elem->last_i_type) {
- case OCSD_INSTR_BR:
- case OCSD_INSTR_BR_INDIRECT:
decoder->packet_buffer[et].last_instruction_is_branch = true;
break;
- case OCSD_INSTR_OTHER:
- case OCSD_INSTR_ISB:
- case OCSD_INSTR_DSB_DMB:
- default:
decoder->packet_buffer[et].last_instruction_is_branch = false;
break;
- } decoder->packet_buffer[et].exc = false; decoder->packet_buffer[et].exc_ret = false; decoder->packet_buffer[et].cpu = *((int*)inode->priv);
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index d15a288..4fd5a9b 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -49,6 +49,7 @@ struct cs_etm_packet { enum cs_etm_sample_type sample_type; uint64_t start_addr; uint64_t end_addr;
- bool last_instruction_is_branch; bool exc; bool exc_ret; int cpu;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 8270203..c99b728 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -104,6 +104,10 @@ struct cs_etm_queue { * inserted. */ size_t last_branch_pos;
- /*
* A pointer to the last and current decoded packets.
*/
- struct cs_etm_packet *prev_packet, *packet;
}; static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq); @@ -226,6 +230,8 @@ static void cs_etm__free_queue(void *priv) zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb);
- zfree(&etmq->prev_packet);
- zfree(&etmq->packet); zfree(&etmq->chain); free(etmq);
} @@ -392,6 +398,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, if (etm->synth_opts.last_branch) { size_t sz = sizeof(struct branch_stack);
size_t szp = sizeof(struct cs_etm_packet);
sz += etm->synth_opts.last_branch_sz * sizeof(struct branch_entry); @@ -401,6 +408,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, etmq->last_branch_rb = zalloc(sz); if (!etmq->last_branch_rb) goto out_free;
etmq->prev_packet = zalloc(szp);
if (!etmq->prev_packet)
goto out_free;
etmq->packet = zalloc(szp);
if (!etmq->packet)
}goto out_free;
etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); @@ -451,6 +464,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb);
- zfree(&etmq->prev_packet);
- zfree(&etmq->packet); zfree(&etmq->chain); free(etmq); return NULL;
@@ -594,8 +609,7 @@ static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) etmq->last_branch_rb->nr = 0; } -static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
struct cs_etm_packet *packet)
+static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq) { struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be; @@ -611,10 +625,16 @@ static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq, etmq->last_branch_pos -= 1;
- be = &bs->entries[etmq->last_branch_pos];
- be->from = packet->start_addr;
- be->to = packet->end_addr;
- /* No support for mispredict */
- be = &bs->entries[etmq->last_branch_pos];
- /*
* The FROM address needs to be the instruction before the end of the
* packet at END_ADDR: substract from END_ADDR the size of the last
* instruction: 4 bytes.
*/
- be->from = etmq->prev_packet->end_addr - 4;
- be->to = etmq->packet->start_addr;
- /* No support for mispredict. */ be->flags.mispred = 0; be->flags.predicted = 1;
@@ -721,13 +741,13 @@ static struct cs_etm_cache_entry *cs_etm__cache_lookup(struct dso *dso, } #endif -static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
struct cs_etm_packet *packet)
+static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq) { int ret = 0; struct cs_etm_auxtrace *etm = etmq->etm; union perf_event *event = etmq->event_buf; struct perf_sample sample = {.ip = 0,};
- struct cs_etm_packet *packet = etmq->packet; uint64_t start_addr = packet->start_addr; uint64_t end_addr = packet->end_addr;
@@ -941,17 +961,18 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) {
//const struct cs_etm_state *state = etmq->state;
struct cs_etm_packet packet;
//struct cs_etm_auxtrace *etm = etmq->etm;
int err;
- struct cs_etm_auxtrace *etm = etmq->etm;
- struct cs_etm_packet *tmp;
- int err = cs_etm_decoder__get_packet(etmq->decoder, etmq->packet);
err = cs_etm_decoder__get_packet(etmq->decoder,&packet);
// if there is no sample, it returns err = -1, no real error
- /*
* If there is no sample, it returns err = -1, no real error.
if (err) return err;*/
- if (etmq->etm->synth_opts.last_branch) {
*cpu = packet.cpu;
- if (etm->synth_opts.last_branch) {
*cpu = etmq->packet->cpu;
/* * FIXME: as the trace sampling does not work for now, (for @@ -961,17 +982,30 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) * should be rewritten as "if reached sampling period". */ if (etmq->last_branch_rb->nr ==
etmq->etm->synth_opts.last_branch_sz) {
err = cs_etm__synth_instruction_sample(etmq, &packet);
etm->synth_opts.last_branch_sz) {
}err = cs_etm__synth_instruction_sample(etmq); if (err) return err;
cs_etm__update_last_branch_rb(etmq, &packet);
- } else if (packet.sample_type & CS_ETM_RANGE) {
err = cs_etm__synth_instruction_sample(etmq,&packet);
if (err)
return err;
}
/*
* Record a branch when the last instruction in PREV_PACKET is a
* branch.
*/
if (etmq->prev_packet->last_instruction_is_branch)
cs_etm__update_last_branch_rb(etmq);
/*
* Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for
* the next incoming packet.
*/
tmp = etmq->packet;
etmq->packet = etmq->prev_packet;
etmq->prev_packet = tmp;
- } else if (etmq->packet->sample_type & CS_ETM_RANGE) {
err = cs_etm__synth_instruction_sample(etmq);
if (err)
return err;
- } return 0;
} @@ -1016,15 +1050,14 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) * buffer at the end of the trace. */ if (etmq->etm->synth_opts.last_branch) {
struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be = &bs->entries[etmq->last_branch_pos];struct cs_etm_packet packet;
packet.cpu = cpu;
packet.start_addr = be->from;
packet.end_addr = be->to;
etmq->packet->cpu = cpu;
etmq->packet->start_addr = be->to;
etmq->packet->end_addr = be->to + 4;
Just like you did above, please add a comment that explains the "+ 4".
err = cs_etm__synth_instruction_sample(etmq, &packet);
if (err) return err; }err = cs_etm__synth_instruction_sample(etmq);
-- 2.7.4
On Wed, May 24, 2017 at 11:36 AM, Mathieu Poirier mathieu.poirier@linaro.org wrote:
Are the instructions in the autoFDO section of the HOWTO.md on GitHub sufficient to test this or there is another way?
Here is how I tested it: (supposing that perf.data contains an ETM trace)
# perf inject -i perf.data -o inj --itrace=il64 --strip # perf report -i inj -D &> dump
and I inspected the addresses from the last branch stack in the output dump with the addresses of the disassembled program from:
# objdump -d sort
On Wed, 24 May 2017 12:48:04 -0500 Sebastian Pop sebpop@gmail.com wrote:
On Wed, May 24, 2017 at 11:36 AM, Mathieu Poirier mathieu.poirier@linaro.org wrote:
Are the instructions in the autoFDO section of the HOWTO.md on GitHub sufficient to test this or there is another way?
Here is how I tested it: (supposing that perf.data contains an ETM trace)
# perf inject -i perf.data -o inj --itrace=il64 --strip # perf report -i inj -D &> dump
and I inspected the addresses from the last branch stack in the output dump with the addresses of the disassembled program from:
# objdump -d sort
Re-running the AutoFDO process with these two patches continue to make the resultant executable perform worse, however:
$ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5306 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5304 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5851 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5889 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5888 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5318 ms
The gcov file generated from the inj.data (no matter whether it's --itrace=il64 or --itrace=i100usle) still looks wrong:
$ ~/git/autofdo/dump_gcov -gcov_version=1 sort-O3.gcov sort_array total:19309128 head:0 0: 0 1: 0 5: 0 6: 0 7.1: 0 7.3: 0 8.3: 0 15: 2 16: 2 17: 2 10: start total:0 1: 0 11: bubble_sort total:19309119 2: 1566 4: 6266668 5: 6071341 7: 6266668 9: 702876 12: stop total:3 2: 0 3: 1 4: 1 5: 1 main total:1 head:0 0: 0 2: 0 4: 1 1: cmd_line total:0 3: 0 4: 0 5: 0 6: 0
Whereas the one generated by intel-pt run looks correct, showing the swap (11: bubble_sort 7,8) as executed less times:
kim@juno sort-etm$ ~/git/autofdo/dump_gcov -gcov_version=1 ../sort-O3.gcov sort_array total:105658 head:0 0: 0 5: 0 6: 0 7.1: 0 7.3: 0 8.3: 0 16: 0 17: 0 1: printf total:0 2: 0 10: start total:0 1: 0 11: bubble_sort total:105658 2: 14 4: 28740 5: 28628 7: 9768 8: 9768 9: 28740 12: stop total:0 2: 0 3: 0 4: 0 5: printf total:0 2: 0 15: printf total:0 2: 0
I have to run the 'perf inject' on the x86 host because of the aforementioned:
0x350 [0x50]: failed to process type: 1
problem when trying to run it natively on the aarch64 target.
However, it doesn't matter whether I run the create_gcov - like so btw:
~/git/autofdo/create_gcov --binary=sort-O3 --profile=inj.data --gcov=sort-O3.gcov -gcov_version=1
on the x86 host or the aarch64 target: I still get the same (negative performance) results.
As Sebastian asked, if I take the intel-pt sourced inject generated .gcov onto the target and rebuild sort, the performance improves:
$ gcc -g -O3 -fauto-profile=../sort-O3.gcov ./sort.c -o ./sort-O3-autofdo $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5309 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5310 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 4443 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 4443 ms
And if I take the ETM-generated gcov and use that to build a new x86_64 binary, it indeed performs worse on x86_64 also:
$ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1502 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1500 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1501 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1907 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1893 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1907 ms
Kim
Hi,
Tried out Sebastians patches and got some similarities to Kim but a couple of differences and some interesting results if you look at the disassemble of the resulting routines.
So as per the AutoFDO instructions I built a sort program with no optimisations and debug: gcc -g sort.c -o sort This I profiled on juno- with 3000 interations
The resulting disassembly of the bubble_sort routine is in bubble-sorts-disass.txt and the dump gcov profile is below... -------------------------------- bubble_sort total:33987051 head:0 0: 0 1: 0 2: 2839 3: 2839 4: 2839 4.1: 8522673 4.2: 8519834 5: 8517035 6: 2104748 7: 2104748 8: 2104748 9: 2104748 13: 0 ------------------------------- So in my view - the swap lines (6:-9:) - see attached sort.c, are run less than the enclosing loop (2:-4:,4.1:-5:) - which is what Kim observed with the intel version. The synthesized LBR records looked reasonable from comparison with the disassembly too.
Trying out the O3 and O3-autofdo from this profile resulted in O3 running marginally faster, but both faster than unoptimised debug.
So now look at the disassemblies from the -O3 and -autofdo-O3 versions of the sort routine [bubble-sorts-disass.txt again]. Both appear to define a bubble_sort routine, but embed the same / similar code into sort_array. Unsurprisingly the O3 version is considerably more compact - hence it runs faster. I have no idea what the autofdo version is up to, but the I cannot see how the massive expansion of the routine with compare and jump tables is going to help.
So perhaps:- 1) the LBR stacks are still not correct - though code and dump inspection might suggest otherwise - are there features in the intel LBR we are not yet synthesizing? 2) There is some adverse interaction with the profiles we are generating and the autofdo code generation. 3) The amount of coverage in the file is hitting the process - looking at gcov above then we only have trace from the bubble sort routine. I did reduce the number of iterations to get more of the program captured in coverage but this did not seem to make a difference. Mike
On 25 May 2017 at 05:12, Kim Phillips kim.phillips@arm.com wrote:
On Wed, 24 May 2017 12:48:04 -0500 Sebastian Pop sebpop@gmail.com wrote:
On Wed, May 24, 2017 at 11:36 AM, Mathieu Poirier mathieu.poirier@linaro.org wrote:
Are the instructions in the autoFDO section of the HOWTO.md on GitHub sufficient to test this or there is another way?
Here is how I tested it: (supposing that perf.data contains an ETM trace)
# perf inject -i perf.data -o inj --itrace=il64 --strip # perf report -i inj -D &> dump
and I inspected the addresses from the last branch stack in the output dump with the addresses of the disassembled program from:
# objdump -d sort
Re-running the AutoFDO process with these two patches continue to make the resultant executable perform worse, however:
$ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5306 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5304 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5851 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5889 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 5888 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5318 ms
The gcov file generated from the inj.data (no matter whether it's --itrace=il64 or --itrace=i100usle) still looks wrong:
$ ~/git/autofdo/dump_gcov -gcov_version=1 sort-O3.gcov sort_array total:19309128 head:0 0: 0 1: 0 5: 0 6: 0 7.1: 0 7.3: 0 8.3: 0 15: 2 16: 2 17: 2 10: start total:0 1: 0 11: bubble_sort total:19309119 2: 1566 4: 6266668 5: 6071341 7: 6266668 9: 702876 12: stop total:3 2: 0 3: 1 4: 1 5: 1 main total:1 head:0 0: 0 2: 0 4: 1 1: cmd_line total:0 3: 0 4: 0 5: 0 6: 0
Whereas the one generated by intel-pt run looks correct, showing the swap (11: bubble_sort 7,8) as executed less times:
kim@juno sort-etm$ ~/git/autofdo/dump_gcov -gcov_version=1 ../sort-O3.gcov sort_array total:105658 head:0 0: 0 5: 0 6: 0 7.1: 0 7.3: 0 8.3: 0 16: 0 17: 0 1: printf total:0 2: 0 10: start total:0 1: 0 11: bubble_sort total:105658 2: 14 4: 28740 5: 28628 7: 9768 8: 9768 9: 28740 12: stop total:0 2: 0 3: 0 4: 0 5: printf total:0 2: 0 15: printf total:0 2: 0
I have to run the 'perf inject' on the x86 host because of the aforementioned:
0x350 [0x50]: failed to process type: 1
problem when trying to run it natively on the aarch64 target.
However, it doesn't matter whether I run the create_gcov - like so btw:
~/git/autofdo/create_gcov --binary=sort-O3 --profile=inj.data --gcov=sort-O3.gcov -gcov_version=1
on the x86 host or the aarch64 target: I still get the same (negative performance) results.
As Sebastian asked, if I take the intel-pt sourced inject generated .gcov onto the target and rebuild sort, the performance improves:
$ gcc -g -O3 -fauto-profile=../sort-O3.gcov ./sort.c -o ./sort-O3-autofdo $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5309 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 5310 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 4443 ms $ taskset -c 2 ./sort-O3-autofdo Bubble sorting array of 30000 elements 4443 ms
And if I take the ETM-generated gcov and use that to build a new x86_64 binary, it indeed performs worse on x86_64 also:
$ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1502 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1500 ms $ taskset -c 2 ./sort-O3 Bubble sorting array of 30000 elements 1501 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1907 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1893 ms $ taskset -c 2 ./sort-O3-autofdo-etmgcov Bubble sorting array of 30000 elements 1907 ms
Kim _______________________________________________ CoreSight mailing list CoreSight@lists.linaro.org https://lists.linaro.org/mailman/listinfo/coresight
I also have seen a slow-down when generating the ETM trace with sorting 3000 elements, and using that profile in optimizing the sort of 30000 elements. I see some speedup when generating the perf.data with the same number of sorted elements as the base:
+ gcc -O3 -g3 sort.c -o sort_30k-base -DARRAY_LEN=30000 + export LD_LIBRARY_PATH=/root/etm/decoder-may/decoder/tests/bin/linux-arm64/dbg + LD_LIBRARY_PATH=/root/etm/decoder-may/decoder/tests/bin/linux-arm64/dbg + /root/etm/OpenCSD-may/tools/perf/perf record -e cs_etm/@20070000.etr/u --per-thread ./sort_30k-base Bubble sorting array of 30000 elements 8686 ms [ perf record: Woken up 10 times to write data ] Warning: AUX data lost 7 times out of 14!
[ perf record: Captured and wrote 22.132 MB perf.data ] + /root/etm/OpenCSD-may/tools/perf/perf inject -f -i perf.data -o inj --itrace=i100usl --strip + create_gcov --binary=./sort_30k-base --profile=inj --gcov=sort_30k-base.gcov -gcov_version=1 + dump_gcov sort_30k-base.gcov -gcov_version=1 sort_array total:44830 head:1 0: 1 3.3: 4981 4.3: 4981 7: 4981 1: printf total:4981 2: 4981 6: bubble_sort total:24905 2: 4981 4: 4981 5: 4981 7: 4981 9: 4981 + gcc -O3 -fauto-profile=sort_30k-base.gcov sort.c -o sort_autofdo1 -DARRAY_LEN=30000 + taskset -c 2 ./sort_autofdo1 Bubble sorting array of 30000 elements 5687 ms # taskset -c 2 ./sort_30k-base Bubble sorting array of 30000 elements 5910 ms # taskset -c 2 ./sort_30k-base Bubble sorting array of 30000 elements 5911 ms # taskset -c 2 ./sort_autofdo1 Bubble sorting array of 30000 elements 5686 ms # taskset -c 2 ./sort_autofdo1 Bubble sorting array of 30000 elements 5685 ms
The output of dump_gcov looks wrong. And this is far from the compiler generated profile:
# gcc sort.c -O3 -g -fprofile-generate -o sort_30k-gen-profile -DARRAY_LEN=30000 # taskset -c 2 ./sort_30k-gen-profile Bubble sorting array of 30000 elements 5877 ms # gcc sort.c -O3 -g -fprofile-use -o sort_30k-profile-use -DARRAY_LEN=30000 # taskset -c 2 ./sort_30k-profile-use Bubble sorting array of 30000 elements 4332 ms # taskset -c 2 ./sort_30k-profile-use Bubble sorting array of 30000 elements 4328 ms # taskset -c 2 ./sort_30k-profile-use Bubble sorting array of 30000 elements 4331 ms
It looks like the profile we generate from the ETM has some problem. The compiler can generate much better code with a correct profile. The profile from x86 also gets better performance, though it does not match the compiler generated profile:
# taskset -c 2 ./sort_autofdo_x86 Bubble sorting array of 30000 elements 4971 ms
By popular demand, I started debugging this problem again.
With the two patches that I posted earlier, the traces seem correct with the exception of a few "holes" where the trace seems to jump over a few instructions that are not reported in the trace, creating jumps that do not exist in the control flow graph of the code.
The nested loop for bubble sort is:
4008a0: 9100e3a0 add x0, x29, #0x38 4008a4: 52800004 mov w4, #0x0 // #0 4008a8: 29400402 ldp w2, w1, [x0] 4008ac: 6b02003f cmp w1, w2 4008b0: 5400006a b.ge 4008bc <sort_array+0x84> 4008b4: 52800024 mov w4, #0x1 // #1 4008b8: 29000801 stp w1, w2, [x0] 4008bc: 91001000 add x0, x0, #0x4 4008c0: eb00007f cmp x3, x0 4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70> 4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
..... 34: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 35: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 36: 00000000004008b0 -> 00000000004008a8 0 cycles P 0
edge #36 does not exist in the code: the trace is not correct here. 4008b0 is "b.ge 4008bc" and should either jump to 4008bc or fall through to the next instruction 4008b4, and the trace wrongly jumps to 4008a8.
Several hundred jumps later, we see this following sequence:
..... 40: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 41: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 42: 00000000004008c4 -> 00000000004008b4 0 cycles P 0 ..... 43: 00000000004008c4 -> 00000000004008a8 0 cycles P 0
where edge #42 is not correct either: 4008c4 should either branch to 4008a8 or fall through to 4008c8.
Maybe these inconsistencies are due to interruptions in trace recordings? I think that these interruptions could not be avoided in trace collections.
Dehao, could these wrong edges be fixed in the compiler when reading the coverage file? Thanks, Sebastian
On Tue, Sep 19, 2017 at 1:02 PM, Sebastian Pop sebpop@gmail.com wrote:
By popular demand, I started debugging this problem again.
With the two patches that I posted earlier, the traces seem correct with the exception of a few "holes" where the trace seems to jump over a few instructions that are not reported in the trace, creating jumps that do not exist in the control flow graph of the code.
The nested loop for bubble sort is:
4008a0: 9100e3a0 add x0, x29, #0x38 4008a4: 52800004 mov w4, #0x0 // #0 4008a8: 29400402 ldp w2, w1, [x0] 4008ac: 6b02003f cmp w1, w2 4008b0: 5400006a b.ge 4008bc <sort_array+0x84> 4008b4: 52800024 mov w4, #0x1 // #1 4008b8: 29000801 stp w1, w2, [x0] 4008bc: 91001000 add x0, x0, #0x4 4008c0: eb00007f cmp x3, x0 4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70> 4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
..... 34: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 35: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 36: 00000000004008b0 -> 00000000004008a8 0 cycles P 0
edge #36 does not exist in the code: the trace is not correct here. 4008b0 is "b.ge 4008bc" and should either jump to 4008bc or fall through to the next instruction 4008b4, and the trace wrongly jumps to 4008a8.
Several hundred jumps later, we see this following sequence:
..... 40: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 41: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 42: 00000000004008c4 -> 00000000004008b4 0 cycles P 0 ..... 43: 00000000004008c4 -> 00000000004008a8 0 cycles P 0
where edge #42 is not correct either: 4008c4 should either branch to 4008a8 or fall through to 4008c8.
Maybe these inconsistencies are due to interruptions in trace recordings? I think that these interruptions could not be avoided in trace collections.
Dehao, could these wrong edges be fixed in the compiler when reading the coverage file?
I cannot see an easy way for compiler/create_gcov tool to cover these issue. Why can't trace collection tool fix these issues? Looks a bug to me.
Thanks, Dehao
Thanks, Sebastian
On Tue, Sep 19, 2017 at 3:16 PM, Dehao Chen dehao@google.com wrote:
On Tue, Sep 19, 2017 at 1:02 PM, Sebastian Pop sebpop@gmail.com wrote:
By popular demand, I started debugging this problem again.
With the two patches that I posted earlier, the traces seem correct with the exception of a few "holes" where the trace seems to jump over a few instructions that are not reported in the trace, creating jumps that do not exist in the control flow graph of the code.
The nested loop for bubble sort is:
4008a0: 9100e3a0 add x0, x29, #0x38 4008a4: 52800004 mov w4, #0x0 // #0 4008a8: 29400402 ldp w2, w1, [x0] 4008ac: 6b02003f cmp w1, w2 4008b0: 5400006a b.ge 4008bc <sort_array+0x84> 4008b4: 52800024 mov w4, #0x1 // #1 4008b8: 29000801 stp w1, w2, [x0] 4008bc: 91001000 add x0, x0, #0x4 4008c0: eb00007f cmp x3, x0 4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70> 4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
..... 34: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 35: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 36: 00000000004008b0 -> 00000000004008a8 0 cycles P 0
edge #36 does not exist in the code: the trace is not correct here. 4008b0 is "b.ge 4008bc" and should either jump to 4008bc or fall through to the next instruction 4008b4, and the trace wrongly jumps to 4008a8.
Several hundred jumps later, we see this following sequence:
..... 40: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 41: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 42: 00000000004008c4 -> 00000000004008b4 0 cycles P 0 ..... 43: 00000000004008c4 -> 00000000004008a8 0 cycles P 0
where edge #42 is not correct either: 4008c4 should either branch to 4008a8 or fall through to 4008c8.
Maybe these inconsistencies are due to interruptions in trace recordings? I think that these interruptions could not be avoided in trace collections.
Dehao, could these wrong edges be fixed in the compiler when reading the coverage file?
I cannot see an easy way for compiler/create_gcov tool to cover these issue. Why can't trace collection tool fix these issues? Looks a bug to me.
My thinking was that the compiler knows that there are no edges between the blocks at these addresses and may just ignore the counts at these addresses.
Maybe we can figure out why this pattern occurs and try to solve it in either perf inject or in the decoder? The pattern looks very regular.
These first two errors occur at a distance of 389 branches, the next error occurs after again 389 branches. If we call the first incorrect jump "A" and the second incorrect jump "B", we have this pattern:
A, 389 correct jumps, B, 389 correct jumps, A, 389 correct jumps, B,
There are 343 occurrences of A and 322 of B in a trace of sorting 3000 elements.
I agree. If we can fix the issue upstream, we don't want to have hacks downstream to patch the issue.
Dehao
On Tue, Sep 19, 2017 at 2:59 PM, Sebastian Pop sebpop@gmail.com wrote:
On Tue, Sep 19, 2017 at 3:16 PM, Dehao Chen dehao@google.com wrote:
On Tue, Sep 19, 2017 at 1:02 PM, Sebastian Pop sebpop@gmail.com wrote:
By popular demand, I started debugging this problem again.
With the two patches that I posted earlier, the traces seem correct with the exception of a few "holes" where the trace seems to jump over a few instructions that are not reported in the trace, creating jumps that do not exist in the control flow graph of the code.
The nested loop for bubble sort is:
4008a0: 9100e3a0 add x0, x29, #0x38 4008a4: 52800004 mov w4, #0x0 // #0 4008a8: 29400402 ldp w2, w1, [x0] 4008ac: 6b02003f cmp w1, w2 4008b0: 5400006a b.ge 4008bc <sort_array+0x84> 4008b4: 52800024 mov w4, #0x1 // #1 4008b8: 29000801 stp w1, w2, [x0] 4008bc: 91001000 add x0, x0, #0x4 4008c0: eb00007f cmp x3, x0 4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70> 4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
..... 34: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 35: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 36: 00000000004008b0 -> 00000000004008a8 0 cycles P 0
edge #36 does not exist in the code: the trace is not correct here. 4008b0 is "b.ge 4008bc" and should either jump to 4008bc or fall through to the next instruction 4008b4, and the trace wrongly jumps to 4008a8.
Several hundred jumps later, we see this following sequence:
..... 40: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 41: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 42: 00000000004008c4 -> 00000000004008b4 0 cycles P 0 ..... 43: 00000000004008c4 -> 00000000004008a8 0 cycles P 0
where edge #42 is not correct either: 4008c4 should either branch to 4008a8 or fall through to 4008c8.
Maybe these inconsistencies are due to interruptions in trace recordings? I think that these interruptions could not be avoided in trace collections.
Dehao, could these wrong edges be fixed in the compiler when reading the coverage file?
I cannot see an easy way for compiler/create_gcov tool to cover these issue. Why can't trace collection tool fix these issues? Looks a bug to me.
My thinking was that the compiler knows that there are no edges between the blocks at these addresses and may just ignore the counts at these addresses.
Maybe we can figure out why this pattern occurs and try to solve it in either perf inject or in the decoder? The pattern looks very regular.
These first two errors occur at a distance of 389 branches, the next error occurs after again 389 branches. If we call the first incorrect jump "A" and the second incorrect jump "B", we have this pattern:
A, 389 correct jumps, B, 389 correct jumps, A, 389 correct jumps, B,
There are 343 occurrences of A and 322 of B in a trace of sorting 3000 elements.
Hi Sebastian,
I agree that this is either a decoder or inject issue.
First, can you confirm you are using version 0.7.3 of the decoder - I recently fixed an addressing bug in there.
Otherwise there are a couple of possibilities.... A) an as yet undiscovered decoder bug. B) gaps in the trace that are not being communicated correctly making inject assume continuous trace when it is not. C) some other misunderstanding/misinterpretation between decoder and perf inject.
Whichever it is I need to look at the raw trace data alongside the inject output at one of the A/B points and follow the packet => decode => inject flow to see why we get a bad address value.
Can you send me the capture you are using to create the examples you quote above - plus instructions on how to reproduce the output you were getting. I'll then up the logging on the decoder and walk through the trace decode path.
Regards
Mike On Tue, 19 Sep 2017 at 23:05, Dehao Chen dehao@google.com wrote:
I agree. If we can fix the issue upstream, we don't want to have hacks downstream to patch the issue.
Dehao
On Tue, Sep 19, 2017 at 2:59 PM, Sebastian Pop sebpop@gmail.com wrote:
On Tue, Sep 19, 2017 at 3:16 PM, Dehao Chen dehao@google.com wrote:
On Tue, Sep 19, 2017 at 1:02 PM, Sebastian Pop sebpop@gmail.com wrote:
By popular demand, I started debugging this problem again.
With the two patches that I posted earlier, the traces seem correct with the exception of a few "holes" where the trace seems to jump over a few instructions that are not reported in the trace, creating jumps that do not exist in the control flow graph of the code.
The nested loop for bubble sort is:
4008a0: 9100e3a0 add x0, x29, #0x38 4008a4: 52800004 mov w4, #0x0 // #0 4008a8: 29400402 ldp w2, w1, [x0] 4008ac: 6b02003f cmp w1, w2 4008b0: 5400006a b.ge 4008bc <sort_array+0x84> 4008b4: 52800024 mov w4, #0x1 // #1 4008b8: 29000801 stp w1, w2, [x0] 4008bc: 91001000 add x0, x0, #0x4 4008c0: eb00007f cmp x3, x0 4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70> 4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
..... 34: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 35: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 36: 00000000004008b0 -> 00000000004008a8 0 cycles P 0
edge #36 does not exist in the code: the trace is not correct here. 4008b0 is "b.ge 4008bc" and should either jump to 4008bc or fall through to the next instruction 4008b4, and the trace wrongly jumps to 4008a8.
Several hundred jumps later, we see this following sequence:
..... 40: 00000000004008c4 -> 00000000004008a8 0 cycles P 0 ..... 41: 00000000004008b0 -> 00000000004008b4 0 cycles P 0 ..... 42: 00000000004008c4 -> 00000000004008b4 0 cycles P 0 ..... 43: 00000000004008c4 -> 00000000004008a8 0 cycles P 0
where edge #42 is not correct either: 4008c4 should either branch to 4008a8 or fall through to 4008c8.
Maybe these inconsistencies are due to interruptions in trace recordings? I think that these interruptions could not be avoided in trace collections.
Dehao, could these wrong edges be fixed in the compiler when reading the coverage file?
I cannot see an easy way for compiler/create_gcov tool to cover these issue. Why can't trace collection tool fix these issues? Looks a bug to me.
My thinking was that the compiler knows that there are no edges between the blocks at these addresses and may just ignore the counts at these addresses.
Maybe we can figure out why this pattern occurs and try to solve it in either perf inject or in the decoder? The pattern looks very regular.
These first two errors occur at a distance of 389 branches, the next error occurs after again 389 branches. If we call the first incorrect jump "A" and the second incorrect jump "B", we have this pattern:
A, 389 correct jumps, B, 389 correct jumps, A, 389 correct jumps, B,
There are 343 occurrences of A and 322 of B in a trace of sorting 3000 elements.
Hi Mike,
On 09/19/2017 06:06 PM, Mike Leach wrote:
Hi Sebastian,
I agree that this is either a decoder or inject issue.
First, can you confirm you are using version 0.7.3 of the decoder - I recently fixed an addressing bug in there.
I have updated my decoder as of this morning master branch of OpenCSD git: commit 5eb5e0e850339fe42a230c9d5ff93cd37cf9ae36 Author: Mike Leach mike.leach@linaro.org Date: Fri Sep 8 14:03:16 2017 +0100
I have applied the attached patch to remove two compile warnings.
Otherwise there are a couple of possibilities.... A) an as yet undiscovered decoder bug. B) gaps in the trace that are not being communicated correctly making inject assume continuous trace when it is not. C) some other misunderstanding/misinterpretation between decoder and perf inject.
Whichever it is I need to look at the raw trace data alongside the inject output at one of the A/B points and follow the packet => decode => inject flow to see why we get a bad address value.
Can you send me the capture you are using to create the examples you quote above - plus instructions on how to reproduce the output you were getting. I'll then up the logging on the decoder and walk through the trace decode path.
Here are the steps that I followed on a Juno-r0 machine:
$ gcc -O3 -g3 sort.c -o sort_3k-base -DARRAY_LEN=3000 $ cat sort.c #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #ifndef ARRAY_LEN #define ARRAY_LEN 30000 #endif
static struct timeval tm1;
static inline void start() { gettimeofday(&tm1, NULL); }
static inline void stop() { struct timeval tm2; gettimeofday(&tm2, NULL); unsigned long long t = 1000 * (tm2.tv_sec - tm1.tv_sec) +\ (tm2.tv_usec - tm1.tv_usec) / 1000; printf("%llu ms\n", t); }
void bubble_sort (int *a, int n) { int i, t, s = 1; while (s) { s = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { t = a[i]; a[i] = a[i - 1]; a[i - 1] = t; s = 1; } } } }
void sort_array() { printf("Bubble sorting array of %d elements\n", ARRAY_LEN); int data[ARRAY_LEN], i; for(i=0; i<ARRAY_LEN; ++i){ data[i] = rand(); } bubble_sort(data, ARRAY_LEN); }
int main(){ start(); sort_array(); stop(); return 0; }
$ export LD_LIBRARY_PATH=.../decoder/tests/bin/linux-arm64/rel $ perf record -e cs_etm/@20070000.etr/u --per-thread ./sort_3k-base $ perf inject -f -i perf.data -o inj --itrace=i100usl --strip $ tools/perf/perf report -D -i inj &> dump
The dump still shows the wrong jumps with the new decoder: $ grep '00000000004008c4 -> 00000000004008b4' dump | wc -l 603 $ grep '00000000004008b0 -> 00000000004008a8' dump | wc -l 723
The trace of the new decoder is longer than the trace I got yesterday with the old decoder.
Sebastian
Hi Mike,
The issue seems to be that the decoder->packet_buffer contains twice the same start and end address in two consecutive entries:
(gdb) p etmq->decoder->packet_buffer[etmq->decoder->head] $300 = {sample_type = CS_ETM_RANGE, start_addr = 4196520, end_addr = 4196532, last_instruction_is_branch = true, exc = false, exc_ret = false, cpu = 1} (gdb) p etmq->decoder->packet_buffer[etmq->decoder->head - 1] $301 = {sample_type = CS_ETM_RANGE, start_addr = 4196520, end_addr = 4196532, last_instruction_is_branch = true, exc = false, exc_ret = false, cpu = 1} (gdb) p etmq->decoder->head $302 = 550
This leads to the wrong jump computed as follows: from prev_packet(at head - 1)->end_addr - 4 to current_packet(at head)->start_addr i.e., 4196532 - 4 to 4196520 in hex: 0x4008b0 to 0x4008a8
I will try to understand why decoder->packet_buffer contains the duplicate entries.
Sebastian
On Tue, May 23, 2017 at 09:09:30AM -0500, Sebastian Pop wrote:
ARM ETM traces contain packets of executed instructions. The addresses recorded in the trace correspond to the beginning and one instruction past the end of the block of executed instructions. In order to record the address and destination of executed branches, keep track of two packets: the jump occurs from the last executed instruction of the previous packet to the address of the first instruction in the current packet.
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com
tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 12 ++++ tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 1 + tools/perf/util/cs-etm.c | 91 +++++++++++++++++-------- 3 files changed, 75 insertions(+), 29 deletions(-)
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index b4cbf4c..6164bfe 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -143,6 +143,18 @@ static int cs_etm_decoder__buffer_packet(struct cs_etm_decoder *decoder, decoder->packet_buffer[et].sample_type = sample_type; decoder->packet_buffer[et].start_addr = elem->st_addr; decoder->packet_buffer[et].end_addr = elem->en_addr;
- switch (elem->last_i_type) {
- case OCSD_INSTR_BR:
- case OCSD_INSTR_BR_INDIRECT:
decoder->packet_buffer[et].last_instruction_is_branch = true;
break;
- case OCSD_INSTR_OTHER:
- case OCSD_INSTR_ISB:
- case OCSD_INSTR_DSB_DMB:
- default:
decoder->packet_buffer[et].last_instruction_is_branch = false;
break;
- } decoder->packet_buffer[et].exc = false; decoder->packet_buffer[et].exc_ret = false; decoder->packet_buffer[et].cpu = *((int*)inode->priv);
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index d15a288..4fd5a9b 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -49,6 +49,7 @@ struct cs_etm_packet { enum cs_etm_sample_type sample_type; uint64_t start_addr; uint64_t end_addr;
- bool last_instruction_is_branch; bool exc; bool exc_ret; int cpu;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 8270203..c99b728 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -104,6 +104,10 @@ struct cs_etm_queue { * inserted. */ size_t last_branch_pos;
- /*
* A pointer to the last and current decoded packets.
*/
- struct cs_etm_packet *prev_packet, *packet;
}; static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq); @@ -226,6 +230,8 @@ static void cs_etm__free_queue(void *priv) zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb);
- zfree(&etmq->prev_packet);
- zfree(&etmq->packet); zfree(&etmq->chain); free(etmq);
} @@ -392,6 +398,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, if (etm->synth_opts.last_branch) { size_t sz = sizeof(struct branch_stack);
size_t szp = sizeof(struct cs_etm_packet);
sz += etm->synth_opts.last_branch_sz * sizeof(struct branch_entry); @@ -401,6 +408,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, etmq->last_branch_rb = zalloc(sz); if (!etmq->last_branch_rb) goto out_free;
etmq->prev_packet = zalloc(szp);
if (!etmq->prev_packet)
goto out_free;
etmq->packet = zalloc(szp);
if (!etmq->packet)
}goto out_free;
etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); @@ -451,6 +464,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, zfree(&etmq->event_buf); zfree(&etmq->last_branch); zfree(&etmq->last_branch_rb);
- zfree(&etmq->prev_packet);
- zfree(&etmq->packet); zfree(&etmq->chain); free(etmq); return NULL;
@@ -594,8 +609,7 @@ static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) etmq->last_branch_rb->nr = 0; } -static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
struct cs_etm_packet *packet)
+static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq) { struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be; @@ -611,10 +625,16 @@ static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq, etmq->last_branch_pos -= 1;
- be = &bs->entries[etmq->last_branch_pos];
- be->from = packet->start_addr;
- be->to = packet->end_addr;
- /* No support for mispredict */
- be = &bs->entries[etmq->last_branch_pos];
- /*
* The FROM address needs to be the instruction before the end of the
* packet at END_ADDR: substract from END_ADDR the size of the last
* instruction: 4 bytes.
*/
...and checkpatch complains about 'substract' being mispelled.
Thanks, Mathieu
- be->from = etmq->prev_packet->end_addr - 4;
- be->to = etmq->packet->start_addr;
- /* No support for mispredict. */ be->flags.mispred = 0; be->flags.predicted = 1;
@@ -721,13 +741,13 @@ static struct cs_etm_cache_entry *cs_etm__cache_lookup(struct dso *dso, } #endif -static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
struct cs_etm_packet *packet)
+static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq) { int ret = 0; struct cs_etm_auxtrace *etm = etmq->etm; union perf_event *event = etmq->event_buf; struct perf_sample sample = {.ip = 0,};
- struct cs_etm_packet *packet = etmq->packet; uint64_t start_addr = packet->start_addr; uint64_t end_addr = packet->end_addr;
@@ -941,17 +961,18 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) {
//const struct cs_etm_state *state = etmq->state;
struct cs_etm_packet packet;
//struct cs_etm_auxtrace *etm = etmq->etm;
int err;
- struct cs_etm_auxtrace *etm = etmq->etm;
- struct cs_etm_packet *tmp;
- int err = cs_etm_decoder__get_packet(etmq->decoder, etmq->packet);
err = cs_etm_decoder__get_packet(etmq->decoder,&packet);
// if there is no sample, it returns err = -1, no real error
- /*
* If there is no sample, it returns err = -1, no real error.
if (err) return err;*/
- if (etmq->etm->synth_opts.last_branch) {
*cpu = packet.cpu;
- if (etm->synth_opts.last_branch) {
*cpu = etmq->packet->cpu;
/* * FIXME: as the trace sampling does not work for now, (for @@ -961,17 +982,30 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) * should be rewritten as "if reached sampling period". */ if (etmq->last_branch_rb->nr ==
etmq->etm->synth_opts.last_branch_sz) {
err = cs_etm__synth_instruction_sample(etmq, &packet);
etm->synth_opts.last_branch_sz) {
}err = cs_etm__synth_instruction_sample(etmq); if (err) return err;
cs_etm__update_last_branch_rb(etmq, &packet);
- } else if (packet.sample_type & CS_ETM_RANGE) {
err = cs_etm__synth_instruction_sample(etmq,&packet);
if (err)
return err;
}
/*
* Record a branch when the last instruction in PREV_PACKET is a
* branch.
*/
if (etmq->prev_packet->last_instruction_is_branch)
cs_etm__update_last_branch_rb(etmq);
/*
* Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for
* the next incoming packet.
*/
tmp = etmq->packet;
etmq->packet = etmq->prev_packet;
etmq->prev_packet = tmp;
- } else if (etmq->packet->sample_type & CS_ETM_RANGE) {
err = cs_etm__synth_instruction_sample(etmq);
if (err)
return err;
- } return 0;
} @@ -1016,15 +1050,14 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) * buffer at the end of the trace. */ if (etmq->etm->synth_opts.last_branch) {
struct branch_stack *bs = etmq->last_branch_rb; struct branch_entry *be = &bs->entries[etmq->last_branch_pos];struct cs_etm_packet packet;
packet.cpu = cpu;
packet.start_addr = be->from;
packet.end_addr = be->to;
etmq->packet->cpu = cpu;
etmq->packet->start_addr = be->to;
etmq->packet->end_addr = be->to + 4;
err = cs_etm__synth_instruction_sample(etmq, &packet);
if (err) return err; }err = cs_etm__synth_instruction_sample(etmq);
-- 2.7.4