From 642350c084e9203b3e998821962b003cfcfac076 Mon Sep 17 00:00:00 2001 From: Pantelis Antoniou Date: Sat, 31 Mar 2012 16:40:25 +0000 Subject: [PATCH 2/2] [perf] Add perf sched spr-replay functionality --- tools/perf/builtin-sched.c | 2201 +++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 2185 insertions(+), 16 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index fb8b5f8..e886459 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -22,6 +22,12 @@ #include #include +#include +#include +#include +#include +#include + static const char *input_name; static char default_sort_order[] = "avg, max, switch, runtime"; @@ -42,6 +48,9 @@ static u64 sleep_measurement_overhead; static unsigned long nr_tasks; +#define mb() asm volatile ("" : : : "memory") +static u64 bogoloops = 0; + struct sched_atom; struct task_desc { @@ -60,6 +69,11 @@ struct task_desc { sem_t work_done_sem; u64 cpu_usage; + + u64 task_state; + + unsigned int exited : 1; /* exited */ + unsigned int selected : 1; /* part of playback */ }; enum sched_event_type { @@ -67,6 +81,9 @@ enum sched_event_type { SCHED_EVENT_SLEEP, SCHED_EVENT_WAKEUP, SCHED_EVENT_MIGRATION, + SCHED_EVENT_EXIT, + SCHED_EVENT_FORK_PARENT, + SCHED_EVENT_FORK_CHILD, }; struct sched_atom { @@ -77,11 +94,38 @@ struct sched_atom { unsigned long nr; sem_t *wait_sem; struct task_desc *wakee; + struct sched_atom *wakee_event; + u64 task_state; + int pid; + struct task_desc *parent; + struct task_desc *child; + + /* extra */ + unsigned int exited; /* event was generated after task has called exit() */ + char *msg; + + unsigned int waker_count; + struct task_desc **wakers; + struct sched_atom **waker_events; + unsigned long wake_id; }; +static const char *sched_atom_str(const struct sched_atom *atom); + static struct task_desc *pid_to_task[MAX_PID]; +static u64 replay_start_time; +static u64 replay_end_time; static struct task_desc **tasks; +static unsigned long next_wake_id; +static bool preserve_time = false; +static bool dry_run = false; +static bool generate = false; +static int debug = 0; +static bool spr_list = false; +static const char *spr_filename = NULL; +static bool bogoburn = false; +static bool spr_replay = false; static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; static u64 start_time; @@ -91,6 +135,9 @@ static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; static unsigned long nr_run_events; static unsigned long nr_sleep_events; static unsigned long nr_wakeup_events; +static unsigned long nr_exit_events; +static unsigned long nr_fork_parent_events; +static unsigned long nr_fork_child_events; static unsigned long nr_sleep_corrections; static unsigned long nr_run_events_optimized; @@ -217,6 +264,120 @@ static void calibrate_sleep_measurement_overhead(void) printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); } +static u64 bogoloops_measure_single(u64 loops) +{ + u64 cnt; + u64 ns1, ns2; + + /* loop to make sure you start close to the monotonic clock */ + ns2 = get_nsecs(); + while ((ns1 = get_nsecs()) - ns2 == 0) + ; + cnt = loops; + while (cnt-- > 0) + mb(); + ns2 = get_nsecs(); + return ns2 - ns1; +} + +static u64 bogoloops_measure_stable(u64 loops) +{ + int i; + unsigned int j, k; + u64 delta; + u64 delta_samples[16]; /* 16 is an adequate number */ + u64 estimate, maxdiff, new_estimate; + int imin, imax; /* indices of min & max */ + + i = 0; + do { + imin = 0; + imax = 0; + estimate = 0; + for (j = 0; j < ARRAY_SIZE(delta_samples); j++) { + /* measure a single time */ + delta = bogoloops_measure_single(loops); + delta_samples[j] = delta; + if (delta < delta_samples[imin]) + imin = j; + if (delta > delta_samples[imax]) + imax = j; + estimate += delta; + } + estimate /= ARRAY_SIZE(delta_samples); + + maxdiff = estimate >> 3; /* maxdiff is 12.5% of estimate */ + + /* only keep the ones that are with the proper range */ + k = 0; + new_estimate = 0; + for (j = 0; j < ARRAY_SIZE(delta_samples); j++) { + if (delta_samples[j] > estimate) + delta = delta_samples[j] - estimate; + else + delta = estimate - delta_samples[j]; + + /* too bad, continue */ + if (delta > maxdiff) + continue; + k++; + new_estimate += delta_samples[j]; + } + + /* we need at least half the samples to be right */ + if (k >= ARRAY_SIZE(delta_samples) / 2) + return new_estimate / k; + + } while (i++ < 8); /* try for 8 times */ + + /* we have failed to find a stable measurement */ + /* (this is normal for low loops values) */ + return 0; +} + +static void calculate_bogoloops_value(void) +{ + u64 loops, last_loops, new_loops; + u64 sample_period; + u64 ns1, ns2, delta, delta_new, delta_diff; + + /* 1 ms */ + sample_period = 1000000; + + ns1 = get_nsecs(); + loops = 0; + while (((ns2 = get_nsecs()) - ns1) < sample_period) + loops++; + + /* it is guaranteed that the loops value would be lower */ + /* we grow until we get right after the sample period */ + /* then we interpolate for the period */ + do { + last_loops = loops; + delta = bogoloops_measure_stable(loops); + loops *= 2; + } while (delta < sample_period); + + new_loops = ((last_loops * sample_period) + (delta / 2)) / delta; + delta_new = bogoloops_measure_stable(new_loops); + + if (delta_new > sample_period) + delta_diff = delta_new - sample_period; + else + delta_diff = sample_period - delta_new; + + bogoloops = new_loops; +} + +static void bogoburn_nsecs(u64 nsecs) +{ + u64 cnt; + + cnt = (bogoloops * nsecs) / 1000000LLU; + while (cnt-- > 0) + mb(); +} + static struct sched_atom * get_new_event(struct task_desc *task, u64 timestamp) { @@ -227,6 +388,8 @@ get_new_event(struct task_desc *task, u64 timestamp) event->timestamp = timestamp; event->nr = idx; + event->exited = task->exited; + task->nr_events++; size = sizeof(struct sched_atom *) * task->nr_events; task->atoms = realloc(task->atoms, size); @@ -248,7 +411,8 @@ static struct sched_atom *last_event(struct task_desc *task) static void add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration) { - struct sched_atom *event, *curr_event = last_event(task); + struct sched_atom *curr_event = last_event(task); + struct sched_atom *event; /* * optimize an existing RUN event by merging this one @@ -269,6 +433,18 @@ add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration) } static void +add_sched_event_exit(struct task_desc *task, u64 timestamp) +{ + struct sched_atom *event; + + event = get_new_event(task, timestamp); + + event->type = SCHED_EVENT_EXIT; + + nr_exit_events++; +} + +static int add_sched_event_wakeup(struct task_desc *task, u64 timestamp, struct task_desc *wakee) { @@ -281,11 +457,26 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp, wakee_event = last_event(wakee); if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { targetless_wakeups++; - return; + return 1; /* targetless wakeup */ } + event->wakee_event = wakee_event; + + /* add to the waker array */ + wakee_event->waker_count++; + wakee_event->wakers = realloc(wakee_event->wakers, + wakee_event->waker_count * sizeof(*wakee_event->wakers)); + BUG_ON(wakee_event->wakers == NULL); + wakee_event->wakers[wakee_event->waker_count-1] = task; + + /* add to the waker event array */ + wakee_event->waker_events = realloc(wakee_event->waker_events, + wakee_event->waker_count * sizeof(*wakee_event->waker_events)); + BUG_ON(wakee_event->waker_events == NULL); + wakee_event->waker_events[wakee_event->waker_count-1] = event; + if (wakee_event->wait_sem) { multitarget_wakeups++; - return; + return 2; /* multi target wakup */ } wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); @@ -294,19 +485,51 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp, event->wait_sem = wakee_event->wait_sem; nr_wakeup_events++; + + return 0; /* normal wakeup with both waker & wakee */ } static void add_sched_event_sleep(struct task_desc *task, u64 timestamp, - u64 task_state __used) + u64 task_state) { struct sched_atom *event = get_new_event(task, timestamp); event->type = SCHED_EVENT_SLEEP; + event->task_state = task_state; + + event->wake_id = ++next_wake_id; + BUG_ON(next_wake_id == 0); /* do not allow overflow */ nr_sleep_events++; } +static void +add_sched_event_fork_parent(struct task_desc *task, u64 timestamp, int child_pid) +{ + struct sched_atom *event; + + event = get_new_event(task, timestamp); + + event->type = SCHED_EVENT_FORK_PARENT; + event->pid = child_pid; + + nr_fork_parent_events++; +} + +static void +add_sched_event_fork_child(struct task_desc *task, u64 timestamp, int parent_pid) +{ + struct sched_atom *event; + + event = get_new_event(task, timestamp); + + event->type = SCHED_EVENT_FORK_CHILD; + event->pid = parent_pid; + + nr_fork_child_events++; +} + static struct task_desc *register_pid(unsigned long pid, const char *comm) { struct task_desc *task; @@ -315,18 +538,29 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm) task = pid_to_task[pid]; - if (task) + if (task) { + /* update task name */ + if (strcmp(task->comm, comm) != 0) { + if (verbose) + printf("task #%lu with pid %ld updates name from '%s' to '%s'\n", + task->nr, task->pid, task->comm, comm); + strcpy(task->comm, comm); + } return task; + } task = zalloc(sizeof(*task)); task->pid = pid; task->nr = nr_tasks; strcpy(task->comm, comm); + /* * every task starts in sleeping state - this gets ignored * if there's no wakeup pointing to this sleep state: + * NOTE: spr-replay get's awfully messy with this; removed. */ - add_sched_event_sleep(task, 0, 0); + if (!spr_replay) + add_sched_event_sleep(task, 0, 0); pid_to_task[pid] = task; nr_tasks++; @@ -389,6 +623,11 @@ process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom) break; case SCHED_EVENT_MIGRATION: break; + /* unused */ + case SCHED_EVENT_EXIT: + case SCHED_EVENT_FORK_PARENT: + case SCHED_EVENT_FORK_CHILD: + break; default: BUG_ON(1); } @@ -708,6 +947,21 @@ struct trace_fork_event { u32 child_pid; }; +struct trace_exit_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + u32 pid, ppid; + u32 tid, ptid; + u64 time; +}; + + struct trace_migrate_task_event { u32 size; @@ -758,18 +1012,63 @@ struct trace_sched_handler { int cpu, u64 timestamp, struct thread *thread); + + void (*exit_event)(struct trace_exit_event *, + struct machine *machine, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); }; + __used static void dump_event(struct event *event) +{ + printf("dump of event=%p\n", event); + printf(" %-20s %s\n", "name", event->name); + printf(" %-20s %d\n", "id", event->id); + printf(" %-20s %d (%s|%s|%s|%s|%s|%s|%s)\n", "flags", event->flags, + (event->flags & EVENT_FL_ISFTRACE) ? "ISFTRACE" : "", + (event->flags & EVENT_FL_ISPRINT) ? "ISPRINT" : "", + (event->flags & EVENT_FL_ISBPRINT) ? "ISBPRINT" : "", + (event->flags & EVENT_FL_ISFUNC) ? "ISFUNC" : "", + (event->flags & EVENT_FL_ISFUNCENT) ? "ISFUNCENT" : "", + (event->flags & EVENT_FL_ISFUNCRET) ? "ISFUNCRET" : "", + (event->flags & EVENT_FL_FAILED) ? "FAILED" : ""); + printf(" %-20s %s\n", "system", event->system); +} + +__used static void dump_perf_sample(struct perf_sample *sample) +{ + printf("dump of perf_sample=%p\n", sample); + printf(" %-20s %" PRIu64 "\n", "ip", sample->ip); + printf(" %-20s %" PRIu32 "\n", "pid", sample->pid); + printf(" %-20s %" PRIu32 "\n", "tid", sample->tid); + printf(" %-20s %" PRIu64 "\n", "time", sample->time); + printf(" %-20s %" PRIu64 "\n", "addr", sample->addr); + printf(" %-20s %" PRIu64 "\n", "id", sample->id); + printf(" %-20s %" PRIu64 "\n", "stream_id", sample->stream_id); + printf(" %-20s %" PRIu64 "\n", "period", sample->period); + printf(" %-20s %" PRIu32 "\n", "cpu", sample->cpu); + printf(" %-20s %" PRIu32 "\n", "raw_size", sample->raw_size); + printf(" %-20s %p\n", "raw_data", sample->raw_data); +} + +static u64 cpu_last_switched[MAX_CPUS]; +struct task_desc *cpu_last_task[MAX_CPUS]; static void replay_wakeup_event(struct trace_wakeup_event *wakeup_event, struct machine *machine __used, struct event *event, int cpu __used, - u64 timestamp __used, + u64 timestamp, struct thread *thread __used) { struct task_desc *waker, *wakee; + struct sched_atom *waker_event, *wakee_event; + char *s, *e; + char buf[BUFSIZ]; + int ret; if (verbose) { printf("sched_wakeup event %p\n", event); @@ -783,10 +1082,44 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event, waker = register_pid(wakeup_event->common_pid, ""); wakee = register_pid(wakeup_event->pid, wakeup_event->comm); - add_sched_event_wakeup(waker, timestamp, wakee); -} + ret = add_sched_event_wakeup(waker, timestamp, wakee); -static u64 cpu_last_switched[MAX_CPUS]; + waker_event = last_event(waker); + BUG_ON(waker_event == NULL); + + wakee_event = last_event(wakee); + + if (debug > 2) { + s = buf; + e = &buf[ARRAY_SIZE(buf)]; + e[-1] = '\0'; + + snprintf(s, e - s, "wakee %d preempt=%u prio=%" PRIu32 " succ=%" PRIu32 "", + wakeup_event->pid, + (unsigned int)wakeup_event->common_preempt_count, + wakeup_event->prio, wakeup_event->cpu); + e[-1] = '\0'; + s += strlen(s); + + if (ret == 1) { + snprintf(s, e - s, " (targetless )"); + e[-1] = '\0'; + s += strlen(s); + } else if (ret == 2) { + snprintf(s, e - s, " (multitarget)"); + e[-1] = '\0'; + s += strlen(s); + } + + if (wakee_event != NULL) { + snprintf(s, e - s, " (%s)", sched_atom_str(wakee_event)); + e[-1] = '\0'; + s += strlen(s); + } + + waker_event->msg = strdup(buf); + } +} static void replay_switch_event(struct trace_switch_event *switch_event, @@ -796,9 +1129,11 @@ replay_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct task_desc *prev, __used *next; + struct task_desc *prev, *next; + struct sched_atom *prev_atom; u64 timestamp0; s64 delta; + char buf[BUFSIZ]; if (verbose) printf("sched_switch event %p\n", event); @@ -826,9 +1161,20 @@ replay_switch_event(struct trace_switch_event *switch_event, next = register_pid(switch_event->next_pid, switch_event->next_comm); cpu_last_switched[cpu] = timestamp; + cpu_last_task[cpu] = next; add_sched_event_run(prev, timestamp, delta); - add_sched_event_sleep(prev, timestamp, switch_event->prev_state); + add_sched_event_sleep(prev, timestamp, switch_event->prev_state); /* was adding +delta */ + prev->task_state = switch_event->prev_state; + next->task_state = 0; /* task running */ + + prev_atom = last_event(prev); + BUG_ON(prev_atom == NULL); + + if (debug > 2) { + snprintf(buf, sizeof(buf), "switch to %d", switch_event->next_pid); + prev_atom->msg = strdup(buf); + } } @@ -836,22 +1182,65 @@ static void replay_fork_event(struct trace_fork_event *fork_event, struct event *event, int cpu __used, - u64 timestamp __used, + u64 timestamp, struct thread *thread __used) { + struct task_desc *parent, *child; + struct sched_atom *atom; + char buf[BUFSIZ]; + if (verbose) { printf("sched_fork event %p\n", event); printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); } - register_pid(fork_event->parent_pid, fork_event->parent_comm); - register_pid(fork_event->child_pid, fork_event->child_comm); + + parent = register_pid(fork_event->parent_pid, fork_event->parent_comm); + child = register_pid(fork_event->child_pid, fork_event->child_comm); + + add_sched_event_fork_parent(parent, timestamp, fork_event->child_pid); + + atom = last_event(parent); + BUG_ON(atom == NULL); + atom->child = child; + + if (debug > 2) { + snprintf(buf, sizeof(buf), "child %d", fork_event->child_pid); + atom->msg = strdup(buf); + } + + add_sched_event_fork_child(child, timestamp, fork_event->parent_pid); + + atom = last_event(child); + BUG_ON(atom == NULL); + atom->parent = parent; + + if (debug > 2) { + snprintf(buf, sizeof(buf), "parent %d", fork_event->parent_pid); + atom->msg = strdup(buf); + } +} + +static void +replay_exit_event(struct trace_exit_event *exit_event __used, + struct machine *machine __used, + struct event *event __used, + int cpu __used, + u64 timestamp, + struct thread *thread __used) +{ + struct task_desc *task; + + task = register_pid(exit_event->pid, ""); /* should never pick up unkown */ + add_sched_event_exit(task, timestamp); + task->exited = 1; /* mark that this task has exited */ } static struct trace_sched_handler replay_ops = { .wakeup_event = replay_wakeup_event, .switch_event = replay_switch_event, .fork_event = replay_fork_event, + .exit_event = replay_exit_event, }; struct sort_dimension { @@ -1558,8 +1947,23 @@ process_sched_exit_event(struct perf_tool *tool __used, struct machine *machine __used, struct thread *thread __used) { + void *data = sample->raw_data; + struct trace_exit_event exit_event; + if (verbose) printf("sched_exit event %p\n", event); + + FILL_COMMON_FIELDS(exit_event, event, data); + + FILL_FIELD(exit_event, pid, event, data); + FILL_FIELD(exit_event, ppid, event, data); + FILL_FIELD(exit_event, tid, event, data); + FILL_FIELD(exit_event, ptid, event, data); + FILL_FIELD(exit_event, time, event, data); + + if (trace_handler->exit_event) + trace_handler->exit_event(&exit_event, machine, event, + sample->cpu, sample->time, thread); } static void @@ -1624,6 +2028,7 @@ static struct perf_tool perf_sched = { .comm = perf_event__process_comm, .lost = perf_event__process_lost, .fork = perf_event__process_task, + .exit = perf_event__process_task, .ordered_samples = true, }; @@ -1779,7 +2184,7 @@ static void __cmd_replay(void) static const char * const sched_usage[] = { - "perf sched [] {record|latency|map|replay|script}", + "perf sched [] {record|latency|map|replay|script|spr-replay}", NULL }; @@ -1882,6 +2287,1762 @@ static int __cmd_record(int argc, const char **argv) return cmd_record(i, rec_argv, NULL); } +/**************************************************************************/ + +enum task_action_type { + TA_END, + TA_BURN, + TA_SLEEP, + TA_SPAWN, + TA_EXIT, + TA_CLONE_PARENT, + TA_CLONE_CHILD, + TA_WAIT_ID, + TA_SIGNAL_ID +}; + +struct task_action { + enum task_action_type action; + union { + struct { u64 nsecs; } burn; + struct { u64 nsecs; } sleep; + struct { u64 nsecs; } spawn; + struct { int ret; } exit; + struct { int child_pid; } clone_parent; + struct { int parent_pid; } clone_child; + struct { unsigned long id; } wait_id; + struct { unsigned long id; } signal_id; + } u; +}; + +#define T(x) ((u64)((double)(x) * 1E9)) + +struct task { + const char *name; + int pid; + const struct task_action *actions; +}; + +void *work_area; +struct playback *playback; + +struct select_list_entry { + struct list_head node; + int pid; + char *name; +}; + +LIST_HEAD(select_list); + +struct task_run { + struct playback *playback; /* point back to the playback */ + const struct task *task; /* task */ + const struct task_action *current_action; + char name[BUFSIZ]; /* lots of space for a name */ + int id; + pid_t real_pid; + pid_t parent_pid; /* when created by CLONE_PARENT */ + u64 start_time; /* time when the current task starts */ + u64 local_time; /* local time (beginning from 0) */ + int children_count; + + int futex_int; +}; + +struct playback_info { + int task_cnt; + int futex_cnt; + int task_off; + int futex_off; + int work_area_size; +}; + +struct playback { + struct playback_info info; + + u64 origin_time; /* time when the simulation starts */ + int task_count; + struct task_run *task_runs; + int futex_count; + int *futexes; + + unsigned int flags; /* flags */ +#define PF_ABSOLUTE_TIME 1 +#define PF_RELATIVE_TIME 2 +#define PF_TIME_MASK 3 + + int debug_level; + + int spawn_count; /* left to spawn */ + int exited_count; /* left to exit */ + int forked_count; /* initially forked */ +}; + +static inline int playback_absolute_time(const struct playback *p) +{ + return ((p->flags & PF_TIME_MASK) == PF_ABSOLUTE_TIME); +} + +static inline int playback_relative_time(const struct playback *p) +{ + return ((p->flags & PF_TIME_MASK) == PF_RELATIVE_TIME); +} + +static inline void playback_set_absolute_time(struct playback *p) +{ + p->flags = (p->flags & ~PF_TIME_MASK) | PF_ABSOLUTE_TIME; +} + +static inline void playback_set_relative_time(struct playback *p) +{ + p->flags = (p->flags & ~PF_TIME_MASK) | PF_ABSOLUTE_TIME; +} + +static inline void playback_set_debug_level(struct playback *p, int debug_level) +{ + p->debug_level = debug_level; +} + +static const char *task_action_str(u64 local_ts, const struct task_action *ta); +const struct task_action *task_action_current(struct task_run *tr); +void task_action_advance(struct task_run *tr); + +void vtprintf(struct task_run *tr, const char *fmt, va_list ap); +void tprintf(struct task_run *tr, const char *fmt, ...) __attribute__((__format__(__printf__,2,3))); + +void vtprintf(struct task_run *tr, const char *fmt, va_list ap) +{ + struct playback *p = tr->playback; + int n, size; + char *str = NULL; + + if (p->debug_level <= 0) + return; + + size = 256; /* start with 80 chars (40 * 2) */ + do { + size *= 2; + str = realloc(str, size); + BUG_ON(str == NULL); + /* Try to print in the allocated space. */ + n = vsnprintf(str, size, fmt, ap); + } while (n < 0 || n >= size); + + printf("#%2d [%16" PRIu64 " %16" PRIu64 "] %s", tr->id, + tr->local_time, + get_nsecs() - p->origin_time, + str); + free(str); +} + +void tprintf(struct task_run *tr, const char *fmt, ...) +{ + struct playback *p; + va_list ap; + + p = tr->playback; + + if (p->debug_level <= 0) + return; + + va_start(ap, fmt); + vtprintf(tr, fmt, ap); + va_end(ap); +} + + +const struct task_action *task_action_current(struct task_run *tr) +{ + if (tr->current_action->action == TA_END) + return NULL; + return tr->current_action; +} + +void task_action_advance(struct task_run *tr) +{ + if (tr->current_action->action != TA_END) + tr->current_action++; +} + +struct task_action_desc { + unsigned int id; + const char *name; + void (*execute)(struct task_run *tr, const struct task_action *ta); +}; + +static void execute_task(struct task_run *tr) __attribute__((__noreturn__)); + +/* END is not processed */ + +static void execute_spawn(struct task_run *tr __used, const struct task_action *ta __used) +{ + /* we do nothing */ +} + +static void execute_burn(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + u64 t1, dt, target; + + t1 = get_nsecs(); + dt = ta->u.burn.nsecs; + + if (playback_absolute_time(p)) + target = p->origin_time + tr->local_time + dt; + else + target = t1 + dt; + + if (target <= t1) { + tprintf(tr, "%s %" PRIu64 " - SKIPPED; slip of %" PRIu64 "ns\n", + "BURN", ta->u.burn.nsecs, t1 - target); + return; + } + + tprintf(tr, "%s %" PRIu64 ":%" PRIi64 "\n", "BURN", + ta->u.burn.nsecs, (int64_t)(target - t1)); + + if (!bogoburn) + burn_nsecs(target - t1); + else + bogoburn_nsecs(target - t1); + + tr->local_time += dt; +} + +static void execute_sleep(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + u64 t1, dt, target; + + t1 = get_nsecs(); + dt = ta->u.sleep.nsecs; + + if (playback_absolute_time(p)) + target = p->origin_time + tr->local_time + dt; + else + target = t1 + dt; + + if (target <= t1) { + tprintf(tr, "%s %" PRIu64 " - SKIPPED; slip of %" PRIu64 "ns\n", + "SLEEP", ta->u.sleep.nsecs, t1 - target); + return; + } + + tprintf(tr, "%s %" PRIu64 ":%" PRIi64 "\n", "SLEEP", + ta->u.burn.nsecs, (int64_t)(target - t1)); + + sleep_nsecs(target - t1); /* bogomips */ + + tr->local_time += dt; +} + +static void execute_exit(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + struct task_run *tr1; + pid_t real_pid; + int i, status; + + tprintf(tr, "%s\n", "EXIT"); + + while (tr->children_count > 0) { + real_pid = wait(&status); + if (real_pid < 0) { + fprintf(stderr, "panto-perf: Wait failed: %s:%d\n", __func__, __LINE__); + exit(EXIT_FAILURE); + } + if (!WIFEXITED(status)) + continue; + + /* locate child */ + for (i = 0, tr1 = p->task_runs; i < p->task_count; i++, tr1++) + if (real_pid == tr1->real_pid) + break; + assert(i < p->task_count); + + tr->children_count--; + } + exit(ta->u.exit.ret); +} + +static void execute_clone_parent(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + struct task_run *tr1; + int i, pid; + pid_t real_pid; + + pid = ta->u.clone_parent.child_pid; + + tprintf(tr, "%s %d\n", "CLONE_PARENT", ta->u.clone_parent.child_pid); + + /* locate child */ + for (i = 0, tr1 = p->task_runs; i < p->task_count; i++, tr1++) + if (pid == tr1->task->pid) + break; + assert(i < p->task_count); + + tr1->parent_pid = getpid(); + + /* all clones are for now forks */ + real_pid = fork(); + assert(real_pid >= 0); + + if (real_pid == 0) { /* child */ + tr1->real_pid = getpid(); + tr1->start_time = tr->start_time + tr->local_time; + tr1->local_time = tr->local_time; + execute_task(tr1); + } + tr1->real_pid = real_pid; + tr->children_count++; +} + +static void execute_clone_child(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + + (void)p; + + /* nothing */ + tprintf(tr, "%s %d\n", "CLONE_CHILD", ta->u.clone_child.parent_pid); +} + +static void execute_wait_id(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + int *futex_ptr; + u64 t1, dt; + int ret; + + t1 = get_nsecs(); + + tprintf(tr, "%s %lu\n", "WAIT_ID", ta->u.wait_id.id); + + /* wait for someone to wake us up */ + futex_ptr = p->futexes + ta->u.wait_id.id; + while (!__sync_bool_compare_and_swap(futex_ptr, 1, 0)) { + do { + ret = syscall(SYS_futex, futex_ptr, FUTEX_WAIT, 0, NULL, NULL, 0); + } while (ret == EINTR); + assert(ret == 0 || ret == EWOULDBLOCK); + } + + dt = get_nsecs() - t1; + + tr->local_time += dt; +} + +static void execute_signal_id(struct task_run *tr, const struct task_action *ta) +{ + struct playback *p = tr->playback; + int *futex_ptr; + int ret; + + tprintf(tr, "%s %lu\n", "SIGNAL_ID", ta->u.signal_id.id); + + /* signal wake up for the other thread */ + futex_ptr = p->futexes + ta->u.signal_id.id; + if (__sync_bool_compare_and_swap(futex_ptr, 0, 1)) { + ret = syscall(SYS_futex, futex_ptr, FUTEX_WAKE, 0, NULL, NULL, 0); + assert(ret >= 0); + } + + /* local time does not advance */ +} + +static const struct task_action_desc action_table[] = { + [TA_END] = { + .id = TA_END, + .name = "END", + }, + [TA_SPAWN] = { + .id = TA_SPAWN, + .name = "SPAWN", + .execute = execute_spawn, + }, + [TA_BURN] = { + .id = TA_BURN, + .name = "BURN", + .execute = execute_burn, + }, + [TA_SLEEP] = { + .id = TA_SLEEP, + .name = "SLEEP", + .execute = execute_sleep, + }, + [TA_EXIT] = { + .id = TA_EXIT, + .name = "EXIT", + .execute = execute_exit, + }, + [TA_CLONE_PARENT] = { + .id = TA_CLONE_PARENT, + .name = "CLONE_PARENT", + .execute = execute_clone_parent, + }, + [TA_CLONE_CHILD] = { + .id = TA_CLONE_CHILD, + .name = "CLONE_CHILD", + .execute = execute_clone_child, + }, + [TA_WAIT_ID] = { + .id = TA_WAIT_ID, + .name = "WAIT_ID", + .execute = execute_wait_id, + }, + [TA_SIGNAL_ID] = { + .id = TA_SIGNAL_ID, + .name = "SIGNAL_ID", + .execute = execute_signal_id, + }, +}; + +static void execute_task(struct task_run *tr) +{ + const struct task *t; + const struct task_action *ta; + struct playback *p; + + t = tr->task; + assert(t != NULL); + + p = tr->playback; + assert(p != NULL); + + /* copy name from the task and inform */ + snprintf(tr->name, sizeof(tr->name) - 1, ":%s-%d", t->name, t->pid); + tr->name[sizeof(tr->name) - 1] = '\0'; + prctl(PR_SET_NAME, tr->name); + + /* skip over spawn */ + while ((ta = task_action_current(tr)) != NULL && ta->action == TA_SPAWN) { + tr->local_time += ta->u.spawn.nsecs; + task_action_advance(tr); + } + + tprintf(tr, "start: name %s, pid %d\n", t->name, t->pid); + + while ((ta = task_action_current(tr)) != NULL) { + /* verify size */ + assert((unsigned int)ta->action < ARRAY_SIZE(action_table)); + (*action_table[ta->action].execute)(tr, ta); + + task_action_advance(tr); + } + + exit(0); +} + +static void dump_task(const struct task *t) +{ + const struct task_action *ta; + u64 ts; + + printf("TASK: name %s, pid %d\n", t->name, t->pid); + + ts = 0; + for (ta = t->actions; ; ta++) { + + if (ta->action == TA_SPAWN) + ts = ta->u.spawn.nsecs; + + printf("%s\n", task_action_str(ts, ta)); + if (ta->action == TA_END) + break; + + if (ta->action == TA_BURN) + ts += ta->u.burn.nsecs; + else if (ta->action == TA_SLEEP) + ts += ta->u.sleep.nsecs; + } +} + +static void generate_task(const struct task *t) +{ + const struct task_action *ta; + + printf("[%s/%d]\n", t->name, t->pid); + + for (ta = t->actions; ; ta++) { + + switch (ta->action) { + case TA_BURN: + printf("\tburn %" PRIu64 "\n", ta->u.burn.nsecs); + break; + case TA_SLEEP: + printf("\tsleep %" PRIu64 "\n", ta->u.sleep.nsecs); + break; + case TA_SPAWN: + printf("\tspawn %" PRIu64 "\n", ta->u.spawn.nsecs); + break; + case TA_CLONE_PARENT: + printf("\tclone-parent %d\n", ta->u.clone_parent.child_pid); + break; + case TA_CLONE_CHILD: + printf("\tclone-child %d\n", ta->u.clone_child.parent_pid); + break; + case TA_WAIT_ID: + printf("\twait-id %lu\n", ta->u.wait_id.id); + break; + case TA_SIGNAL_ID: + printf("\tsignal-id %lu\n", ta->u.signal_id.id); + break; + case TA_EXIT: + printf("\texit %d\n", ta->u.exit.ret); + break; + case TA_END: + printf("\tend\n"); + default: + break; + } + + if (ta->action == TA_END) + break; + } +} + +static void fill_playback_info(struct playback_info *pi, const struct task * const * tasks_array) +{ + const struct task *t; + const struct task * const *tt; + const struct task_action *ta; + unsigned long id, maxid; + int pagesize; + + /* get page size */ + pagesize = getpagesize(); + + /* make sure it's a power of two */ + assert((pagesize & (pagesize - 1)) == 0); + + /* align work area to 16 bytes */ + pi->work_area_size = ALIGN(sizeof(struct playback), 16); + pi->task_off = pi->work_area_size; + + /* count number of tasks (and find maximum futex id) */ + maxid = -1; + pi->task_cnt = 0; + tt = tasks_array; + while ((t = *tt++) != NULL) { + pi->task_cnt++; + for (ta = t->actions; ta != NULL && ta->action != TA_END; ta++) { + if (ta->action == TA_WAIT_ID) + id = ta->u.wait_id.id; + else if (ta->action == TA_SIGNAL_ID) + id = ta->u.signal_id.id; + else + id = -1; + + if (id > maxid) + maxid = id; + } + } + pi->futex_cnt = maxid + 1; + + /* add the size of the task structs */ + pi->work_area_size += ALIGN(sizeof(struct task_run) * pi->task_cnt, 16); + pi->futex_off = pi->work_area_size; + + /* add the size of the futexes */ + pi->work_area_size += ALIGN(sizeof(int) * pi->futex_cnt, 16); + + /* now align to the pagesize (we have verified that pagesize is a power of two) */ + pi->work_area_size = ALIGN(pi->work_area_size, pagesize); + + printf("#%d tasks, #%d futexes, total work area size %d\n", + pi->task_cnt, pi->futex_cnt, pi->work_area_size); + +} + +static struct playback *playback_create(const struct task * const *tasks_array) +{ + struct playback_info pi; + const struct task * const *tt; + struct task_run *tr; + int i; + struct playback *p; + + assert(tasks_array != NULL); + + /* layout playback structure according to inputs */ + fill_playback_info(&pi, tasks_array); + + /* map the shared memory across all processes */ + work_area = mmap(NULL, pi.work_area_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); + assert(work_area != MAP_FAILED); + + memset(work_area, 0, pi.work_area_size); + + playback = work_area; + + p = playback; + memcpy(&p->info, &pi, sizeof(pi)); + + p->origin_time = 0; + p->task_count = pi.task_cnt; + p->task_runs = work_area + pi.task_off; + p->futex_count = pi.futex_cnt; + p->futexes = work_area + pi.futex_off; + + p->spawn_count = 0; + p->exited_count = 0; + p->forked_count = 0; + + for (i = 0, tr = p->task_runs, tt = tasks_array; i < p->task_count; + i++, tr++, tt++) { + + tr->playback = p; + tr->task = *tt; + tr->current_action = tr->task->actions; + tr->id = i; + tr->real_pid = -1; + tr->start_time = 0; + tr->local_time = 0; + tr->futex_int = 0; + } + + return p; +} + +static void playback_destroy(struct playback *p) +{ + assert(p != NULL); + + munmap(p, p->info.work_area_size); +} + +static void playback_run(struct playback *p) +{ + struct task_run *tr, *tr1; + const struct task *t; + const struct task_action *ta; + int i, status; + pid_t real_pid; + uint64_t curr_nsecs; + + p->origin_time = get_nsecs(); + + p->spawn_count = 0; + for (i = 0; i < p->task_count; i++) { + tr = &p->task_runs[i]; + t = tr->task; + if (t->actions->action == TA_SPAWN) { + p->spawn_count++; + continue; + } + + /* don't bother with children */ + if (t->actions->action == TA_CLONE_CHILD) + continue; + + real_pid = fork(); + assert(real_pid >= 0); + + if (real_pid == 0) { + /* fill up for the child */ + tr->start_time = get_nsecs(); + tr->real_pid = getpid(); + execute_task(tr); + } + tr->real_pid = real_pid; + + p->forked_count++; + } + + curr_nsecs = 0; + while (p->spawn_count > 0) { + tr1 = NULL; + for (i = 0; i < p->task_count; i++) { + tr = &p->task_runs[i]; + t = tr->task; + ta = t->actions; + + if (tr->real_pid >= 0 || ta->action != TA_SPAWN) + continue; + + if (tr1 == NULL || tr1->task->actions->u.spawn.nsecs > ta->u.spawn.nsecs) + tr1 = tr; + } + assert(tr1 != NULL); + + tr = tr1; + ta = t->actions; + sleep_nsecs(ta->u.spawn.nsecs - curr_nsecs); + + real_pid = fork(); + assert(real_pid >= 0); + + if (real_pid == 0) { + /* fill up for the child */ + tr->start_time = get_nsecs(); + tr->real_pid = getpid(); + execute_task(tr); + } + tr->real_pid = real_pid; + + curr_nsecs += (ta->u.spawn.nsecs - curr_nsecs); + p->spawn_count--; + p->forked_count++; + } + + while (p->forked_count > 0) { + real_pid = wait(&status); + if (real_pid < 0) { + fprintf(stderr, "panto-perf: Wait failed: %s:%d\n", __func__, __LINE__); + exit(EXIT_FAILURE); + } + if (!WIFEXITED(status)) + continue; + + for (i = 0; i < p->task_count; i++) { + tr = &p->task_runs[i]; + t = tr->task; + if (tr->real_pid == real_pid) + break; + } + + assert(i < p->task_count); + tr->real_pid = -1; + p->forked_count--; + } +} + +static const char * const spr_replay_usage[] = { + "perf sched spr-replay []", + NULL +}; + +static int parse_select_option(const struct option *opt, const char *str, + int unset __used) +{ + struct list_head *lh = (struct list_head *)opt->value; + struct select_list_entry *sle; + + sle = zalloc(sizeof(*sle)); + BUG_ON(sle == NULL); + + INIT_LIST_HEAD(&sle->node); + if (isdigit(*str)) { + sle->pid = atoi(str); + } else { + sle->pid = -1; + sle->name = strdup(str); + BUG_ON(sle->name == NULL); + } + + list_add_tail(&sle->node, lh); + + return 0; +} + +static const struct option spr_replay_options[] = { + OPT_CALLBACK('s', "select", &select_list, "select", + "Number selects pid, name matches comm name", + parse_select_option), + OPT_UINTEGER('r', "repeat", &replay_repeat, + "repeat the workload replay N times (-1: infinite)"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), + OPT_BOOLEAN('p', "preserve-time", &preserve_time, + "Preserve time (do not erase sleeps while in running state)"), + OPT_BOOLEAN('n', "dry-run", &dry_run, + "do not execute"), + OPT_INCR('d', "debug", &debug, + "be more verbose"), + OPT_BOOLEAN('g', "generate", &generate, + "generate an spr program at stdout"), + OPT_STRING('f', "spr-filename", &spr_filename, "file", + "spr file name (instead of trace)"), + OPT_BOOLEAN('l', "list", &spr_list, + "list tasks & pids"), + OPT_BOOLEAN('b', "bogoburn", &bogoburn, + "burn time using a bogo-mips like busy loop"), + OPT_U64('B', "bogoloops", &bogoloops, + "set bogoloops value directly without re-calculating"), + OPT_END() +}; + +static const char *task_state_str(unsigned int task_state) +{ + static const char * const task_state_array[] = { + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ + }; + const char * const *p = &task_state_array[0]; + unsigned int state = task_state & 511; + + while (state) { + p++; + state >>= 1; + } + return *p; +} + +static void calculate_replay_start_time(void) +{ + struct sched_atom *atom; + struct task_desc *task; + unsigned long i; + + replay_start_time = (u64)-1; /* maximum possible */ + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + + if (task->nr_events == 0) + continue; + + atom = task->atoms[0]; + + if (atom->timestamp < replay_start_time) + replay_start_time = atom->timestamp; + + } + if (replay_start_time == (u64)-1) + replay_start_time = 0; +} + +static void calculate_replay_end_time(void) +{ + struct sched_atom *atom; + struct task_desc *task; + unsigned long i; + u64 ts_end; + + replay_end_time = 0; /* minimum */ + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + + if (task->nr_events == 0) + continue; + + atom = task->atoms[task->nr_events - 1]; + + ts_end = atom->timestamp; + if (atom->type == SCHED_EVENT_RUN) + ts_end += atom->duration; + + if (ts_end > replay_end_time) + replay_end_time = ts_end; + } +} + +static const char *sched_atom_str(const struct sched_atom *atom) +{ + static char buf[BUFSIZ]; + char *s, *e; + unsigned int i; + + if (replay_start_time == 0) + calculate_replay_start_time(); + + if (replay_end_time == 0) + calculate_replay_end_time(); + + s = buf; + e = &buf[ARRAY_SIZE(buf)]; + + snprintf(s, e - s, "\t%014" PRIu64 " ", atom->timestamp - replay_start_time); + e[-1] = '\0'; + s += strlen(s); + + switch (atom->type) { + case SCHED_EVENT_RUN: + snprintf(s, e - s, "%-10s%014" PRIu64, "RUN", + atom->duration); + e[-1] = '\0'; + s += strlen(s); + break; + case SCHED_EVENT_SLEEP: + snprintf(s, e - s, "%-10s %s %d", "SLEEP", + task_state_str(atom->task_state), + atom->waker_count); + e[-1] = '\0'; + s += strlen(s); + for (i = 0; i < atom->waker_count; i++) { + snprintf(s, e - s, " [%ld@%014" PRIu64 "]", + atom->wakers[i]->pid, + atom->waker_events[i]->timestamp - replay_start_time); + e[-1] = '\0'; + s += strlen(s); + } + break; + case SCHED_EVENT_WAKEUP: + snprintf(s, e - s, "%-10s", "WAKEUP"); + e[-1] = '\0'; + s += strlen(s); + if (atom->wakee != NULL && atom->wakee_event != NULL) { + snprintf(s, e - s, " [%ld@%014" PRIu64 "]", + atom->wakee->pid, + atom->wakee_event->timestamp - replay_start_time); + e[-1] = '\0'; + s += strlen(s); + } + break; + case SCHED_EVENT_MIGRATION: + snprintf(s, e - s, "%-10s", "MIGRATION"); + e[-1] = '\0'; + s += strlen(s); + break; + case SCHED_EVENT_EXIT: + snprintf(s, e - s, "%-10s", "EXIT"); + e[-1] = '\0'; + s += strlen(s); + break; + case SCHED_EVENT_FORK_PARENT: + snprintf(s, e - s, "%-10s %d", "FORK_PARENT", atom->pid); + e[-1] = '\0'; + s += strlen(s); + break; + case SCHED_EVENT_FORK_CHILD: + snprintf(s, e - s, "%-10s %d", "FORK_CHILD", atom->pid); + e[-1] = '\0'; + s += strlen(s); + break; + default: + /* GCC whines about missing default case without this one */ + break; + } + + if (debug > 2 && atom->msg) { + snprintf(s, e - s, " | %s", atom->msg); + e[-1] = '\0'; + s += strlen(s); + } + + return buf; +} + +static void dump_sched_atom_task(struct task_desc *task) +{ + struct sched_atom *atom; + unsigned long j; + unsigned int exited; + + exited = 0; + for (j = 0; j < task->nr_events && !exited; j++) { + + atom = task->atoms[j]; + + printf("%s\n", sched_atom_str(atom)); + + /* exited = atom->type == SCHED_EVENT_EXIT; */ + } +} + +static const char *task_action_str(u64 local_ts, const struct task_action *ta) +{ + static char buf[BUFSIZ]; + char *s, *e; + + s = buf; + e = &buf[ARRAY_SIZE(buf)]; + + snprintf(s, e - s, "[%014" PRIu64 "] %-12s", local_ts, action_table[ta->action].name); + e[-1] = '\0'; + s += strlen(s); + + switch (ta->action) { + case TA_BURN: + snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.burn.nsecs); + break; + case TA_SLEEP: + snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.sleep.nsecs); + break; + case TA_SPAWN: + snprintf(s, e - s, "%014" PRIu64 "ns", ta->u.spawn.nsecs); + break; + case TA_CLONE_PARENT: + snprintf(s, e - s, "%d", ta->u.clone_parent.child_pid); + break; + case TA_CLONE_CHILD: + snprintf(s, e - s, "%d", ta->u.clone_child.parent_pid); + break; + case TA_WAIT_ID: + snprintf(s, e - s, "%lu", ta->u.wait_id.id); + break; + case TA_SIGNAL_ID: + snprintf(s, e - s, "%lu", ta->u.signal_id.id); + break; + case TA_EXIT: + case TA_END: + *s = '\0'; + break; + default: + /* GCC is whining about missing default case */ + break; + } + e[-1] = '\0'; + s += strlen(s); + + return buf; +} + +static struct task *generate_spr_program(struct task_desc *task, unsigned long *map_id_fwd, unsigned long *next_opt_id) +{ + struct sched_atom *atom, *atom_last; + unsigned long eventnr; + unsigned int exited; + unsigned int blocked; + unsigned int last_blocked; + unsigned int last_was_runnable; + unsigned int selected_wakers; + unsigned int exited_wakers; + struct task *t; + struct task_action *ta_alloc; + int ta_count, ta_max; + int task_state; + u64 replay_ts, replay_ts_new; + u64 local_ts; + u64 delta_ts; + u64 start_run_ts; + int pending_count; + struct task_action ta_work; + unsigned int i; + u64 burn_run_accum; + + BUG_ON(task == NULL); + + /* make sure there are events there */ + if (task->nr_events == 0) + return NULL; + + if (replay_start_time == 0) + calculate_replay_start_time(); + + if (replay_end_time == 0) + calculate_replay_end_time(); + + t = zalloc(sizeof(*t)); + BUG_ON(t == NULL); + t->pid = task->pid; + t->name = strdup(task->comm); + BUG_ON(t->name == NULL); + + ta_count = 0; + ta_max = 256; /* starting point */ + ta_alloc = zalloc(sizeof(*ta_alloc) * ta_max); + BUG_ON(ta_alloc == NULL); + t->actions = ta_alloc; + +#undef APPEND_TA_WORK +#define APPEND_TA_WORK() \ + do { \ + if (ta_count >= ta_max) { \ + ta_alloc = realloc(ta_alloc, ta_max * 2 * (sizeof(*t->actions))); \ + BUG_ON(ta_alloc == NULL); \ + ta_max *= 2; \ + t->actions = ta_alloc; \ + } \ + ta_alloc[ta_count++] = ta_work; \ + if (debug > 2) \ + printf("\t\t%4d: GEN: %4d: [%012" PRIu64 "] %s\n", __LINE__, \ + pending_count, start_run_ts, \ + task_action_str(start_run_ts, &ta_work)); \ + } while (0) + +#undef GET_OPTIMIZED_ID +#define GET_OPTIMIZED_ID(x) \ + ({ \ + unsigned long _x = (x); \ + BUG_ON(_x >= (next_wake_id + 1)); \ + if (map_id_fwd[_x] == 0) { \ + map_id_fwd[_x] = ++(*next_opt_id); \ + BUG_ON(*next_opt_id == 0); \ + } \ + map_id_fwd[_x]; \ + }) + + exited = 0; + replay_ts = 0; + local_ts = 0; + task_state = -1; + atom = NULL; + start_run_ts = 0; + pending_count = 0; + last_was_runnable = 0; + last_blocked = 0; + burn_run_accum = 0; + + memset(&ta_work, 0, sizeof(ta_work)); + ta_work.action = TA_END; + + for (eventnr = 0; eventnr < task->nr_events && !exited; eventnr++) { + + atom_last = atom; + + atom = task->atoms[eventnr]; + + exited = atom->type == SCHED_EVENT_EXIT; + + blocked = 0; + selected_wakers = 0; + exited_wakers = 0; + + if (atom->type == SCHED_EVENT_SLEEP) { + + /* check for exited wakers */ + selected_wakers = 0; + exited_wakers = 0; + for (i = 0; i < atom->waker_count; i++) { + if (atom->wakers[i]->selected) { + selected_wakers++; + if (atom->waker_events[i]->exited) + exited_wakers++; + } + } + + if (!preserve_time) { + blocked = (atom->task_state & 511) != 0 /* && + selected_wakers > 0 && selected_wakers > exited_wakers */ ; + } else + blocked = 1; + } + + replay_ts_new = atom->timestamp - replay_start_time; + + /* first one */ + if (eventnr == 0) + replay_ts = replay_ts_new; + + if (last_was_runnable && !preserve_time) /* we don't count the time we were runnable */ + delta_ts = 0; + else + delta_ts = replay_ts_new - replay_ts; + last_was_runnable = 0; + replay_ts = replay_ts_new; + + if (debug > 2) + printf("[%012" PRIu64 " %012" PRIu64 "] %s\n", + replay_ts, delta_ts, sched_atom_str(atom)); + + /* first one is special (for non-fork children assume we run until the point of start) */ + if (eventnr == 0 && atom->type != SCHED_EVENT_FORK_CHILD) { + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = replay_ts; + start_run_ts = 0; + pending_count = 1; + burn_run_accum = 0; + } + + if (eventnr > 0 && pending_count > 0 && blocked != last_blocked) { + + BUG_ON(ta_work.action != TA_BURN && ta_work.action != TA_SLEEP); + + if (ta_work.action == TA_BURN) { + ta_work.u.burn.nsecs += burn_run_accum + delta_ts; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + } else if (ta_work.action == TA_SLEEP) { + ta_work.u.sleep.nsecs += delta_ts; + if (ta_work.u.sleep.nsecs > 0) + APPEND_TA_WORK(); + } + start_run_ts = local_ts; + pending_count = 0; + } + + switch (atom->type) { + case SCHED_EVENT_RUN: + task_state = 0; + if (pending_count == 0) { + ta_work.action = TA_BURN; + if (atom->duration < delta_ts) + start_run_ts = local_ts - atom->duration; + else + start_run_ts = local_ts - delta_ts; + } else { + BUG_ON(ta_work.action != TA_BURN); + } + ta_work.u.burn.nsecs = 0; /* always reset */ + burn_run_accum += atom->duration; + pending_count++; + break; + + case SCHED_EVENT_SLEEP: + task_state = atom->task_state & 511; + + /* set when we were running, and switched out */ + last_was_runnable = task_state == 0; + + if (pending_count > 0) { + BUG_ON(ta_work.action != TA_BURN); + ta_work.u.burn.nsecs += delta_ts; + if (blocked) { + ta_work.u.burn.nsecs += burn_run_accum; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + pending_count = 0; + burn_run_accum = 0; + } + } + + /* if we're not blocked, we skip */ + if (!blocked) + break; + + if (selected_wakers > 0 && selected_wakers > exited_wakers) { + ta_work.action = TA_WAIT_ID; + ta_work.u.wait_id.id = GET_OPTIMIZED_ID(atom->wake_id); + start_run_ts = local_ts; + APPEND_TA_WORK(); + pending_count = 0; + } else { + ta_work.action = TA_SLEEP; + ta_work.u.sleep.nsecs = 0; + start_run_ts = local_ts; + pending_count = 1; + } + + break; + + case SCHED_EVENT_WAKEUP: + /* we were blocked? */ + if (pending_count > 0 && last_blocked) { + BUG_ON(ta_work.action != TA_SLEEP); + ta_work.u.sleep.nsecs += delta_ts; + if (ta_work.u.sleep.nsecs > 0) + APPEND_TA_WORK(); + start_run_ts = local_ts; + pending_count = 0; + } + + /* no target; ignore */ + if (atom->wakee == NULL || atom->wakee_event == NULL || + !atom->wakee->selected) { + if (pending_count > 0) { + BUG_ON(ta_work.action != TA_BURN); + ta_work.u.burn.nsecs += delta_ts; + } else { + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = 0; + start_run_ts = local_ts; + burn_run_accum = 0; + } + pending_count++; + break; + } + + /* target; need to finish the run */ + if (pending_count > 0) { + BUG_ON(ta_work.action != TA_BURN); + ta_work.u.burn.nsecs += burn_run_accum + delta_ts; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + pending_count = 0; + } + + ta_work.action = TA_SIGNAL_ID; + ta_work.u.signal_id.id = GET_OPTIMIZED_ID(atom->wakee_event->wake_id); + start_run_ts = local_ts; + APPEND_TA_WORK(); + + /* and we're running */ + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = 0; + start_run_ts = local_ts; + pending_count = 1; + burn_run_accum = 0; + + break; + + case SCHED_EVENT_MIGRATION: + break; + case SCHED_EVENT_EXIT: + if (pending_count > 0) { + BUG_ON(ta_work.action != TA_BURN); + ta_work.u.burn.nsecs += burn_run_accum + delta_ts; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + pending_count = 0; + } + + ta_work.action = TA_EXIT; + ta_work.u.exit.ret = 0; + start_run_ts = local_ts; + APPEND_TA_WORK(); + break; + case SCHED_EVENT_FORK_PARENT: + BUG_ON(atom->child == NULL); + + if (pending_count > 0) { + BUG_ON(ta_work.action != TA_BURN); + ta_work.u.burn.nsecs += delta_ts; + + if (atom->child->selected) { + ta_work.u.burn.nsecs += burn_run_accum; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + pending_count = 0; + } else + pending_count++; + } + + if (!atom->child->selected) { + if (verbose) + printf("Forking parent (%s/%ld) but child (%s/%ld) not selected; ignoring\n", + task->comm, task->pid, atom->child->comm, atom->child->pid); + break; + } + + ta_work.action = TA_CLONE_PARENT; + ta_work.u.clone_parent.child_pid = atom->pid; + start_run_ts = local_ts; + APPEND_TA_WORK(); + + /* and we're running */ + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = 0; + start_run_ts = local_ts; + pending_count = 1; + burn_run_accum = 0; + + task_state = 0; /* running */ + + break; + case SCHED_EVENT_FORK_CHILD: + BUG_ON(eventnr != 0); /* this _must_ be the first one */ + BUG_ON(atom->parent == NULL); + + if (!atom->parent->selected) { + if (verbose) + printf("Forking child (%s/%ld) but parent (%s/%ld) not selected; converting to run\n", + task->comm, task->pid, atom->parent->comm, atom->parent->pid); + + task_state = 0; + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = 0; + start_run_ts = local_ts; + pending_count = 1; + burn_run_accum = 0; + break; + } + + ta_work.action = TA_CLONE_CHILD; + ta_work.u.clone_child.parent_pid = atom->pid; + start_run_ts = local_ts; + APPEND_TA_WORK(); + + /* and we're running */ + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = 0; + start_run_ts = local_ts; + pending_count = 1; + burn_run_accum = 0; + + task_state = 0; /* running */ + + break; + default: + break; + } + + if (preserve_time || !last_was_runnable) + local_ts += delta_ts; + + last_blocked = blocked; + } + + /* stop any runs */ + if (pending_count > 0) { + + BUG_ON(ta_work.action != TA_BURN && ta_work.action != TA_SLEEP); + + /* if the process was running extend till end */ + delta_ts = !exited ? (replay_end_time - (replay_ts + replay_start_time)) : 0; + + if (ta_work.action == TA_BURN) { + ta_work.u.burn.nsecs += burn_run_accum + delta_ts; + if (ta_work.u.burn.nsecs > 0) + APPEND_TA_WORK(); + } else if (ta_work.action == TA_SLEEP) { + ta_work.u.sleep.nsecs += delta_ts; + if (ta_work.u.sleep.nsecs > 0) + APPEND_TA_WORK(); + } + pending_count = 0; + } + + ta_work.action = TA_END; + ta_work.u.exit.ret = 0; + start_run_ts = local_ts; + APPEND_TA_WORK(); + + return t; +} + +static int read_spr_program(const char *file, struct task ***ttt) +{ + FILE *fp; + char buf[BUFSIZ]; + char orig_buf[BUFSIZ]; + char *name; + struct task *t, **tt; + struct task_action ta_work, *ta_alloc; + int task_count, task_max; + int ta_count, ta_max; + char *s, *e, *start, *end; + int line, pid; + +#undef APPEND_TASK +#define APPEND_TASK() \ + do { \ + if (task_count >= task_max) { \ + tt = realloc(tt, (task_max + 32) * (sizeof(*tt))); \ + BUG_ON(tt == NULL); \ + task_max += 32; \ + } \ + tt[task_count++] = t; \ + } while (0) + +#undef APPEND_TA_WORK +#define APPEND_TA_WORK() \ + do { \ + if (ta_count >= ta_max) { \ + ta_alloc = realloc(ta_alloc, (ta_max + 4096) * (sizeof(*ta_alloc))); \ + BUG_ON(ta_alloc == NULL); \ + ta_max += 4096; \ + } \ + ta_alloc[ta_count++] = ta_work; \ + } while (0) + + + fp = fopen(file, "ra"); + if (fp == NULL) { + fprintf(stderr, "Could not open file '%s'\n", file); + return -1; + } + + task_count = 0; + task_max = 0; + tt = NULL; + ta_count = 0; + ta_max = 0; + ta_alloc = NULL; + pid = -1; + name = NULL; + + t = NULL; + line = 0; + while (fgets(buf, sizeof(buf) - 1, fp) != NULL) { + line++; + + buf[sizeof(buf) - 1] = '\0'; + + /* remove trailing newline */ + s = strrchr(buf, '\n'); + if (s != NULL) + *s = '\0'; + + strcpy(orig_buf, buf); + + /* remove comments */ + s = strchr(buf, '#'); + if (s != NULL) + *s = '\0'; + + /* remove trailing spaces */ + s = buf + strlen(buf) - 1; + while (s > buf && isspace(*s)) + *s-- = '\0'; + + /* skip over spaces in the front */ + s = buf; + while (isspace(*s)) + s++; + start = s; + end = s + strlen(s); + + /* empty line */ + if (start >= end - 1) + continue; + + /* waiting for task */ + if (t == NULL) { + if (*start != '[' || end[-1] != ']') + goto syntax_error; + s = start + 1; + e = strrchr(s, '/'); + if (e == NULL) + goto syntax_error; + if (e - s < 1) + goto syntax_error; + + name = malloc(e - s + 1); + BUG_ON(name == NULL); + + memcpy(name, s, e - s); + name[e - s] = '\0'; + + pid = atoi(e + 1); + if (pid < 0) + goto syntax_error; + + ta_count = 0; + ta_max = 0; + ta_alloc = NULL; + + /* allocate but don't do anything with it */ + t = malloc(sizeof(*t)); + BUG_ON(t == NULL); + + } else { + s = start; + e = start; + while (!isspace(*e)) + e++; + *e++ = '\0'; + if (strcmp(s, "burn") == 0) { + ta_work.action = TA_BURN; + ta_work.u.burn.nsecs = strtoull(e, NULL, 10); + } else if (strcmp(s, "sleep") == 0) { + ta_work.action = TA_SLEEP; + ta_work.u.sleep.nsecs = strtoull(e, NULL, 10); + } else if (strcmp(s, "spawn") == 0) { + ta_work.action = TA_SPAWN; + ta_work.u.spawn.nsecs = strtoull(e, NULL, 10); + } else if (strcmp(s, "clone-parent") == 0) { + ta_work.action = TA_CLONE_PARENT; + ta_work.u.clone_parent.child_pid = (int)strtoul(e, NULL, 10); + } else if (strcmp(s, "clone-child") == 0) { + ta_work.action = TA_CLONE_CHILD; + ta_work.u.clone_child.parent_pid = (int)strtoul(e, NULL, 10); + } else if (strcmp(s, "wait-id") == 0) { + ta_work.action = TA_WAIT_ID; + ta_work.u.wait_id.id = strtoul(e, NULL, 10); + } else if (strcmp(s, "signal-id") == 0) { + ta_work.action = TA_SIGNAL_ID; + ta_work.u.signal_id.id = strtoul(e, NULL, 10); + } else if (strcmp(s, "exit") == 0) { + ta_work.action = TA_EXIT; + ta_work.u.exit.ret = (int)strtoul(e, NULL, 10); + } else if (strcmp(s, "end") == 0) { + ta_work.action = TA_END; + } else + goto syntax_error; + + APPEND_TA_WORK(); + + if (ta_work.action == TA_END) { + BUG_ON(t == NULL); + BUG_ON(name == NULL); + BUG_ON(pid < 0); + + if (ta_count == 0) + goto syntax_error; + + t->name = name; + t->pid = pid; + t->actions = ta_alloc; + + APPEND_TASK(); + t = NULL; + } + } + } + t = NULL; + APPEND_TASK(); + + fclose(fp); + + /* no program */ + if (tt == NULL) + return -1; + + *ttt = tt; + return 0; + +syntax_error: + fprintf(stderr, "syntax error at line %d: %s\n", line, orig_buf); + *ttt = NULL; + fclose(fp); + return -1; +} + +static void __cmd_spr_replay(void) +{ + struct task_desc *task; + struct task **tt; + const struct task * const *task_array; + struct select_list_entry *sle; + unsigned long i, j; + int selected_task_count; + struct playback *p; + unsigned long *map_id_fwd; + unsigned long next_opt_id; + int ret; + + /* mark that we're not the standard replay */ + spr_replay = true; + + if (!dry_run && !spr_list) { + calibrate_run_measurement_overhead(); + calibrate_sleep_measurement_overhead(); + + test_calibrations(); + } + + if (spr_filename == NULL) { + + read_events(true, NULL); + + if (debug > 0) { + printf("nr_run_events: %ld\n", nr_run_events); + printf("nr_sleep_events: %ld\n", nr_sleep_events); + printf("nr_wakeup_events: %ld\n", nr_wakeup_events); + + if (targetless_wakeups) + printf("target-less wakeups: %ld\n", targetless_wakeups); + if (multitarget_wakeups) + printf("multi-target wakeups: %ld\n", multitarget_wakeups); + if (nr_run_events_optimized) + printf("run atoms optimized: %ld\n", + nr_run_events_optimized); + } + + calculate_replay_start_time(); + calculate_replay_end_time(); + + selected_task_count = 0; + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + + /* if no entries, then everything is selected */ + if (!list_empty(&select_list)) { + list_for_each_entry(sle, &select_list, node) { + task->selected = sle->pid == -1 ? + strcmp(sle->name, task->comm) == 0 : + sle->pid == (int)task->pid; + if (task->selected) + break; + } + } else + task->selected = 1; + selected_task_count += task->selected; + } + + tt = zalloc((selected_task_count + 1) * sizeof(*tt)); + BUG_ON(tt == NULL); + + if (spr_list) { + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + printf("[%s/%ld]\n", + task->comm, task->pid); + } + } + + if (debug > 1) { + printf("Dump of scheduling atoms\n"); + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + + if (!task->selected) + continue; + + printf("task %6ld (%20s:%10ld), nr_events: %ld\n", + task->nr, task->comm, task->pid, task->nr_events); + + dump_sched_atom_task(task); + printf("\n"); + } + printf("\n"); + } + + /* map of a wake id to an optimized wake id */ + map_id_fwd = zalloc((next_wake_id + 1) * sizeof(*map_id_fwd)); + BUG_ON(map_id_fwd == NULL); + + next_opt_id = 0; + + if (verbose) + printf("Generating replay program\n"); + + for (i = 0, j = 0; i < nr_tasks; i++) { + task = tasks[i]; + if (!task->selected) + continue; + + if (verbose) + printf("task %6ld (%20s:%10ld), nr_events: %ld\n", + task->nr, task->comm, task->pid, task->nr_events); + + tt[j] = generate_spr_program(task, map_id_fwd, &next_opt_id); + BUG_ON(tt[j] == NULL); + + if (debug > 1) { + dump_task(tt[j]); + printf("\n"); + } + + j++; + } + + if (generate) { + for (i = 0; tt[i] != NULL; i++) + generate_task(tt[i]); + } + + free(map_id_fwd); + + } else { + ret = read_spr_program(spr_filename, &tt); + BUG_ON(ret != 0); + + if (generate) { + for (i = 0; tt[i] != NULL; i++) + generate_task(tt[i]); + } + + } + + if (!dry_run) { + + if (bogoburn) { + + if (bogoloops == 0) + calculate_bogoloops_value(); + + if (debug > 0) + printf("bogoloops at %" PRIu64 "\n", bogoloops); + } + + task_array = (void *)tt; + + p = playback_create(task_array); + BUG_ON(p == NULL); + + playback_set_debug_level(p, debug); + + if (verbose) + printf("Running...\n"); + playback_run(p); + + if (verbose) + printf("Done...\n"); + + playback_destroy(p); + } + + /* free */ + for (i = 0; tt[i] != NULL; i++) { + free((void *)tt[i]->actions); + free((void *)tt[i]->name); + free(tt[i]); + } + free(tt); +} + int cmd_sched(int argc, const char **argv, const char *prefix __used) { argc = parse_options(argc, argv, sched_options, sched_usage, @@ -1919,6 +4080,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(replay_usage, replay_options); } __cmd_replay(); + } else if (!strncmp(argv[0], "spr-rep", 7)) { + trace_handler = &replay_ops; + if (argc) { + argc = parse_options(argc, argv, spr_replay_options, spr_replay_usage, 0); + if (argc) + usage_with_options(spr_replay_usage, spr_replay_options); + } + __cmd_spr_replay(); } else { usage_with_options(sched_usage, sched_options); } -- 1.7.1