diff --git a/src/graph.cc b/src/graph.cc index 37844fe..f935d6d 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -282,7 +282,7 @@ void BarrierInstance::compute_transitivity(Weft *weft, bool forward) // Then do the transitive update if (forward) { - latest_before.resize(graph->weft->thread_count(), -1); + latest_before.resize(graph->program->thread_count(), -1); for (std::vector::const_iterator it = participants.begin(); it != participants.end(); it++) { @@ -299,7 +299,7 @@ void BarrierInstance::compute_transitivity(Weft *weft, bool forward) } else { - earliest_after.resize(graph->weft->thread_count(), -1); + earliest_after.resize(graph->program->thread_count(), -1); for (std::vector::const_iterator it = participants.begin(); it != participants.end(); it++) { @@ -438,8 +438,8 @@ void BarrierDependenceGraph::PreceedingBarriers::add_instance( next->update_waiting_threads(arrival_threads); } -BarrierDependenceGraph::BarrierDependenceGraph(Weft *w) - : weft(w), max_num_barriers(w->barrier_upper_bound()) +BarrierDependenceGraph::BarrierDependenceGraph(Weft *w, Program *p) + : weft(w), program(p), max_num_barriers(p->barrier_upper_bound()) { barrier_instances.resize(max_num_barriers); PTHREAD_SAFE_CALL( pthread_mutex_init(&validation_mutex, NULL) ); @@ -447,7 +447,7 @@ BarrierDependenceGraph::BarrierDependenceGraph(Weft *w) BarrierDependenceGraph::BarrierDependenceGraph( const BarrierDependenceGraph &rhs) - : weft(NULL), max_num_barriers(0) + : weft(NULL), program(NULL), max_num_barriers(0) { assert(false); } @@ -496,19 +496,28 @@ void BarrierDependenceGraph::construct_graph( { if (weft->print_detail()) { + char buffer[1024]; + snprintf(buffer, 1023, "DEADLOCK DETECTED IN KERNEL %s! " + "(thread and barrier state reported above)", + program->get_name()); report_state(program_counters, threads, pending_arrives); - weft->report_error(WEFT_ERROR_DEADLOCK, "DEADLOCK DETECTED! " - "(thread and barrier state reported above)"); + weft->report_error(WEFT_ERROR_DEADLOCK, buffer); } else - weft->report_error(WEFT_ERROR_DEADLOCK, "DEADLOCK DETECTED! " - "(run in detailed mode with '-d' to see thread and barrier state)"); + { + char buffer[1024]; + snprintf(buffer, 1023, "DEADLOCK DETECTED IN KERNEL %s! " + "(run in detailed mode with '-d' to see thread and barrier state)", + program->get_name()); + weft->report_error(WEFT_ERROR_DEADLOCK, buffer); + } } else - fprintf(stdout,"WEFT INFO: No deadlocks detected!\n"); + fprintf(stdout,"WEFT INFO: No deadlocks detected in kernel %s!\n", + program->get_name()); if (weft->print_verbose()) - fprintf(stdout,"WEFT INFO: Total barrier instances: %ld\n", - all_barriers.size()); + fprintf(stdout,"WEFT INFO: Total barrier instances in kernel %s: %ld\n", + program->get_name(), all_barriers.size()); } int BarrierDependenceGraph::count_validation_tasks(void) @@ -547,7 +556,8 @@ void BarrierDependenceGraph::check_for_validation_errors(void) // threadpool when we invokee this method if (!failed_validations.empty()) { - fprintf(stderr, "WEFT INFO: BARRIERS NOT PROPERLY RECYCLED!\n"); + fprintf(stderr, "WEFT INFO: BARRIERS NOT PROPERLY RECYCLED " + "IN KERNEL %s!\n", program->get_name()); for (std::vector >::const_iterator it = failed_validations.begin(); it != failed_validations.end(); it++) { @@ -562,7 +572,8 @@ void BarrierDependenceGraph::check_for_validation_errors(void) weft->report_error(WEFT_ERROR_GRAPH_VALIDATION, buffer); } else - fprintf(stdout,"WEFT INFO: Barriers properly recycled!\n"); + fprintf(stdout,"WEFT INFO: Barriers properly recycled in kernel %s!\n", + program->get_name()); } void BarrierDependenceGraph::validate_barrier(int name, int generation) diff --git a/src/graph.h b/src/graph.h index fe8d59c..60b0394 100644 --- a/src/graph.h +++ b/src/graph.h @@ -118,7 +118,7 @@ class BarrierDependenceGraph { std::deque previous; }; public: - BarrierDependenceGraph(Weft *weft); + BarrierDependenceGraph(Weft *weft, Program *p); BarrierDependenceGraph(const BarrierDependenceGraph &rhs); ~BarrierDependenceGraph(void); public: @@ -150,6 +150,7 @@ class BarrierDependenceGraph { void initialize_pending_counts(void); public: Weft *const weft; + Program *const program; const int max_num_barriers; protected: std::vector > barrier_instances; diff --git a/src/program.cc b/src/program.cc index 9de608e..cc30da2 100644 --- a/src/program.cc +++ b/src/program.cc @@ -16,6 +16,7 @@ #include "weft.h" #include "race.h" +#include "graph.h" #include "program.h" #include "instruction.h" @@ -27,10 +28,16 @@ #include #include #include +#include // Demangling -Program::Program(Weft *w) - : weft(w) +Program::Program(Weft *w, std::string &name) + : weft(w), kernel_name(name), + max_num_threads(-1), max_num_barriers(1), + shared_memory(NULL), graph(NULL) { + // Initialize values + weft->initialize_program(block_dim, block_id, grid_dim, warp_synchronous); + max_num_threads = block_dim[0] * block_dim[1] * block_dim[2]; } Program::Program(const Program &rhs) @@ -48,6 +55,22 @@ Program::~Program(void) delete (*it); } ptx_instructions.clear(); + if (shared_memory != NULL) + { + delete shared_memory; + shared_memory = NULL; + } + if (graph != NULL) + { + delete graph; + graph = NULL; + } + for (std::vector::iterator it = threads.begin(); + it != threads.end(); it++) + { + delete (*it); + } + threads.clear(); } Program& Program::operator=(const Program &rhs) @@ -57,89 +80,172 @@ Program& Program::operator=(const Program &rhs) return *this; } -void Program::parse_ptx_file(const char *file_name, int &max_num_threads) +/*static*/ +void Program::parse_ptx_file(const char *file_name, Weft *weft, + std::vector &programs) { + assert(file_name != NULL); + + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Parsing file %s...\n", file_name); + + if (weft->perform_instrumentation()) + weft->start_parsing_instrumentation(); + + Program *current = NULL; std::ifstream file(file_name); - std::vector > lines; std::map source_files; // First, let's get all the lines we care about - if (file.is_open()) + if (!file.is_open()) { - bool start_recording = false; - bool found = false; - std::string line; - int line_num = 1; - std::getline(file, line); - while (!file.eof()) + char buffer[1024]; + snprintf(buffer, 1023, "Unable to open file %s", file_name); + weft->report_error(WEFT_ERROR_FILE_OPEN, buffer); + } + bool found = false; + std::string line, kernel_name; + int line_num = 1; + std::getline(file, line); + int block_dim[3]; + while (!file.eof()) + { + if (current != NULL) + current->add_line(line, line_num); + // Try parsing this as a file location + parse_file_location(line, source_files); + if (line.find(".entry") != std::string::npos) { - if (start_recording) - lines.push_back(std::pair(line,line_num)); - // Try parsing this as a file location - parse_file_location(line, source_files); - if (line.find(".entry") != std::string::npos) + // We should only have one entry kernel, we don't know + // how to do this for more than one kernel at the moment + if (current != NULL) { - // We should only have one entry kernel, we don't know - // how to do this for more than one kernel at the moment - if (start_recording) + if (!found) { char buffer[1024]; - snprintf(buffer, 1023, "Found multiple entry kernels in file %s. " - "Weft currently only supports one kernel " - "per file", file_name); - weft->report_error(WEFT_ERROR_MULTIPLE_KERNELS, buffer); + snprintf(buffer, 1023," Failed to find max number of threads for " + "kernel %s and the value was not set on the command " + "line using the '-n' flag", kernel_name.c_str()); + weft->report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer); + } + else + { + current->set_block_dim(block_dim); + found = false; } - start_recording = true; + programs.push_back(current); } - if (!found && (line.find(".maxntid") != std::string::npos)) + size_t start = line.find(".entry")+7; + if (line.find("(") != std::string::npos) { - int temp = atoi(line.substr(line.find(" "),line.find(",")).c_str()); - if (max_num_threads != -1) + size_t stop = line.find("("); + int status; + char *realname = + abi::__cxa_demangle(line.substr(start,stop-start).c_str(), 0, 0, &status); + std::string full_name(realname); + kernel_name = full_name.substr(0, full_name.find("(")); + current = new Program(weft, kernel_name); + free(realname); + } + else + { + int status; + char *realname = + abi::__cxa_demangle(line.substr(start).c_str(), 0, 0, &status); + std::string full_name(realname); + kernel_name = full_name.substr(0, full_name.find("(")); + current = new Program(weft, kernel_name); + free(realname); + } + } + if (!found && (line.find(".maxntid") != std::string::npos)) + { + found = true; + std::string remaining = line.substr(line.find(".maxntid")+8); + int str_index = 0; + for (int i = 0; i < 3; i++) + { + bool has_dim = false; + char buffer[8]; // No more than 1024 threads per CTA + int buffer_index = 0; + while ((str_index < remaining.size()) && + ((remaining[str_index] < '0') || + (remaining[str_index] > '9'))) + str_index++; + while ((str_index < remaining.size()) && + (remaining[str_index] >= '0') && + (remaining[str_index] <= '9')) { - if (temp != max_num_threads) - { - char buffer[1024]; - snprintf(buffer, 1023, "Found max thread count %d " - "which does not agree with specified count " - "of %d", temp, max_num_threads); - weft->report_error(WEFT_ERROR_THREAD_COUNT_MISMATCH, buffer); - } + buffer[buffer_index] = remaining[str_index]; + str_index++; buffer_index++; + has_dim = true; } + buffer[buffer_index] = '\0'; + if (has_dim) + block_dim[i] = atoi(buffer); else - max_num_threads = temp; - found = true; + block_dim[i] = 1; // Unspecified dimensions are set to one } - if (line.find(".version") != std::string::npos) + } + if (line.find(".version") != std::string::npos) + { + double version = atof(line.substr(line.find(" ")).c_str()); + if (version < 3.2) { - double version = atof(line.substr(line.find(" ")).c_str()); - if (version < 3.2) - { - char buffer[1024]; - snprintf(buffer,1023, "Weft requires PTX version 3.2 (CUDA 5.5) or later! " - "File %s contains PTX version %g", file_name, version); - weft->report_error(WEFT_ERROR_INVALID_PTX_VERSION, buffer); - } + char buffer[1024]; + snprintf(buffer,1023, "Weft requires PTX version 3.2 (CUDA 5.5) or later! " + "File %s contains PTX version %g", file_name, version); + weft->report_error(WEFT_ERROR_INVALID_PTX_VERSION, buffer); } - line_num++; - std::getline(file, line); } + line_num++; + std::getline(file, line); + } + if (current == NULL) + { + char buffer[1024]; + snprintf(buffer,1023, "Weft found no entry point kernels " + "in PTX file %s\n", file_name); + weft->report_error(WEFT_ERROR_NO_KERNELS, buffer); } else + programs.push_back(current); + if (!found) { char buffer[1024]; - snprintf(buffer, 1023, "Unable to open file %s", file_name); - weft->report_error(WEFT_ERROR_FILE_OPEN, buffer); + snprintf(buffer, 1023," Failed to find max number of threads for " + "kernel %s and the value was not set on the command " + "line using the '-n' flag", kernel_name.c_str()); + weft->report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer); } + else + current->set_block_dim(block_dim); // If we didn't find a source file issue a warning if (source_files.empty()) fprintf(stderr,"WEFT WARNING: No line information found! Line numbers from PTX " "will be used!\n\t\tTry re-running nvcc with the '-lineinfo' flag!\n"); // Once we have the lines, then convert them into static PTX instructions - convert_to_instructions(max_num_threads, lines, source_files); + for (std::vector::const_iterator it = programs.begin(); + it != programs.end(); it++) + { + (*it)->convert_to_instructions(source_files); + } + + if (weft->perform_instrumentation()) + weft->stop_parsing_instrumentation(); + + if (weft->print_verbose()) + { + for (std::vector::const_iterator it = programs.begin(); + it != programs.end(); it++) + { + (*it)->report_statistics(); + } + } } void Program::report_statistics(void) { - fprintf(stdout,"WEFT INFO: Program Statistics\n"); + fprintf(stdout,"WEFT INFO: Program Statistics for Kernel %s\n", kernel_name.c_str()); fprintf(stdout," Static Instructions: %ld\n", ptx_instructions.size()); fprintf(stdout," Instruction Counts\n"); unsigned counts[PTX_LAST]; @@ -167,7 +273,7 @@ void Program::report_statistics(const std::vector &threads) { total_count += (*it)->accumulate_instruction_counts(instruction_counts); } - fprintf(stdout,"WEFT INFO: Program Statistics\n"); + fprintf(stdout,"WEFT INFO: Program Statistics for Kernel %s\n", kernel_name.c_str()); fprintf(stdout," Dynamic Instructions: %d\n", total_count); fprintf(stdout," Instruction Counts\n"); for (unsigned idx = 0; idx < PTX_LAST; idx++) @@ -191,6 +297,233 @@ bool Program::has_shuffles(void) const return false; } +void Program::emulate_threads(void) +{ + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Emulating %d GPU threads " + "for kernel %s...\n", + max_num_threads, kernel_name.c_str()); + + if (weft->perform_instrumentation()) + start_instrumentation(EMULATE_THREADS_STAGE); + + assert(shared_memory == NULL); + shared_memory = new SharedMemory(weft, this); + assert(max_num_threads > 0); + assert(max_num_threads == (block_dim[0]*block_dim[1]*block_dim[2])); + threads.resize(max_num_threads, NULL); + // If we are doing warp synchronous execution we + // execute all the threads in a warp together + if (warp_synchronous) + { + assert((max_num_threads % WARP_SIZE) == 0); + weft->initialize_count(max_num_threads/WARP_SIZE); + int tid = 0; + for (int z = 0; z < block_dim[2]; z++) + { + for (int y = 0; y < block_dim[1]; y++) + { + for (int x = 0; x < block_dim[0]; x++) + { + threads[tid] = new Thread(tid, x, y, z, this, shared_memory); + // Increment first + tid++; + // Only kick off a warp once we've generated all the threads + if ((tid % WARP_SIZE) == 0) + { + assert((tid-WARP_SIZE) >= 0); + EmulateWarp *task = + new EmulateWarp(this, &(threads[tid-WARP_SIZE])); + weft->enqueue_task(task); + } + } + } + } + } + else + { + weft->initialize_count(max_num_threads); + int tid = 0; + for (int z = 0; z < block_dim[2]; z++) + { + for (int y = 0; y < block_dim[1]; y++) + { + for (int x = 0; x < block_dim[0]; x++) + { + threads[tid] = new Thread(tid, x, y, z, this, shared_memory); + EmulateThread *task = new EmulateThread(threads[tid]); + weft->enqueue_task(task); + tid++; + } + } + } + } + weft->wait_until_done(); + // Get the maximum barrier ID from all threads + for (int i = 0; i < max_num_threads; i++) + { + int local_max = threads[i]->get_max_barrier_name(); + if ((local_max+1) > max_num_barriers) + max_num_barriers = (local_max+1); + } + if (weft->print_verbose()) + { + fprintf(stdout,"WEFT INFO: Emulation found %d named barriers for kernel %s.\n", + barrier_upper_bound(), kernel_name.c_str()); + report_statistics(); + } + + if (weft->perform_instrumentation()) + stop_instrumentation(EMULATE_THREADS_STAGE); + + // If we want to dump thread-specific files, do that now + // Note that we don't include this in the timing + if (weft->emit_program_files()) + print_files(); +} + +void Program::construct_dependence_graph(void) +{ + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Constructing barrier dependence graph " + "for kernel %s...\n", kernel_name.c_str()); + + if (weft->perform_instrumentation()) + start_instrumentation(CONSTRUCT_BARRIER_GRAPH_STAGE); + + assert(graph == NULL); + graph = new BarrierDependenceGraph(weft, this); + graph->construct_graph(threads); + + // Validate the graph + int total_validation_tasks = graph->count_validation_tasks(); + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Performing %d graph validation checks...\n", + total_validation_tasks); + if (total_validation_tasks > 0) + { + weft->initialize_count(total_validation_tasks); + graph->enqueue_validation_tasks(); + weft->wait_until_done(); + graph->check_for_validation_errors(); + } + + if (weft->perform_instrumentation()) + stop_instrumentation(CONSTRUCT_BARRIER_GRAPH_STAGE); +} + +void Program::compute_happens_relationships(void) +{ + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Computing happens-before/after " + "relationships for kernel %s...\n", kernel_name.c_str()); + + if (weft->perform_instrumentation()) + start_instrumentation(COMPUTE_HAPPENS_RELATIONSHIP_STAGE); + + // First initialize all the data structures + weft->initialize_count(threads.size()); + for (std::vector::const_iterator it = threads.begin(); + it != threads.end(); it++) + weft->enqueue_task( + new InitializationTask(*it, threads.size(), max_num_barriers)); + weft->wait_until_done(); + + // Compute barrier reachability + // There are twice as many tasks as barriers + int total_barriers = graph->count_total_barriers(); + weft->initialize_count(2*total_barriers); + graph->enqueue_reachability_tasks(); + weft->wait_until_done(); + + // Compute latest/earliest happens-before/after tasks + // There are twice as many tasks as barriers + weft->initialize_count(2*total_barriers); + graph->enqueue_transitive_happens_tasks(); + weft->wait_until_done(); + + // Finally update all the happens relationships + weft->initialize_count(threads.size()); + for (std::vector::const_iterator it = threads.begin(); + it != threads.end(); it++) + weft->enqueue_task(new UpdateThreadTask(*it)); + weft->wait_until_done(); + + if (weft->perform_instrumentation()) + stop_instrumentation(COMPUTE_HAPPENS_RELATIONSHIP_STAGE); +} + +void Program::check_for_race_conditions(void) +{ + if (weft->print_verbose()) + fprintf(stdout,"WEFT INFO: Checking for race conditions in " + "kernel %s...\n", kernel_name.c_str()); + + if (weft->perform_instrumentation()) + start_instrumentation(CHECK_FOR_RACES_STAGE); + + weft->initialize_count(shared_memory->count_addresses()); + shared_memory->enqueue_race_checks(); + weft->wait_until_done(); + shared_memory->check_for_races(); + + if (weft->perform_instrumentation()) + stop_instrumentation(CHECK_FOR_RACES_STAGE); +} + +void Program::print_statistics(void) +{ + fprintf(stdout,"WEFT STATISTICS for Kernel %s\n", kernel_name.c_str()); + fprintf(stdout," CTA Thread Count: %15d\n", max_num_threads); + fprintf(stdout," Shared Memory Locations: %15d\n", + shared_memory->count_addresses()); + fprintf(stdout," Physical Named Barriers; %15d\n", max_num_barriers); + fprintf(stdout," Dynamic Barrier Instances: %15d\n", + graph->count_total_barriers()); + fprintf(stdout," Static Instructions: %15d\n", + count_instructions()); + fprintf(stdout," Dynamic Instructions: %15d\n", + count_dynamic_instructions()); + fprintf(stdout," Weft Statements: %15d\n", + count_weft_statements()); + fprintf(stdout," Total Race Tests: %15ld\n", + shared_memory->count_race_tests()); +} + +void Program::print_files(void) +{ + weft->initialize_count(max_num_threads); + for (std::vector::const_iterator it = threads.begin(); + it != threads.end(); it++) + { + DumpThreadTask *dump_task = new DumpThreadTask(*it); + weft->enqueue_task(dump_task); + } + weft->wait_until_done(); +} + +int Program::count_dynamic_instructions(void) +{ + int result = 0; + for (std::vector::const_iterator it = threads.begin(); + it != threads.end(); it++) + { + result += (*it)->count_dynamic_instructions(); + } + return result; +} + +int Program::count_weft_statements(void) +{ + int result = 0; + for (std::vector::const_iterator it = threads.begin(); + it != threads.end(); it++) + { + result += (*it)->count_weft_statements(); + } + return result; +} + int Program::emulate(Thread *thread) { int dynamic_instructions = 0; @@ -262,8 +595,42 @@ void Program::emulate_warp(Thread **threads) threads[i]->set_dynamic_instructions(dynamic_instructions[i]); } -void Program::convert_to_instructions(int max_num_threads, - const std::vector > &lines, +void Program::get_kernel_prefix(char *buffer, size_t count) +{ + strncpy(buffer, kernel_name.c_str(), count); +} + +void Program::add_line(const std::string &line, int line_num) +{ + lines.push_back(std::pair(line, line_num)); +} + +void Program::set_block_dim(int *array) +{ + for (int i = 0; i < 3; i++) + block_dim[i] = array[i]; + max_num_threads = block_dim[0] * block_dim[1] * block_dim[2]; +} + +void Program::fill_block_dim(int *array) const +{ + for (int i = 0; i < 3; i++) + array[i] = block_dim[i]; +} + +void Program::fill_block_id(int *array) const +{ + for (int i = 0; i < 3; i++) + array[i] = block_id[i]; +} + +void Program::fill_grid_dim(int *array) const +{ + for (int i = 0; i < 3; i++) + array[i] = grid_dim[i]; +} + +void Program::convert_to_instructions( const std::map &source_files) { // Make a first pass and create all the instructions @@ -313,8 +680,20 @@ void Program::convert_to_instructions(int max_num_threads, barrier->update_count(max_num_threads); } } + // Check for shuffles, if we have shuffles then make sure + // that we have enabled warp-synchronous execution + if (!warp_synchronous && has_shuffles()) + { + fprintf(stdout,"WEFT WARNING: Program %s has shuffle instructions " + "but warp-synchronous execution was not assumed!\n" + "Enabling warp-synchronous assumption...\n", + kernel_name.c_str()); + warp_synchronous = true; + } + lines.clear(); } +/*static*/ bool Program::parse_file_location(const std::string &line, std::map &source_files) { @@ -333,6 +712,7 @@ bool Program::parse_file_location(const std::string &line, return false; } +/*static*/ bool Program::parse_source_location(const std::string &line, int &source_file, int &source_line) { @@ -349,6 +729,52 @@ bool Program::parse_source_location(const std::string &line, return false; } +void Program::start_instrumentation(ProgramStage stage) +{ + timing[stage] = weft->get_current_time_in_micros(); +} + +void Program::stop_instrumentation(ProgramStage stage) +{ + unsigned long long stop = weft->get_current_time_in_micros(); + unsigned long long start = timing[stage]; + timing[stage] = stop - start; + memory_usage[stage] = weft->get_memory_usage(); +} + +void Program::report_instrumentation(size_t &accumulated_memory) +{ + const char *stage_names[TOTAL_STAGES] = { "Emulate Threads", + "Construct Barrier Dependence Graph", + "Compute Happens-Before/After Relationships", + "Check for Race Conditions" }; + fprintf(stdout,"WEFT INSTRUMENTATION FOR KERNEL %s\n", kernel_name.c_str()); + unsigned long long total_time = 0; + size_t total_memory = 0; + for (int i = 0; i < TOTAL_STAGES; i++) + { + double time = double(timing[i]) * 1e-3; + size_t memory = memory_usage[i] - accumulated_memory; +#ifdef __MACH__ + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + stage_names[i], time, memory / (1024 * 1024)); +#else + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + stage_names[i], time, memory / 1024); +#endif + total_time += timing[i]; + total_memory += memory; + accumulated_memory += memory; + } +#ifdef __MACH__ + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + "Total", double(total_time) * 1e-3, total_memory / (1024*1024)); +#else + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + "Total", double(total_time) * 1e-3, total_memory / 1024); +#endif +} + Thread::Thread(unsigned tid, int tidx, int tidy, int tidz, Program *p, SharedMemory *m) : thread_id(tid), tid_x(tidx), tid_y(tidy), tid_z(tidz), @@ -380,9 +806,9 @@ void Thread::initialize(void) int block_dim[3]; int block_id[3]; int grid_dim[3]; - program->weft->fill_block_dim(block_dim); - program->weft->fill_block_id(block_id); - program->weft->fill_grid_dim(grid_dim); + program->fill_block_dim(block_dim); + program->fill_block_id(block_id); + program->fill_grid_dim(grid_dim); // Before starting emulation fill in the special // values for particular registers register_store[WEFT_TID_X_REG] = tid_x; @@ -553,7 +979,7 @@ void Thread::dump_weft_thread(void) // Open up a file for this thread and then // print out all of our weft instructions char file_name[1024]; - program->weft->get_file_prefix(file_name, 1024-32); + program->get_kernel_prefix(file_name, 1024-32); char buffer[32]; snprintf(buffer, 31, "_%d_%d_%d.weft", tid_x, tid_y, tid_z); strncat(file_name, buffer, 31); diff --git a/src/program.h b/src/program.h index 34ccb76..8c8329f 100644 --- a/src/program.h +++ b/src/program.h @@ -30,6 +30,14 @@ enum ThreadStatus { THREAD_EXITTED, }; +enum ProgramStage { + EMULATE_THREADS_STAGE, + CONSTRUCT_BARRIER_GRAPH_STAGE, + COMPUTE_HAPPENS_RELATIONSHIP_STAGE, + CHECK_FOR_RACES_STAGE, + TOTAL_STAGES, +}; + class Weft; class Thread; class Happens; @@ -50,32 +58,74 @@ struct ThreadState { class Program { public: - Program(Weft *weft); + Program(Weft *weft, std::string &kernel_name); Program(const Program &rhs); ~Program(void); public: Program& operator=(const Program &rhs); public: - void parse_ptx_file(const char *file_name, int &max_num_threads); + static void parse_ptx_file(const char *file_name, Weft *weft, + std::vector &programs); void report_statistics(void); void report_statistics(const std::vector &threads); bool has_shuffles(void) const; inline int count_instructions(void) const { return ptx_instructions.size(); } + inline int barrier_upper_bound(void) const { return max_num_barriers; } + inline int thread_count(void) const { return max_num_threads; } + inline bool assume_warp_synchronous(void) const { return warp_synchronous; } + inline const char* get_name(void) const { return kernel_name.c_str(); } +public: + void emulate_threads(void); + void construct_dependence_graph(void); + void compute_happens_relationships(void); + void check_for_race_conditions(void); + void print_statistics(void); + void print_files(void); + int count_dynamic_instructions(void); + int count_weft_statements(void); public: int emulate(Thread *thread); void emulate_warp(Thread **threads); + void get_kernel_prefix(char *buffer, size_t count); +public: + void add_line(const std::string &line, int line_num); + void set_block_dim(int *array); + void fill_block_dim(int *array) const; + void fill_block_id(int *array) const; + void fill_grid_dim(int *array) const; protected: - void convert_to_instructions(int max_num_threads, - const std::vector > &lines, - const std::map &source_files); - bool parse_file_location(const std::string &line, - std::map &source_files); - bool parse_source_location(const std::string &line, - int &source_file, int &source_line); + void convert_to_instructions(const std::map &source_files); + static bool parse_file_location(const std::string &line, + std::map &source_files); + static bool parse_source_location(const std::string &line, + int &source_file, int &source_line); +protected: + void start_instrumentation(ProgramStage stage); + void stop_instrumentation(ProgramStage stage); +public: + void report_instrumentation(size_t &accumulated_memory); public: Weft *const weft; protected: + std::string kernel_name; + int max_num_threads; + int max_num_barriers; std::vector ptx_instructions; + std::vector threads; + SharedMemory *shared_memory; +protected: + int block_dim[3]; + int block_id[3]; + int grid_dim[3]; + bool warp_synchronous; +protected: + BarrierDependenceGraph *graph; +protected: + std::vector > lines; +protected: + // Instrumentation + unsigned long long timing[TOTAL_STAGES]; + size_t memory_usage[TOTAL_STAGES]; }; class Thread { diff --git a/src/race.cc b/src/race.cc index e354933..f10ef89 100644 --- a/src/race.cc +++ b/src/race.cc @@ -86,7 +86,7 @@ void Address::add_access(WeftAccess *access) void Address::perform_race_tests(void) { - if (memory->weft->assume_warp_synchronous()) + if (memory->program->assume_warp_synchronous()) { for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++) { @@ -195,7 +195,7 @@ int Address::report_races(std::map< { if (memory->weft->print_detail()) { - fprintf(stderr,"WEFT INFO: Found %d races on adress %d!\n", + fprintf(stderr,"WEFT INFO: Found %d races on address %d!\n", total_races, address); for (std::map,std::set< std::pair > >::const_iterator it = @@ -267,8 +267,8 @@ size_t Address::count_race_tests(void) return ((num_accesses * (num_accesses-1))/2); } -SharedMemory::SharedMemory(Weft *w) - : weft(w) +SharedMemory::SharedMemory(Weft *w, Program *p) + : weft(w), program(p) { PTHREAD_SAFE_CALL( pthread_mutex_init(&memory_lock,NULL) ); } @@ -360,16 +360,19 @@ void SharedMemory::check_for_races(void) one->line_number, two->line_number); } } - fprintf(stderr,"WEFT INFO: Found %d total races!\n" + fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n" " Run with '-d' flag to see detailed per-thread " - "and per-address races\n", total_races); + "and per-address races\n", total_races, program->get_name()); } else - fprintf(stderr,"WEFT INFO: Found %d total races!\n", total_races); - fprintf(stderr,"WEFT INFO: RACES DETECTED!\n"); + fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n", + total_races, program->get_name()); + fprintf(stderr,"WEFT INFO: RACES DETECTED IN KERNEL %s!\n", + program->get_name()); } else - fprintf(stdout,"WEFT INFO: No races detected!\n"); + fprintf(stdout,"WEFT INFO: No races detected in kernel %s!\n", + program->get_name()); } size_t SharedMemory::count_race_tests(void) diff --git a/src/race.h b/src/race.h index cb274a6..39accca 100644 --- a/src/race.h +++ b/src/race.h @@ -26,6 +26,7 @@ class Weft; class Thread; +class Program; class WeftAccess; class WeftBarrier; class SharedMemory; @@ -81,8 +82,8 @@ class Address { class SharedMemory { public: - SharedMemory(Weft *weft); - SharedMemory(const SharedMemory &rhs) : weft(NULL) { assert(false); } + SharedMemory(Weft *weft, Program *program); + SharedMemory(const SharedMemory &rhs) : weft(NULL), program(NULL) { assert(false); } ~SharedMemory(void); public: SharedMemory& operator=(const SharedMemory &rhs) @@ -95,6 +96,7 @@ class SharedMemory { size_t count_race_tests(void); public: Weft *const weft; + Program *const program; protected: pthread_mutex_t memory_lock; std::map addresses; diff --git a/src/weft.cc b/src/weft.cc index c27b537..4c74132 100644 --- a/src/weft.cc +++ b/src/weft.cc @@ -36,11 +36,9 @@ #endif Weft::Weft(int argc, char **argv) - : file_name(NULL), max_num_threads(-1), - thread_pool_size(1), max_num_barriers(1), + : file_name(NULL), thread_pool_size(1), verbose(false), detailed(false), instrument(false), warnings(false), warp_synchronous(false), print_files(false), - program(NULL), shared_memory(NULL), graph(NULL), worker_threads(NULL), pending_count(0) { for (int i = 0; i < 3; i++) @@ -56,39 +54,29 @@ Weft::Weft(int argc, char **argv) Weft::~Weft(void) { stop_threadpool(); - if (program != NULL) - { - delete program; - program = NULL; - } - if (shared_memory != NULL) - { - delete shared_memory; - shared_memory = NULL; - } - if (graph != NULL) - { - delete graph; - graph = NULL; - } - for (std::vector::iterator it = threads.begin(); - it != threads.end(); it++) + for (std::vector::iterator it = programs.begin(); + it != programs.end(); it++) { delete (*it); } - threads.clear(); - if (instrument) - report_instrumentation(); + programs.clear(); } void Weft::verify(void) { - parse_ptx(); - emulate_threads(); - construct_dependence_graph(); - compute_happens_relationships(); - check_for_race_conditions(); - print_statistics(); + Program::parse_ptx_file(file_name, this, programs); + for (std::vector::const_iterator it = programs.begin(); + it != programs.end(); it++) + { + Program *program = *it; + program->emulate_threads(); + program->construct_dependence_graph(); + program->compute_happens_relationships(); + program->check_for_race_conditions(); + program->print_statistics(); + } + if (instrument) + report_instrumentation(); } void Weft::report_error(int error_code, const char *message) @@ -135,9 +123,7 @@ void Weft::parse_inputs(int argc, char **argv) if (!strcmp(argv[i],"-n")) { std::string threads(argv[++i]); - // If we succeeded, compute the max number of threads - if (parse_triple(threads, block_dim, "-n", "CTA size")) - max_num_threads = block_dim[0] * block_dim[1] * block_dim[2]; + parse_triple(threads, block_dim, "-n", "CTA size"); continue; } if (!strcmp(argv[i],"-p")) @@ -268,287 +254,51 @@ void Weft::report_usage(int error, const char *error_str) fprintf(stderr," -s: assume warp-synchronous execution\n"); fprintf(stderr," -t: thread pool size\n"); fprintf(stderr," -v: print verbose output\n"); - fprintf(stderr," -w: report emulation warnings (this will generate considerable output)\n"); + fprintf(stderr," -w: report emulation warnings (this may generate considerable output)\n"); exit(error); } -void Weft::parse_ptx(void) -{ - assert(file_name != NULL); - if (verbose) - fprintf(stdout,"WEFT INFO: Parsing file %s...\n", file_name); - if (instrument) - start_instrumentation(0/*stage*/); - assert(program == NULL); - program = new Program(this); - bool need_update = (max_num_threads == -1); - program->parse_ptx_file(file_name, max_num_threads); - // If we didn't get a block size, make it Nx1x1 - if (need_update) - block_dim[0] = max_num_threads; - if (max_num_threads <= 0) - { - char buffer[1024]; - snprintf(buffer, 1023," Failed to find max number of threads " - "in file %s and the value was not set on the command " - "line using the '-n' flag", file_name); - report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer); - } - // Check for shuffles, if we have shuffles then make sure - // that we have enabled warp-synchronous execution - if (!warp_synchronous && program->has_shuffles()) - { - fprintf(stdout,"WEFT WARNING: Program has shuffle instructions " - "but warp-synchronous execution was not assumed!\n" - "Enabling warp-synchronous assumption...\n"); - warp_synchronous = true; - } - if (instrument) - stop_instrumentation(0/*stage*/); - if (verbose) - program->report_statistics(); -} - -void Weft::emulate_threads(void) -{ - if (verbose) - fprintf(stdout,"WEFT INFO: Emulating %d GPU threads " - "with %d CPU threads...\n", - max_num_threads, thread_pool_size); - if (instrument) - start_instrumentation(1/*stage*/); - assert(shared_memory == NULL); - shared_memory = new SharedMemory(this); - assert(max_num_threads > 0); - assert(max_num_threads == (block_dim[0]*block_dim[1]*block_dim[2])); - threads.resize(max_num_threads, NULL); - // If we are doing warp synchronous execution we - // execute all the threads in a warp together - if (warp_synchronous) - { - assert((max_num_threads % WARP_SIZE) == 0); - initialize_count(max_num_threads/WARP_SIZE); - int tid = 0; - for (int z = 0; z < block_dim[2]; z++) - { - for (int y = 0; y < block_dim[1]; y++) - { - for (int x = 0; x < block_dim[0]; x++) - { - threads[tid] = new Thread(tid, x, y, z, program, shared_memory); - // Increment first - tid++; - // Only kick off a warp once we've generated all the threads - if ((tid % WARP_SIZE) == 0) - { - assert((tid-WARP_SIZE) >= 0); - EmulateWarp *task = - new EmulateWarp(program, &(threads[tid-WARP_SIZE])); - enqueue_task(task); - } - } - } - } - } - else - { - initialize_count(max_num_threads); - int tid = 0; - for (int z = 0; z < block_dim[2]; z++) - { - for (int y = 0; y < block_dim[1]; y++) - { - for (int x = 0; x < block_dim[0]; x++) - { - threads[tid] = new Thread(tid, x, y, z, program, shared_memory); - EmulateThread *task = new EmulateThread(threads[tid]); - enqueue_task(task); - tid++; - } - } - } - } - wait_until_done(); - // Get the maximum barrier ID from all threads - for (int i = 0; i < max_num_threads; i++) - { - int local_max = threads[i]->get_max_barrier_name(); - if ((local_max+1) > max_num_barriers) - max_num_barriers = (local_max+1); - } - if (verbose) - { - fprintf(stdout,"WEFT INFO: Emulation found %d named barriers.\n", - max_num_barriers); - program->report_statistics(threads); - } - - if (instrument) - stop_instrumentation(1/*stage*/); - - // If we want to dump thread-specific files, do that now - // Note that we don't include this in the timing - if (print_files) - { - initialize_count(max_num_threads); - for (std::vector::const_iterator it = threads.begin(); - it != threads.end(); it++) - { - DumpThreadTask *dump_task = new DumpThreadTask(*it); - enqueue_task(dump_task); - } - wait_until_done(); - } -} - -void Weft::construct_dependence_graph(void) +void Weft::initialize_program(int *bdim, int *bid, + int *gdim, bool &synchronous) { - if (verbose) - fprintf(stdout,"WEFT INFO: Constructing barrier dependence graph...\n"); - if (instrument) - start_instrumentation(2/*stage*/); - - assert(graph == NULL); - graph = new BarrierDependenceGraph(this); - graph->construct_graph(threads); - - // Validate the graph - int total_validation_tasks = graph->count_validation_tasks(); - if (verbose) - fprintf(stdout,"WEFT INFO: Performing %d graph validation checks...\n", - total_validation_tasks); - if (total_validation_tasks > 0) - { - initialize_count(total_validation_tasks); - graph->enqueue_validation_tasks(); - wait_until_done(); - graph->check_for_validation_errors(); - } - - if (instrument) - stop_instrumentation(2/*stage*/); -} - -void Weft::compute_happens_relationships(void) -{ - if (verbose) - fprintf(stdout,"WEFT INFO: Computing happens-before/after " - "relationships...\n"); - if (instrument) - start_instrumentation(3/*stage*/); - - // First initialize all the data structures - initialize_count(threads.size()); - for (std::vector::const_iterator it = threads.begin(); - it != threads.end(); it++) - enqueue_task(new InitializationTask(*it, threads.size(), max_num_barriers)); - wait_until_done(); - - // Compute barrier reachability - // There are twice as many tasks as barriers - int total_barriers = graph->count_total_barriers(); - initialize_count(2*total_barriers); - graph->enqueue_reachability_tasks(); - wait_until_done(); - - // Compute latest/earliest happens-before/after tasks - // There are twice as many tasks as barriers - initialize_count(2*total_barriers); - graph->enqueue_transitive_happens_tasks(); - wait_until_done(); - - // Finally update all the happens relationships - initialize_count(threads.size()); - for (std::vector::const_iterator it = threads.begin(); - it != threads.end(); it++) - enqueue_task(new UpdateThreadTask(*it)); - wait_until_done(); - - if (instrument) - stop_instrumentation(3/*stage*/); -} - -void Weft::check_for_race_conditions(void) -{ - if (verbose) - fprintf(stdout,"WEFT INFO: Checking for race conditions...\n"); - if (instrument) - start_instrumentation(4/*stage*/); - - initialize_count(shared_memory->count_addresses()); - shared_memory->enqueue_race_checks(); - wait_until_done(); - shared_memory->check_for_races(); - - if (instrument) - stop_instrumentation(4/*stage*/); + for (int i = 0; i < 3; i++) + bdim[i] = block_dim[i]; + for (int i = 0; i < 3; i++) + bid[i] = block_id[i]; + for (int i = 0; i < 3; i++) + gdim[i] = grid_dim[i]; + synchronous = warp_synchronous; } -void Weft::print_statistics(void) +void Weft::start_parsing_instrumentation(void) { - fprintf(stdout,"WEFT STATISTICS for %s\n", file_name); - fprintf(stdout," CTA Thread Count: %15d\n", max_num_threads); - fprintf(stdout," Shared Memory Locations: %15d\n", - shared_memory->count_addresses()); - fprintf(stdout," Physical Named Barriers; %15d\n", max_num_barriers); - fprintf(stdout," Dynamic Barrier Instances: %15d\n", - graph->count_total_barriers()); - fprintf(stdout," Static Instructions: %15d\n", - program->count_instructions()); - fprintf(stdout," Dynamic Instructions: %15d\n", - count_dynamic_instructions()); - fprintf(stdout," Weft Statements: %15d\n", - count_weft_statements()); - fprintf(stdout," Total Race Tests: %15ld\n", - shared_memory->count_race_tests()); + parsing_time = get_current_time_in_micros(); } -int Weft::count_dynamic_instructions(void) +void Weft::stop_parsing_instrumentation(void) { - int result = 0; - for (std::vector::const_iterator it = threads.begin(); - it != threads.end(); it++) - { - result += (*it)->count_dynamic_instructions(); - } - return result; + unsigned long long stop = get_current_time_in_micros(); + unsigned long long start = parsing_time; + parsing_time = stop - start; + parsing_memory = get_memory_usage(); } -int Weft::count_weft_statements(void) +void Weft::report_instrumentation(void) { - int result = 0; - for (std::vector::const_iterator it = threads.begin(); - it != threads.end(); it++) + fprintf(stdout,"WEFT INSTRUMENTATION FOR PARSING FILE %s\n", file_name); +#ifdef __MACH__ + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / (1024 * 1024)); +#else + fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", + "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / 1024); +#endif + size_t accumulated_memory = parsing_memory; + for (std::vector::const_iterator it = programs.begin(); + it != programs.end(); it++) { - result += (*it)->count_weft_statements(); + (*it)->report_instrumentation(accumulated_memory); } - return result; -} - -void Weft::fill_block_dim(int *array) -{ - for (int i = 0; i < 3; i++) - array[i] = block_dim[i]; -} - -void Weft::fill_block_id(int *array) -{ - for (int i = 0; i < 3; i++) - array[i] = block_id[i]; -} - -void Weft::fill_grid_dim(int *array) -{ - for (int i = 0; i < 3; i++) - array[i] = grid_dim[i]; -} - -void Weft::get_file_prefix(char *buffer, size_t count) -{ - std::string full_name(file_name); - assert(full_name.find(".ptx") != std::string::npos); - std::string base = full_name.substr(0, full_name.find(".ptx")); - strncpy(buffer, base.c_str(), count); } void Weft::start_threadpool(void) @@ -694,51 +444,6 @@ size_t Weft::get_memory_usage(void) return usage.ru_maxrss; } -void Weft::start_instrumentation(int stage) -{ - timing[stage] = get_current_time_in_micros(); -} - -void Weft::stop_instrumentation(int stage) -{ - unsigned long long stop = get_current_time_in_micros(); - unsigned long long start = timing[stage]; - timing[stage] = stop - start; - memory_usage[stage] = get_memory_usage(); -} - -void Weft::report_instrumentation(void) -{ - const char *stage_names[5] = { "Parse PTX", "Emulate Threads", - "Construct Barrier Dependence Graph", - "Compute Happens-Before/After Relationships", - "Check for Race Conditions" }; - fprintf(stdout,"WEFT INSTRUMENTATION\n"); - unsigned long long total_time = 0; - size_t total_memory = 0; - for (int i = 0; i < 5; i++) - { - double time = double(timing[i]) * 1e-3; - size_t memory = memory_usage[i] - total_memory; -#ifdef __MACH__ - fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", - stage_names[i], time, memory / (1024 * 1024)); -#else - fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", - stage_names[i], time, memory / 1024); -#endif - total_time += timing[i]; - total_memory += memory; - } -#ifdef __MACH__ - fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", - "Total", double(total_time) * 1e-3, total_memory / (1024*1024)); -#else - fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", - "Total", double(total_time) * 1e-3, total_memory / 1024); -#endif -} - int main(int argc, char **argv) { Weft weft(argc, argv); diff --git a/src/weft.h b/src/weft.h index 2717862..a9ca244 100644 --- a/src/weft.h +++ b/src/weft.h @@ -39,8 +39,7 @@ enum { WEFT_SUCCESS, WEFT_ERROR_NO_FILE_NAME, WEFT_ERROR_FILE_OPEN, - WEFT_ERROR_MULTIPLE_KERNELS, - WEFT_ERROR_THREAD_COUNT_MISMATCH, + WEFT_ERROR_NO_KERNELS, WEFT_ERROR_NO_THREAD_COUNT, WEFT_ERROR_ARRIVAL_MISMATCH, WEFT_ERROR_TOO_MANY_PARTICIPANTS, @@ -208,32 +207,27 @@ class Weft { void verify(void); void report_error(int error_code, const char *message); inline bool report_warnings(void) const { return warnings; } - inline int thread_count(void) const { return max_num_threads; } - inline int barrier_upper_bound(void) const { return max_num_barriers; } inline bool print_verbose(void) const { return verbose; } inline bool print_detail(void) const { return detailed; } - inline bool assume_warp_synchronous(void) const { return warp_synchronous; } + inline bool perform_instrumentation(void) const { return instrument; } + inline bool emit_program_files(void) const { return print_files; } protected: void parse_inputs(int argc, char **argv); bool parse_triple(const std::string &input, int *array, const char *flag, const char *error_str); void report_usage(int error, const char *error_str); - void parse_ptx(void); - void emulate_threads(void); - void construct_dependence_graph(void); - void compute_happens_relationships(void); - void check_for_race_conditions(void); - void print_statistics(void); - int count_dynamic_instructions(void); - int count_weft_statements(void); -public: - void fill_block_dim(int *array); - void fill_block_id(int *array); - void fill_grid_dim(int *array); - void get_file_prefix(char *buffer, size_t count); + Program* parse_ptx(void); +public: + void initialize_program(int *block_dim, int *block_id, + int *grid_dim, bool &warp_synchronous); + void start_parsing_instrumentation(void); + void stop_parsing_instrumentation(void); +protected: + void report_instrumentation(void); protected: void start_threadpool(void); void stop_threadpool(void); +public: void initialize_count(unsigned count); void wait_until_done(void); public: @@ -244,30 +238,19 @@ class Weft { static void* worker_loop(void *arg); static unsigned long long get_current_time_in_micros(void); static size_t get_memory_usage(void); -protected: - void start_instrumentation(int stage); - void stop_instrumentation(int stage); - void report_instrumentation(void); protected: const char *file_name; int block_dim[3]; // x, y, z int block_id[3]; // x, y, z int grid_dim[3]; // x, y, z - int max_num_threads; int thread_pool_size; - int max_num_barriers; bool verbose; bool detailed; bool instrument; bool warnings; bool warp_synchronous; bool print_files; -protected: - Program *program; - std::vector threads; - SharedMemory *shared_memory; -protected: - BarrierDependenceGraph *graph; + std::vector programs; protected: pthread_t *worker_threads; bool threadpool_finished; @@ -280,9 +263,8 @@ class Weft { pthread_cond_t queue_cond; std::deque queue; protected: - // Instrumentation - unsigned long long timing[5]; - size_t memory_usage[5]; + unsigned long long parsing_time; + size_t parsing_memory; }; #endif // __WEFT_H__