diff --git a/src/graph.cc b/src/graph.cc
index 37844fe..f935d6d 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -282,7 +282,7 @@ void BarrierInstance::compute_transitivity(Weft *weft, bool forward)
   // Then do the transitive update
   if (forward)
   {
-    latest_before.resize(graph->weft->thread_count(), -1);
+    latest_before.resize(graph->program->thread_count(), -1);
     for (std::vector<WeftBarrier*>::const_iterator it = 
           participants.begin(); it != participants.end(); it++)
     {
@@ -299,7 +299,7 @@ void BarrierInstance::compute_transitivity(Weft *weft, bool forward)
   }
   else
   {
-    earliest_after.resize(graph->weft->thread_count(), -1);
+    earliest_after.resize(graph->program->thread_count(), -1);
     for (std::vector<WeftBarrier*>::const_iterator it = 
           participants.begin(); it != participants.end(); it++)
     {
@@ -438,8 +438,8 @@ void BarrierDependenceGraph::PreceedingBarriers::add_instance(
   next->update_waiting_threads(arrival_threads);
 }
 
-BarrierDependenceGraph::BarrierDependenceGraph(Weft *w)
-  : weft(w), max_num_barriers(w->barrier_upper_bound())
+BarrierDependenceGraph::BarrierDependenceGraph(Weft *w, Program *p)
+  : weft(w), program(p), max_num_barriers(p->barrier_upper_bound())
 {
   barrier_instances.resize(max_num_barriers);
   PTHREAD_SAFE_CALL( pthread_mutex_init(&validation_mutex, NULL) );
@@ -447,7 +447,7 @@ BarrierDependenceGraph::BarrierDependenceGraph(Weft *w)
 
 BarrierDependenceGraph::BarrierDependenceGraph(
                         const BarrierDependenceGraph &rhs)
-  : weft(NULL), max_num_barriers(0)
+  : weft(NULL), program(NULL), max_num_barriers(0)
 {
   assert(false);
 }
@@ -496,19 +496,28 @@ void BarrierDependenceGraph::construct_graph(
   {
     if (weft->print_detail())
     {
+      char buffer[1024];
+      snprintf(buffer, 1023, "DEADLOCK DETECTED IN KERNEL %s! "
+                      "(thread and barrier state reported above)",
+                      program->get_name());
       report_state(program_counters, threads, pending_arrives);
-      weft->report_error(WEFT_ERROR_DEADLOCK, "DEADLOCK DETECTED! "
-                      "(thread and barrier state reported above)");
+      weft->report_error(WEFT_ERROR_DEADLOCK, buffer);
     }
     else
-      weft->report_error(WEFT_ERROR_DEADLOCK, "DEADLOCK DETECTED! "
-          "(run in detailed mode with '-d' to see thread and barrier state)");
+    {
+      char buffer[1024];
+      snprintf(buffer, 1023, "DEADLOCK DETECTED IN KERNEL %s! "
+          "(run in detailed mode with '-d' to see thread and barrier state)",
+          program->get_name());
+      weft->report_error(WEFT_ERROR_DEADLOCK, buffer);
+    }
   }
   else
-    fprintf(stdout,"WEFT INFO: No deadlocks detected!\n");
+    fprintf(stdout,"WEFT INFO: No deadlocks detected in kernel %s!\n",
+            program->get_name());
   if (weft->print_verbose())
-    fprintf(stdout,"WEFT INFO: Total barrier instances: %ld\n",
-            all_barriers.size());
+    fprintf(stdout,"WEFT INFO: Total barrier instances in kernel %s: %ld\n",
+            program->get_name(), all_barriers.size());
 }
 
 int BarrierDependenceGraph::count_validation_tasks(void)
@@ -547,7 +556,8 @@ void BarrierDependenceGraph::check_for_validation_errors(void)
   // threadpool when we invokee this method
   if (!failed_validations.empty())
   {
-    fprintf(stderr, "WEFT INFO: BARRIERS NOT PROPERLY RECYCLED!\n");
+    fprintf(stderr, "WEFT INFO: BARRIERS NOT PROPERLY RECYCLED "
+                    "IN KERNEL %s!\n", program->get_name());
     for (std::vector<std::pair<int,int> >::const_iterator it = 
           failed_validations.begin(); it != failed_validations.end(); it++)
     {
@@ -562,7 +572,8 @@ void BarrierDependenceGraph::check_for_validation_errors(void)
     weft->report_error(WEFT_ERROR_GRAPH_VALIDATION, buffer);
   }
   else
-    fprintf(stdout,"WEFT INFO: Barriers properly recycled!\n");
+    fprintf(stdout,"WEFT INFO: Barriers properly recycled in kernel %s!\n",
+            program->get_name());
 }
 
 void BarrierDependenceGraph::validate_barrier(int name, int generation)
diff --git a/src/graph.h b/src/graph.h
index fe8d59c..60b0394 100644
--- a/src/graph.h
+++ b/src/graph.h
@@ -118,7 +118,7 @@ class BarrierDependenceGraph {
     std::deque<BarrierInstance*> previous;
   };
 public:
-  BarrierDependenceGraph(Weft *weft);
+  BarrierDependenceGraph(Weft *weft, Program *p);
   BarrierDependenceGraph(const BarrierDependenceGraph &rhs);
   ~BarrierDependenceGraph(void);
 public:
@@ -150,6 +150,7 @@ class BarrierDependenceGraph {
   void initialize_pending_counts(void);
 public:
   Weft *const weft;
+  Program *const program;
   const int max_num_barriers;
 protected:
   std::vector<std::deque<BarrierInstance*> > barrier_instances;
diff --git a/src/program.cc b/src/program.cc
index 9de608e..cc30da2 100644
--- a/src/program.cc
+++ b/src/program.cc
@@ -16,6 +16,7 @@
 
 #include "weft.h"
 #include "race.h"
+#include "graph.h"
 #include "program.h"
 #include "instruction.h"
 
@@ -27,10 +28,16 @@
 #include <cstring>
 #include <cassert>
 #include <cstdlib>
+#include <cxxabi.h> // Demangling
 
-Program::Program(Weft *w)
-  : weft(w)
+Program::Program(Weft *w, std::string &name)
+  : weft(w), kernel_name(name), 
+    max_num_threads(-1), max_num_barriers(1),
+    shared_memory(NULL), graph(NULL)
 {
+  // Initialize values
+  weft->initialize_program(block_dim, block_id, grid_dim, warp_synchronous);
+  max_num_threads = block_dim[0] * block_dim[1] * block_dim[2];
 }
 
 Program::Program(const Program &rhs)
@@ -48,6 +55,22 @@ Program::~Program(void)
     delete (*it);
   }
   ptx_instructions.clear();
+  if (shared_memory != NULL)
+  {
+    delete shared_memory;
+    shared_memory = NULL;
+  }
+  if (graph != NULL)
+  {
+    delete graph;
+    graph = NULL;
+  }
+  for (std::vector<Thread*>::iterator it = threads.begin();
+        it != threads.end(); it++)
+  {
+    delete (*it);
+  }
+  threads.clear();
 }
 
 Program& Program::operator=(const Program &rhs)
@@ -57,89 +80,172 @@ Program& Program::operator=(const Program &rhs)
   return *this;
 }
 
-void Program::parse_ptx_file(const char *file_name, int &max_num_threads)
+/*static*/
+void Program::parse_ptx_file(const char *file_name, Weft *weft,
+                             std::vector<Program*> &programs)
 {
+  assert(file_name != NULL);
+
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Parsing file %s...\n", file_name);
+
+  if (weft->perform_instrumentation())
+    weft->start_parsing_instrumentation();
+
+  Program *current = NULL;
   std::ifstream file(file_name);
-  std::vector<std::pair<std::string,int> > lines;
   std::map<int,const char*> source_files;
   // First, let's get all the lines we care about
-  if (file.is_open())
+  if (!file.is_open())
   {
-    bool start_recording = false;
-    bool found = false;
-    std::string line;
-    int line_num = 1;
-    std::getline(file, line);
-    while (!file.eof())
+    char buffer[1024];
+    snprintf(buffer, 1023, "Unable to open file %s", file_name);
+    weft->report_error(WEFT_ERROR_FILE_OPEN, buffer);
+  }
+  bool found = false;
+  std::string line, kernel_name;
+  int line_num = 1;
+  std::getline(file, line);
+  int block_dim[3];
+  while (!file.eof())
+  {
+    if (current != NULL)
+      current->add_line(line, line_num);
+    // Try parsing this as a file location
+    parse_file_location(line, source_files);
+    if (line.find(".entry") != std::string::npos)
     {
-      if (start_recording)
-        lines.push_back(std::pair<std::string,int>(line,line_num));
-      // Try parsing this as a file location
-      parse_file_location(line, source_files);
-      if (line.find(".entry") != std::string::npos)
+      // We should only have one entry kernel, we don't know
+      // how to do this for more than one kernel at the moment
+      if (current != NULL)
       {
-        // We should only have one entry kernel, we don't know
-        // how to do this for more than one kernel at the moment
-        if (start_recording)
+        if (!found)
         {
           char buffer[1024];
-          snprintf(buffer, 1023, "Found multiple entry kernels in file %s. "
-                                 "Weft currently only supports one kernel "
-                                 "per file", file_name);
-          weft->report_error(WEFT_ERROR_MULTIPLE_KERNELS, buffer);
+          snprintf(buffer, 1023," Failed to find max number of threads for "
+                         "kernel %s and the value was not set on the command "
+                         "line using the '-n' flag", kernel_name.c_str());
+          weft->report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer);
+        }
+        else
+        {
+          current->set_block_dim(block_dim);
+          found = false;
         }
-        start_recording = true;
+        programs.push_back(current);
       }
-      if (!found && (line.find(".maxntid") != std::string::npos))
+      size_t start = line.find(".entry")+7;
+      if (line.find("(") != std::string::npos)
       {
-        int temp = atoi(line.substr(line.find(" "),line.find(",")).c_str());
-        if (max_num_threads != -1)
+        size_t stop = line.find("(");
+        int status;
+        char *realname = 
+          abi::__cxa_demangle(line.substr(start,stop-start).c_str(), 0, 0, &status);
+        std::string full_name(realname);
+        kernel_name = full_name.substr(0, full_name.find("("));
+        current = new Program(weft, kernel_name);
+        free(realname);
+      }
+      else
+      {
+        int status;
+        char *realname = 
+          abi::__cxa_demangle(line.substr(start).c_str(), 0, 0, &status);
+        std::string full_name(realname);
+        kernel_name = full_name.substr(0, full_name.find("("));
+        current = new Program(weft, kernel_name);  
+        free(realname);
+      }
+    }
+    if (!found && (line.find(".maxntid") != std::string::npos))
+    {
+      found = true;
+      std::string remaining = line.substr(line.find(".maxntid")+8);
+      int str_index = 0;
+      for (int i = 0; i < 3; i++)
+      {
+        bool has_dim = false;
+        char buffer[8]; // No more than 1024 threads per CTA
+        int buffer_index = 0; 
+        while ((str_index < remaining.size()) &&
+               ((remaining[str_index] < '0') || 
+               (remaining[str_index] > '9')))
+          str_index++; 
+        while ((str_index < remaining.size()) &&
+               (remaining[str_index] >= '0') &&
+               (remaining[str_index] <= '9'))
         {
-          if (temp != max_num_threads)
-          {
-            char buffer[1024];
-            snprintf(buffer, 1023, "Found max thread count %d "
-                           "which does not agree with specified count "
-                           "of %d", temp, max_num_threads);
-            weft->report_error(WEFT_ERROR_THREAD_COUNT_MISMATCH, buffer);
-          }
+          buffer[buffer_index] = remaining[str_index];
+          str_index++; buffer_index++;
+          has_dim = true;
         }
+        buffer[buffer_index] = '\0';
+        if (has_dim)
+          block_dim[i] = atoi(buffer);
         else
-          max_num_threads = temp;
-        found = true;
+          block_dim[i] = 1; // Unspecified dimensions are set to one
       }
-      if (line.find(".version") != std::string::npos)
+    }
+    if (line.find(".version") != std::string::npos)
+    {
+      double version = atof(line.substr(line.find(" ")).c_str());
+      if (version < 3.2)
       {
-        double version = atof(line.substr(line.find(" ")).c_str());
-        if (version < 3.2)
-        {
-          char buffer[1024];
-          snprintf(buffer,1023, "Weft requires PTX version 3.2 (CUDA 5.5) or later! "
-                    "File %s contains PTX version %g", file_name, version);
-          weft->report_error(WEFT_ERROR_INVALID_PTX_VERSION, buffer);
-        }
+        char buffer[1024];
+        snprintf(buffer,1023, "Weft requires PTX version 3.2 (CUDA 5.5) or later! "
+                  "File %s contains PTX version %g", file_name, version);
+        weft->report_error(WEFT_ERROR_INVALID_PTX_VERSION, buffer);
       }
-      line_num++;
-      std::getline(file, line);
     }
+    line_num++;
+    std::getline(file, line);
+  }
+  if (current == NULL)
+  {
+    char buffer[1024];
+    snprintf(buffer,1023, "Weft found no entry point kernels "
+                          "in PTX file %s\n", file_name);
+    weft->report_error(WEFT_ERROR_NO_KERNELS, buffer);
   }
   else
+    programs.push_back(current);
+  if (!found)
   {
     char buffer[1024];
-    snprintf(buffer, 1023, "Unable to open file %s", file_name);
-    weft->report_error(WEFT_ERROR_FILE_OPEN, buffer);
+    snprintf(buffer, 1023," Failed to find max number of threads for "
+                   "kernel %s and the value was not set on the command "
+                   "line using the '-n' flag", kernel_name.c_str());
+    weft->report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer);  
   }
+  else
+    current->set_block_dim(block_dim);
   // If we didn't find a source file issue a warning
   if (source_files.empty())
     fprintf(stderr,"WEFT WARNING: No line information found! Line numbers from PTX "
        "will be used!\n\t\tTry re-running nvcc with the '-lineinfo' flag!\n");
   // Once we have the lines, then convert them into static PTX instructions
-  convert_to_instructions(max_num_threads, lines, source_files);
+  for (std::vector<Program*>::const_iterator it = programs.begin();
+        it != programs.end(); it++)
+  {
+    (*it)->convert_to_instructions(source_files);
+  }
+
+  if (weft->perform_instrumentation())
+    weft->stop_parsing_instrumentation();
+
+  if (weft->print_verbose())
+  {
+    for (std::vector<Program*>::const_iterator it = programs.begin();
+          it != programs.end(); it++)
+    {
+      (*it)->report_statistics();
+    }
+  }
 }
 
 void Program::report_statistics(void)
 {
-  fprintf(stdout,"WEFT INFO: Program Statistics\n");
+  fprintf(stdout,"WEFT INFO: Program Statistics for Kernel %s\n", kernel_name.c_str());
   fprintf(stdout,"  Static Instructions: %ld\n", ptx_instructions.size());
   fprintf(stdout,"  Instruction Counts\n");
   unsigned counts[PTX_LAST];
@@ -167,7 +273,7 @@ void Program::report_statistics(const std::vector<Thread*> &threads)
   {
     total_count += (*it)->accumulate_instruction_counts(instruction_counts);
   }
-  fprintf(stdout,"WEFT INFO: Program Statistics\n");
+  fprintf(stdout,"WEFT INFO: Program Statistics for Kernel %s\n", kernel_name.c_str());
   fprintf(stdout,"  Dynamic Instructions: %d\n", total_count);
   fprintf(stdout,"  Instruction Counts\n");
   for (unsigned idx = 0; idx < PTX_LAST; idx++)
@@ -191,6 +297,233 @@ bool Program::has_shuffles(void) const
   return false;
 }
 
+void Program::emulate_threads(void)
+{
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Emulating %d GPU threads "
+                   "for kernel %s...\n",
+                   max_num_threads, kernel_name.c_str());
+  
+  if (weft->perform_instrumentation())
+    start_instrumentation(EMULATE_THREADS_STAGE);
+
+  assert(shared_memory == NULL);
+  shared_memory = new SharedMemory(weft, this);
+  assert(max_num_threads > 0);
+  assert(max_num_threads == (block_dim[0]*block_dim[1]*block_dim[2]));
+  threads.resize(max_num_threads, NULL);
+  // If we are doing warp synchronous execution we 
+  // execute all the threads in a warp together
+  if (warp_synchronous) 
+  {
+    assert((max_num_threads % WARP_SIZE) == 0);
+    weft->initialize_count(max_num_threads/WARP_SIZE);
+    int tid = 0;
+    for (int z = 0; z < block_dim[2]; z++)
+    {
+      for (int y = 0; y < block_dim[1]; y++)
+      {
+        for (int x = 0; x < block_dim[0]; x++)
+        {
+          threads[tid] = new Thread(tid, x, y, z, this, shared_memory);
+          // Increment first
+          tid++;
+          // Only kick off a warp once we've generated all the threads
+          if ((tid % WARP_SIZE) == 0)
+          {
+            assert((tid-WARP_SIZE) >= 0);
+            EmulateWarp *task = 
+              new EmulateWarp(this, &(threads[tid-WARP_SIZE]));
+            weft->enqueue_task(task);
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    weft->initialize_count(max_num_threads);
+    int tid = 0;
+    for (int z = 0; z < block_dim[2]; z++)
+    {
+      for (int y = 0; y < block_dim[1]; y++)
+      {
+        for (int x = 0; x < block_dim[0]; x++)
+        {
+          threads[tid] = new Thread(tid, x, y, z, this, shared_memory); 
+          EmulateThread *task = new EmulateThread(threads[tid]);
+          weft->enqueue_task(task);
+          tid++;
+        }
+      }
+    }
+  }
+  weft->wait_until_done();
+  // Get the maximum barrier ID from all threads
+  for (int i = 0; i < max_num_threads; i++)
+  {
+    int local_max = threads[i]->get_max_barrier_name();
+    if ((local_max+1) > max_num_barriers)
+      max_num_barriers = (local_max+1);
+  }
+  if (weft->print_verbose())
+  {
+    fprintf(stdout,"WEFT INFO: Emulation found %d named barriers for kernel %s.\n",
+                    barrier_upper_bound(), kernel_name.c_str());
+    report_statistics();
+  }
+
+  if (weft->perform_instrumentation())
+    stop_instrumentation(EMULATE_THREADS_STAGE);
+
+  // If we want to dump thread-specific files, do that now
+  // Note that we don't include this in the timing
+  if (weft->emit_program_files())
+    print_files();
+}
+
+void Program::construct_dependence_graph(void)
+{
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Constructing barrier dependence graph "
+                   "for kernel %s...\n", kernel_name.c_str());
+
+  if (weft->perform_instrumentation())
+    start_instrumentation(CONSTRUCT_BARRIER_GRAPH_STAGE);
+
+  assert(graph == NULL);
+  graph = new BarrierDependenceGraph(weft, this);
+  graph->construct_graph(threads);
+
+  // Validate the graph 
+  int total_validation_tasks = graph->count_validation_tasks();
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Performing %d graph validation checks...\n",
+                              total_validation_tasks);
+  if (total_validation_tasks > 0)
+  {
+    weft->initialize_count(total_validation_tasks);
+    graph->enqueue_validation_tasks();
+    weft->wait_until_done();
+    graph->check_for_validation_errors();
+  }
+
+  if (weft->perform_instrumentation())
+    stop_instrumentation(CONSTRUCT_BARRIER_GRAPH_STAGE);
+}
+
+void Program::compute_happens_relationships(void)
+{
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Computing happens-before/after "
+                   "relationships for kernel %s...\n", kernel_name.c_str());
+
+  if (weft->perform_instrumentation())
+    start_instrumentation(COMPUTE_HAPPENS_RELATIONSHIP_STAGE);
+
+  // First initialize all the data structures
+  weft->initialize_count(threads.size());
+  for (std::vector<Thread*>::const_iterator it = threads.begin();
+        it != threads.end(); it++)
+    weft->enqueue_task(
+        new InitializationTask(*it, threads.size(), max_num_barriers));
+  weft->wait_until_done();
+
+  // Compute barrier reachability
+  // There are twice as many tasks as barriers
+  int total_barriers = graph->count_total_barriers();
+  weft->initialize_count(2*total_barriers);
+  graph->enqueue_reachability_tasks();
+  weft->wait_until_done();
+
+  // Compute latest/earliest happens-before/after tasks
+  // There are twice as many tasks as barriers
+  weft->initialize_count(2*total_barriers);
+  graph->enqueue_transitive_happens_tasks();
+  weft->wait_until_done();
+
+  // Finally update all the happens relationships
+  weft->initialize_count(threads.size());
+  for (std::vector<Thread*>::const_iterator it = threads.begin();
+        it != threads.end(); it++)
+    weft->enqueue_task(new UpdateThreadTask(*it));
+  weft->wait_until_done();
+
+  if (weft->perform_instrumentation())
+    stop_instrumentation(COMPUTE_HAPPENS_RELATIONSHIP_STAGE);
+}
+
+void Program::check_for_race_conditions(void)
+{
+  if (weft->print_verbose())
+    fprintf(stdout,"WEFT INFO: Checking for race conditions in "
+                   "kernel %s...\n", kernel_name.c_str());
+
+  if (weft->perform_instrumentation())
+    start_instrumentation(CHECK_FOR_RACES_STAGE);
+
+  weft->initialize_count(shared_memory->count_addresses());
+  shared_memory->enqueue_race_checks();
+  weft->wait_until_done();
+  shared_memory->check_for_races();
+
+  if (weft->perform_instrumentation())
+    stop_instrumentation(CHECK_FOR_RACES_STAGE);
+}
+
+void Program::print_statistics(void)
+{
+  fprintf(stdout,"WEFT STATISTICS for Kernel %s\n", kernel_name.c_str());
+  fprintf(stdout,"  CTA Thread Count:          %15d\n", max_num_threads);
+  fprintf(stdout,"  Shared Memory Locations:   %15d\n", 
+                                    shared_memory->count_addresses());
+  fprintf(stdout,"  Physical Named Barriers;   %15d\n", max_num_barriers);
+  fprintf(stdout,"  Dynamic Barrier Instances: %15d\n", 
+                                    graph->count_total_barriers());
+  fprintf(stdout,"  Static Instructions:       %15d\n", 
+                                    count_instructions());
+  fprintf(stdout,"  Dynamic Instructions:      %15d\n",
+                                    count_dynamic_instructions());
+  fprintf(stdout,"  Weft Statements:           %15d\n",
+                                    count_weft_statements());   
+  fprintf(stdout,"  Total Race Tests:          %15ld\n",
+                                    shared_memory->count_race_tests());
+}
+
+void Program::print_files(void)
+{
+  weft->initialize_count(max_num_threads);
+  for (std::vector<Thread*>::const_iterator it = threads.begin();
+        it != threads.end(); it++)
+  {
+    DumpThreadTask *dump_task = new DumpThreadTask(*it); 
+    weft->enqueue_task(dump_task);
+  }
+  weft->wait_until_done();
+}
+
+int Program::count_dynamic_instructions(void)
+{
+  int result = 0;
+  for (std::vector<Thread*>::const_iterator it = threads.begin();
+        it != threads.end(); it++)
+  {
+    result += (*it)->count_dynamic_instructions();
+  }
+  return result;
+}
+
+int Program::count_weft_statements(void)
+{
+  int result = 0;
+  for (std::vector<Thread*>::const_iterator it = threads.begin();
+        it != threads.end(); it++)
+  {
+    result += (*it)->count_weft_statements();
+  }
+  return result;
+}
+
 int Program::emulate(Thread *thread)
 {
   int dynamic_instructions = 0;
@@ -262,8 +595,42 @@ void Program::emulate_warp(Thread **threads)
     threads[i]->set_dynamic_instructions(dynamic_instructions[i]);
 }
 
-void Program::convert_to_instructions(int max_num_threads,
-                const std::vector<std::pair<std::string,int> > &lines,
+void Program::get_kernel_prefix(char *buffer, size_t count)
+{
+  strncpy(buffer, kernel_name.c_str(), count);
+}
+
+void Program::add_line(const std::string &line, int line_num)
+{
+  lines.push_back(std::pair<std::string,int>(line, line_num));
+}
+
+void Program::set_block_dim(int *array)
+{
+  for (int i = 0; i < 3; i++)
+    block_dim[i] = array[i];
+  max_num_threads = block_dim[0] * block_dim[1] * block_dim[2];
+}
+
+void Program::fill_block_dim(int *array) const
+{
+  for (int i = 0; i < 3; i++)
+    array[i] = block_dim[i];
+}
+
+void Program::fill_block_id(int *array) const
+{
+  for (int i = 0; i < 3; i++)
+    array[i] = block_id[i];
+}
+
+void Program::fill_grid_dim(int *array) const
+{
+  for (int i = 0; i < 3; i++)
+    array[i] = grid_dim[i];
+}
+
+void Program::convert_to_instructions(
                 const std::map<int,const char*> &source_files)
 {
   // Make a first pass and create all the instructions
@@ -313,8 +680,20 @@ void Program::convert_to_instructions(int max_num_threads,
       barrier->update_count(max_num_threads);
     }
   }
+  // Check for shuffles, if we have shuffles then make sure
+  // that we have enabled warp-synchronous execution
+  if (!warp_synchronous && has_shuffles())
+  {
+    fprintf(stdout,"WEFT WARNING: Program %s has shuffle instructions "
+                   "but warp-synchronous execution was not assumed!\n"
+                   "Enabling warp-synchronous assumption...\n",
+                   kernel_name.c_str());
+    warp_synchronous = true;
+  }
+  lines.clear();
 }
 
+/*static*/
 bool Program::parse_file_location(const std::string &line,
                                   std::map<int,const char*> &source_files)
 {
@@ -333,6 +712,7 @@ bool Program::parse_file_location(const std::string &line,
   return false;
 }
 
+/*static*/
 bool Program::parse_source_location(const std::string &line,
                                     int &source_file, int &source_line)
 {
@@ -349,6 +729,52 @@ bool Program::parse_source_location(const std::string &line,
   return false;
 }
 
+void Program::start_instrumentation(ProgramStage stage)
+{
+  timing[stage] = weft->get_current_time_in_micros();
+}
+
+void Program::stop_instrumentation(ProgramStage stage)
+{
+  unsigned long long stop = weft->get_current_time_in_micros();
+  unsigned long long start = timing[stage];
+  timing[stage] = stop - start;
+  memory_usage[stage] = weft->get_memory_usage();
+}
+
+void Program::report_instrumentation(size_t &accumulated_memory)
+{
+  const char *stage_names[TOTAL_STAGES] = { "Emulate Threads",
+                         "Construct Barrier Dependence Graph",
+                         "Compute Happens-Before/After Relationships",
+                         "Check for Race Conditions" };
+  fprintf(stdout,"WEFT INSTRUMENTATION FOR KERNEL %s\n", kernel_name.c_str());
+  unsigned long long total_time = 0;
+  size_t total_memory = 0;
+  for (int i = 0; i < TOTAL_STAGES; i++)
+  {
+    double time = double(timing[i]) * 1e-3;
+    size_t memory = memory_usage[i] - accumulated_memory;
+#ifdef __MACH__
+    fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+            stage_names[i], time, memory / (1024 * 1024));
+#else
+    fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+            stage_names[i], time, memory / 1024);
+#endif
+    total_time += timing[i];
+    total_memory += memory;
+    accumulated_memory += memory;
+  }
+#ifdef __MACH__
+  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+          "Total", double(total_time) * 1e-3, total_memory / (1024*1024));
+#else
+  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+          "Total", double(total_time) * 1e-3, total_memory / 1024);
+#endif
+}
+
 Thread::Thread(unsigned tid, int tidx, int tidy, int tidz,
                Program *p, SharedMemory *m)
   : thread_id(tid), tid_x(tidx), tid_y(tidy), tid_z(tidz),
@@ -380,9 +806,9 @@ void Thread::initialize(void)
   int block_dim[3];
   int block_id[3];
   int grid_dim[3];
-  program->weft->fill_block_dim(block_dim);
-  program->weft->fill_block_id(block_id);
-  program->weft->fill_grid_dim(grid_dim);
+  program->fill_block_dim(block_dim);
+  program->fill_block_id(block_id);
+  program->fill_grid_dim(grid_dim);
   // Before starting emulation fill in the special
   // values for particular registers
   register_store[WEFT_TID_X_REG] = tid_x;
@@ -553,7 +979,7 @@ void Thread::dump_weft_thread(void)
   // Open up a file for this thread and then 
   // print out all of our weft instructions
   char file_name[1024];
-  program->weft->get_file_prefix(file_name, 1024-32);
+  program->get_kernel_prefix(file_name, 1024-32);
   char buffer[32];
   snprintf(buffer, 31, "_%d_%d_%d.weft", tid_x, tid_y, tid_z);
   strncat(file_name, buffer, 31);
diff --git a/src/program.h b/src/program.h
index 34ccb76..8c8329f 100644
--- a/src/program.h
+++ b/src/program.h
@@ -30,6 +30,14 @@ enum ThreadStatus {
   THREAD_EXITTED,
 };
 
+enum ProgramStage {
+  EMULATE_THREADS_STAGE,
+  CONSTRUCT_BARRIER_GRAPH_STAGE,
+  COMPUTE_HAPPENS_RELATIONSHIP_STAGE,
+  CHECK_FOR_RACES_STAGE,
+  TOTAL_STAGES,
+};
+
 class Weft;
 class Thread;
 class Happens;
@@ -50,32 +58,74 @@ struct ThreadState {
 
 class Program {
 public:
-  Program(Weft *weft);
+  Program(Weft *weft, std::string &kernel_name);
   Program(const Program &rhs);
   ~Program(void);
 public:
   Program& operator=(const Program &rhs);
 public:
-  void parse_ptx_file(const char *file_name, int &max_num_threads);
+  static void parse_ptx_file(const char *file_name, Weft *weft,
+                             std::vector<Program*> &programs);
   void report_statistics(void);
   void report_statistics(const std::vector<Thread*> &threads);
   bool has_shuffles(void) const;
   inline int count_instructions(void) const { return ptx_instructions.size(); }
+  inline int barrier_upper_bound(void) const { return max_num_barriers; }
+  inline int thread_count(void) const { return max_num_threads; }
+  inline bool assume_warp_synchronous(void) const { return warp_synchronous; }
+  inline const char* get_name(void) const { return kernel_name.c_str(); }
+public:
+  void emulate_threads(void);
+  void construct_dependence_graph(void);
+  void compute_happens_relationships(void);
+  void check_for_race_conditions(void);
+  void print_statistics(void);
+  void print_files(void);
+  int count_dynamic_instructions(void);
+  int count_weft_statements(void);
 public:
   int emulate(Thread *thread);
   void emulate_warp(Thread **threads);
+  void get_kernel_prefix(char *buffer, size_t count);
+public:
+  void add_line(const std::string &line, int line_num);
+  void set_block_dim(int *array);
+  void fill_block_dim(int *array) const;
+  void fill_block_id(int *array) const;
+  void fill_grid_dim(int *array) const;
 protected:
-  void convert_to_instructions(int max_num_threads,
-          const std::vector<std::pair<std::string,int> > &lines,
-          const std::map<int,const char*> &source_files);
-  bool parse_file_location(const std::string &line,
-                           std::map<int,const char*> &source_files);
-  bool parse_source_location(const std::string &line,
-                             int &source_file, int &source_line);
+  void convert_to_instructions(const std::map<int,const char*> &source_files);
+  static bool parse_file_location(const std::string &line,
+                                  std::map<int,const char*> &source_files);
+  static bool parse_source_location(const std::string &line,
+                                    int &source_file, int &source_line);
+protected:
+  void start_instrumentation(ProgramStage stage);
+  void stop_instrumentation(ProgramStage stage);
+public:
+  void report_instrumentation(size_t &accumulated_memory);
 public:
   Weft *const weft;
 protected:
+  std::string kernel_name;
+  int max_num_threads;
+  int max_num_barriers;
   std::vector<PTXInstruction*> ptx_instructions;
+  std::vector<Thread*> threads;
+  SharedMemory *shared_memory;
+protected:
+  int block_dim[3];
+  int block_id[3];
+  int grid_dim[3];
+  bool warp_synchronous;
+protected:
+  BarrierDependenceGraph *graph;
+protected:
+  std::vector<std::pair<std::string,int> > lines;
+protected:
+  // Instrumentation
+  unsigned long long timing[TOTAL_STAGES];
+  size_t memory_usage[TOTAL_STAGES];
 };
 
 class Thread {
diff --git a/src/race.cc b/src/race.cc
index e354933..f10ef89 100644
--- a/src/race.cc
+++ b/src/race.cc
@@ -86,7 +86,7 @@ void Address::add_access(WeftAccess *access)
 
 void Address::perform_race_tests(void)
 {
-  if (memory->weft->assume_warp_synchronous())
+  if (memory->program->assume_warp_synchronous())
   {
     for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++)
     {
@@ -195,7 +195,7 @@ int Address::report_races(std::map<
   { 
     if (memory->weft->print_detail())
     {
-      fprintf(stderr,"WEFT INFO: Found %d races on adress %d!\n",
+      fprintf(stderr,"WEFT INFO: Found %d races on address %d!\n",
                       total_races, address);
       for (std::map<std::pair<PTXInstruction*,PTXInstruction*>,std::set<
                     std::pair<Thread*,Thread*> > >::const_iterator it = 
@@ -267,8 +267,8 @@ size_t Address::count_race_tests(void)
   return ((num_accesses * (num_accesses-1))/2);
 }
 
-SharedMemory::SharedMemory(Weft *w)
-  : weft(w)
+SharedMemory::SharedMemory(Weft *w, Program *p)
+  : weft(w), program(p)
 {
   PTHREAD_SAFE_CALL( pthread_mutex_init(&memory_lock,NULL) );
 }
@@ -360,16 +360,19 @@ void SharedMemory::check_for_races(void)
                            one->line_number, two->line_number);
         }
       }
-      fprintf(stderr,"WEFT INFO: Found %d total races!\n"
+      fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n"
                      "           Run with '-d' flag to see detailed per-thread "
-                     "and per-address races\n", total_races);
+                     "and per-address races\n", total_races, program->get_name());
     }
     else
-      fprintf(stderr,"WEFT INFO: Found %d total races!\n", total_races);
-    fprintf(stderr,"WEFT INFO: RACES DETECTED!\n");
+      fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n", 
+                     total_races, program->get_name());
+    fprintf(stderr,"WEFT INFO: RACES DETECTED IN KERNEL %s!\n", 
+                     program->get_name());
   }
   else
-    fprintf(stdout,"WEFT INFO: No races detected!\n");
+    fprintf(stdout,"WEFT INFO: No races detected in kernel %s!\n", 
+                    program->get_name());
 }
 
 size_t SharedMemory::count_race_tests(void)
diff --git a/src/race.h b/src/race.h
index cb274a6..39accca 100644
--- a/src/race.h
+++ b/src/race.h
@@ -26,6 +26,7 @@
 
 class Weft;
 class Thread;
+class Program;
 class WeftAccess;
 class WeftBarrier;
 class SharedMemory;
@@ -81,8 +82,8 @@ class Address {
 
 class SharedMemory {
 public:
-  SharedMemory(Weft *weft);
-  SharedMemory(const SharedMemory &rhs) : weft(NULL) { assert(false); }
+  SharedMemory(Weft *weft, Program *program);
+  SharedMemory(const SharedMemory &rhs) : weft(NULL), program(NULL) { assert(false); }
   ~SharedMemory(void);
 public:
   SharedMemory& operator=(const SharedMemory &rhs) 
@@ -95,6 +96,7 @@ class SharedMemory {
   size_t count_race_tests(void);
 public:
   Weft *const weft;
+  Program *const program;
 protected:
   pthread_mutex_t memory_lock;
   std::map<int/*address*/,Address*> addresses;
diff --git a/src/weft.cc b/src/weft.cc
index c27b537..4c74132 100644
--- a/src/weft.cc
+++ b/src/weft.cc
@@ -36,11 +36,9 @@
 #endif
 
 Weft::Weft(int argc, char **argv)
-  : file_name(NULL), max_num_threads(-1), 
-    thread_pool_size(1), max_num_barriers(1),
+  : file_name(NULL), thread_pool_size(1), 
     verbose(false), detailed(false), instrument(false), 
     warnings(false), warp_synchronous(false), print_files(false),
-    program(NULL), shared_memory(NULL), graph(NULL),
     worker_threads(NULL), pending_count(0)
 {
   for (int i = 0; i < 3; i++)
@@ -56,39 +54,29 @@ Weft::Weft(int argc, char **argv)
 Weft::~Weft(void)
 {
   stop_threadpool();
-  if (program != NULL)
-  {
-    delete program;
-    program = NULL;
-  }
-  if (shared_memory != NULL)
-  {
-    delete shared_memory;
-    shared_memory = NULL;
-  }
-  if (graph != NULL)
-  {
-    delete graph;
-    graph = NULL;
-  }
-  for (std::vector<Thread*>::iterator it = threads.begin();
-        it != threads.end(); it++)
+  for (std::vector<Program*>::iterator it = programs.begin();
+        it != programs.end(); it++)
   {
     delete (*it);
   }
-  threads.clear();
-  if (instrument)
-    report_instrumentation();
+  programs.clear();
 }
 
 void Weft::verify(void)
 {
-  parse_ptx();
-  emulate_threads();
-  construct_dependence_graph();
-  compute_happens_relationships();
-  check_for_race_conditions();
-  print_statistics();
+  Program::parse_ptx_file(file_name, this, programs);
+  for (std::vector<Program*>::const_iterator it = programs.begin();
+        it != programs.end(); it++)
+  {
+    Program *program = *it;
+    program->emulate_threads();
+    program->construct_dependence_graph();
+    program->compute_happens_relationships();
+    program->check_for_race_conditions();
+    program->print_statistics();
+  }
+  if (instrument)
+    report_instrumentation();
 }
 
 void Weft::report_error(int error_code, const char *message)
@@ -135,9 +123,7 @@ void Weft::parse_inputs(int argc, char **argv)
     if (!strcmp(argv[i],"-n"))
     {
       std::string threads(argv[++i]);
-      // If we succeeded, compute the max number of threads
-      if (parse_triple(threads, block_dim, "-n", "CTA size"))
-        max_num_threads = block_dim[0] * block_dim[1] * block_dim[2];
+      parse_triple(threads, block_dim, "-n", "CTA size");
       continue;
     }
     if (!strcmp(argv[i],"-p"))
@@ -268,287 +254,51 @@ void Weft::report_usage(int error, const char *error_str)
   fprintf(stderr,"  -s: assume warp-synchronous execution\n");
   fprintf(stderr,"  -t: thread pool size\n");
   fprintf(stderr,"  -v: print verbose output\n");
-  fprintf(stderr,"  -w: report emulation warnings (this will generate considerable output)\n");
+  fprintf(stderr,"  -w: report emulation warnings (this may generate considerable output)\n");
   exit(error);
 }
 
-void Weft::parse_ptx(void)
-{
-  assert(file_name != NULL);
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Parsing file %s...\n", file_name);
-  if (instrument)
-    start_instrumentation(0/*stage*/);
-  assert(program == NULL);
-  program = new Program(this);
-  bool need_update = (max_num_threads == -1);
-  program->parse_ptx_file(file_name, max_num_threads);
-  // If we didn't get a block size, make it Nx1x1
-  if (need_update)
-    block_dim[0] = max_num_threads;
-  if (max_num_threads <= 0)
-  {
-    char buffer[1024];
-    snprintf(buffer, 1023," Failed to find max number of threads "
-                   "in file %s and the value was not set on the command "
-                   "line using the '-n' flag", file_name);
-    report_error(WEFT_ERROR_NO_THREAD_COUNT, buffer);
-  }
-  // Check for shuffles, if we have shuffles then make sure
-  // that we have enabled warp-synchronous execution
-  if (!warp_synchronous && program->has_shuffles())
-  {
-    fprintf(stdout,"WEFT WARNING: Program has shuffle instructions "
-                   "but warp-synchronous execution was not assumed!\n"
-                   "Enabling warp-synchronous assumption...\n");
-    warp_synchronous = true;
-  }
-  if (instrument)
-    stop_instrumentation(0/*stage*/);
-  if (verbose)
-    program->report_statistics();
-}
-
-void Weft::emulate_threads(void)
-{
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Emulating %d GPU threads "
-                   "with %d CPU threads...\n",
-                   max_num_threads, thread_pool_size);
-  if (instrument)
-    start_instrumentation(1/*stage*/);
-  assert(shared_memory == NULL);
-  shared_memory = new SharedMemory(this);
-  assert(max_num_threads > 0);
-  assert(max_num_threads == (block_dim[0]*block_dim[1]*block_dim[2]));
-  threads.resize(max_num_threads, NULL);
-  // If we are doing warp synchronous execution we 
-  // execute all the threads in a warp together
-  if (warp_synchronous) 
-  {
-    assert((max_num_threads % WARP_SIZE) == 0);
-    initialize_count(max_num_threads/WARP_SIZE);
-    int tid = 0;
-    for (int z = 0; z < block_dim[2]; z++)
-    {
-      for (int y = 0; y < block_dim[1]; y++)
-      {
-        for (int x = 0; x < block_dim[0]; x++)
-        {
-          threads[tid] = new Thread(tid, x, y, z, program, shared_memory);
-          // Increment first
-          tid++;
-          // Only kick off a warp once we've generated all the threads
-          if ((tid % WARP_SIZE) == 0)
-          {
-            assert((tid-WARP_SIZE) >= 0);
-            EmulateWarp *task = 
-              new EmulateWarp(program, &(threads[tid-WARP_SIZE]));
-            enqueue_task(task);
-          }
-        }
-      }
-    }
-  }
-  else
-  {
-    initialize_count(max_num_threads);
-    int tid = 0;
-    for (int z = 0; z < block_dim[2]; z++)
-    {
-      for (int y = 0; y < block_dim[1]; y++)
-      {
-        for (int x = 0; x < block_dim[0]; x++)
-        {
-          threads[tid] = new Thread(tid, x, y, z, program, shared_memory); 
-          EmulateThread *task = new EmulateThread(threads[tid]);
-          enqueue_task(task);
-          tid++;
-        }
-      }
-    }
-  }
-  wait_until_done();
-  // Get the maximum barrier ID from all threads
-  for (int i = 0; i < max_num_threads; i++)
-  {
-    int local_max = threads[i]->get_max_barrier_name();
-    if ((local_max+1) > max_num_barriers)
-      max_num_barriers = (local_max+1);
-  }
-  if (verbose)
-  {
-    fprintf(stdout,"WEFT INFO: Emulation found %d named barriers.\n",
-                    max_num_barriers);
-    program->report_statistics(threads);
-  }
-
-  if (instrument)
-    stop_instrumentation(1/*stage*/);
-
-  // If we want to dump thread-specific files, do that now
-  // Note that we don't include this in the timing
-  if (print_files)
-  {
-    initialize_count(max_num_threads);
-    for (std::vector<Thread*>::const_iterator it = threads.begin();
-          it != threads.end(); it++)
-    {
-      DumpThreadTask *dump_task = new DumpThreadTask(*it); 
-      enqueue_task(dump_task);
-    }
-    wait_until_done();
-  }
-}
-
-void Weft::construct_dependence_graph(void)
+void Weft::initialize_program(int *bdim, int *bid,
+                              int *gdim, bool &synchronous)
 {
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Constructing barrier dependence graph...\n");
-  if (instrument)
-    start_instrumentation(2/*stage*/);
-
-  assert(graph == NULL);
-  graph = new BarrierDependenceGraph(this);
-  graph->construct_graph(threads);
-
-  // Validate the graph 
-  int total_validation_tasks = graph->count_validation_tasks();
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Performing %d graph validation checks...\n",
-                              total_validation_tasks);
-  if (total_validation_tasks > 0)
-  {
-    initialize_count(total_validation_tasks);
-    graph->enqueue_validation_tasks();
-    wait_until_done();
-    graph->check_for_validation_errors();
-  }
-
-  if (instrument)
-    stop_instrumentation(2/*stage*/);
-}
-
-void Weft::compute_happens_relationships(void)
-{
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Computing happens-before/after "
-                   "relationships...\n");
-  if (instrument)
-    start_instrumentation(3/*stage*/);
-
-  // First initialize all the data structures
-  initialize_count(threads.size());
-  for (std::vector<Thread*>::const_iterator it = threads.begin();
-        it != threads.end(); it++)
-    enqueue_task(new InitializationTask(*it, threads.size(), max_num_barriers));
-  wait_until_done();
-
-  // Compute barrier reachability
-  // There are twice as many tasks as barriers
-  int total_barriers = graph->count_total_barriers();
-  initialize_count(2*total_barriers);
-  graph->enqueue_reachability_tasks();
-  wait_until_done();
-
-  // Compute latest/earliest happens-before/after tasks
-  // There are twice as many tasks as barriers
-  initialize_count(2*total_barriers);
-  graph->enqueue_transitive_happens_tasks();
-  wait_until_done();
-
-  // Finally update all the happens relationships
-  initialize_count(threads.size());
-  for (std::vector<Thread*>::const_iterator it = threads.begin();
-        it != threads.end(); it++)
-    enqueue_task(new UpdateThreadTask(*it));
-  wait_until_done();
-
-  if (instrument)
-    stop_instrumentation(3/*stage*/);
-}
-
-void Weft::check_for_race_conditions(void)
-{
-  if (verbose)
-    fprintf(stdout,"WEFT INFO: Checking for race conditions...\n");
-  if (instrument)
-    start_instrumentation(4/*stage*/);
-
-  initialize_count(shared_memory->count_addresses());
-  shared_memory->enqueue_race_checks();
-  wait_until_done();
-  shared_memory->check_for_races();
-
-  if (instrument)
-    stop_instrumentation(4/*stage*/);
+  for (int i = 0; i < 3; i++)
+    bdim[i] = block_dim[i];
+  for (int i = 0; i < 3; i++)
+    bid[i] = block_id[i];
+  for (int i = 0; i < 3; i++)
+    gdim[i] = grid_dim[i];
+  synchronous = warp_synchronous;
 }
 
-void Weft::print_statistics(void)
+void Weft::start_parsing_instrumentation(void)
 {
-  fprintf(stdout,"WEFT STATISTICS for %s\n", file_name);
-  fprintf(stdout,"  CTA Thread Count:          %15d\n", max_num_threads);
-  fprintf(stdout,"  Shared Memory Locations:   %15d\n", 
-                                    shared_memory->count_addresses());
-  fprintf(stdout,"  Physical Named Barriers;   %15d\n", max_num_barriers);
-  fprintf(stdout,"  Dynamic Barrier Instances: %15d\n", 
-                                    graph->count_total_barriers());
-  fprintf(stdout,"  Static Instructions:       %15d\n", 
-                                    program->count_instructions());
-  fprintf(stdout,"  Dynamic Instructions:      %15d\n",
-                                    count_dynamic_instructions());
-  fprintf(stdout,"  Weft Statements:           %15d\n",
-                                    count_weft_statements());   
-  fprintf(stdout,"  Total Race Tests:          %15ld\n",
-                                    shared_memory->count_race_tests());
+  parsing_time = get_current_time_in_micros();
 }
 
-int Weft::count_dynamic_instructions(void)
+void Weft::stop_parsing_instrumentation(void)
 {
-  int result = 0;
-  for (std::vector<Thread*>::const_iterator it = threads.begin();
-        it != threads.end(); it++)
-  {
-    result += (*it)->count_dynamic_instructions();
-  }
-  return result;
+  unsigned long long stop = get_current_time_in_micros();
+  unsigned long long start = parsing_time;
+  parsing_time = stop - start;
+  parsing_memory = get_memory_usage();
 }
 
-int Weft::count_weft_statements(void)
+void Weft::report_instrumentation(void)
 {
-  int result = 0;
-  for (std::vector<Thread*>::const_iterator it = threads.begin();
-        it != threads.end(); it++)
+  fprintf(stdout,"WEFT INSTRUMENTATION FOR PARSING FILE %s\n", file_name); 
+#ifdef __MACH__
+  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+          "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / (1024 * 1024));
+#else
+  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
+          "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / 1024);
+#endif
+  size_t accumulated_memory = parsing_memory;
+  for (std::vector<Program*>::const_iterator it = programs.begin();
+        it != programs.end(); it++)
   {
-    result += (*it)->count_weft_statements();
+    (*it)->report_instrumentation(accumulated_memory);
   }
-  return result;
-}
-
-void Weft::fill_block_dim(int *array)
-{
-  for (int i = 0; i < 3; i++)
-    array[i] = block_dim[i];
-}
-
-void Weft::fill_block_id(int *array)
-{
-  for (int i = 0; i < 3; i++)
-    array[i] = block_id[i];
-}
-
-void Weft::fill_grid_dim(int *array)
-{
-  for (int i = 0; i < 3; i++)
-    array[i] = grid_dim[i];
-}
-
-void Weft::get_file_prefix(char *buffer, size_t count)
-{
-  std::string full_name(file_name);
-  assert(full_name.find(".ptx") != std::string::npos);
-  std::string base = full_name.substr(0, full_name.find(".ptx"));
-  strncpy(buffer, base.c_str(), count);
 }
 
 void Weft::start_threadpool(void)
@@ -694,51 +444,6 @@ size_t Weft::get_memory_usage(void)
   return usage.ru_maxrss;
 }
 
-void Weft::start_instrumentation(int stage)
-{
-  timing[stage] = get_current_time_in_micros();
-}
-
-void Weft::stop_instrumentation(int stage)
-{
-  unsigned long long stop = get_current_time_in_micros();
-  unsigned long long start = timing[stage];
-  timing[stage] = stop - start;
-  memory_usage[stage] = get_memory_usage();
-}
-
-void Weft::report_instrumentation(void)
-{
-  const char *stage_names[5] = { "Parse PTX", "Emulate Threads",
-                                 "Construct Barrier Dependence Graph",
-                                 "Compute Happens-Before/After Relationships",
-                                 "Check for Race Conditions" };
-  fprintf(stdout,"WEFT INSTRUMENTATION\n");
-  unsigned long long total_time = 0;
-  size_t total_memory = 0;
-  for (int i = 0; i < 5; i++)
-  {
-    double time = double(timing[i]) * 1e-3;
-    size_t memory = memory_usage[i] - total_memory;
-#ifdef __MACH__
-    fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
-            stage_names[i], time, memory / (1024 * 1024));
-#else
-    fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
-            stage_names[i], time, memory / 1024);
-#endif
-    total_time += timing[i];
-    total_memory += memory;
-  }
-#ifdef __MACH__
-  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
-          "Total", double(total_time) * 1e-3, total_memory / (1024*1024));
-#else
-  fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
-          "Total", double(total_time) * 1e-3, total_memory / 1024);
-#endif
-}
-
 int main(int argc, char **argv)
 {
   Weft weft(argc, argv);
diff --git a/src/weft.h b/src/weft.h
index 2717862..a9ca244 100644
--- a/src/weft.h
+++ b/src/weft.h
@@ -39,8 +39,7 @@ enum {
   WEFT_SUCCESS,
   WEFT_ERROR_NO_FILE_NAME,
   WEFT_ERROR_FILE_OPEN,
-  WEFT_ERROR_MULTIPLE_KERNELS,
-  WEFT_ERROR_THREAD_COUNT_MISMATCH,
+  WEFT_ERROR_NO_KERNELS,
   WEFT_ERROR_NO_THREAD_COUNT,
   WEFT_ERROR_ARRIVAL_MISMATCH,
   WEFT_ERROR_TOO_MANY_PARTICIPANTS,
@@ -208,32 +207,27 @@ class Weft {
   void verify(void);
   void report_error(int error_code, const char *message);
   inline bool report_warnings(void) const { return warnings; }
-  inline int thread_count(void) const { return max_num_threads; }
-  inline int barrier_upper_bound(void) const { return max_num_barriers; }
   inline bool print_verbose(void) const { return verbose; }
   inline bool print_detail(void) const { return detailed; }
-  inline bool assume_warp_synchronous(void) const { return warp_synchronous; }
+  inline bool perform_instrumentation(void) const { return instrument; }
+  inline bool emit_program_files(void) const { return print_files; }
 protected:
   void parse_inputs(int argc, char **argv);
   bool parse_triple(const std::string &input, int *array,
                     const char *flag, const char *error_str);
   void report_usage(int error, const char *error_str);
-  void parse_ptx(void);
-  void emulate_threads(void); 
-  void construct_dependence_graph(void);
-  void compute_happens_relationships(void);
-  void check_for_race_conditions(void);
-  void print_statistics(void);
-  int count_dynamic_instructions(void);
-  int count_weft_statements(void);
-public:
-  void fill_block_dim(int *array);
-  void fill_block_id(int *array);
-  void fill_grid_dim(int *array);
-  void get_file_prefix(char *buffer, size_t count);
+  Program* parse_ptx(void);
+public:
+  void initialize_program(int *block_dim, int *block_id, 
+                          int *grid_dim, bool &warp_synchronous);
+  void start_parsing_instrumentation(void);
+  void stop_parsing_instrumentation(void);
+protected:
+  void report_instrumentation(void);
 protected:
   void start_threadpool(void);
   void stop_threadpool(void);
+public:
   void initialize_count(unsigned count);
   void wait_until_done(void);
 public:
@@ -244,30 +238,19 @@ class Weft {
   static void* worker_loop(void *arg);
   static unsigned long long get_current_time_in_micros(void);
   static size_t get_memory_usage(void);
-protected:
-  void start_instrumentation(int stage);
-  void stop_instrumentation(int stage);
-  void report_instrumentation(void);
 protected:
   const char *file_name;
   int block_dim[3]; // x, y, z
   int block_id[3]; // x, y, z
   int grid_dim[3]; // x, y, z
-  int max_num_threads;
   int thread_pool_size;
-  int max_num_barriers;
   bool verbose;
   bool detailed;
   bool instrument;
   bool warnings;
   bool warp_synchronous;
   bool print_files;
-protected:
-  Program *program;
-  std::vector<Thread*> threads;
-  SharedMemory *shared_memory;
-protected:
-  BarrierDependenceGraph *graph;
+  std::vector<Program*> programs;
 protected:
   pthread_t *worker_threads;
   bool threadpool_finished;
@@ -280,9 +263,8 @@ class Weft {
   pthread_cond_t queue_cond;
   std::deque<WeftTask*> queue;
 protected:
-  // Instrumentation
-  unsigned long long timing[5];
-  size_t memory_usage[5];
+  unsigned long long parsing_time;
+  size_t parsing_memory;
 };
 
 #endif // __WEFT_H__