update benchmark infer

shenghuitan · Jan 25, 2021 · 412521c · 412521c
1 parent ee1caad
commit 412521c
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 21 deletions.
diff --git a/models/recall/word2vec/benchmark/w2v_infer.py b/models/recall/word2vec/benchmark/w2v_infer.py
@@ -86,22 +86,26 @@ def _load_emb(var):
 def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
     """ inference function """
     epoch_model_path_list = []
-    epoch_model_name_list = []
+
     for file in os.listdir(model_dir):
         file_path = os.path.join(model_dir, file)
         # hard code for epoch model folder
         if os.path.isdir(file_path) and is_number(file):
             epoch_model_path_list.append(file_path)
-            epoch_model_name_list.append(file)
+
     if len(epoch_model_path_list) == 0:
         return
+    self.epoch_model_path_list = self.epoch_model_path_list.sort()
     print("Save model len {}".format(len(epoch_model_path_list)))
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     emb_size = args.emb_size
     batch_size = args.batch_size
-    result_dict = collections.OrderedDict()
+
+    result_dict = {}
+    result_dict["result"] = {}
+
     with fluid.scope_guard(fluid.Scope()):
         main_program = fluid.Program()
         with fluid.program_guard(main_program):
@@ -153,7 +157,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
                 print("model: {} \t acc: {} ".format(
                     model_path, 1.0 * accum_num / accum_num_sum))
                 epoch_acc = 1.0 * accum_num / accum_num_sum
-                result_dict[epoch] = epoch_acc
+                epoch_name = model_path.split("/")[-1]
+                result_dict["result"][epoch_name] = epoch_acc
+
+    print("infer_result_dict: {}".format(result_dict))
+    with open("./infer_result_dict.txt", 'w+') as f:
+        f.write(str(result_dict))
 
 
 def BuildWord_IdMap(dict_path):

diff --git a/tools/static_ps_trainer.py b/tools/static_ps_trainer.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
+from utils.static_ps.reader_helper import get_reader, get_example_num, get_file_list, get_word_num
+from utils.static_ps.program_helper import get_model, get_strategy
+from utils.static_ps.common import YamlHelper, is_distributed_env
 import argparse
 import time
 import sys
@@ -25,9 +28,6 @@
 
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
-from utils.static_ps.common import YamlHelper, is_distributed_env
-from utils.static_ps.program_helper import get_model, get_strategy
-from utils.static_ps.reader_helper import get_reader, get_example_num, get_file_list, get_word_num
 
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
@@ -59,6 +59,8 @@ def __init__(self, config):
         self.input_data = None
         self.reader = None
         self.exe = None
+        self.train_result_dict = {}
+        self.train_result_dict["speed"] = []
 
     def run(self):
         fleet.init()
@@ -68,6 +70,7 @@ def run(self):
         elif fleet.is_worker():
             self.run_worker()
             fleet.stop_worker()
+            self.record_result()
         logger.info("Run Success, Exit.")
 
     def network(self):
@@ -125,6 +128,7 @@ def run_worker(self):
             logger.info(
                 "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                     epoch, epoch_time, epoch_speed, self.count_method))
+            self.train_result_dict["speed"].append(epoch_speed)
 
             model_dir = "{}/{}".format(save_model_path, epoch)
             if fleet.is_first_worker(
@@ -239,9 +243,9 @@ def recdataset_train_loop(self, epoch):
                     " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} {}/sec".
                     format(train_reader_cost / print_interval, (
                         train_reader_cost + train_run_cost) / print_interval,
-                           total_samples / print_interval, total_samples / (
-                               train_reader_cost + train_run_cost),
-                           self.count_method))
+                        total_samples / print_interval, total_samples / (
+                        train_reader_cost + train_run_cost),
+                        self.count_method))
                 train_reader_cost = 0.0
                 train_run_cost = 0.0
                 total_samples = 0
@@ -289,6 +293,11 @@ def heter_train_loop(self, epoch):
                     self.reader.reset()
                     break
 
+    def record_result(self):
+        logger.info("train_result_dict: {}".format(self.train_result_dict))
+        with open("./train_result_dict.txt", 'w+') as f:
+            f.write(str(self.train_result_dict))
+
 
 if __name__ == "__main__":
     paddle.enable_static()

diff --git a/tools/utils/static_ps/infer.py b/tools/utils/static_ps/infer.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
+from reader_helper import get_reader, get_infer_reader, get_example_num, get_file_list, get_word_num
+from program_helper import get_model, get_strategy
+from common import YamlHelper, is_number
 import os
 import numpy as np
 import warnings
@@ -26,9 +29,6 @@
 
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
-from common import YamlHelper, is_number
-from program_helper import get_model, get_strategy
-from reader_helper import get_reader, get_infer_reader, get_example_num, get_file_list, get_word_num
 
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
@@ -61,7 +61,8 @@ def __init__(self, config):
         self.reader = None
         self.exe = None
         self.epoch_model_path_list = []
-        self.epoch_model_name_list = []
+        self.infer_result_dict = {}
+        self.infer_result_dict["result"] = {}
 
     def run(self):
         self.network()
@@ -76,19 +77,20 @@ def run(self):
             # hard code for epoch model folder
             if os.path.isdir(file_path) and is_number(file):
                 self.epoch_model_path_list.append(file_path)
-                self.epoch_model_name_list.append(file)
 
         if len(self.epoch_model_path_list) == 0:
             self.epoch_model_path_list.append(init_model_path)
-            self.epoch_model_name_list.append(init_model_path)
 
-        self.epoch_model_path_list.sort()
-        self.epoch_model_name_list.sort()
+        self.epoch_model_path_list = self.epoch_model_path_list.sort()
 
         for idx, model_path in enumerate(self.epoch_model_path_list):
             logger.info("Begin Infer Model {}".format(
-                self.epoch_model_name_list[idx]))
-            self.run_infer(model_path, self.epoch_model_name_list[idx])
+                self.epoch_model_path_list[idx]))
+            model_name = model_path.split("/")[-1]
+            infer_res = self.run_infer(model_path, model_name)
+            self.infer_result_dict["result"][model_name] = infer_res
+
+        self.record_result()
         logger.info("Run Success, Exit.")
 
     def network(self):
@@ -102,7 +104,7 @@ def run_infer(self, model_path, model_name):
                 dirname=model_path, executor=self.exe))
 
         self.reset_auc()
-
+        infer_res = []
         for batch_id, data in enumerate(self.reader()):
             results = self.exe.run(inference_program,
                                    feed=data,
@@ -114,8 +116,10 @@ def run_infer(self, model_path, model_name):
                 for var_idx, var_name in enumerate(results):
                     metrics_string += "Infer res: {}, ".format(results[
                         var_idx])
+                    infer_res.append(results[var_idx])
                 logger.info("Model: {}, Batch: {}, {}".format(
                     model_name, batch_id, metrics_string))
+        return np.mean(infer_res)
 
     def init_reader(self):
         self.reader, self.file_list = get_infer_reader(self.input_data, config)
@@ -146,6 +150,11 @@ def reset_auc(self):
                 tensor.set(tensor_array, paddle.CPUPlace())
                 logger.info("AUC Reset To Zero: {}".format(name))
 
+    def record_result(self):
+        logger.info("infer_result_dict: {}".format(self.infer_result_dict))
+        with open("./infer_result_dict.txt", 'w+') as f:
+            f.write(str(self.infer_result_dict))
+
 
 if __name__ == "__main__":
     paddle.enable_static()