add accumulate files

meditates · meditates · commit 61b29d52fe4a · 2022-04-05T13:09:51.000-04:00
diff --git a/dataprocess/average128accu.py b/dataprocess/average128accu.py
@@ -0,0 +1,84 @@
+# no delta, but only remove the negative increasing with linear interpolate with hardware events and cpu cycle
+# use the previous value to fill the negative delta, remove inst_retired events
+# update the special case:MINIQMC and QMCQMCPack only 8 ranks on one node 
+# and directory Power donot have instruction related events
+# delete the biggest value in 32 ranks and then average
+import sys
+import numpy as np
+import pandas as pd 
+
+
+def clean_event_from_file(filename, savefile):
+    if "QMC" in filename:
+        rank=8
+    else:
+        rank=32
+
+    Xp = pd.read_csv(filename,header = 0)
+    #sort each node (4 nodes in each file) by time
+    X = Xp.sort_values(['component_id','#Time'])
+    X.reset_index(drop=True,inplace=True)
+
+    # get the hardware events in the file
+    event = [col[:-2] for col in X.columns if (col.endswith('63')and not col.endswith('_63'))]
+    event.remove('Pid')
+    X.drop([column for column in X.columns if column.startswith("Pid")],axis = 1, inplace=True)
+
+    event.remove('CPU_CLK_THREAD_UNHALTED:THREAD_P')
+    # only two groups Instruction_Cache and Instruction_Mix need the inst_retired events
+    # directory Power donot have instruction related events
+    if "Instruction" not in filename and "Power" not in filename:
+        event.remove('INST_RETIRED:ANY_P')
+        event.remove('INST_RETIRED:ALL')
+        X.drop([column for column in X.columns if column.startswith("INST_RETIRED")],axis = 1, inplace=True)
+
+    X_clean = X.copy()
+    #12 calculate delta start from the first events as well as the cpu cycle
+ 
+    # we need to delete the last row of each node because it's strange
+    for id in X['component_id'].unique():
+        index = X_clean.index[X_clean['component_id'] == id][-1:] 
+        X_clean.drop(index, axis=0, inplace=True)
+
+    # remove the ZERO CPU CYCLE
+    before_lenth=X_clean.shape[0]
+    #12+32, 32 columns is cpu cycles
+    X_clean[X_clean.iloc[:, 12:12+rank]==X_clean.iloc[:, 12:12+rank].shift(+1)]=np.nan
+    X_clean=X_clean.dropna(axis=0,how='any')
+
+    after_lenth=X_clean.shape[0]
+    print(filename +" delete cpu cyle=0 rate: {:.2f}".format((before_lenth-after_lenth)/before_lenth))
+
+    # remove the smaller value with the linear interpolate value   
+    X_clean[( X_clean.iloc[:, 12:] <  X_clean.iloc[:, 12:].shift(+1))]=np.nan
+    X_clean = X_clean.replace([np.inf, -np.inf], np.nan)
+    X_clean.iloc[:, 12:]=X_clean.iloc[:, 12:].interpolate( method = 'linear')
+
+    # normalize by cpu cycle 
+    for e in event:
+        for i in range(rank):
+            X_clean[e+str(i)] = X_clean[e+str(i)].div(X_clean['CPU_CLK_THREAD_UNHALTED:THREAD_P'+str(i)], axis=0)
+  
+    # average of 32 process on one rank,delete the biggest values (spike)
+    for e in event:
+        section = X_clean[[e+str(i) for i in range(rank)]]
+        X_clean[e]= (section.sum(axis=1)-section.max(axis=1))/(rank-1)
+
+
+    data_clean= pd.DataFrame(columns=event)
+    for e in event: 
+        p = pd.DataFrame()
+        for id in X_clean['component_id'].unique():
+            q = X_clean[X_clean["component_id"]==id][e].reset_index(drop=True)
+            p = pd.concat((p,q ),axis=1)
+        data_clean[e] = p.mean(axis=1)
+        
+    # data_delta is the final events/cpu cycle
+    data_clean.to_csv(savefile,index=False)
+
+    return 
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:]
+    clean_event_from_file(args[0],args[1])
diff --git a/dataprocess/average5new.py b/dataprocess/average5new.py
@@ -0,0 +1,24 @@
+import sys
+import os
+import numpy as np
+import pandas as pd 
+
+def average_5file(dirname,appname):
+# get the mean of last 5 steps of 5 trials 
+	files = os.listdir(dirname)
+
+	X = pd.DataFrame()
+	for i,f in enumerate(files):
+	    if f.startswith(appname+"_"):
+	        X1 = pd.read_csv(dirname+'/'+f,header = 0)
+	        p = X1.iloc[-5:].mean(axis=0)
+	        X = X.append(p,ignore_index=True)
+	Y = X.mean(axis=0)
+	Y.to_csv("output/"+appname+"_"+dirname+'.csv')
+   
+	return
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:]
+    average_5file(args[0],args[1])
diff --git a/dataprocess/buildcsv.py b/dataprocess/buildcsv.py
@@ -0,0 +1,52 @@
+import sys
+import os
+import numpy as np
+import pandas as pd
+def build_all_csv():
+    allapps = pd.DataFrame()
+    files = os.listdir("./csv")
+    for f in files:
+        X1 = pd.read_csv("./csv/"+f,header=0,index_col=0)
+        allapps = pd.concat([allapps, X1], axis=1)
+    allapps = allapps.loc[:,~allapps.columns.duplicated(keep='first')]    
+    allapps.to_csv("./csv/SKX_allapps.csv")
+
+def build_csv(groupname):
+    allapps = pd.DataFrame()
+    for app in appnames:
+        X = pd.DataFrame()
+        for i,f in enumerate(files):
+            if f.startswith(app+"_") and groupname in f:
+                X1 = pd.read_csv("./output/"+f,header=None, index_col = 0)
+                X = X.append(X1)
+        # drop the duplicated event name
+        X = X[~X.index.duplicated(keep='first')]
+        X = X.rename(columns={1: app})
+        allapps = allapps.append(X.T)
+    allapps.to_csv("./csv/SKX_"+groupname+".csv")
+
+def build_memory_csv():
+    allapps = pd.DataFrame()
+    for app in appnames:
+        X = pd.DataFrame()
+        for i,f in enumerate(files):
+            if f.startswith(app+"_") and "Memory" in f and "Memory_" not in f:
+                X1 = pd.read_csv("./output/"+f,header=None, index_col = 0)
+                X = X.append(X1)
+        # drop the duplicated event name
+        X = X[~X.index.duplicated(keep='first')]
+        X = X.rename(columns={1: app})
+        allapps = allapps.append(X.T)
+    allapps.to_csv("./csv/SKX_Memory.csv")
+
+
+if __name__ == "__main__":
+    files = os.listdir("./output")
+    appnames = ["ExaMiniMD", "LAMMPS", "sw4lite", "sw4", "SWFFT", "HACC", "MiniQMC", "QMCPack", "miniVite", "vite", "Nekbone", "Nek5000", "XSBench", "openmc", "picsarlite", "picsar", "amg2013", "Castro", "Laghos", "pennant", "snap", "hpcc_dgemm", "hpcc_random", "hpcc_streams", "hpcg"]
+    groups = ["Branch","DecodeIssue_Pipeline","Dispatch_Pipeline","Execution_Pipeline",
+    "Frontend","Instruction_Cache","Instruction_Mix","L1_D_Cache",
+    "L2_D_Cache","L3_D_Cache","Memory_Pipeline","Misc","Power","Retirement_Pipeline"]
+    for group in groups:
+        build_csv(group)
+    build_memory_csv()
+    build_all_csv()
diff --git a/dataprocess/run.sh b/dataprocess/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+path=/research/file_system_traces/attaway_run_Cosine_08132021/
+pathdirs=$(ls $path)
+for dirname in $pathdirs
+do
+        if [[ $dirname == SKX_M* ]]
+	then 
+		mkdir $dirname
+		echo $dirname >> pathdirs.txt
+	
+		filenames=$(ls $path$dirname)
+		for file in $filenames
+			do
+				python average128accu.py $path$dirname"/"$file $dirname"/"${file:7} >> output112.txt
+
+			done
+	fi
+done
+
+mkdir output
+
+path=./
+pathdirs=$(ls $path)
+appnames="ExaMiniMD LAMMPS sw4lite sw4 SWFFT HACC MiniQMC QMCPack miniVite vite Nekbone Nek5000 XSBench openmc picsarlite picsar amg2013 Castro Laghos pennant snap hpcc_dgemm hpcc_random hpcc_streams hpcg "
+
+for dirname in $pathdirs
+do
+        if [[ $dirname == SKX* ]]
+                then
+                        for app in $appnames
+                                do
+                                        python average5new.py $dirname $app
+
+                                done
+                fi
+done
+
+mkdir csv
+python buildcsv.py