Skip to content

Commit 61b29d5

Browse files
committed
add accumulate files
1 parent 57c7c6b commit 61b29d5

File tree

4 files changed

+200
-0
lines changed

4 files changed

+200
-0
lines changed

dataprocess/average128accu.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# no delta, but only remove the negative increasing with linear interpolate with hardware events and cpu cycle
2+
# use the previous value to fill the negative delta, remove inst_retired events
3+
# update the special case:MINIQMC and QMCQMCPack only 8 ranks on one node
4+
# and directory Power donot have instruction related events
5+
# delete the biggest value in 32 ranks and then average
6+
import sys
7+
import numpy as np
8+
import pandas as pd
9+
10+
11+
def clean_event_from_file(filename, savefile):
12+
if "QMC" in filename:
13+
rank=8
14+
else:
15+
rank=32
16+
17+
Xp = pd.read_csv(filename,header = 0)
18+
#sort each node (4 nodes in each file) by time
19+
X = Xp.sort_values(['component_id','#Time'])
20+
X.reset_index(drop=True,inplace=True)
21+
22+
# get the hardware events in the file
23+
event = [col[:-2] for col in X.columns if (col.endswith('63')and not col.endswith('_63'))]
24+
event.remove('Pid')
25+
X.drop([column for column in X.columns if column.startswith("Pid")],axis = 1, inplace=True)
26+
27+
event.remove('CPU_CLK_THREAD_UNHALTED:THREAD_P')
28+
# only two groups Instruction_Cache and Instruction_Mix need the inst_retired events
29+
# directory Power donot have instruction related events
30+
if "Instruction" not in filename and "Power" not in filename:
31+
event.remove('INST_RETIRED:ANY_P')
32+
event.remove('INST_RETIRED:ALL')
33+
X.drop([column for column in X.columns if column.startswith("INST_RETIRED")],axis = 1, inplace=True)
34+
35+
X_clean = X.copy()
36+
#12 calculate delta start from the first events as well as the cpu cycle
37+
38+
# we need to delete the last row of each node because it's strange
39+
for id in X['component_id'].unique():
40+
index = X_clean.index[X_clean['component_id'] == id][-1:]
41+
X_clean.drop(index, axis=0, inplace=True)
42+
43+
# remove the ZERO CPU CYCLE
44+
before_lenth=X_clean.shape[0]
45+
#12+32, 32 columns is cpu cycles
46+
X_clean[X_clean.iloc[:, 12:12+rank]==X_clean.iloc[:, 12:12+rank].shift(+1)]=np.nan
47+
X_clean=X_clean.dropna(axis=0,how='any')
48+
49+
after_lenth=X_clean.shape[0]
50+
print(filename +" delete cpu cyle=0 rate: {:.2f}".format((before_lenth-after_lenth)/before_lenth))
51+
52+
# remove the smaller value with the linear interpolate value
53+
X_clean[( X_clean.iloc[:, 12:] < X_clean.iloc[:, 12:].shift(+1))]=np.nan
54+
X_clean = X_clean.replace([np.inf, -np.inf], np.nan)
55+
X_clean.iloc[:, 12:]=X_clean.iloc[:, 12:].interpolate( method = 'linear')
56+
57+
# normalize by cpu cycle
58+
for e in event:
59+
for i in range(rank):
60+
X_clean[e+str(i)] = X_clean[e+str(i)].div(X_clean['CPU_CLK_THREAD_UNHALTED:THREAD_P'+str(i)], axis=0)
61+
62+
# average of 32 process on one rank,delete the biggest values (spike)
63+
for e in event:
64+
section = X_clean[[e+str(i) for i in range(rank)]]
65+
X_clean[e]= (section.sum(axis=1)-section.max(axis=1))/(rank-1)
66+
67+
68+
data_clean= pd.DataFrame(columns=event)
69+
for e in event:
70+
p = pd.DataFrame()
71+
for id in X_clean['component_id'].unique():
72+
q = X_clean[X_clean["component_id"]==id][e].reset_index(drop=True)
73+
p = pd.concat((p,q ),axis=1)
74+
data_clean[e] = p.mean(axis=1)
75+
76+
# data_delta is the final events/cpu cycle
77+
data_clean.to_csv(savefile,index=False)
78+
79+
return
80+
81+
82+
if __name__ == "__main__":
83+
args = sys.argv[1:]
84+
clean_event_from_file(args[0],args[1])

dataprocess/average5new.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
import os
3+
import numpy as np
4+
import pandas as pd
5+
6+
def average_5file(dirname,appname):
7+
# get the mean of last 5 steps of 5 trials
8+
files = os.listdir(dirname)
9+
10+
X = pd.DataFrame()
11+
for i,f in enumerate(files):
12+
if f.startswith(appname+"_"):
13+
X1 = pd.read_csv(dirname+'/'+f,header = 0)
14+
p = X1.iloc[-5:].mean(axis=0)
15+
X = X.append(p,ignore_index=True)
16+
Y = X.mean(axis=0)
17+
Y.to_csv("output/"+appname+"_"+dirname+'.csv')
18+
19+
return
20+
21+
22+
if __name__ == "__main__":
23+
args = sys.argv[1:]
24+
average_5file(args[0],args[1])

dataprocess/buildcsv.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import sys
2+
import os
3+
import numpy as np
4+
import pandas as pd
5+
def build_all_csv():
6+
allapps = pd.DataFrame()
7+
files = os.listdir("./csv")
8+
for f in files:
9+
X1 = pd.read_csv("./csv/"+f,header=0,index_col=0)
10+
allapps = pd.concat([allapps, X1], axis=1)
11+
allapps = allapps.loc[:,~allapps.columns.duplicated(keep='first')]
12+
allapps.to_csv("./csv/SKX_allapps.csv")
13+
14+
def build_csv(groupname):
15+
allapps = pd.DataFrame()
16+
for app in appnames:
17+
X = pd.DataFrame()
18+
for i,f in enumerate(files):
19+
if f.startswith(app+"_") and groupname in f:
20+
X1 = pd.read_csv("./output/"+f,header=None, index_col = 0)
21+
X = X.append(X1)
22+
# drop the duplicated event name
23+
X = X[~X.index.duplicated(keep='first')]
24+
X = X.rename(columns={1: app})
25+
allapps = allapps.append(X.T)
26+
allapps.to_csv("./csv/SKX_"+groupname+".csv")
27+
28+
def build_memory_csv():
29+
allapps = pd.DataFrame()
30+
for app in appnames:
31+
X = pd.DataFrame()
32+
for i,f in enumerate(files):
33+
if f.startswith(app+"_") and "Memory" in f and "Memory_" not in f:
34+
X1 = pd.read_csv("./output/"+f,header=None, index_col = 0)
35+
X = X.append(X1)
36+
# drop the duplicated event name
37+
X = X[~X.index.duplicated(keep='first')]
38+
X = X.rename(columns={1: app})
39+
allapps = allapps.append(X.T)
40+
allapps.to_csv("./csv/SKX_Memory.csv")
41+
42+
43+
if __name__ == "__main__":
44+
files = os.listdir("./output")
45+
appnames = ["ExaMiniMD", "LAMMPS", "sw4lite", "sw4", "SWFFT", "HACC", "MiniQMC", "QMCPack", "miniVite", "vite", "Nekbone", "Nek5000", "XSBench", "openmc", "picsarlite", "picsar", "amg2013", "Castro", "Laghos", "pennant", "snap", "hpcc_dgemm", "hpcc_random", "hpcc_streams", "hpcg"]
46+
groups = ["Branch","DecodeIssue_Pipeline","Dispatch_Pipeline","Execution_Pipeline",
47+
"Frontend","Instruction_Cache","Instruction_Mix","L1_D_Cache",
48+
"L2_D_Cache","L3_D_Cache","Memory_Pipeline","Misc","Power","Retirement_Pipeline"]
49+
for group in groups:
50+
build_csv(group)
51+
build_memory_csv()
52+
build_all_csv()

dataprocess/run.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
3+
path=/research/file_system_traces/attaway_run_Cosine_08132021/
4+
pathdirs=$(ls $path)
5+
for dirname in $pathdirs
6+
do
7+
if [[ $dirname == SKX_M* ]]
8+
then
9+
mkdir $dirname
10+
echo $dirname >> pathdirs.txt
11+
12+
filenames=$(ls $path$dirname)
13+
for file in $filenames
14+
do
15+
python average128accu.py $path$dirname"/"$file $dirname"/"${file:7} >> output112.txt
16+
17+
done
18+
fi
19+
done
20+
21+
mkdir output
22+
23+
path=./
24+
pathdirs=$(ls $path)
25+
appnames="ExaMiniMD LAMMPS sw4lite sw4 SWFFT HACC MiniQMC QMCPack miniVite vite Nekbone Nek5000 XSBench openmc picsarlite picsar amg2013 Castro Laghos pennant snap hpcc_dgemm hpcc_random hpcc_streams hpcg "
26+
27+
for dirname in $pathdirs
28+
do
29+
if [[ $dirname == SKX* ]]
30+
then
31+
for app in $appnames
32+
do
33+
python average5new.py $dirname $app
34+
35+
done
36+
fi
37+
done
38+
39+
mkdir csv
40+
python buildcsv.py

0 commit comments

Comments
 (0)