|
| 1 | +# no delta, but only remove the negative increasing with linear interpolate with hardware events and cpu cycle |
| 2 | +# use the previous value to fill the negative delta, remove inst_retired events |
| 3 | +# update the special case:MINIQMC and QMCQMCPack only 8 ranks on one node |
| 4 | +# and directory Power donot have instruction related events |
| 5 | +# delete the biggest value in 32 ranks and then average |
| 6 | +import sys |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | + |
| 11 | +def clean_event_from_file(filename, savefile): |
| 12 | + if "QMC" in filename: |
| 13 | + rank=8 |
| 14 | + else: |
| 15 | + rank=32 |
| 16 | + |
| 17 | + Xp = pd.read_csv(filename,header = 0) |
| 18 | + #sort each node (4 nodes in each file) by time |
| 19 | + X = Xp.sort_values(['component_id','#Time']) |
| 20 | + X.reset_index(drop=True,inplace=True) |
| 21 | + |
| 22 | + # get the hardware events in the file |
| 23 | + event = [col[:-2] for col in X.columns if (col.endswith('63')and not col.endswith('_63'))] |
| 24 | + event.remove('Pid') |
| 25 | + X.drop([column for column in X.columns if column.startswith("Pid")],axis = 1, inplace=True) |
| 26 | + |
| 27 | + event.remove('CPU_CLK_THREAD_UNHALTED:THREAD_P') |
| 28 | + # only two groups Instruction_Cache and Instruction_Mix need the inst_retired events |
| 29 | + # directory Power donot have instruction related events |
| 30 | + if "Instruction" not in filename and "Power" not in filename: |
| 31 | + event.remove('INST_RETIRED:ANY_P') |
| 32 | + event.remove('INST_RETIRED:ALL') |
| 33 | + X.drop([column for column in X.columns if column.startswith("INST_RETIRED")],axis = 1, inplace=True) |
| 34 | + |
| 35 | + X_clean = X.copy() |
| 36 | + #12 calculate delta start from the first events as well as the cpu cycle |
| 37 | + |
| 38 | + # we need to delete the last row of each node because it's strange |
| 39 | + for id in X['component_id'].unique(): |
| 40 | + index = X_clean.index[X_clean['component_id'] == id][-1:] |
| 41 | + X_clean.drop(index, axis=0, inplace=True) |
| 42 | + |
| 43 | + # remove the ZERO CPU CYCLE |
| 44 | + before_lenth=X_clean.shape[0] |
| 45 | + #12+32, 32 columns is cpu cycles |
| 46 | + X_clean[X_clean.iloc[:, 12:12+rank]==X_clean.iloc[:, 12:12+rank].shift(+1)]=np.nan |
| 47 | + X_clean=X_clean.dropna(axis=0,how='any') |
| 48 | + |
| 49 | + after_lenth=X_clean.shape[0] |
| 50 | + print(filename +" delete cpu cyle=0 rate: {:.2f}".format((before_lenth-after_lenth)/before_lenth)) |
| 51 | + |
| 52 | + # remove the smaller value with the linear interpolate value |
| 53 | + X_clean[( X_clean.iloc[:, 12:] < X_clean.iloc[:, 12:].shift(+1))]=np.nan |
| 54 | + X_clean = X_clean.replace([np.inf, -np.inf], np.nan) |
| 55 | + X_clean.iloc[:, 12:]=X_clean.iloc[:, 12:].interpolate( method = 'linear') |
| 56 | + |
| 57 | + # normalize by cpu cycle |
| 58 | + for e in event: |
| 59 | + for i in range(rank): |
| 60 | + X_clean[e+str(i)] = X_clean[e+str(i)].div(X_clean['CPU_CLK_THREAD_UNHALTED:THREAD_P'+str(i)], axis=0) |
| 61 | + |
| 62 | + # average of 32 process on one rank,delete the biggest values (spike) |
| 63 | + for e in event: |
| 64 | + section = X_clean[[e+str(i) for i in range(rank)]] |
| 65 | + X_clean[e]= (section.sum(axis=1)-section.max(axis=1))/(rank-1) |
| 66 | + |
| 67 | + |
| 68 | + data_clean= pd.DataFrame(columns=event) |
| 69 | + for e in event: |
| 70 | + p = pd.DataFrame() |
| 71 | + for id in X_clean['component_id'].unique(): |
| 72 | + q = X_clean[X_clean["component_id"]==id][e].reset_index(drop=True) |
| 73 | + p = pd.concat((p,q ),axis=1) |
| 74 | + data_clean[e] = p.mean(axis=1) |
| 75 | + |
| 76 | + # data_delta is the final events/cpu cycle |
| 77 | + data_clean.to_csv(savefile,index=False) |
| 78 | + |
| 79 | + return |
| 80 | + |
| 81 | + |
| 82 | +if __name__ == "__main__": |
| 83 | + args = sys.argv[1:] |
| 84 | + clean_event_from_file(args[0],args[1]) |
0 commit comments