Skip to content

Commit 7617ad0

Browse files
committed
[rdc] with '-rdc true' ggtt has the same performance in cuda, but uses fewer registers (170 instead of 172)
1 parent da08e56 commit 7617ad0

File tree

1 file changed

+44
-44
lines changed

1 file changed

+44
-44
lines changed

epochX/cudacpp/tput/logs_ggtt_manu/log_ggtt_manu_d_inl0_hrd0.txt

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -68,23 +68,23 @@ make[1]: Entering directory `/data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp
6868
make[1]: Nothing to be done for `all.512z_d_inl0_hrd0_hasCurand'.
6969
make[1]: Leaving directory `/data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx'
7070

71-
DATE: 2022-01-11_20:54:44
71+
DATE: 2022-01-17_16:39:57
7272

7373
On itscrd70.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
7474
=========================================================================
7575
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP=
7676
Process = SIGMA_SM_GG_TTX_CUDA [nvcc 11.1.105 (gcc 10.2.0)] [inlineHel=0] [hardcodeCIPC=0]
7777
Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAREF
7878
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
79-
EvtsPerSec[Rmb+ME] (23) = ( 4.474940e+07 ) sec^-1
80-
EvtsPerSec[MatrixElems] (3) = ( 1.285687e+08 ) sec^-1
81-
EvtsPerSec[MECalcOnly] (3a) = ( 1.428806e+08 ) sec^-1
79+
EvtsPerSec[Rmb+ME] (23) = ( 4.102362e+07 ) sec^-1
80+
EvtsPerSec[MatrixElems] (3) = ( 1.254413e+08 ) sec^-1
81+
EvtsPerSec[MECalcOnly] (3a) = ( 1.421700e+08 ) sec^-1
8282
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
83-
TOTAL : 0.804412 sec
84-
1,241,084,391 cycles:u # 1.639 GHz
85-
2,415,012,227 instructions:u # 1.95 insn per cycle
86-
0.877208910 seconds time elapsed
87-
==PROF== Profiling "sigmaKin": launch__registers_per_thread 172
83+
TOTAL : 1.460359 sec
84+
1,750,368,254 cycles:u # 1.148 GHz
85+
3,462,065,234 instructions:u # 1.98 insn per cycle
86+
1.767628095 seconds time elapsed
87+
==PROF== Profiling "sigmaKin": launch__registers_per_thread 170
8888
==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
8989
.........................................................................
9090
=========================================================================
@@ -93,14 +93,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 10.2.0] [inlineHel=0] [ha
9393
Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAREF
9494
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
9595
Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD)
96-
EvtsPerSec[Rmb+ME] (23) = ( 1.885695e+05 ) sec^-1
97-
EvtsPerSec[MatrixElems] (3) = ( 2.008159e+05 ) sec^-1
98-
EvtsPerSec[MECalcOnly] (3a) = ( 2.008159e+05 ) sec^-1
96+
EvtsPerSec[Rmb+ME] (23) = ( 1.889508e+05 ) sec^-1
97+
EvtsPerSec[MatrixElems] (3) = ( 2.010785e+05 ) sec^-1
98+
EvtsPerSec[MECalcOnly] (3a) = ( 2.010785e+05 ) sec^-1
9999
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
100-
TOTAL : 2.888760 sec
101-
7,553,516,361 cycles:u # 2.604 GHz
102-
22,020,957,578 instructions:u # 2.92 insn per cycle
103-
2.905195425 seconds time elapsed
100+
TOTAL : 2.883996 sec
101+
7,536,098,356 cycles:u # 2.601 GHz
102+
22,020,934,454 instructions:u # 2.92 insn per cycle
103+
2.901081838 seconds time elapsed
104104
=Symbols in CPPProcess.o= (~sse4: 456) (avx2: 0) (512y: 0) (512z: 0)
105105
-------------------------------------------------------------------------
106106
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -111,14 +111,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 10.2.0] [inlineHel=0] [ha
111111
Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXREF
112112
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
113113
Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
114-
EvtsPerSec[Rmb+ME] (23) = ( 2.859668e+05 ) sec^-1
115-
EvtsPerSec[MatrixElems] (3) = ( 3.155640e+05 ) sec^-1
116-
EvtsPerSec[MECalcOnly] (3a) = ( 3.155640e+05 ) sec^-1
114+
EvtsPerSec[Rmb+ME] (23) = ( 2.856574e+05 ) sec^-1
115+
EvtsPerSec[MatrixElems] (3) = ( 3.155833e+05 ) sec^-1
116+
EvtsPerSec[MECalcOnly] (3a) = ( 3.155833e+05 ) sec^-1
117117
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
118-
TOTAL : 1.939524 sec
119-
5,017,269,787 cycles:u # 2.569 GHz
120-
12,885,087,717 instructions:u # 2.57 insn per cycle
121-
1.955618653 seconds time elapsed
118+
TOTAL : 1.943074 sec
119+
5,020,219,382 cycles:u # 2.566 GHz
120+
12,885,064,423 instructions:u # 2.57 insn per cycle
121+
1.960308285 seconds time elapsed
122122
=Symbols in CPPProcess.o= (~sse4: 2353) (avx2: 0) (512y: 0) (512z: 0)
123123
-------------------------------------------------------------------------
124124
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -129,14 +129,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 10.2.0] [inlineHel=0] [ha
129129
Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXREF
130130
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
131131
Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
132-
EvtsPerSec[Rmb+ME] (23) = ( 4.837448e+05 ) sec^-1
133-
EvtsPerSec[MatrixElems] (3) = ( 5.708976e+05 ) sec^-1
134-
EvtsPerSec[MECalcOnly] (3a) = ( 5.708976e+05 ) sec^-1
132+
EvtsPerSec[Rmb+ME] (23) = ( 4.838378e+05 ) sec^-1
133+
EvtsPerSec[MatrixElems] (3) = ( 5.715632e+05 ) sec^-1
134+
EvtsPerSec[MECalcOnly] (3a) = ( 5.715632e+05 ) sec^-1
135135
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
136-
TOTAL : 1.191260 sec
137-
2,653,373,365 cycles:u # 2.204 GHz
138-
5,512,790,923 instructions:u # 2.08 insn per cycle
139-
1.207423210 seconds time elapsed
136+
TOTAL : 1.191525 sec
137+
2,654,274,999 cycles:u # 2.202 GHz
138+
5,512,768,344 instructions:u # 2.08 insn per cycle
139+
1.208573724 seconds time elapsed
140140
=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2185) (512y: 0) (512z: 0)
141141
-------------------------------------------------------------------------
142142
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -147,14 +147,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 10.2.0] [inlineHel=0] [ha
147147
Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXREF
148148
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
149149
Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
150-
EvtsPerSec[Rmb+ME] (23) = ( 5.161384e+05 ) sec^-1
151-
EvtsPerSec[MatrixElems] (3) = ( 6.170124e+05 ) sec^-1
152-
EvtsPerSec[MECalcOnly] (3a) = ( 6.170124e+05 ) sec^-1
150+
EvtsPerSec[Rmb+ME] (23) = ( 5.161483e+05 ) sec^-1
151+
EvtsPerSec[MatrixElems] (3) = ( 6.175564e+05 ) sec^-1
152+
EvtsPerSec[MECalcOnly] (3a) = ( 6.175564e+05 ) sec^-1
153153
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
154-
TOTAL : 1.121818 sec
155-
2,500,360,700 cycles:u # 2.203 GHz
156-
5,346,587,691 instructions:u # 2.14 insn per cycle
157-
1.137968023 seconds time elapsed
154+
TOTAL : 1.123995 sec
155+
2,498,424,655 cycles:u # 2.196 GHz
156+
5,346,564,797 instructions:u # 2.14 insn per cycle
157+
1.141241097 seconds time elapsed
158158
=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2024) (512y: 115) (512z: 0)
159159
-------------------------------------------------------------------------
160160
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -165,14 +165,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 10.2.0] [inlineHel=0] [ha
165165
Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXREF
166166
FP precision = DOUBLE (NaN/abnormal=0, zero=0)
167167
Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
168-
EvtsPerSec[Rmb+ME] (23) = ( 3.361240e+05 ) sec^-1
169-
EvtsPerSec[MatrixElems] (3) = ( 3.763054e+05 ) sec^-1
170-
EvtsPerSec[MECalcOnly] (3a) = ( 3.763054e+05 ) sec^-1
168+
EvtsPerSec[Rmb+ME] (23) = ( 3.370261e+05 ) sec^-1
169+
EvtsPerSec[MatrixElems] (3) = ( 3.778267e+05 ) sec^-1
170+
EvtsPerSec[MECalcOnly] (3a) = ( 3.778267e+05 ) sec^-1
171171
MeanMatrixElemValue = ( 2.085623e+00 +- 4.835084e-03 ) GeV^0
172-
TOTAL : 1.668573 sec
173-
2,764,359,455 cycles:u # 1.644 GHz
174-
3,601,047,206 instructions:u # 1.30 insn per cycle
175-
1.689281266 seconds time elapsed
172+
TOTAL : 1.664563 sec
173+
2,760,255,519 cycles:u # 1.645 GHz
174+
3,601,024,707 instructions:u # 1.30 insn per cycle
175+
1.681761721 seconds time elapsed
176176
=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1051) (512y: 85) (512z: 1591)
177177
-------------------------------------------------------------------------
178178
runExe /data/avalassi/GPU2020/madgraph4gpuX/epochX/cudacpp/gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe

0 commit comments

Comments
 (0)