-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathexx_gs.f90
1234 lines (1227 loc) · 48.5 KB
/
exx_gs.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
SUBROUTINE exx_gs(nfi, c)
!=======================================================================================
! Code Version 1.0 (Princeton University, September 2014)
!=======================================================================================
! Note: From this code exx_potential is returned after multiplying mixing parameter exxalfa.
! Later the exx_potential is added with GGA potential in forces.f90.
! In the future, full exx_potential should be returned and the mixing parameter exxalfa
! should be multiplied in forces.f90.
!=======================================================================================
!
USE kinds, ONLY : DP
USE constants, ONLY : fpi
USE fft_base, ONLY : dffts, dfftp, dtgs
USE mp, ONLY : mp_barrier, mp_sum
USE mp_global, ONLY : nproc_image, me_image, root_image, intra_image_comm, intra_bgrp_comm, me_bgrp
USE parallel_include
USE io_global, ONLY : stdout
USE cell_base, ONLY : omega, ainv, h
USE cell_base, ONLY : ibrav
USE cell_base, ONLY : isotropic !True if volume option is chosen for cell_dofree
USE electrons_base, ONLY : nbsp, nbspx, nspin
USE gvecw, ONLY : ngw
USE wannier_module, ONLY : wfc
USE exx_module, ONLY : my_nbspx, my_nbsp, my_nxyz, index_my_nbsp,rk_of_obtl, lindex_of_obtl
USE exx_module, ONLY : selfv, pairv, pair_dist
USE exx_module, ONLY : exx_potential
USE exx_module, ONLY : n_exx, sc_fac ! EXX step and the performance scaling factor
USE exx_module, ONLY : coe_1st_derv, coeke, nord1, nord2, fornberg
USE exx_module, ONLY : thdtood, odtothd_in_sp, thdtood_in_sp
USE exx_module, ONLY : np_in_sp_s , np_in_sp_me_s , np_in_sp_p , np_in_sp_me_p
USE exx_module, ONLY : xx_in_sp,yy_in_sp,zz_in_sp,sc_xx_in_sp,sc_yy_in_sp,sc_zz_in_sp
USE energies, ONLY : exx
USE printout_base, ONLY : printout_base_open, printout_base_unit, printout_base_close
USE wannier_base, ONLY : neigh, dis_cutoff
USE mp_wave, ONLY : redistwfr
!
USE time_step, ONLY : tps !md time in picoseconds
USE io_global, ONLY : ionode !logical for I/O node
USE cp_main_variables, ONLY : iprint_stdout !print control
USE printout_base, ONLY : printout_base_close!close print unit
USE printout_base, ONLY : printout_base_open !open print unit
USE printout_base, ONLY : printout_base_unit !printout_base_unit
USE exx_module, ONLY : dexx_dh
USE exx_module, ONLY : exx_energy_cell_derivative
USE exx_module, ONLY : exxalfa
!
IMPLICIT NONE
COMPLEX(DP) c(ngw, nbspx) ! wave functions at time t
#if defined(__MPI)
!
INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
INTEGER ir, ip, i, j,nfi, ierr, nnrtot, nr1s,nr2s,nr3s
INTEGER nj_max, iunit
REAl(DP) sa1,a(3),ha, hb, hc
REAl(DP) hcub, centerx, centery, centerz
REAL(DP) middle(3,neigh/2)
REAL(DP) d_pair(neigh/2) ! pair distance
!
REAL(DP), ALLOCATABLE :: vpsil(:,:)
REAL(DP), ALLOCATABLE :: rhol(:),rho_in_sp(:),vl(:)
REAL(DP), ALLOCATABLE :: psi(:,:)
!
INTEGER iobtl, gindex_of_iobtl, irank, rk_of_obtl_trcv, rk_of_obtl_tbs
INTEGER obtl_tbs, lindex_obtl_tbs, obtl_trcv, lindex_obtl_trcv
REAL(DP) totalenergy, totalenergyg, tot_energy(nbsp)
REAL(DP) total_exx_derv(3,3), total_exx_derv_g(3,3)
REAL(DP) selfe, paire(neigh/2), &
self_dexx_dhab(3,3), pair_dexx_dhab(3,3,neigh/2)
!
INTEGER, allocatable :: isendreq(:),irecvreq(:,:)
INTEGER tran(3), proc, tmp_iobtl, me
!
INTEGER :: k,jj,ii,ia,ib,ic,my_var,my_var2,my_var3,i_fac,va,cgstep
INTEGER :: ndim,nogrp
INTEGER, ALLOCATABLE :: overlap(:,:),njj(:)
REAl(DP), ALLOCATABLE :: wannierc(:,:),wannierc_tmp(:,:)
REAl(DP), ALLOCATABLE :: psil(:),psil_pair_recv(:,:),psil_pair_send(:,:,:)
REAL(DP), ALLOCATABLE :: exx_tmp(:,:),exx_tmp3(:,:)
INTEGER, ALLOCATABLE :: sdispls(:), sendcount(:)
INTEGER, ALLOCATABLE :: rdispls(:), recvcount(:)
INTEGER, ALLOCATABLE :: sdispls1(:), sendcount1(:)
INTEGER, ALLOCATABLE :: rdispls1(:), recvcount1(:)
!
REAL(DP) :: Jim(3,3) ! jacobian [d/d x] [d/d a]
! |d/d y| = [J] |d/d b|
! [d/d z] [d/d c]
!
!=============================================================================================
!print *, 'entering exx_gs', 'n_exx', n_exx
!
CALL start_clock('exx_gs_setup')
!
! make processor index start from 1
!
me = me_image+1
!
! number of real space gird along each lattice parameter directions (a1, a2, a3)
!
nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3
!
! the length of each lattice parameters
!
a(1)=DSQRT(h(1,1)*h(1,1)+h(2,1)*h(2,1)+h(3,1)*h(3,1)) ! lattice 1
a(2)=DSQRT(h(1,2)*h(1,2)+h(2,2)*h(2,2)+h(3,2)*h(3,2)) ! lattice 2
a(3)=DSQRT(h(1,3)*h(1,3)+h(2,3)*h(2,3)+h(3,3)*h(3,3)) ! lattice 3
!
! grid spacing in each lattice parameters
!
ha = a(1) / DBLE(nr1s) !grid spacing in Lattice 1 direction
hb = a(2) / DBLE(nr2s) !grid spacing in Lattice 2 direction
hc = a(3) / DBLE(nr3s) !grid spacing in Lattice 3 direction
!
! total number of real space grid points in the global mesh
! and the corresponding volume elements for each grid point
!
nnrtot = nr1s * nr2s * nr3s
hcub = omega / DBLE(nnrtot) !nnrtot in parallel
!
! the x,y,z coordinates of the center of the box (gird center)
! NOTE: center of the box is set to grid point at int(nr1/2), int(nr2/2), int(nr3/2) for every cell
!
centerx = h(1,1)*DBLE(INT(nr1s/2))+h(1,2)*DBLE(INT(nr2s/2))+h(1,3)*DBLE(INT(nr3s/2)) ! r = h s
centery = h(2,1)*DBLE(INT(nr1s/2))+h(2,2)*DBLE(INT(nr2s/2))+h(2,3)*DBLE(INT(nr3s/2)) ! r = h s
centerz = h(3,1)*DBLE(INT(nr1s/2))+h(3,2)*DBLE(INT(nr2s/2))+h(3,3)*DBLE(INT(nr3s/2)) ! r = h s
!
! inverse volume
!
sa1 = 1.0_DP/omega
!
!========================================================================
! Compute distances between grid points and the center of the simulation cell in R space
! This part needs to be done once in constant volume simulation and
! needs to be done every step in variable cell simulationulations ...
!
DO i=1,np_in_sp_me_s
!
xx_in_sp(i)=h(1,1)*sc_xx_in_sp(i)+h(1,2)*sc_yy_in_sp(i)+h(1,3)*sc_zz_in_sp(i) ! r = h s
yy_in_sp(i)=h(2,1)*sc_xx_in_sp(i)+h(2,2)*sc_yy_in_sp(i)+h(2,3)*sc_zz_in_sp(i) ! r = h s
zz_in_sp(i)=h(3,1)*sc_xx_in_sp(i)+h(3,2)*sc_yy_in_sp(i)+h(3,3)*sc_zz_in_sp(i) ! r = h s
!
END DO
!========================================================================
! Compute coeke and renormalize
! This part needs to be done once in constant volume simulation and
! needs to be done every step in variable cell simulationulations ...
!
! get ha*d/da, ha^2*d^2/da^2 stencil and cross coefficients
! 1. for the finite difference coefficients, we follow B. Fornberg in
! Math. Comp. 51 (1988), 699-706
!
CALL fornberg(nord1, nord2,coe_1st_derv(:,1),coeke(:,1,1),coeke(:,1,2),ierr)
!
IF (ierr .ne. 0) THEN
WRITE(stdout,*) ' ERROR: Wrong parameter in CALL of Fornberg'
WRITE(stdout,*) ' STOP in exx_gs'
RETURN
END IF
!
!RENORMALIZE COEKES WITH RESPECT TO THE GRID SPACING
! first derivative coefficients
!
coe_1st_derv(:,3) = coe_1st_derv(:,1)/hc ! d/dc stencil
coe_1st_derv(:,2) = coe_1st_derv(:,1)/hb ! d/db stencil
coe_1st_derv(:,1) = coe_1st_derv(:,1)/ha ! d/da stencil
!
! NOTE: in the second derivatives there is a additional factor of
! -4*pi because we merege that from the Poisson equation
!
! \nabla^2 V = -4*\pi \rho
!
! axial derivatives
!
coeke(:,3,3) = -coeke(:,1,1)/(hc*hc*fpi) ! -d^2/dc^2/4pi stencil
coeke(:,2,2) = -coeke(:,1,1)/(hb*hb*fpi) ! -d^2/db^2/4pi stencil
coeke(:,1,1) = -coeke(:,1,1)/(ha*ha*fpi) ! -d^2/da^2/4pi stencil
!
! cross derivatives
!
coeke(:,2,3) = -coeke(:,1,2)/(hb*hc*fpi) ! -d^2/dbdc/4pi stencil
coeke(:,1,3) = -coeke(:,1,2)/(ha*hc*fpi) ! -d^2/dadc/4pi stencil
coeke(:,1,2) = -coeke(:,1,2)/(ha*hb*fpi) ! -d^2/dadb/4pi stencil
!
! -- Jacobian for the general (non-orthogonal) --
! please see the following reference for details <todo: EXX paper>
!
! J = transpose(ainv).(diag(a))
!
Jim(:,1) = ainv(1,:)*a(1)
Jim(:,2) = ainv(2,:)*a(2) ! i={xyz}, m={abc}
Jim(:,3) = ainv(3,:)*a(3)
!
! -- weigh coeke with the Jacobian --
!
! axial derivatives
!
coeke(:,3,3) = (Jim(1,3)**2+Jim(2,3)**2+Jim(3,3)**2)*coeke(:,3,3)
coeke(:,2,2) = (Jim(1,2)**2+Jim(2,2)**2+Jim(3,2)**2)*coeke(:,2,2)
coeke(:,1,1) = (Jim(1,1)**2+Jim(2,1)**2+Jim(3,1)**2)*coeke(:,1,1)
!
! cross derivatives (needed for non-othogonal grids in the second derivatives)
!
coeke(:,2,3) = 2.0_DP*(Jim(1,2)*Jim(1,3)+Jim(2,2)*Jim(2,3)+Jim(3,2)*Jim(3,3))*coeke(:,2,3)
coeke(:,1,3) = 2.0_DP*(Jim(1,1)*Jim(1,3)+Jim(2,1)*Jim(2,3)+Jim(3,1)*Jim(3,3))*coeke(:,1,3)
coeke(:,1,2) = 2.0_DP*(Jim(1,1)*Jim(1,2)+Jim(2,1)*Jim(2,2)+Jim(3,1)*Jim(3,2))*coeke(:,1,2)
coeke(:,3,2) = coeke(:,2,3) ! symmetry of coeke
coeke(:,2,1) = coeke(:,1,2) ! symmetry of coeke
coeke(:,3,1) = coeke(:,1,3) ! symmetry of coeke
!
! a samall check on the shape of user defined cell (if any)
!
IF ((ibrav.EQ.0).AND.(nfi.EQ.1)) THEN
WRITE(stdout,*) 'EXX info: If you are using an orthogonal cell without its cell vectors&
& aligned to the xyz directions, the EXX calculation may be twice more expensive.'
END IF
!
!========================================================================
!
CALL stop_clock('exx_gs_setup')
!
!-------------------------------------------------------------------------
! Get the Wannier center and compute the pair overlap matrix
!-------------------------------------------------------------------------
!
CALL start_clock('exx_pairs')
!
ndim=MAX(nproc_image, nbsp)
ALLOCATE (wannierc(3,ndim)); wannierc=0.0_DP
ALLOCATE (wannierc_tmp(3,nbsp)); wannierc_tmp=0.0_DP
!
! Adjust Cartesian coordinates of wannier centres according to periodic boundary conditions...
! N.B.: PBC are imposed here in the range [0,1)...
!
DO iobtl=1,nbsp
!
wannierc_tmp(1,iobtl)=ainv(1,1)*wfc(1,iobtl)+ainv(1,2)*wfc(2,iobtl)+ainv(1,3)*wfc(3,iobtl) ! s = h^-1 r
wannierc_tmp(2,iobtl)=ainv(2,1)*wfc(1,iobtl)+ainv(2,2)*wfc(2,iobtl)+ainv(2,3)*wfc(3,iobtl) ! s = h^-1 r
wannierc_tmp(3,iobtl)=ainv(3,1)*wfc(1,iobtl)+ainv(3,2)*wfc(2,iobtl)+ainv(3,3)*wfc(3,iobtl) ! s = h^-1 r
!
wannierc_tmp(1,iobtl)=wannierc_tmp(1,iobtl)-FLOOR(wannierc_tmp(1,iobtl)) ! impose PBC on s in range: [0,1)
wannierc_tmp(2,iobtl)=wannierc_tmp(2,iobtl)-FLOOR(wannierc_tmp(2,iobtl)) ! impose PBC on s in range: [0,1)
wannierc_tmp(3,iobtl)=wannierc_tmp(3,iobtl)-FLOOR(wannierc_tmp(3,iobtl)) ! impose PBC on s in range: [0,1)
!
wannierc(1,iobtl)=h(1,1)*wannierc_tmp(1,iobtl)+h(1,2)*wannierc_tmp(2,iobtl)+h(1,3)*wannierc_tmp(3,iobtl) ! r = h s
wannierc(2,iobtl)=h(2,1)*wannierc_tmp(1,iobtl)+h(2,2)*wannierc_tmp(2,iobtl)+h(2,3)*wannierc_tmp(3,iobtl) ! r = h s
wannierc(3,iobtl)=h(3,1)*wannierc_tmp(1,iobtl)+h(3,2)*wannierc_tmp(2,iobtl)+h(3,3)*wannierc_tmp(3,iobtl) ! r = h s
!
wannierc_tmp(:,iobtl)=wannierc(:,iobtl) ! keep a temporary copy to compute pair indices
!
END DO
!
! make copy of wannier centres when number of processors > number of bands
!
IF(nproc_image.GT.nbsp) THEN
!
DO iobtl=nbsp+1,nproc_image
!
ir=MOD(iobtl,nbsp)
IF(ir.EQ.0 )THEN
wannierc(:,iobtl)=wannierc(:,nbsp)
ELSE
wannierc(:,iobtl)=wannierc(:,ir)
END IF
!
END DO
!
END IF
!
! overlap is the unique neighbor list that for each band or processor image (ndim)
! njj is the number of unique neighbors for each band or processor image (ndim)
!
ALLOCATE (overlap(neigh/2,ndim)); overlap=0
ALLOCATE (njj(ndim)); njj=0
!
! generate the unique neighbor list
!
CALL exx_index_pair(wannierc_tmp, overlap, njj, nj_max, ndim )
!
IF (ALLOCATED(wannierc_tmp)) DEALLOCATE(wannierc_tmp)
!
CALL stop_clock('exx_pairs')
!
!-------------------------------------------------------------------------
!
! Allocate variables to store potentials for 3 steps ....
!
IF (n_exx.EQ.0) THEN
!
! the following variables are used in the extrapolation of exx potentials
ALLOCATE( pair_dist( 3, nj_max*2, my_nbspx), stat=ierr ); pair_dist=0.0_DP
ALLOCATE( selfv( np_in_sp_s, 3, my_nbspx), stat=ierr ); selfv=0.0_DP
ALLOCATE( pairv( np_in_sp_p, 3, nj_max*2, my_nbspx), stat=ierr ); pairv=0.0_DP
!
END IF
!
!-------------------------------------------------------------------------
!
! update exx step ...
!
n_exx = n_exx + 1
!
!=========================================================================
!
! obtain orbitals on each local processor, stored in psi
!
ALLOCATE ( psi( nnrtot, my_nbsp(me ) ) ); psi=0.0_DP
ALLOCATE ( vpsil(nnrtot, my_nbsp(me ) ) ); vpsil=0.0_DP
!
CALL start_clock('r_orbital')
!
! In c variable, the plane wave wavefunction, is distributed in nr3 parts
! and stored in each parallel mpitasks (or processors).
! The exx_psi maps c in to psi, which is in real space grid.
!
! c -- one processor has a part of information from all bands
! psi -- one processor has complete information of one band (or more)
!
CALL exx_psi(c, psi, nnrtot, my_nbsp, my_nxyz, nbsp)
!
CALL stop_clock('r_orbital')
!
!===============================================================================
!
! PAIR AND SELF POTENTIALS ARE CALCULATED IN ONE BIG LOOP
!
!===============================================================================
!
!========================================================================
! THE MOST OUTER LOOP STARTS:
!========================================================================
!
! flag identifying whether the pair communication is done or not
!
ALLOCATE( irecvreq(nj_max,0:nproc_image-1) )
!
! obtain psi in sphere (psil) for neighbors
!
ALLOCATE ( psil_pair_send(np_in_sp_me_p, nj_max, my_nbsp(me )) ); psil_pair_send=0.0_DP
ALLOCATE ( psil_pair_recv(np_in_sp_me_p, nj_max) ); psil_pair_recv=0.0_DP
!
! initialize totalenergy and derivatives
!
totalenergy = 0.0_DP
total_exx_derv(:,:) =0.0_DP
!
! my_var is the maximum of nbsp or nproc_image
my_var = MAX(nproc_image, nbsp)
!
! we should use my_nbspx (maxval(my_nbsp(me))) here
!
DO iobtl = 1, my_nbspx
!
CALL start_clock('send_psi')
!
psil_pair_recv(:,:)=0.0_DP
!
!========================================================================
!
DO j = 1, nj_max ! the neighbor index in a certain mpi task
!
DO irank = 1, nproc_image ! dummy loop over mpi tasks
!
! the global index of the target orbital (iobtl)
gindex_of_iobtl = index_my_nbsp(iobtl, irank)
!
IF ( gindex_of_iobtl .LE. my_var) THEN ! my_var is the maximum of nbsp or nproc_image
!
rk_of_obtl_trcv = irank - 1 ! mpi rank that should receive the target orbital
obtl_tbs=overlap(j, gindex_of_iobtl) ! global band (orbital) index that is paired iobtl (gindex_of_iobtl)
!
IF (obtl_tbs .NE. 0) THEN
!
rk_of_obtl_tbs = rk_of_obtl(obtl_tbs)
lindex_obtl_tbs = lindex_of_obtl(obtl_tbs)
!
IF ((me_image .EQ. rk_of_obtl_trcv) .AND. (me_image .eq. rk_of_obtl_tbs )) THEN
!
!-- internal communication: inside the same mpi task (no mpi is needed) --
!
! calculate mid point of two wannier centers
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_tbs), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_tbs),d_pair(j))
! calculate translation vector from the center of the box
CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle(1, j), tran)
!
! get the localized psi around the mid point of two wannier centers
! note: the psil is centered at the center of the box
! (using the translation vector "tran" from middle of wfc to the center of box)
CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, lindex_obtl_tbs), psil_pair_recv(:,j), tran)
!
ELSEIF( me_image .EQ. rk_of_obtl_tbs )THEN
!
!-- when the mpi task cantains paired orbital with iobtl (this task has to send the obtl_tbs) --
!
! calculate mid point of two wannier centers
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_tbs), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_tbs),d_pair(j))
! calculate translation vector from the center of the box
CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle(1, j), tran)
!
! get the localized psi around the mid point of two wannier centers
! note: the psil is centered at the center of the box
! (using the translation vector "tran" from middle of wfc to the center of box)
CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, lindex_obtl_tbs), psil_pair_send(1,j,lindex_obtl_tbs), tran)
!
#if defined(__MPI)
CALL MPI_SEND( psil_pair_send(1, j, lindex_obtl_tbs), np_in_sp_me_p, MPI_DOUBLE_PRECISION, &
rk_of_obtl_trcv, j*irank, intra_image_comm,ierr )
#endif
!
ELSEIF( me_image .EQ. rk_of_obtl_trcv )THEN
!
!-- when the mpi task matchs with a cetain obtl_tbs (this task has to receive the obtl_tbs) --
!
#if defined(__MPI)
CALL MPI_IRECV( psil_pair_recv(1,j), np_in_sp_me_p, MPI_DOUBLE_PRECISION, &
rk_of_obtl_tbs, j*irank, intra_image_comm, irecvreq(j,me_image),ierr)
#endif
! note: this part will be waited later in the wait loop using irecvreq(j,me_image)
!
ENDIF
!
ENDIF
!
ENDIF
!
ENDDO !irank
!
ENDDO ! j
!
!=======================================================================
CALL start_clock('send_psi_wait')
!
DO j = 1, nj_max
!
DO irank = 1, nproc_image
!
gindex_of_iobtl = index_my_nbsp(iobtl, irank)
!
IF ( gindex_of_iobtl .LE. my_var) THEN
!
rk_of_obtl_trcv = irank - 1
obtl_tbs=overlap(j, gindex_of_iobtl)
!
IF (obtl_tbs .NE. 0) THEN
!
rk_of_obtl_tbs = rk_of_obtl(obtl_tbs)
!
IF ((me_image .EQ. rk_of_obtl_trcv) .AND. (me_image .NE. rk_of_obtl_tbs)) THEN
#if defined(__MPI)
!
! once the paired orbital (obtl_tbs) is received this task can move on to exx calculations
CALL MPI_WAIT(irecvreq(j,me_image), istatus, ierr)
#endif
END IF
!
END IF
!
END IF
!
END DO
!
END DO
!
CALL stop_clock('send_psi_wait')
CALL stop_clock('send_psi')
!
! printout header for the cgsteps
!
IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
!
IF (ionode) THEN
!
iunit=printout_base_unit("ncg")
!
CALL printout_base_open("ncg")
!
WRITE(iunit,'(I8,F16.8)')nfi,tps
!
END IF
!
END IF
!
!=======================================================================
! after this loop ( do irank ), all the processor got all the overlapping orbitals
! for the i_obtl orbital and ready to calculate pair potential
!=======================================================================
!
CALL start_clock('getpairv')
!
middle(:,:)=0.0_DP
gindex_of_iobtl = index_my_nbsp(iobtl, me)
!
IF ( gindex_of_iobtl .LE. my_var) THEN
!
!-- second loop starts: calculate overlapping potential with the j_th orbitals --
!
!CALL start_clock('getpairv')
!
! my_var3 is the unique neighbor number for gindex_of_iobtl
my_var3=njj( gindex_of_iobtl )
!
do j = 1, my_var3
!
! my_var2 is the global index of the unique pair to gindex_of_iobtl
my_var2=overlap(j,gindex_of_iobtl)
!
IF (my_var2 .NE. 0) THEN
!
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,my_var2), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,my_var2),d_pair(j))
!
! calculate translation vector from the center of the box
CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle(1, j), tran)
!
! get the localized psi around the mid point of two wannier centers
! note: the psil is centered at the center of the box
! (using the translation vector "tran" from middle of wfc to the center of box)
ALLOCATE ( psil(np_in_sp_me_p) ); psil=0.0_DP
CALL getpsil( nnrtot, np_in_sp_me_p, psi(1, iobtl), psil(1), tran)
!
! the localized density rhol
ALLOCATE ( rhol(np_in_sp_me_p) ); rhol=0.0_DP
ALLOCATE ( rho_in_sp(np_in_sp_p) ); rho_in_sp=0.0_DP
CALL getrhol( np_in_sp_me_p, np_in_sp_p, psil(1), psil_pair_recv(1, j), rhol, rho_in_sp, tran, sa1)
!
! calculate the exx potential from the pair density by solving Poisson
!
ALLOCATE ( vl(np_in_sp_me_p) ); vl=0.0_DP ! compute potential (vl) in ME sphere
CALL start_clock('getvofr')
CALL getvofr( np_in_sp_me_p, np_in_sp_p, hcub, rho_in_sp, vl,&
pairv(1,1,j,iobtl), pairv(1,2,j,iobtl), pairv(1,3,j,iobtl),&
.FALSE., d_pair(j), pair_dist(1,j,iobtl), pair_dist(2,j,iobtl),&
pair_dist(3,j,iobtl),cgstep)
CALL stop_clock('getvofr')
!
! write cgsteps in the suffix.ncg (unit=44)
!
IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
!
IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
!
WRITE(iunit,'(3X,"(i,j,cgsteps)",3I6)') gindex_of_iobtl, my_var2, cgstep
!
END IF
!
END IF
!
! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
!$omp parallel do private(ir)
DO ip = 1, np_in_sp_me_p
CALL l2goff (ip,ir,tran) ! local is centered at box center; global index is offset by tran
vpsil(ir,iobtl) = vpsil(ir,iobtl) - exxalfa*vl(ip)*psil_pair_recv(ip,j) ! to remain
psil_pair_recv(ip,j) = - exxalfa*vl(ip)*psil(ip) ! to be sent
END DO
!$omp end parallel do
!
CALL vvprod(np_in_sp_me_p, rhol, vl, paire(j)) ! dot product of the rho and vl !HK (todo): do we need to do PS+ME ?? rho_in_sp may be enough
paire(j) = paire(j) * 0.5_DP* hcub ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
totalenergy = totalenergy + 2.0_DP*paire(j) ! the factor of two comes from the identity of ij and ji pair
!
IF (.NOT. (isotropic .AND. (ibrav.EQ.1) )) THEN
!
! EXX cell derivative (note: exxalfa is included in vofrho.f90 when calculate stress)
CALL start_clock('exx_cell_derv')
!
CALL exx_energy_cell_derivative(np_in_sp_me_p, np_in_sp_p, tran,&
vl, ha, hb, hc, rhol, pair_dexx_dhab(:,:,j))
!
! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
!
pair_dexx_dhab(:,:,j) = pair_dexx_dhab(:,:,j)*0.5_DP*hcub
!
! accumulate the derivative from different pair terms
!
total_exx_derv(:,:) = total_exx_derv(:,:) + 2.0_DP*pair_dexx_dhab(:,:,j)
!
CALL stop_clock('exx_cell_derv')
!
! if isotropic => calculate the stress tensor in vofrho.f90
!
END IF
!
IF (ALLOCATED(psil)) DEALLOCATE(psil)
IF (ALLOCATED(rhol)) DEALLOCATE(rhol)
IF (ALLOCATED(rho_in_sp)) DEALLOCATE(rho_in_sp)
IF (ALLOCATED(vl)) DEALLOCATE(vl)
!
END IF
!
END DO !for j
!
!print *, 'pair energy follows '
!write(*,'(5f15.8)')(paire(j),j=1,njj( gindex_of_iobtl ))
!
END IF !gindex_of_iobtl <= nbsp
!
!
!CALL stop_clock('getpairv')
!
!
!=========================================================================
!
! SELF POTENTIAL FOR EACH ORBITAL STARTS HERE
!
!=========================================================================
!
!CALL start_clock('self_v')
!
IF (iobtl.LE.my_nbsp(me)) THEN ! skip when the loop of my_nbspx goes outside of scope
!
IF (me.GT.(nbsp*(sc_fac-1))) THEN ! compatible with more processors than nbsp
!
gindex_of_iobtl = index_my_nbsp(iobtl, me)
!
! calculate translation vector from the center of the box
CALL getsftv(nr1s, nr2s, nr3s, h, ainv, wannierc(1, gindex_of_iobtl), tran)
!
! get the localized psi around the wannier centers
! note: the psil is centered at the center of the box
! (using the translation vector "tran" from the wfc to the center of box)
ALLOCATE ( psil(np_in_sp_me_s) ); psil=0.0_DP
CALL getpsil( nnrtot, np_in_sp_me_s, psi(1, iobtl), psil(1), tran)
!
! get the localized density rhol
ALLOCATE ( rhol(np_in_sp_me_s) ); rhol=0.0_DP
ALLOCATE ( rho_in_sp(np_in_sp_s) ); rho_in_sp=0.0_DP
CALL getrhol( np_in_sp_me_s, np_in_sp_s, psil(1), psil(1), rhol, rho_in_sp, tran, sa1)
!
! calculate the exx potential from the pair density by solving Poisson
!
ALLOCATE ( vl(np_in_sp_me_s) ); vl=0.0_DP ! compute potential (vl) in ME sphere
CALL start_clock('getvofr')
CALL getvofr( np_in_sp_me_s,np_in_sp_s,&
hcub, rho_in_sp, vl, selfv(1,1,iobtl), selfv(1,2,iobtl),&
selfv(1,3,iobtl), .TRUE., 0.0, 0.0, 0.0, 0.0,cgstep)
!
CALL stop_clock('getvofr')
!
! write cgsteps in the suffix.ncg (unit=44)
!
IF ((MOD(nfi,iprint_stdout).EQ.0)) THEN
!
IF (ionode) THEN ! maybe not needed for ionode (if one want more information)
!
WRITE(44,'(3X,"(i,i,cgsteps)",3I6)') gindex_of_iobtl, gindex_of_iobtl, cgstep
!
CALL printout_base_close("ncg")
!
END IF
!
END IF
!
! update vpsil in the global grid (exxalfa is 0.25 for PBE0)
!$omp parallel do private(ir)
DO ip = 1, np_in_sp_me_s
CALL l2goff (ip,ir,tran) ! local is centered at box center; global index is offset by tran
vpsil(ir,iobtl) = vpsil(ir,iobtl) - exxalfa*vl(ip)*psil(ip) ! PBE0
END DO
!$omp end parallel do
!
! compute exchange energy in ME sphere
CALL vvprod(np_in_sp_me_s, rhol, vl, selfe) ! dot product of the rho and vl !HK (todo): do we need to do PS+ME ?? rho_in_sp may be enough
selfe = selfe * 0.5_DP * hcub ! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
!totalenergy = totalenergy + (selfe/DBLE(sc_fac))
totalenergy = totalenergy + selfe
!
!IF(me .GT. nbsp) THEN
! vpsil(:,iobtl) = 0.0_DP
!END IF
!
IF (.NOT. (isotropic .AND. (ibrav.EQ.1))) THEN
!
! EXX cell derivative (note: need to include exxalfa later)
CALL start_clock('exx_cell_derv')
!
CALL exx_energy_cell_derivative(np_in_sp_me_s, np_in_sp_s, tran,&
vl, ha, hb, hc, rhol, self_dexx_dhab(:,:))
!
! volume element hcub and trapezoidal rule prefactor 0.5_DP are included
!
self_dexx_dhab(:,:) = self_dexx_dhab(:,:)*0.5_DP*hcub
!
! combine derivative with pair terms
!
total_exx_derv(:,:) = total_exx_derv(:,:) + self_dexx_dhab(:,:)
!
CALL stop_clock('exx_cell_derv')
!
! if isotropic => calculate the stress tensor in vofrho.f90
!
END IF
!
IF (ALLOCATED(psil)) DEALLOCATE(psil)
IF (ALLOCATED(rhol)) DEALLOCATE(rhol)
IF (ALLOCATED(rho_in_sp)) DEALLOCATE(rho_in_sp)
IF (ALLOCATED(vl)) DEALLOCATE(vl)
!
END IF ! me
!
END IF !iobtl
!
!CALL stop_clock('self_v')
CALL stop_clock('getpairv')
!
!===============================================================================
! After this loop, each processor finished the pair potentials for the
! iobtl orbital, and shall talk to send/recv vpsiforj
!===============================================================================
!
CALL start_clock('sendv')
!
middle(:,:)=0.0_DP
jj = 0
!
DO j = 1, nj_max
!
DO irank = 1, nproc_image
!
gindex_of_iobtl = index_my_nbsp(iobtl, irank)
!
IF ( gindex_of_iobtl .LE. my_var) THEN
!
rk_of_obtl_tbs = irank - 1
obtl_trcv=overlap(j, gindex_of_iobtl)
!
IF ( obtl_trcv .NE. 0) THEN
!
IF(nproc_image .LE. nbsp) THEN
!
rk_of_obtl_trcv = rk_of_obtl(obtl_trcv)
!
ELSE
!
DO i=1,sc_fac-1
!
ib=nbsp*i; ic=nbsp*(i+1)
!
IF(me.LE.ib .AND. irank.LE.ib) THEN
rk_of_obtl_trcv = rk_of_obtl(obtl_trcv)
ELSEIF(me.GT.ib .AND. irank.GT.ib .AND. me.LE.ic .AND. irank.LE.ic) THEN
rk_of_obtl_trcv = rk_of_obtl(obtl_trcv) + ib
ELSE
rk_of_obtl_trcv = nproc_image+1
END IF
!
END DO
!
END IF
!
lindex_obtl_trcv = lindex_of_obtl(obtl_trcv)
!
IF ((me_image .EQ. rk_of_obtl_trcv) .AND. (me_image .EQ. rk_of_obtl_tbs )) THEN
! -- only encountered when nproc < nbsp --
! upadate vpsil PBE0
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_trcv), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_trcv),d_pair(j))
!
! calculate translation vector from the center of the box
CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle(1, j), tran)
!
!$omp parallel do private(ir)
DO ip = 1, np_in_sp_me_p
CALL l2goff (ip,ir,tran)
vpsil(ir,lindex_obtl_trcv) = vpsil(ir,lindex_obtl_trcv) + psil_pair_recv(ip,j)
END DO
!$omp end parallel do
!
ELSE IF ( me_image .EQ. rk_of_obtl_tbs ) THEN
!
#if defined(__MPI)
!
CALL MPI_SEND( psil_pair_recv(1,j), np_in_sp_me_p, MPI_DOUBLE_PRECISION, &
rk_of_obtl_trcv, j*irank, intra_image_comm,ierr)
#endif
!
ELSE IF ( me_image .EQ. rk_of_obtl_trcv) THEN
!
!jj = jj + 1
#if defined(__MPI)
!
CALL mpi_recv( psil_pair_send(1, j, lindex_obtl_trcv), np_in_sp_me_p, mpi_double_precision, &
rk_of_obtl_tbs, j*irank, intra_image_comm, istatus,ierr) ! bs
#endif
!WRITE(stdout,'("recv",3I6)')me,gindex_of_iobtl,j*irank ! DEBUG
!
IF (nproc_image .LE. nbsp) THEN
!
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_trcv), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,obtl_trcv),d_pair(j))
!
ELSE
!
CALL getmiddlewc(wannierc(1,gindex_of_iobtl),wannierc(1,rk_of_obtl_trcv+1), &
& h, ainv, middle(1,j) )
! get pair distance
CALL get_pair_dist(wannierc(1,gindex_of_iobtl),wannierc(1,rk_of_obtl_trcv+1),d_pair(j))
!
END IF
! calculate translation vector from the center of the box
CALL getsftv( nr1s, nr2s, nr3s, h, ainv, middle(1, j), tran)
! upadate vpsil PBE0
!$omp parallel do private(ir)
DO ip = 1, np_in_sp_me_p
CALL l2goff (ip,ir,tran)
vpsil(ir,lindex_obtl_trcv) = vpsil(ir,lindex_obtl_trcv) + psil_pair_send(ip,j,lindex_obtl_trcv)
END DO
!$omp end parallel do
!
END IF
!
END IF
!
END IF ! gindex_of_iobtl <= nbsp
!
END DO ! irank
!
END DO ! j
!
CALL stop_clock('sendv')
!
END DO ! iobtl
!============================================================================
!THE MOST OUTER LOOP FOR PAIR POTENTIAL ENDS HERE
!============================================================================
!
!============================================================================
CALL start_clock('totalenergy')
!
totalenergyg=0.0_DP ! mpi reduction variable initialization
exx=0.0_DP ! exx energy (used to handle the open/closed shell energy)
!
#if defined(__MPI)
! collect the totalenergy of each mpi task to totalenergyg
CALL MPI_ALLREDUCE(totalenergy, totalenergyg, 1, MPI_DOUBLE_PRECISION, &
& MPI_SUM, intra_image_comm, ierr)
!
#endif
exx = totalenergyg
IF (nspin .EQ. 1) exx = exx + totalenergyg ! if closed shell double the totalenergy
!
!WRITE(stdout, '("EXX Energy",2F30.14," step",I7)')exx,totalenergyg*2.0_DP, nfi
!
CALL stop_clock('totalenergy')
!
CALL start_clock('exx_cell_derv')
!
total_exx_derv_g(:,:) = 0.0_DP ! mpi reduction variable initialization
!
#if defined(__MPI)
IF (.NOT. (isotropic .AND. (ibrav.EQ.1))) THEN
! collect the total_exx_derv of each mpi task to total_exx_derv_g
CALL MPI_ALLREDUCE(total_exx_derv(:,:), total_exx_derv_g(:,:), 9, &
MPI_DOUBLE_PRECISION, MPI_SUM, intra_image_comm, ierr)
!
END IF
#endif
!
! for closed shell case inclued spin factor of 2
dexx_dh(:,:) = total_exx_derv_g(:,:)
IF (nspin .EQ. 1) dexx_dh(:,:) = dexx_dh(:,:) + total_exx_derv_g(:,:)
!
CALL stop_clock('exx_cell_derv')
!
! Local to global distribution of EXX potential
! vpsil (local) --> exx_potential (global)
!
CALL start_clock('vl2vg')
exx_potential=0.0_DP
!
IF (nproc_image .LE. nbsp) THEN
!
CALL redistwfr ( exx_potential, vpsil, my_nxyz, my_nbsp, intra_image_comm, -1 )
!
ELSE
!
!-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
!
nogrp = dtgs%nogrp
!
ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0
ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0
ALLOCATE( sdispls1(nogrp), sendcount1(nogrp) ); sdispls1=0; sendcount1=0
ALLOCATE( rdispls1(nogrp), recvcount1(nogrp) ); rdispls1=0; recvcount1=0
!
DO proc = 1, nproc_image
!
IF (me <= nogrp*nr3s) THEN
sendcount(proc) =nr1s*nr2s/nogrp
ELSE
sendcount(proc) = 0
END IF
!proc 1 holds the nr1s*nr2s/nogrp information of 1024 orbital(duplicate)
!proc 640 as well
!however, 641 to 1024 idle
IF (proc <= nogrp*nr3s) THEN
recvcount(proc)=nr1s*nr2s/nogrp
ELSE
recvcount(proc)=0
END IF
!
END DO
!
sdispls(1) = 0
rdispls(1) = 0
!
DO proc = 2, nproc_image
sdispls(proc)= sdispls(proc-1) + sendcount(proc-1)
rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
END DO
!
ALLOCATE(exx_tmp (dffts%nnr,nproc_image/nogrp)); exx_tmp=0.0_DP
ALLOCATE(exx_tmp3(dffts%nnr,nproc_image/nogrp)); exx_tmp3=0.0_DP
!
#if defined(__MPI)
!
CALL mp_barrier( intra_image_comm )
CALL MPI_ALLTOALLV(vpsil(1,1), recvcount,rdispls,MPI_DOUBLE_PRECISION, &
& exx_tmp, sendcount,sdispls, MPI_DOUBLE_PRECISION, &
& intra_image_comm, ierr)
#endif
!
va = dffts%nnr/nogrp
DO j=1,nproc_image/nogrp,2
DO i=1,2*nogrp
ii=((i-1)/2)*va
jj=j+mod(i-1,2)
ia=(i-1-((i-1)/nogrp)*nogrp)*va
ib=j+(i-1)/nogrp
!$omp parallel do
DO ir=1,va
exx_tmp3(ii+ir,jj)=exx_tmp(ia+ir,ib)
END DO
!$omp end parallel do
END DO
END DO
!
DO proc = 1 , nogrp
sendcount1(proc) = dffts%nnr/nogrp
recvcount1(proc) = dffts%nnr/nogrp
END DO
!
rdispls1(1) = 0
sdispls1(1) = 0
!
DO proc = 2, nogrp
sdispls1(proc) = sdispls1(proc-1) + sendcount1(proc-1)
rdispls1(proc) = rdispls1(proc-1) + recvcount1(proc-1)
END DO
!
#if defined(__MPI)
!
DO ir=1,nproc_image/nogrp
CALL mp_barrier( dtgs%ogrp_comm )
CALL MPI_ALLTOALLV(exx_tmp3(1,ir), sendcount1, sdispls1, MPI_DOUBLE_PRECISION, &
& exx_potential(1,ir),recvcount1, rdispls1, MPI_DOUBLE_PRECISION, &
& dtgs%ogrp_comm, ierr)
END DO
#endif
!
DO ir=1,nbsp/nogrp
DO i=1,sc_fac-1
ii=i*nbsp/nogrp
!$omp parallel do
DO ia=1,dffts%nnr
exx_potential(ia,ir)=exx_potential(ia,ir)+exx_potential(ia,ir+ii)
END DO
!$omp end parallel do
END DO
END DO
!
!-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
!
END IF ! vl2vg
!
CALL stop_clock('vl2vg')
!
!==============================================================================
IF (ALLOCATED(vpsil)) DEALLOCATE(vpsil)
IF (ALLOCATED(psi)) DEALLOCATE(psi)
IF (ALLOCATED(irecvreq)) DEALLOCATE(irecvreq)
IF (ALLOCATED(wannierc)) DEALLOCATE(wannierc)
IF (ALLOCATED(overlap)) DEALLOCATE(overlap)
IF (ALLOCATED(njj)) DEALLOCATE(njj)
IF (ALLOCATED(psil_pair_send)) DEALLOCATE(psil_pair_send)
IF (ALLOCATED(psil_pair_recv)) DEALLOCATE(psil_pair_recv)
IF (ALLOCATED(exx_tmp)) DEALLOCATE(exx_tmp)
IF (ALLOCATED(exx_tmp3)) DEALLOCATE(exx_tmp3)
IF (ALLOCATED(sdispls)) DEALLOCATE(sdispls)
IF (ALLOCATED(rdispls)) DEALLOCATE(rdispls)