Open
Description
This issue affects the performance of 627.cam4_s in SPEC CPU 2017.
The execution of the following codes is slower than GFortran.
- main.f90
subroutine repro(n1,n2,n3,array)
implicit none
integer, intent(in) :: n1
integer, intent(in) :: n2
integer, intent(in) :: n3
real(8), allocatable, intent(in) :: array(:,:,:,:)
real(8), allocatable :: matrix(:,:,:)
integer :: i, j
real :: start_t, end_t
allocate(matrix(n1,n2,n3))
call cpu_time(start_t)
do i=1,size(array,2)
do j=1,n1
matrix(j,:,:) = array(:,i,n3:1:-1,j) * 50.0_8
end do
end do
call cpu_time(end_t)
print *, end_t - start_t
call dummy(matrix)
deallocate(matrix)
end subroutine repro
real(8), allocatable :: array(:,:,:,:)
interface
subroutine repro(n1,n2,n3,array)
integer, intent(in) :: n1
integer, intent(in) :: n2
integer, intent(in) :: n3
real(8), allocatable, intent(in) :: array(:,:,:,:)
end subroutine
end interface
allocate(array(100,100,100,100))
call repro(100,100,100,array)
end
- dummy.f90
subroutine dummy(matrix)
real(8) :: matrix(*)
end subroutine
- commands
$ flang -Ofast -mcpu=native main.f90 dummy.f90 && ./a.out # for A64FX/Grace
$ flang -Ofast -march=native main.f90 dummy.f90 && ./a.out # for Xeon
Flang [s] | GFortran [s] | |
---|---|---|
A64FX | 3.2565217 | 0.876525998 |
Grace | 0.46003637 | 0.102575004 |
Xeon | 0.7637187 | 0.152457997 |
FYI: main.f90
can be reduced as follows:
subroutine repro(array,n)
implicit none
integer, intent(in) :: n
integer :: i
real, intent(in) :: array(:)
real, allocatable :: matrix(:)
real :: start_t, end_t
allocate(matrix(n))
call cpu_time(start_t)
do i=1,n
matrix(::2) = array
end do
call cpu_time(end_t)
call dummy(matrix)
deallocate(matrix)
print *, end_t - start_t
end subroutine repro
implicit none
integer, parameter :: n = 10000
real :: a(n)
interface
subroutine repro(array,n)
integer, intent(in) :: n
real, intent(in) :: array(:)
end subroutine repro
end interface
call repro(a,n*2)
end