Skip to content

[Flang] Assignments for local non-contiguous allocatable arrays are slower than GFortran #121126

Open
@yus3710-fj

Description

@yus3710-fj

This issue affects the performance of 627.cam4_s in SPEC CPU 2017.
The execution of the following codes is slower than GFortran.

  • main.f90
subroutine repro(n1,n2,n3,array)
  implicit none
  integer, intent(in) :: n1
  integer, intent(in) :: n2
  integer, intent(in) :: n3
  real(8), allocatable, intent(in) :: array(:,:,:,:)
  real(8), allocatable :: matrix(:,:,:)
  integer :: i, j
  real :: start_t, end_t

  allocate(matrix(n1,n2,n3))

  call cpu_time(start_t)
  do i=1,size(array,2)
    do j=1,n1
      matrix(j,:,:) = array(:,i,n3:1:-1,j) * 50.0_8
    end do
  end do
  call cpu_time(end_t)

  print *, end_t - start_t
  call dummy(matrix)

  deallocate(matrix)
end subroutine repro

real(8), allocatable :: array(:,:,:,:)
interface
subroutine repro(n1,n2,n3,array)
  integer, intent(in) :: n1
  integer, intent(in) :: n2
  integer, intent(in) :: n3
  real(8), allocatable, intent(in) :: array(:,:,:,:)
end subroutine
end interface

allocate(array(100,100,100,100))
call repro(100,100,100,array)
end
  • dummy.f90
subroutine dummy(matrix)
  real(8) :: matrix(*)
end subroutine
  • commands
$ flang -Ofast -mcpu=native main.f90 dummy.f90 && ./a.out # for A64FX/Grace
$ flang -Ofast -march=native main.f90 dummy.f90 && ./a.out # for Xeon
Flang [s] GFortran [s]
A64FX 3.2565217 0.876525998
Grace 0.46003637 0.102575004
Xeon 0.7637187 0.152457997

FYI: main.f90 can be reduced as follows:

subroutine repro(array,n)
  implicit none
  integer, intent(in) :: n
  integer :: i
  real, intent(in) :: array(:)
  real, allocatable :: matrix(:)
  real :: start_t, end_t

  allocate(matrix(n))

  call cpu_time(start_t)
  do i=1,n
    matrix(::2) = array
  end do
  call cpu_time(end_t)

  call dummy(matrix)
  deallocate(matrix)

  print *, end_t - start_t
end subroutine repro

implicit none
integer, parameter :: n = 10000
real :: a(n)
interface
subroutine repro(array,n)
  integer, intent(in) :: n
  real, intent(in) :: array(:)
end subroutine repro
end interface

call repro(a,n*2)
end

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions