!--------------------------------------------
!CAF=/usr/lib/x86_64-linux-gnu/open-coarrays/openmpi/lib    (locate libcaf_mpi) 
!mpifort -fcoarray=lib 05_evs.f90    -lscalapack-openmpi  -L$CAF     -lcaf_mpi -o m ; mpirun -n 4 ./m ; head -n 100 out0*[0-9]
!
! If too many processes, then:   
!  n=4;  mpirun -n $n --host $(hostname):$n ./m | sort 
!--------------------------------------------
program     test_scalapack
 use, intrinsic :: iso_fortran_env
 implicit none
!--------------------------------------------
 integer, parameter       :: dp = real64
 integer                  :: context
 integer                  ::  nprow,  npcol, nprocs
 integer                  :: myprow, mypcol, myrank
 integer                  :: myimg , nimg  , myi   , myj
!--------------------------------------------
 integer, parameter       :: DLEN = 9        ! matrix descriptor size
 integer, dimension(DLEN) :: desca           ! matrix descriptors
 integer, external        :: numroc          ! must provide definitions or interfaces to BLACS functions
 integer                  :: info
!--------------------------------------------
 integer                  :: N,MB,NB,Mloc,Nloc
 integer                  :: i,j ,iloc,jloc
 real(dp), allocatable    :: A    (:,:)[:,:], E    (:,:)[:,:]
 real(dp), allocatable    :: Aglob(:,:)     , Eglob(:,:)     , evs(:) 
!--------------------------------------------
 real(dp), allocatable    :: work(:)        ! PDSYEV variables
 real(dp)                 ::qwork(1)
 integer                  ::lwork
!--------------------------------------------
 character(100)           :: fname
 integer                  :: fout
!--------------------------------------------
 N    = 4
!--------------------------------------------
 call init
!--------------------------------------------
 allocate(A(Nloc,Nloc)[nprow,*],Aglob(N,N))
 allocate(E(Nloc,Nloc)[nprow,*],Eglob(N,N),evs(N)) 
!--------------------------------------------
 myi = this_image(A,1); myj = this_image(A,2)
 write(fout,'(A    )') '00  myimg myrank myprow mypcol     MB  nprow'
 write(fout,'(A,6I7)') '00',myimg,myrank,myprow,mypcol,    MB, nprow
!--------------------------------------------
!Define the global array Aglob: must be symmetric
 if(myimg == 1)then
  call random_number(Aglob) 
  Aglob = 0.5_dp * (Aglob + transpose(Aglob))
 end if
 call co_broadcast(Aglob,1)
!--------------------------------------------
!Define the local array A:
 A     = 0.0_dp
 do i = 1, N ; do j = 1, N                        
  call pdelset(A,i,j,desca, Aglob(i,j))       ! Parallel Double ELement SET = PDELSET  (expensive...)
 end do      ; end do
!--------------------------------------------
! call pdsyev(JOBZ,UPLO,N,A,IA,JA,DESCA,EVS,EVEC,IEVEC,JEVEC,DESCE,WORK,LWORK,INFO)
!
! Alignment conditions: MB_A = NB_A = MB_E, mod(IA-1,MB_A) = 0, mod(JA-1,NB_A) = 0
!--------------------------------------------
!Compute eigenvalues and eigenvectors:
 if( MB /= NB ) error stop             ! pdsyev requires MB=NB
!Query size of work(lwork):
 call pdsyev(                       &
      'V' ,'U',N    ,               &  ! V=eigenvector, N=only eigenvalue, 'U' use upper triangular values of symmetric matrix
      A   ,1,1,desca,               &
      evs ,                         &  ! eigenvalues
      E   ,1,1,desca,               &
      qwork,-1,info)
!--------------------------------------------
 lwork = int(qwork(1)+1.0e-4_dp)       ! take care of truncation errors with a small epsilon...
 allocate(work(lwork))
!--------------------------------------------
!Compute eigenvalues and eigenvectors
 call pdsyev(                       &
      'V' ,'U',N    ,               &  ! V=eigenvector, N=only eigenvalue, 'U' use upper triangular values of symmetric matrix
      A   ,1,1,desca,               &  ! A is destroyed, Aglob must be set before the call to pdsyev
      evs ,                         &  ! eigenvalues
      E   ,1,1,desca,               &
      work,lwork,info)
 if(info /= 0) error stop
!--------------------------------------------
!Print eigenvalues:
 write(fout,'(A)')'-------Eigenvalues------------'
 write(fout,'(A,1000F8.3)') '01', evs  ! evs: available on all processors/images
!--------------------------------------------
!Construct global arrays and test result:
 Eglob=0.0_dp
 do i = 1, N; do j = 1, N                           ! Parallel Double ELement GET
  call pdelget('A',' ',Eglob(i,j),E,i,j,desca)      ! SUBROUTINE PDELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA )
 end do     ; end do
!--------------------------------------------
!Print results:
 write(fout,'(A)')'-------Eigenvectors-----------'
 do j = 1, N
  write(fout,'(A,I3,F8.3,A,1000F8.3)') '02', j, evs(j),' v= ', Eglob(:,j)
 end do
!--------------------------------------------
!Test eigensystem definition:
 write(fout,'(A)')'-------r=||A-lambda v||------'
 do j = 1, N
  write(fout,'(A,I3,F8.3,A,1000G12.3)') '03', j, evs(j),' r= ',sum(abs(matmul(Aglob,Eglob(:,j)) - evs(j)*Eglob(:,j)))/N
 end do

100 continue
contains
!--------------------------------------------
 subroutine     sca_mmult(a,b,c)
  real(dp), intent(in ) :: a(Mloc,Nloc),b(Mloc,Nloc)
  real(dp), intent(out) :: c(Mloc,Nloc)
  real(dp)              :: alpha, beta

  ! C = alpha A . B + beta C
  alpha = 1.0_dp ; beta = 0.0_dp
  call pdgemm(                       &
       'N','N',N,N,N,         alpha, &
       a,1,1,desca,                  &
       b,1,1,desca,           beta , &
       c,1,1,desca)

 end subroutine sca_mmult
!--------------------------------------------
 subroutine     init
  
  call sca_init

  write(fname,'(A,I0.3)') 'out',myimg
  open(newunit=fout,file=fname)
 
  MB   = N/nprow ; NB = N/npcol           ! Fit one block per image/process
  Mloc = numroc(N,MB,myprow,0,nprow) 
  Nloc = numroc(N,NB,mypcol,0,npcol) 
  if(Mloc /= N/nprow) error stop
  if(Nloc /= N/npcol) error stop
  if(Nloc /= Mloc   ) error stop
  !Array descriptor:
  call descinit(desca,N,N,MB,NB,0,0,context,Mloc,info)           !DESCINIT( DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT,LLD, INFO )

  call random_init(.false., .true.) ! random_init(REPEATABLE, IMAGE_DISTINCT)

 end subroutine init
!--------------------------------------------
 subroutine     sca_init
  call blacs_pinfo(myrank,nprocs)
  nprow = sqrt(nprocs+1.0e-6_dp); npcol = nprow
  if(nprocs /= nprow * npcol) error stop
  call blacs_get(-1,0, context)
  call blacs_gridinit(context,'Col-major', nprow, npcol)
  call blacs_gridinfo(context,nprow,npcol,myprow,mypcol)
  myimg = this_image()
  nimg  = num_images()
 end subroutine sca_init
!--------------------------------------------
end program test_scalapack
!--------------------------------------------
