!--------------------------------------------
!CAF=/usr/lib/x86_64-linux-gnu/open-coarrays/openmpi/lib    (locate libcaf_mpi) 
!mpifort -fcoarray=lib 04_mmult.f90  -lscalapack-openmpi  -L$CAF     -lcaf_mpi -o m ; mpirun -n 4 ./m ; head -n 100 out0*[0-9]
!
! If too many processes, then:   
!  n=4;  mpirun -n $n --host $(hostname):$n ./m | sort 
!--------------------------------------------
program     test_scalapack
 use, intrinsic :: iso_fortran_env
 implicit none
!--------------------------------------------
 integer, parameter       :: dp = real64
 integer                  :: context
 integer                  ::  nprow,  npcol, nprocs
 integer                  :: myprow, mypcol, myrank
 integer                  :: myimg , nimg  , myi   , myj
!--------------------------------------------
 integer, parameter       :: DLEN = 9        ! matrix descriptor size
 integer, dimension(DLEN) :: desca           ! matrix descriptors
 integer, external        :: numroc          ! must provide definitions or interfaces to BLACS functions
 integer                  :: info
!--------------------------------------------
 integer                  :: N,MB,NB,Mloc,Nloc
 integer                  :: i,j ,iloc,jloc
 real(dp), allocatable    :: A    (:,:)[:,:], B    (:,:)[:,:], C    (:,:)[:,:]
 real(dp), allocatable    :: Aglob(:,:)     , Bglob(:,:)     , Cglob(:,:)    
 real(dp)                 :: alpha          , beta
!--------------------------------------------
 character(100)           :: fname
 integer                  :: fout
!--------------------------------------------
 N    = 4
!--------------------------------------------
 call init
!--------------------------------------------
 allocate(A(Nloc,Nloc)[nprow,*])
 allocate(B(Nloc,Nloc)[nprow,*])
 allocate(C(Nloc,Nloc)[nprow,*])
!--------------------------------------------
 myi = this_image(A,1); myj = this_image(A,2)
 write(fout,'(A    )') '00  myimg myrank myprow mypcol     MB  nprow'
 write(fout,'(A,6I7)') '00',myimg,myrank,myprow,mypcol,    MB, nprow
!--------------------------------------------
!Define the local arrays: 
 call random_number(A) ; call random_number(B)
!--------------------------------------------
!Multiply arrays C = A . B            
 alpha = 1.0_dp ; beta = 0.0_dp          ! C = alpha A . B + beta C
 call pdgemm(                       &
      'N','N',N,N,N,         alpha, &
      A,1,1,desca,                  &
      B,1,1,desca,           beta , &
      C,1,1,desca)
!--------------------------------------------
!Construct global arrays and test result:
 allocate(Aglob(N,N))
 allocate(Bglob(N,N))
 allocate(Cglob(N,N))
 Aglob=0.0_dp
 do i = 1, N; do j = 1, N                           ! Parallel Double ELement GET
  call pdelget('A',' ',Aglob(i,j),A,i,j,desca)      ! SUBROUTINE PDELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA )
  call pdelget('A',' ',Bglob(i,j),B,i,j,desca) 
  call pdelget('A',' ',Cglob(i,j),C,i,j,desca) 
 end do     ; end do

 write(fout,'(A)')'------------------------------'
 write(fout,'(A,G28.16)') '01', sum(abs( Cglob - matmul(Aglob,Bglob)))/(N*N)

contains
!--------------------------------------------
 subroutine     init
  
  call sca_init

  write(fname,'(A,I0.3)') 'out',myimg
  open(newunit=fout,file=fname)
 
  MB   = N/nprow ; NB = N/npcol           ! Fit one block per image/process
  Mloc = numroc(N,MB,myprow,0,nprow) 
  Nloc = numroc(N,NB,mypcol,0,npcol) 
  if(Mloc /= N/nprow) error stop
  if(Nloc /= N/npcol) error stop
  if(Nloc /= Mloc   ) error stop
  !Array descriptor:
  call descinit(desca,N,N,MB,NB,0,0,context,Mloc,info)           !DESCINIT( DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT,LLD, INFO )

  call random_init(.false., .true.) ! random_init(REPEATABLE, IMAGE_DISTINCT)

 end subroutine init
!--------------------------------------------
 subroutine     sca_init
  call blacs_pinfo(myrank,nprocs)
  nprow = sqrt(nprocs+1.0e-6_dp); npcol = nprow
  if(nprocs /= nprow * npcol) error stop
  call blacs_get(-1,0, context)
  call blacs_gridinit(context,'Col-major', nprow, npcol)
  call blacs_gridinfo(context,nprow,npcol,myprow,mypcol)
  myimg = this_image()
  nimg  = num_images()
 end subroutine sca_init
!--------------------------------------------
end program test_scalapack
!--------------------------------------------
