!--------------------------------------------
!CAF=/usr/lib/x86_64-linux-gnu/open-coarrays/openmpi/lib            (locate libcaf_mpi) 
!           mpifort -fcoarray=lib panharmonic.f90 -lscalapack-openmpi -L$CAF -lcaf_mpi -o m ; echo 4 0.2 |mpirun -n 4 ./m; cat an0*
!
!N=12;l=0.3;mpifort -fcoarray=lib panharmonic.f90 -lscalapack-openmpi -L$CAF -lcaf_mpi -o m ; echo $N $l |mpirun -n 4 ./m; cat an0*
!
! If too many processes, then:   
!  n=16;  mpirun -n $n --host $(hostname):$n ./m | sort 
!--------------------------------------------
program     panharmonic_elevels
 use, intrinsic :: iso_fortran_env
 implicit none
!--------------------------------------------
 integer, parameter       :: dp = real64
 integer                  :: context
 integer                  ::  nprow,  npcol, nprocs
 integer                  :: myprow, mypcol, myrank
 integer                  :: myimg , nimg  , myi   , myj
!--------------------------------------------
 integer, parameter       :: DLEN = 9        ! matrix descriptor size
 integer, dimension(DLEN) :: desca           ! matrix descriptors
 integer, external        :: numroc          ! must provide definitions or interfaces to BLACS functions
 integer                  :: info
!--------------------------------------------
 integer                  :: DIM,N,MB,NB,Mloc,Nloc
 integer                  :: i,j ,iloc,jloc
 real(dp), allocatable    :: H    (:,:)[:,:], X    (:,:)[:,:], X4   (:,:)[:,:], X2   (:,:)[:,:], evec(:,:)[:,:]
 real(dp), allocatable    :: Hglob(:,:)     , V    (:,:)     , E    (:) 
!--------------------------------------------
 real(dp)                 :: lambda
!--------------------------------------------
 character(100)           :: fname
 integer                  :: fout
!--------------------------------------------
 call init
!--------------------------------------------
 call calculate_X4
 call calculate_H
 call calculate_evs
 call test_results

contains
!--------------------------------------------
 subroutine     calculate_evs
  integer               :: i,j

  call     sca_ev(H,evec,E) 

  write(fout,'(A,I8,20000G28.16)') 'EV ',DIM,lambda,E

 end subroutine calculate_evs
!--------------------------------------------
 subroutine     calculate_H
  integer                  :: j

  H(:,:)  = 0.0_dp

  do j    = 1, DIM
   call pdelset(H,j,j,desca, j - 0.5_dp)
  end do

  H(:,:)  = H(:,:) + lambda * X4(:,:)

  Hglob = 0.0_dp
  do i = 1, N; do j = 1, N                           ! Parallel Double ELement GET
   call pdelget('A',' ',Hglob(i,j),H   ,i,j,desca) ! SUBROUTINE PDELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA )
  end do; end do
 end subroutine calculate_H
!--------------------------------------------
 subroutine     calculate_X4
  integer                  :: i, j, n, m  ! N is shadowed
  real(dp), parameter      :: isqrt2=1.0_dp/sqrt(2.0_dp)
  real(dp)                 :: Xij
  
  !Position operator:
  X(:,:) = 0.0_dp
  !Compute nonzero elements:
  do i = 1, DIM
   n=i-1 !indices 0,...,DIM-1
   ! The delta_{n,m+1} term, i.e. m=n-1
   m=n-1 !the energy level n -> i=n+1, m-> j=m+1
   j=m+1
   if(j >= 1  ) call pdelset(X,i,j,desca,isqrt2*sqrt(DBLE(m+1)))
   ! The delta_{n,m-1} term, i.e. m=n+1
   m=n+1
   j=m+1
   if(j <= DIM) call pdelset(X,i,j,desca,isqrt2*sqrt(DBLE(m  )))
  end do
 !------------------------------------------------------
 !X2 = X . X  operator:
  call sca_mmult(X ,X ,X2)
 !X4 = X2. X2 operator:
  call sca_mmult(X2,X2,X4) 

 end subroutine calculate_X4
!--------------------------------------------
 subroutine     init
  
  call sca_init

  if(myimg == 1)then
   print '(A)','# Enter N, lambda:'
   read    *  ,         N, lambda
  end if
  call co_broadcast(N     ,1)
  call co_broadcast(lambda,1)
  DIM = N                             ! use if N is shadowed, or if convenient
  write(fname,'(A,I0.3)') 'an',myimg
  open(newunit=fout,file=fname)
  write(fout,'(A      )') '# -------- ' // trim(fname) // ' ----------'
  write(fout,'(A,I18  )') '# N      = ',N
  write(fout,'(A,F15.3)') '# lambda = ',lambda
  
  MB   = nprow ; NB = npcol           ! Fit one block per image/process
  Mloc = numroc(N,MB,myprow,0,nprow) 
  Nloc = numroc(N,NB,mypcol,0,npcol) 
  if(Mloc /= N/MB) error stop
  if(Nloc /= N/NB) error stop
  !Array descriptor:
  call descinit(desca,N,N,MB,NB,0,0,context,Mloc,info)           !DESCINIT( DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT,LLD, INFO )

  allocate(H   (Nloc,Nloc)[nprow,*]) ; myi = this_image(H,1); myj = this_image(H,2)
  allocate(X   (Nloc,Nloc)[nprow,*]) 
  allocate(X2  (Nloc,Nloc)[nprow,*]) 
  allocate(X4  (Nloc,Nloc)[nprow,*]) 
  allocate(evec(Nloc,Nloc)[nprow,*]) 

  allocate(Hglob(N,N), V(N,N), E(N))

  call random_init(.false., .true.) ! random_init(REPEATABLE, IMAGE_DISTINCT)

 end subroutine init
!--------------------------------------------
 subroutine     test_results
  integer fdat, i, j

! V: Gloval evectors array, Hglob: global Hamiltonian
  V    = 0.0_dp ; Hglob = 0.0_dp
  do i = 1, N; do j = 1, N                           ! Parallel Double ELement GET
   call pdelget('A',' ',V    (i,j),evec,i,j,desca)   ! SUBROUTINE PDELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA )
   call pdelget('A',' ',Hglob(i,j),H   ,i,j,desca) 
  end do; end do

  if(myimg == 1)then
   open(newunit=fdat,file='an.out')
   
   write(fdat,'(A      )') '# ----------- an.out ----------------'
   write(fdat,'(A,I18  )') '# N      = ',N
   write(fdat,'(A,F15.3)') '# lambda = ',lambda

   write(fdat,'(A)')'#--------------H---------------------'
   do i = 1, N
    write(fdat,'(A,          1000F8.3)')'HH ', Hglob(i,:)
   end do
                    
   write(fdat,'(A)')'#--------------E---------------------'
   write(fdat,'(A,           1000F8.3)')'EV ', E

   write(fdat,'(A)')'#--------------V---------------------'
   do j = 1, N
    write(fdat,'(A,I3,F8.3,A,1000F8.3)')'VV ', j,E(j),' V= ',V(:,j)
   end do

   write(fdat,'(A)')'#-------------- |H.V - E V| ---------'
   do j = 1, N
    write(fdat,'(A,I3,F8.3,A,1000G16.3)')'RS ', j,E(j),' r= ',sum(abs(matmul(Hglob,V(:,j)) - E(j)*V(:,j)))/N
   end do

  end if ! if(myimg == 1)then



 end subroutine test_results
!--------------------------------------------
 subroutine     sca_ev(a,e,evs)
  real(dp), intent(in )    ::     a(Mloc,Nloc)
  real(dp), intent(out)    ::     e(Mloc,Nloc),evs(:)
  real(dp)                 :: acopy(Mloc,Nloc)
  integer                  :: N
  real(dp), allocatable    :: work(:)        ! PDSYEV variables
  real(dp)                 ::qwork(1)
  integer                  ::lwork

  N     = size(evs)
  acopy = a             ! acopy is destroyed
!--------------------------------------------
! call pdsyev(JOBZ,UPLO,N,A,IA,JA,DESCA,EVS,EVEC,IEVEC,JEVEC,DESCE,WORK,LWORK,INFO)
!
! Alignment conditions: MB_A = NB_A = MB_E, mod(IA-1,MB_A) = 0, mod(JA-1,NB_A) = 0
!--------------------------------------------
!Compute eigenvalues and eigenvectors:
 if( MB /= NB ) error stop             ! pdsyev requires MB=NB
!Query size of work(lwork):
 call pdsyev(                       &
      'V'  ,'U',N    ,              &  ! V=eigenvector, N=only eigenvalue, 'U' use upper triangular values of symmetric matrix
      acopy,1,1,desca,              &
      evs  ,                        &  ! eigenvalues
      E    ,1,1,desca,              &
      qwork,-1,info)
!--------------------------------------------
 lwork = int(qwork(1)+1.0e-4_dp)       ! take care of truncation errors with a small epsilon...
 allocate(work(lwork))
!--------------------------------------------
!Compute eigenvalues and eigenvectors
 call pdsyev(                       &
      'V'  ,'U',N    ,              &  ! V=eigenvector, N=only eigenvalue, 'U' use upper triangular values of symmetric matrix
      acopy,1,1,desca,              &
      evs  ,                        &  ! eigenvalues
      E    ,1,1,desca,              &
      work ,lwork,info)
 if(info /= 0) error stop

 end subroutine sca_ev
!--------------------------------------------
 subroutine     sca_mmult(a,b,c)
  real(dp), intent(in ) :: a(Mloc,Nloc),b(Mloc,Nloc)
  real(dp), intent(out) :: c(Mloc,Nloc)
  real(dp)              :: alpha, beta
  
 alpha = 1.0_dp ; beta = 0.0_dp
 call pdgemm(                       &
      'N','N',N,N,N,         alpha, &
      a,1,1,desca,                  &
      b,1,1,desca,           beta , &
      c,1,1,desca)

 end subroutine sca_mmult
!--------------------------------------------
 subroutine     sca_init
  call blacs_pinfo(myrank,nprocs)
  nprow = sqrt(nprocs+1.0e-6_dp); npcol = nprow
  if(nprocs /= nprow * npcol) error stop
  call blacs_get(-1,0, context)
  call blacs_gridinit(context,'Col-major', nprow, npcol)
  call blacs_gridinfo(context,nprow,npcol,myprow,mypcol)
  myimg = this_image()
  nimg  = num_images()
 end subroutine sca_init
!--------------------------------------------
end program panharmonic_elevels
!--------------------------------------------
