c Implementation of the Jacobi algorithm for the solution of 
c the Poisson problem on a rectangular domain in 2D
c
c Assumptions for simplicity:
c - domain is a square: 1x1
c - number of intervals / mesh points is the SAME in both directions
c - Dirichlet boundary condition   U = 0
c - F(X,Y) is a POINT SOURCE in the "middle" of the square
c - number of inner mesh points is a multiple of number of processes
c
c Parallelization by 1D partition (of y-direction)

      program OneD_basic

      implicit none

      include "mpif.h"
      double precision, allocatable :: U1(:,:), U2(:,:), F(:,:)
      double precision h,hsq,error
      integer NX,NY,NI,NJ,NJloc,NYloc
      integer i,j,k,NIsource,NJsource,NJlocsource
      double precision locsum,sum
      integer locstart,locend
c Maximum number of iterations 
      integer,parameter :: MAXIT=100000

      integer myid, numprocs, ierr
      double precision t1, t2
      integer nbrtop,nbrbottom
      integer status(MPI_STATUS_SIZE)
      integer,parameter :: nout=55

      call MPI_INIT( ierr )
c NUMPROCS is the number of processes
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
c MYID is the id (0 to numrpocs-1) of THIS process
      call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )


c NI is the number of INNER mesh points in x-direction
      if (myid .eq. 0) NI = 64
C Send NI from MASTER to all processes
      call MPI_BCAST(NI,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)

c the number of INNER points in y-direction is assumed = NI
      NJ = NI
c NX=NY is the number of all mesh points, i.e including boundary
      NX = NI + 2 
      NY = NX

c Assumption NI=NJ is a multiple of numprocs
c otherwise stop program
      if (numprocs*(NJ/numprocs) .ne. NJ) goto 1000

c NJloc is the number of LOCAL rows (INNER mesh points)
      NJloc = NJ / numprocs 
c additional space (2 rows) is needed for BOUNDARY / GHOST POINTS
      NYloc = NJloc + 2

c h is the distance between two mesh points (in both directions)
      h = 1.0 / dble(NI+1)
      hsq = h*h

c Determine the neighbouring processes
      nbrbottom = myid - 1
      nbrtop = myid + 1
      if (nbrbottom .eq. -1) nbrbottom = MPI_PROC_NULL
      if (nbrtop .eq. numprocs) nbrtop = MPI_PROC_NULL

c Allocation of the 2D arrays for U and F
      allocate(U1(0:NX-1,0:NYloc-1))
      allocate(U2(0:NX-1,0:NYloc-1))
      allocate(F(0:NX-1,0:NYloc-1))

c Boundary condition U = 0 and initial GHOST POINTS = 0
      U1(0,0:NYloc-1) = 0.0
      U1(NX-1,0:NYloc-1) = 0.0
      U1(0:NX-1,0) = 0.0
      U1(0:NX-1,NYloc-1) = 0.0
c Choice of initial vector U1 = 0.0
      U1(1:NI,1:NJloc) = 0.0

      U2 = U1


c F(X,Y) is a POINT SOURCE at the "middle" of the mesh
      F = 0.0
      NIsource = int(NI/2.+0.6)
      NJsource = NIsource

c Find processor which is responsible for F(NIsource,NJsource)
      locstart = myid * NJloc + 1 
      locend = (myid+1) * NJloc 
c      write(*,*) myid,locstart,locend
c      goto 1000

      if (NJsource .ge. locstart .and. NJsource .le. locend) then
         
c         write(*,*) "MYID ",myid
         NJlocsource = NJsource - locstart + 1

         F(NIsource,NJlocsource) = 1.E+4

      endif



      call MPI_BARRIER( MPI_COMM_WORLD, ierr )
      t1 = MPI_WTIME()

c Maximum of MAXIT iterations of the Jacobi algorithm
      do k=1,MAXIT
         locsum =0.0
c Calculation of the next iteration for the LOCAL INNER mesh points
         do j=1,NJloc
            do i=1,NI
               U2(i,j) = 0.25* (U1(i,j-1)+U1(i-1,j)+U1(i,j+1)+U1(i+1,j) 
     $                          -hsq*F(i,j))
               locsum = locsum+(U2(i,j)-U1(i,j))**2
            enddo
         enddo
         U1 = U2

c The change in U is calculated locally on all processes;
c the complete error has to be summed up over all processes         
         call MPI_Allreduce(locsum, sum, 1, MPI_DOUBLE_PRECISION, 
     $                      MPI_SUM, MPI_COMM_WORLD, ierr )

         error = sqrt(sum)
c Terminate the Jacobi iteration if the change in U in two steps
c is LESS than 1E-12
         if (error .lt. 1.e-12) exit 

c The ghost points have to be updated before the next iteration 
c is started
c Due to the FORTRAN memory access pattern 
c (2D arrays are stored columnwise) 
c the ghost points are contiguous and can be exchanged at once
c The first MPI_SENDRECV sends one "row" of ghost points to the
c processor "above" and simultaneously ghost points are received 
c from "below"
         call MPI_SENDRECV( 
     &            U1(1,NJloc), NI, MPI_DOUBLE_PRECISION, nbrtop, 0, 
     &            U1(1,0), NI, MPI_DOUBLE_PRECISION, nbrbottom, 0, 
     &            MPI_COMM_WORLD, status, ierr )
c The second MPI_SENDRECV sends one "row" of ghost points to the
c processor "below" and simultaneously ghost points are received 
c from "above"
	  call MPI_SENDRECV( 
     &            U1(1,1), NI, MPI_DOUBLE_PRECISION, nbrbottom, 1, 
     &            U1(1,NJloc+1), NI, MPI_DOUBLE_PRECISION, nbrtop, 1, 
     &            MPI_COMM_WORLD, status, ierr )

      enddo
      t2 = MPI_WTIME()


      if (k .eq. MAXIT+1) then

         if (myid .eq. 0) then
            write(*,*) "No convergence of Jacobi algorithm within ",
     $                 MAXIT," iterations!"
            write(*,*) "The error is = ",error
            write(*,*) "Wall clock time for calculation:",
     $                 (t2-t1)
         endif
            goto 1000
      else

         if (myid .eq. 0) then
            write(*,*) "Convergence achieved after ",k," iterations!"
            write(*,*) "The error is = ",error
            write(*,*) "Wall clock time for calculation:",
     $                 (t2-t1)
         endif

      endif


 1000 continue

      if (allocated(U1)) deallocate(U1)
      if (allocated(U2)) deallocate(U2)
      if (allocated(F))  deallocate(F)      

      call MPI_FINALIZE(ierr)

      end program OneD_basic

