program main
	double precision a(8,8), alocal(4,4)
	integer i, j, r, rank, size, sizeofdbl, ierr
	integer stype, t(2), vtype
	integer displs(2)
	integer blklen(2)
	integer sendcount(4), sdispls(4)
	include 'mpif.h'

	call MPI_Init( ierr )
	call MPI_Comm_rank( MPI_COMM_WORLD, rank, ierr )
	call MPI_Comm_size( MPI_COMM_WORLD, size, ierr )

	if (size .ne. 4) then
	   print *, '"This program requires exactly four processors'
	   call MPI_Abort( MPI_COMM_WORLD, 1, ierr )
	endif

	if (rank .eq. 0) then
C  Initialize the matrix.  Note that C has row-major storage 
	   do 10 j=1,8
	      do 10 i=1,8
 10		 A(i,j) = 1.0 + i / 10.0d0 + j / 100.0d0
C Form the vector type for the submatrix
           call MPI_Type_vector( 4, 4, 8, MPI_DOUBLE_PRECISION, 
     *                           vtype, ierr )
C Set an UB so that we can place this in the matrix 
	   t(1) = vtype
	   t(2) = MPI_UB
	   displs(1) = 0
	   call MPI_Type_size( MPI_DOUBLE_PRECISION, sizeofdbl, ierr )
	   displs(2) = 4 * sizeofdbl
	   blklen(1) = 1
	   blklen(2) = 1
	   call MPI_Type_struct( 2, blklen, displs, t, stype, ierr )
	   call MPI_Type_commit( stype, ierr )
C Setup the Scatter values for the send buffer 
	   sendcount(1) = 1
	   sendcount(2) = 1
	   sendcount(3) = 1
	   sendcount(4) = 1
	   sdispls(1) = 0
	   sdispls(2) = 1
	   sdispls(3) = 8
	   sdispls(4) = 9
	   call MPI_Scatterv( A, sendcount, sdispls, stype, alocal, 4*4, 
     *	              MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr )
	else 
    	    call MPI_Scatterv( MPI_BOTTOM, sendcount, sdispls, stype, 
     *                alocal, 4*4, MPI_DOUBLE_PRECISION, 0, 
     *                MPI_COMM_WORLD, ierr )
	endif

C Everyone can now print their local matrix 
	do r=0, size-1
	   if (rank .eq. r) then
	      print *, "Output for process ", r
	      do i=1,4
		 print *, (alocal(i,j),j=1,4)
	      enddo
	   endif
	   call MPI_Barrier( MPI_COMM_WORLD, ierr )
	enddo

	call MPI_Finalize( ierr )
	stop
	end