My progress bar: Fox algorithm for matrix multiplication in parallel with Fortran90+MPI

I'm now re-reading the book "Parallel Programming with MPI" by Peter S. Pacheco and doing some of the exercises in there. An interesting one is the Programming Assignment n.1 in page. 133, which involves fully implementing the Fox parallel algorithm for multiplying matrixes (see for example).

Below is Fortran90 code (version 1... some things need to be improved...) that does it (a better looking source code is at Pastebin, although I'm not sure for how long it will stay there...)



PROGRAM FOX

  IMPLICIT NONE

  include "mpif.h"



  INTEGER :: procs, rank, error

  INTEGER, DIMENSION(MPI_STATUS_SIZE) :: status



  INTEGER :: g_order, g_side, my_g_row, my_g_column, my_g_rank, comm, row_comm, col_comm, block_mpi_t

  INTEGER :: rows

  REAL, DIMENSION(:,:), ALLOCATABLE :: matrixA, matrixB, matrixC, localA, tempA, localB, localC

  

  CALL MPI_Init ( error )

  CALL MPI_Comm_size ( MPI_COMM_WORLD, procs, error )

  CALL MPI_Comm_rank ( MPI_COMM_WORLD, rank, error )

  

  CALL Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)

  CALL Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)

  CALL Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)

  CALL Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)

  CALL Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)

CALL MPI_Finalize (error)



CONTAINS

  

  SUBROUTINE Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)

    INTEGER, INTENT(IN) :: rank, my_g_rank, g_side, comm, procs

    INTEGER, INTENT(OUT) :: error

    INTEGER, DIMENSION(:), INTENT(OUT) :: status

    REAL, DIMENSION(:,:), INTENT(IN) :: localC

    REAL, DIMENSION(:,:), INTENT(OUT) :: matrixC, tempA



    INTEGER :: grow,gcol,i

    INTEGER,DIMENSION(2) :: coordinates



    IF (rank .EQ. 0 .AND. my_g_rank .NE. 0) PRINT*, "Houston, we have a problem with I/O"



    CALL MPI_SEND(localC,g_side*g_side,MPI_REAL,0,0,comm,error)



    IF (rank .EQ. 0) THEN

       matrixC=0

       DO i=1,procs

          CALL MPI_Recv(tempA,g_side*g_side,MPI_REAL,MPI_ANY_SOURCE,0,comm,status,error)

          CALL MPI_Cart_coords(comm,status(MPI_Source),2,coordinates,error)

          grow = coordinates(1)

          gcol = coordinates(2)

          matrixC(grow*g_side+1:(grow+1)*g_side,gcol*g_side+1:(gcol+1)*g_side) = tempA

       END DO

    PRINT*,"============ Matrix multiplication in parallel ==================="

    CALL Print_Matrix(matrixC)

    END IF

  END SUBROUTINE Print_Last_Result





  SUBROUTINE Print_Matrix(M)

    REAL, DIMENSION(:,:) :: M

    INTEGER :: side,row,column



    side = SIZE(M,1)



    DO row=1,side

       DO column=1,side-1

          WRITE(*,'(F10.3)',ADVANCE='NO'), M(row,column)

       END DO

       WRITE(*,'(F10.3)'), M(row,side)

    END DO

  END SUBROUTINE Print_Matrix





  SUBROUTINE Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)

    INTEGER, INTENT(IN) :: my_g_row,my_g_column,g_order,row_comm,col_comm

    INTEGER, INTENT(OUT) :: error

    INTEGER, DIMENSION(:), INTENT(OUT) :: status

    REAL, DIMENSION(:,:), INTENT(IN) :: localA

    REAL, DIMENSION(:,:), INTENT(INOUT) :: tempA,localB,localC



    INTEGER :: source,destination,stage,bcast_root



    localC = 0

    source = MOD(my_g_row + 1,g_order)

    destination = MOD(my_g_row + g_order - 1,g_order)



    DO stage=0,g_order-1

       bcast_root = MOD(my_g_row + stage,g_order)



       IF (my_g_column .EQ. bcast_root) THEN

          CALL MPI_BCAST(localA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)

          CALL Matrix_Multiply(localA,localB,localC)

       ELSE

          CALL MPI_BCAST(tempA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)

          CALL Matrix_Multiply(tempA,localB,localC)

       END IF

       CALL MPI_Sendrecv_replace(localB,g_side*g_side,MPI_REAL,destination,0,source,0,col_comm,status,error)

    END DO

  END SUBROUTINE Perform_Fox_Algorithm





  SUBROUTINE Matrix_Multiply(A,B,C)

    REAL, DIMENSION(:,:), INTENT(IN) :: A,B

    REAL, DIMENSION(:,:), INTENT(OUT) :: C

    INTEGER :: side,row,column



    side = SIZE(A,1)



    DO row=1,side

       DO column=1,side

          C(row,column) = C(row,column) + SUM(A(row,:)*B(:,column))

       END DO

    END DO

  END SUBROUTINE Matrix_Multiply





  SUBROUTINE Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)

    INTEGER, INTENT(IN) :: g_side,rows,my_g_row,my_g_column

    INTEGER, INTENT(OUT) :: block_mpi_t,error

    REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: localA,localB,localC

    REAL, DIMENSION(:,:), INTENT(INOUT) :: matrixA,matrixB



    CALL MPI_TYPE_VECTOR(g_side,g_side,rows,MPI_REAL,block_mpi_t,error)

    CALL MPI_TYPE_COMMIT(block_mpi_t, error)

    CALL MPI_BCAST(matrixA,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)

    CALL MPI_BCAST(matrixB,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)



    ALLOCATE(localA(g_side,g_side),tempA(g_side,g_side),localB(g_side,g_side),localC(g_side,g_side))

    localA = matrixA(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)

    localB = matrixB(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)

  END SUBROUTINE Distribute_matrixes





  SUBROUTINE Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)

    INTEGER, INTENT(OUT) :: g_order, g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm



    INTEGER,DIMENSION(2) :: dimensions,coordinates

    LOGICAL,DIMENSION(2) :: wrap_around,free_coords



    g_order = SQRT(REAL(procs))

    g_side = rows / g_order



    dimensions = g_order

    wrap_around = .TRUE.

    CALL MPI_Cart_create(MPI_COMM_WORLD,2,dimensions,wrap_around,.TRUE.,comm,error)

    CALL MPI_Comm_rank ( comm, my_g_rank, error )

    CALL MPI_Cart_coords(comm,my_g_rank,2,coordinates,error)

    my_g_row = coordinates(1)

    my_g_column = coordinates(2)



    free_coords(1) = .FALSE.

    free_coords(2) = .TRUE.

    CALL MPI_Cart_sub(comm,free_coords,row_comm,error)



    free_coords(1) = .TRUE.

    free_coords(2) = .FALSE.

    CALL MPI_Cart_sub(comm,free_coords,col_comm,error)

  END SUBROUTINE Setup_grid



  SUBROUTINE Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)

    INTEGER, INTENT(IN) :: rank

    INTEGER, INTENT(OUT) :: rows,error

    INTEGER :: i

    REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: matrixA,matrixB,matrixC



    IF (rank .EQ. 0) THEN

       READ*, rows

       CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)

       ALLOCATE(matrixA(rows,rows),matrixB(rows,rows),matrixC(rows,rows))

       DO i=1,rows

          READ*, matrixA(i,:)

       END DO

       DO i=1,rows

          READ*, matrixB(i,:)

       END DO



       !! Calculate the matrix multiplication for comparison                                                                                                                           

       matrixC = 0

       CALL Matrix_Multiply(matrixA,matrixB,matrixC)



       PRINT*, "CALCULATED IN PROCESS 0"

       PRINT*, "=======MATRIX A=========="

       CALL Print_Matrix(matrixA)

       PRINT*, "=======MATRIX B=========="

       CALL Print_Matrix(matrixB)

       PRINT*, "=======MATRIX C=========="

       CALL Print_Matrix(matrixC)

    ELSE

       CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)

       ALLOCATE(matrixA(rows,rows),matrixB(rows,rows))

    END IF

  END SUBROUTINE Read_Matrix



END PROGRAM FOX

And given the following matrixes:



angelv@vaso:~/fox$ cat fox.in

8

3.4 4.2 2.1 9.3 5.6 7.8 3.4 6.3

6.6 7.6 2.1 7.2 3.5 7.9 2.0 8.3

8.6 4.1 7.5 3.5 7.1 9.6 4.1 8.1

5.8 4.7 8.6 3.6 7.4 4.3 1.2 0.2

9.9 5.3 8.9 3.7 6.5 9.7 3.6 5.3

8.2 4.7 9.3 3.8 6.7 8.4 4.7 6.4

6.8 8.0 2.8 8.9 9.8 3.2 6.8 8.7

6.4 7.6 8.3 4.3 6.5 4.1 3.9 3.8



3.5 5.7 8.6 4.6 7.5 9.0 2.8 6.0

8.9 3.4 7.3 3.3 5.3 8.2 3.6 3.9

8.3 0.8 2.1 1.6 4.7 6.4 3.4 8.8

7.3 4.3 3.3 2.7 4.5 6.6 9.3 5.9

6.7 6.7 8.0 2.8 3.7 5.7 7.7 8.3

7.8 4.0 8.9 2.9 1.4 0.8 7.8 8.2

3.4 7.1 5.7 9.0 6.1 7.9 8.9 7.7

2.6 4.1 6.9 8.2 9.3 5.0 7.1 4.6

We can compile it and run it as:

angelv@vaso:~/fox$ pgf90 -Mmpi=mpich -o fox fox.f90

angelv@vaso:~/fox$ mpirun -stdin fox.in -np 4 ./fox

 CALCULATED IN PROCESS 0

 =======MATRIX A==========

     3.400     4.200     2.100     9.300     5.600     7.800     3.400     6.300

     6.600     7.600     2.100     7.200     3.500     7.900     2.000     8.300

     8.600     4.100     7.500     3.500     7.100     9.600     4.100     8.100

     5.800     4.700     8.600     3.600     7.400     4.300     1.200     0.200

     9.900     5.300     8.900     3.700     6.500     9.700     3.600     5.300

     8.200     4.700     9.300     3.800     6.700     8.400     4.700     6.400

     6.800     8.000     2.800     8.900     9.800     3.200     6.800     8.700

     6.400     7.600     8.300     4.300     6.500     4.100     3.900     3.800

 =======MATRIX B==========

     3.500     5.700     8.600     4.600     7.500     9.000     2.800     6.000

     8.900     3.400     7.300     3.300     5.300     8.200     3.600     3.900

     8.300     0.800     2.100     1.600     4.700     6.400     3.400     8.800

     7.300     4.300     3.300     2.700     4.500     6.600     9.300     5.900

     6.700     6.700     8.000     2.800     3.700     5.700     7.700     8.300

     7.800     4.000     8.900     2.900     1.400     0.800     7.800     8.200

     3.400     7.100     5.700     9.000     6.100     7.900     8.900     7.700

     2.600     4.100     6.900     8.200     9.300     5.000     7.100     4.600

 =======MATRIX C==========

   260.900   194.020   272.070   178.530   210.450   236.380   297.220   275.730

   274.180   199.380   307.390   197.010   245.450   266.250   285.240   277.610

   311.840   232.300   352.690   225.580   277.280   303.160   320.440   360.720

   247.510   147.520   219.820   111.300   167.610   225.640   198.500   256.890

   327.930   227.120   350.150   209.450   269.700   313.690   306.850   365.810

   318.490   224.600   336.210   216.270   271.960   310.980   311.220   361.910

   319.570   268.880   357.800   255.450   309.740   359.100   362.840   349.110

   288.990   190.670   279.080   175.760   235.560   291.560   257.210   301.530

 ============ Matrix multiplication in parallel ===================

   260.900   194.020   272.070   178.530   210.450   236.380   297.220   275.730

   274.180   199.380   307.390   197.010   245.450   266.250   285.240   277.610

   311.840   232.300   352.690   225.580   277.280   303.160   320.440   360.720

   247.510   147.520   219.820   111.300   167.610   225.640   198.500   256.890

   327.930   227.120   350.150   209.450   269.700   313.690   306.850   365.810

   318.490   224.600   336.210   216.270   271.960   310.980   311.220   361.910

   319.570   268.880   357.800   255.450   309.740   359.100   362.840   349.110

   288.990   190.670   279.080   175.760   235.560   291.560   257.210   301.530

angelv@vaso:~/fox$

2 comments:

Unknown said...: How do you chose your matrix dimensions? and thanks a lot for the post!; 19 October 2012 at 13:11
angelv said...: Hi,
not sure what you mean... The matrix dimensions are just the sizes of the matrix you want to multiply, which is obviously problem dependent.

I guess I don't understand what you are really trying to ask.; 22 October 2012 at 08:20

Monday, 19 July 2010

Fox algorithm for matrix multiplication in parallel with Fortran90+MPI

2 comments: