Monday, 19 July 2010

Fox algorithm for matrix multiplication in parallel with Fortran90+MPI

I'm now re-reading the book "Parallel Programming with MPI" by Peter S. Pacheco and doing some of the exercises in there. An interesting one is the Programming Assignment n.1 in page. 133, which involves fully implementing the Fox parallel algorithm for multiplying matrixes (see for example).

Below is Fortran90 code (version 1... some things need to be improved...) that does it (a better looking source code is at Pastebin, although I'm not sure for how long it will stay there...)


PROGRAM FOX
  IMPLICIT NONE
  include "mpif.h"

  INTEGER :: procs, rank, error
  INTEGER, DIMENSION(MPI_STATUS_SIZE) :: status

  INTEGER :: g_order, g_side, my_g_row, my_g_column, my_g_rank, comm, row_comm, col_comm, block_mpi_t
  INTEGER :: rows
  REAL, DIMENSION(:,:), ALLOCATABLE :: matrixA, matrixB, matrixC, localA, tempA, localB, localC
 
  CALL MPI_Init ( error )
  CALL MPI_Comm_size ( MPI_COMM_WORLD, procs, error )
  CALL MPI_Comm_rank ( MPI_COMM_WORLD, rank, error )
 
  CALL Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)
  CALL Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)
  CALL Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)
  CALL Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)
  CALL Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)

  
  CALL MPI_Finalize (error)

CONTAINS
 
  SUBROUTINE Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)
    INTEGER, INTENT(IN) :: rank, my_g_rank, g_side, comm, procs
    INTEGER, INTENT(OUT) :: error
    INTEGER, DIMENSION(:), INTENT(OUT) :: status
    REAL, DIMENSION(:,:), INTENT(IN) :: localC
    REAL, DIMENSION(:,:), INTENT(OUT) :: matrixC, tempA

    INTEGER :: grow,gcol,i
    INTEGER,DIMENSION(2) :: coordinates

    IF (rank .EQ. 0 .AND. my_g_rank .NE. 0) PRINT*, "Houston, we have a problem with I/O"

    CALL MPI_SEND(localC,g_side*g_side,MPI_REAL,0,0,comm,error)

    IF (rank .EQ. 0) THEN
       matrixC=0
       DO i=1,procs
          CALL MPI_Recv(tempA,g_side*g_side,MPI_REAL,MPI_ANY_SOURCE,0,comm,status,error)
          CALL MPI_Cart_coords(comm,status(MPI_Source),2,coordinates,error)
          grow = coordinates(1)
          gcol = coordinates(2)
          matrixC(grow*g_side+1:(grow+1)*g_side,gcol*g_side+1:(gcol+1)*g_side) = tempA
       END DO
    PRINT*,"============ Matrix multiplication in parallel ==================="
    CALL Print_Matrix(matrixC)
    END IF
  END SUBROUTINE Print_Last_Result


  SUBROUTINE Print_Matrix(M)
    REAL, DIMENSION(:,:) :: M
    INTEGER :: side,row,column

    side = SIZE(M,1)

    DO row=1,side
       DO column=1,side-1
          WRITE(*,'(F10.3)',ADVANCE='NO'), M(row,column)
       END DO
       WRITE(*,'(F10.3)'), M(row,side)
    END DO
  END SUBROUTINE Print_Matrix


  SUBROUTINE Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)
    INTEGER, INTENT(IN) :: my_g_row,my_g_column,g_order,row_comm,col_comm
    INTEGER, INTENT(OUT) :: error
    INTEGER, DIMENSION(:), INTENT(OUT) :: status
    REAL, DIMENSION(:,:), INTENT(IN) :: localA
    REAL, DIMENSION(:,:), INTENT(INOUT) :: tempA,localB,localC

    INTEGER :: source,destination,stage,bcast_root

    localC = 0
    source = MOD(my_g_row + 1,g_order)
    destination = MOD(my_g_row + g_order - 1,g_order)

    DO stage=0,g_order-1
       bcast_root = MOD(my_g_row + stage,g_order)

       IF (my_g_column .EQ. bcast_root) THEN
          CALL MPI_BCAST(localA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)
          CALL Matrix_Multiply(localA,localB,localC)
       ELSE
          CALL MPI_BCAST(tempA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)
          CALL Matrix_Multiply(tempA,localB,localC)
       END IF
       CALL MPI_Sendrecv_replace(localB,g_side*g_side,MPI_REAL,destination,0,source,0,col_comm,status,error)
    END DO
  END SUBROUTINE Perform_Fox_Algorithm


  SUBROUTINE Matrix_Multiply(A,B,C)
    REAL, DIMENSION(:,:), INTENT(IN) :: A,B
    REAL, DIMENSION(:,:), INTENT(OUT) :: C
    INTEGER :: side,row,column

    side = SIZE(A,1)

    DO row=1,side
       DO column=1,side
          C(row,column) = C(row,column) + SUM(A(row,:)*B(:,column))
       END DO
    END DO
  END SUBROUTINE Matrix_Multiply


  SUBROUTINE Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)
    INTEGER, INTENT(IN) :: g_side,rows,my_g_row,my_g_column
    INTEGER, INTENT(OUT) :: block_mpi_t,error
    REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: localA,localB,localC
    REAL, DIMENSION(:,:), INTENT(INOUT) :: matrixA,matrixB

    CALL MPI_TYPE_VECTOR(g_side,g_side,rows,MPI_REAL,block_mpi_t,error)
    CALL MPI_TYPE_COMMIT(block_mpi_t, error)
    

    CALL MPI_BCAST(matrixA,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)
    CALL MPI_BCAST(matrixB,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)

    ALLOCATE(localA(g_side,g_side),tempA(g_side,g_side),localB(g_side,g_side),localC(g_side,g_side))
    localA = matrixA(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)
    localB = matrixB(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)
  END SUBROUTINE Distribute_matrixes


  SUBROUTINE Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)
    INTEGER, INTENT(OUT) :: g_order, g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm

    INTEGER,DIMENSION(2) :: dimensions,coordinates
    LOGICAL,DIMENSION(2) :: wrap_around,free_coords

    g_order = SQRT(REAL(procs))
    g_side = rows / g_order

    dimensions = g_order
    wrap_around = .TRUE.
    CALL MPI_Cart_create(MPI_COMM_WORLD,2,dimensions,wrap_around,.TRUE.,comm,error)
    CALL MPI_Comm_rank ( comm, my_g_rank, error )
    CALL MPI_Cart_coords(comm,my_g_rank,2,coordinates,error)
    my_g_row = coordinates(1)
    my_g_column = coordinates(2)

    free_coords(1) = .FALSE.
    free_coords(2) = .TRUE.
    CALL MPI_Cart_sub(comm,free_coords,row_comm,error)

    free_coords(1) = .TRUE.
    free_coords(2) = .FALSE.
    CALL MPI_Cart_sub(comm,free_coords,col_comm,error)
  END SUBROUTINE Setup_grid

  SUBROUTINE Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)
    INTEGER, INTENT(IN) :: rank
    INTEGER, INTENT(OUT) :: rows,error
    INTEGER :: i
    REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: matrixA,matrixB,matrixC

    IF (rank .EQ. 0) THEN
       READ*, rows
       CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)
       ALLOCATE(matrixA(rows,rows),matrixB(rows,rows),matrixC(rows,rows))
       DO i=1,rows
          READ*, matrixA(i,:)
       END DO
       DO i=1,rows
          READ*, matrixB(i,:)
       END DO

       !! Calculate the matrix multiplication for comparison                                                                                                                          
       matrixC = 0
       CALL Matrix_Multiply(matrixA,matrixB,matrixC)

       PRINT*, "CALCULATED IN PROCESS 0"
       PRINT*, "=======MATRIX A=========="
       CALL Print_Matrix(matrixA)
       PRINT*, "=======MATRIX B=========="
       CALL Print_Matrix(matrixB)
       PRINT*, "=======MATRIX C=========="
       CALL Print_Matrix(matrixC)
    ELSE
       CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)
       ALLOCATE(matrixA(rows,rows),matrixB(rows,rows))
    END IF
  END SUBROUTINE Read_Matrix

END PROGRAM FOX



And given the following matrixes:


angelv@vaso:~/fox$ cat fox.in
8
3.4 4.2 2.1 9.3 5.6 7.8 3.4 6.3
6.6 7.6 2.1 7.2 3.5 7.9 2.0 8.3
8.6 4.1 7.5 3.5 7.1 9.6 4.1 8.1
5.8 4.7 8.6 3.6 7.4 4.3 1.2 0.2
9.9 5.3 8.9 3.7 6.5 9.7 3.6 5.3
8.2 4.7 9.3 3.8 6.7 8.4 4.7 6.4
6.8 8.0 2.8 8.9 9.8 3.2 6.8 8.7
6.4 7.6 8.3 4.3 6.5 4.1 3.9 3.8

3.5 5.7 8.6 4.6 7.5 9.0 2.8 6.0
8.9 3.4 7.3 3.3 5.3 8.2 3.6 3.9
8.3 0.8 2.1 1.6 4.7 6.4 3.4 8.8
7.3 4.3 3.3 2.7 4.5 6.6 9.3 5.9
6.7 6.7 8.0 2.8 3.7 5.7 7.7 8.3
7.8 4.0 8.9 2.9 1.4 0.8 7.8 8.2
3.4 7.1 5.7 9.0 6.1 7.9 8.9 7.7
2.6 4.1 6.9 8.2 9.3 5.0 7.1 4.6




We can compile it and run it as:

angelv@vaso:~/fox$ pgf90 -Mmpi=mpich -o fox fox.f90

angelv@vaso:~/fox$ mpirun -stdin fox.in -np 4 ./fox
 CALCULATED IN PROCESS 0
 =======MATRIX A==========
     3.400     4.200     2.100     9.300     5.600     7.800     3.400     6.300
     6.600     7.600     2.100     7.200     3.500     7.900     2.000     8.300
     8.600     4.100     7.500     3.500     7.100     9.600     4.100     8.100
     5.800     4.700     8.600     3.600     7.400     4.300     1.200     0.200
     9.900     5.300     8.900     3.700     6.500     9.700     3.600     5.300
     8.200     4.700     9.300     3.800     6.700     8.400     4.700     6.400
     6.800     8.000     2.800     8.900     9.800     3.200     6.800     8.700
     6.400     7.600     8.300     4.300     6.500     4.100     3.900     3.800
 =======MATRIX B==========
     3.500     5.700     8.600     4.600     7.500     9.000     2.800     6.000
     8.900     3.400     7.300     3.300     5.300     8.200     3.600     3.900
     8.300     0.800     2.100     1.600     4.700     6.400     3.400     8.800
     7.300     4.300     3.300     2.700     4.500     6.600     9.300     5.900
     6.700     6.700     8.000     2.800     3.700     5.700     7.700     8.300
     7.800     4.000     8.900     2.900     1.400     0.800     7.800     8.200
     3.400     7.100     5.700     9.000     6.100     7.900     8.900     7.700
     2.600     4.100     6.900     8.200     9.300     5.000     7.100     4.600
 =======MATRIX C==========
   260.900   194.020   272.070   178.530   210.450   236.380   297.220   275.730
   274.180   199.380   307.390   197.010   245.450   266.250   285.240   277.610
   311.840   232.300   352.690   225.580   277.280   303.160   320.440   360.720
   247.510   147.520   219.820   111.300   167.610   225.640   198.500   256.890
   327.930   227.120   350.150   209.450   269.700   313.690   306.850   365.810
   318.490   224.600   336.210   216.270   271.960   310.980   311.220   361.910
   319.570   268.880   357.800   255.450   309.740   359.100   362.840   349.110
   288.990   190.670   279.080   175.760   235.560   291.560   257.210   301.530
 ============ Matrix multiplication in parallel ===================
   260.900   194.020   272.070   178.530   210.450   236.380   297.220   275.730
   274.180   199.380   307.390   197.010   245.450   266.250   285.240   277.610
   311.840   232.300   352.690   225.580   277.280   303.160   320.440   360.720
   247.510   147.520   219.820   111.300   167.610   225.640   198.500   256.890
   327.930   227.120   350.150   209.450   269.700   313.690   306.850   365.810
   318.490   224.600   336.210   216.270   271.960   310.980   311.220   361.910
   319.570   268.880   357.800   255.450   309.740   359.100   362.840   349.110
   288.990   190.670   279.080   175.760   235.560   291.560   257.210   301.530
angelv@vaso:~/fox$ 

2 comments:

Islem Long said...

How do you chose your matrix dimensions? and thanks a lot for the post!

angelv said...

Hi,
not sure what you mean... The matrix dimensions are just the sizes of the matrix you want to multiply, which is obviously problem dependent.

I guess I don't understand what you are really trying to ask.