Below is Fortran90 code (version 1... some things need to be improved...) that does it (a better looking source code is at Pastebin, although I'm not sure for how long it will stay there...)
PROGRAM FOX
IMPLICIT NONE
include "mpif.h"
INTEGER :: procs, rank, error
INTEGER, DIMENSION(MPI_STATUS_SIZE) :: status
INTEGER :: g_order, g_side, my_g_row, my_g_column, my_g_rank, comm, row_comm, col_comm, block_mpi_t
INTEGER :: rows
REAL, DIMENSION(:,:), ALLOCATABLE :: matrixA, matrixB, matrixC, localA, tempA, localB, localC
CALL MPI_Init ( error )
CALL MPI_Comm_size ( MPI_COMM_WORLD, procs, error )
CALL MPI_Comm_rank ( MPI_COMM_WORLD, rank, error )
CALL Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)
CALL Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)
CALL Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)
CALL Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)
CALL Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)
CALL MPI_Finalize (error)
CONTAINS
SUBROUTINE Print_Last_Result(rank, my_g_rank, g_side, comm, procs, error, status, localC, matrixC, tempA)
INTEGER, INTENT(IN) :: rank, my_g_rank, g_side, comm, procs
INTEGER, INTENT(OUT) :: error
INTEGER, DIMENSION(:), INTENT(OUT) :: status
REAL, DIMENSION(:,:), INTENT(IN) :: localC
REAL, DIMENSION(:,:), INTENT(OUT) :: matrixC, tempA
INTEGER :: grow,gcol,i
INTEGER,DIMENSION(2) :: coordinates
IF (rank .EQ. 0 .AND. my_g_rank .NE. 0) PRINT*, "Houston, we have a problem with I/O"
CALL MPI_SEND(localC,g_side*g_side,MPI_REAL,0,0,comm,error)
IF (rank .EQ. 0) THEN
matrixC=0
DO i=1,procs
CALL MPI_Recv(tempA,g_side*g_side,MPI_REAL,MPI_ANY_SOURCE,0,comm,status,error)
CALL MPI_Cart_coords(comm,status(MPI_Source),2,coordinates,error)
grow = coordinates(1)
gcol = coordinates(2)
matrixC(grow*g_side+1:(grow+1)*g_side,gcol*g_side+1:(gcol+1)*g_side) = tempA
END DO
PRINT*,"============ Matrix multiplication in parallel ==================="
CALL Print_Matrix(matrixC)
END IF
END SUBROUTINE Print_Last_Result
SUBROUTINE Print_Matrix(M)
REAL, DIMENSION(:,:) :: M
INTEGER :: side,row,column
side = SIZE(M,1)
DO row=1,side
DO column=1,side-1
WRITE(*,'(F10.3)',ADVANCE='NO'), M(row,column)
END DO
WRITE(*,'(F10.3)'), M(row,side)
END DO
END SUBROUTINE Print_Matrix
SUBROUTINE Perform_Fox_Algorithm(my_g_row,my_g_column,g_order,localA,tempA,localB,localC,row_comm,col_comm,status,error)
INTEGER, INTENT(IN) :: my_g_row,my_g_column,g_order,row_comm,col_comm
INTEGER, INTENT(OUT) :: error
INTEGER, DIMENSION(:), INTENT(OUT) :: status
REAL, DIMENSION(:,:), INTENT(IN) :: localA
REAL, DIMENSION(:,:), INTENT(INOUT) :: tempA,localB,localC
INTEGER :: source,destination,stage,bcast_root
localC = 0
source = MOD(my_g_row + 1,g_order)
destination = MOD(my_g_row + g_order - 1,g_order)
DO stage=0,g_order-1
bcast_root = MOD(my_g_row + stage,g_order)
IF (my_g_column .EQ. bcast_root) THEN
CALL MPI_BCAST(localA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)
CALL Matrix_Multiply(localA,localB,localC)
ELSE
CALL MPI_BCAST(tempA,g_side*g_side,MPI_REAL,bcast_root,row_comm,error)
CALL Matrix_Multiply(tempA,localB,localC)
END IF
CALL MPI_Sendrecv_replace(localB,g_side*g_side,MPI_REAL,destination,0,source,0,col_comm,status,error)
END DO
END SUBROUTINE Perform_Fox_Algorithm
SUBROUTINE Matrix_Multiply(A,B,C)
REAL, DIMENSION(:,:), INTENT(IN) :: A,B
REAL, DIMENSION(:,:), INTENT(OUT) :: C
INTEGER :: side,row,column
side = SIZE(A,1)
DO row=1,side
DO column=1,side
C(row,column) = C(row,column) + SUM(A(row,:)*B(:,column))
END DO
END DO
END SUBROUTINE Matrix_Multiply
SUBROUTINE Distribute_matrixes(g_side,rows,block_mpi_t,error,matrixA,matrixB,localA,localB,localC,my_g_row,my_g_column)
INTEGER, INTENT(IN) :: g_side,rows,my_g_row,my_g_column
INTEGER, INTENT(OUT) :: block_mpi_t,error
REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: localA,localB,localC
REAL, DIMENSION(:,:), INTENT(INOUT) :: matrixA,matrixB
CALL MPI_TYPE_VECTOR(g_side,g_side,rows,MPI_REAL,block_mpi_t,error)
CALL MPI_TYPE_COMMIT(block_mpi_t, error)
CALL MPI_BCAST(matrixA,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)
CALL MPI_BCAST(matrixB,rows*rows,MPI_REAL,0,MPI_COMM_WORLD,error)
ALLOCATE(localA(g_side,g_side),tempA(g_side,g_side),localB(g_side,g_side),localC(g_side,g_side))
localA = matrixA(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)
localB = matrixB(my_g_row*g_side+1:(my_g_row+1)*g_side,my_g_column*g_side+1:(my_g_column+1)*g_side)
END SUBROUTINE Distribute_matrixes
SUBROUTINE Setup_grid(g_order,g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm)
INTEGER, INTENT(OUT) :: g_order, g_side,error,comm,my_g_rank,my_g_row,my_g_column,row_comm,col_comm
INTEGER,DIMENSION(2) :: dimensions,coordinates
LOGICAL,DIMENSION(2) :: wrap_around,free_coords
g_order = SQRT(REAL(procs))
g_side = rows / g_order
dimensions = g_order
wrap_around = .TRUE.
CALL MPI_Cart_create(MPI_COMM_WORLD,2,dimensions,wrap_around,.TRUE.,comm,error)
CALL MPI_Comm_rank ( comm, my_g_rank, error )
CALL MPI_Cart_coords(comm,my_g_rank,2,coordinates,error)
my_g_row = coordinates(1)
my_g_column = coordinates(2)
free_coords(1) = .FALSE.
free_coords(2) = .TRUE.
CALL MPI_Cart_sub(comm,free_coords,row_comm,error)
free_coords(1) = .TRUE.
free_coords(2) = .FALSE.
CALL MPI_Cart_sub(comm,free_coords,col_comm,error)
END SUBROUTINE Setup_grid
SUBROUTINE Read_Matrix(rank,error,rows,matrixA,matrixB,matrixC)
INTEGER, INTENT(IN) :: rank
INTEGER, INTENT(OUT) :: rows,error
INTEGER :: i
REAL, DIMENSION(:,:), ALLOCATABLE, INTENT(OUT) :: matrixA,matrixB,matrixC
IF (rank .EQ. 0) THEN
READ*, rows
CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)
ALLOCATE(matrixA(rows,rows),matrixB(rows,rows),matrixC(rows,rows))
DO i=1,rows
READ*, matrixA(i,:)
END DO
DO i=1,rows
READ*, matrixB(i,:)
END DO
!! Calculate the matrix multiplication for comparison
matrixC = 0
CALL Matrix_Multiply(matrixA,matrixB,matrixC)
PRINT*, "CALCULATED IN PROCESS 0"
PRINT*, "=======MATRIX A=========="
CALL Print_Matrix(matrixA)
PRINT*, "=======MATRIX B=========="
CALL Print_Matrix(matrixB)
PRINT*, "=======MATRIX C=========="
CALL Print_Matrix(matrixC)
ELSE
CALL MPI_BCAST(rows,1,MPI_REAL,0,MPI_COMM_WORLD,error)
ALLOCATE(matrixA(rows,rows),matrixB(rows,rows))
END IF
END SUBROUTINE Read_Matrix
END PROGRAM FOX
And given the following matrixes:
angelv@vaso:~/fox$ cat fox.in
8
3.4 4.2 2.1 9.3 5.6 7.8 3.4 6.3
6.6 7.6 2.1 7.2 3.5 7.9 2.0 8.3
8.6 4.1 7.5 3.5 7.1 9.6 4.1 8.1
5.8 4.7 8.6 3.6 7.4 4.3 1.2 0.2
9.9 5.3 8.9 3.7 6.5 9.7 3.6 5.3
8.2 4.7 9.3 3.8 6.7 8.4 4.7 6.4
6.8 8.0 2.8 8.9 9.8 3.2 6.8 8.7
6.4 7.6 8.3 4.3 6.5 4.1 3.9 3.8
3.5 5.7 8.6 4.6 7.5 9.0 2.8 6.0
8.9 3.4 7.3 3.3 5.3 8.2 3.6 3.9
8.3 0.8 2.1 1.6 4.7 6.4 3.4 8.8
7.3 4.3 3.3 2.7 4.5 6.6 9.3 5.9
6.7 6.7 8.0 2.8 3.7 5.7 7.7 8.3
7.8 4.0 8.9 2.9 1.4 0.8 7.8 8.2
3.4 7.1 5.7 9.0 6.1 7.9 8.9 7.7
2.6 4.1 6.9 8.2 9.3 5.0 7.1 4.6
We can compile it and run it as:
angelv@vaso:~/fox$ pgf90 -Mmpi=mpich -o fox fox.f90
angelv@vaso:~/fox$ mpirun -stdin fox.in -np 4 ./fox
CALCULATED IN PROCESS 0
=======MATRIX A==========
3.400 4.200 2.100 9.300 5.600 7.800 3.400 6.300
6.600 7.600 2.100 7.200 3.500 7.900 2.000 8.300
8.600 4.100 7.500 3.500 7.100 9.600 4.100 8.100
5.800 4.700 8.600 3.600 7.400 4.300 1.200 0.200
9.900 5.300 8.900 3.700 6.500 9.700 3.600 5.300
8.200 4.700 9.300 3.800 6.700 8.400 4.700 6.400
6.800 8.000 2.800 8.900 9.800 3.200 6.800 8.700
6.400 7.600 8.300 4.300 6.500 4.100 3.900 3.800
=======MATRIX B==========
3.500 5.700 8.600 4.600 7.500 9.000 2.800 6.000
8.900 3.400 7.300 3.300 5.300 8.200 3.600 3.900
8.300 0.800 2.100 1.600 4.700 6.400 3.400 8.800
7.300 4.300 3.300 2.700 4.500 6.600 9.300 5.900
6.700 6.700 8.000 2.800 3.700 5.700 7.700 8.300
7.800 4.000 8.900 2.900 1.400 0.800 7.800 8.200
3.400 7.100 5.700 9.000 6.100 7.900 8.900 7.700
2.600 4.100 6.900 8.200 9.300 5.000 7.100 4.600
=======MATRIX C==========
260.900 194.020 272.070 178.530 210.450 236.380 297.220 275.730
274.180 199.380 307.390 197.010 245.450 266.250 285.240 277.610
311.840 232.300 352.690 225.580 277.280 303.160 320.440 360.720
247.510 147.520 219.820 111.300 167.610 225.640 198.500 256.890
327.930 227.120 350.150 209.450 269.700 313.690 306.850 365.810
318.490 224.600 336.210 216.270 271.960 310.980 311.220 361.910
319.570 268.880 357.800 255.450 309.740 359.100 362.840 349.110
288.990 190.670 279.080 175.760 235.560 291.560 257.210 301.530
============ Matrix multiplication in parallel ===================
260.900 194.020 272.070 178.530 210.450 236.380 297.220 275.730
274.180 199.380 307.390 197.010 245.450 266.250 285.240 277.610
311.840 232.300 352.690 225.580 277.280 303.160 320.440 360.720
247.510 147.520 219.820 111.300 167.610 225.640 198.500 256.890
327.930 227.120 350.150 209.450 269.700 313.690 306.850 365.810
318.490 224.600 336.210 216.270 271.960 310.980 311.220 361.910
319.570 268.880 357.800 255.450 309.740 359.100 362.840 349.110
288.990 190.670 279.080 175.760 235.560 291.560 257.210 301.530
angelv@vaso:~/fox$
2 comments:
How do you chose your matrix dimensions? and thanks a lot for the post!
Hi,
not sure what you mean... The matrix dimensions are just the sizes of the matrix you want to multiply, which is obviously problem dependent.
I guess I don't understand what you are really trying to ask.
Post a Comment