Commit 2401cd5b authored by Ciarán Ó Rourke's avatar Ciarán Ó Rourke
Browse files

MPI Non-blocking and Collective Communication

Change log:
* add collective communication slides
* add non-blocking exercises
* add collective exercises
* add non-blocking ping-pong solution
* standardise slides templating
* add data decomposition slide
* add non-blocking vs persistent demo
* add blocking vs non-blocking demo
* add Non-blocking slides
parent 947c0df4
#include <mpi.h>
#include <unistd.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
double before_time = MPI_Wtime();
if (rank == 0) {
/* simulate 2 units of work */
sleep(2);
/* send rank to processor 1 */
MPI_Ssend(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
/* simulate 4 units of work */
sleep(4);
/* send COMM_WORLD size to processor 1 */
MPI_Ssend(&rank, 1, MPI_INT, 1, 1, MPI_COMM_WORLD);
} else if (rank == 1) {
int recv_rank;
/* simulate 4 units of work */
sleep(4);
/* send rank to processor 1 */
MPI_Recv(&recv_rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
/* simulate 2 units of work */
sleep(2);
/* send rank to processor 1 again */
MPI_Recv(&recv_rank, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
printf("Processor %d elapsed time: %lfs\n", rank, MPI_Wtime() - before_time);
MPI_Finalize();
return 0;
}
#include <mpi.h>
#include <unistd.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Request req1, req2;
double before_time = MPI_Wtime();
const int n = 100000;
if (rank == 0) {
for (int i = 0; i < n; i++) {
/* simulate 10 units of work */
usleep(10);
/* send rank to processor 1 */
MPI_Issend(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &req1);
/* simulate 10 units of work */
usleep(10);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Issend(&rank, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req2);
/* simulate 10 units of work */
usleep(10);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
}
} else if (rank == 1) {
for (int i = 0; i < n; i++) {
int recv_rank;
/* simulate 10 units of work */
usleep(10);
/* send rank to processor 1 */
MPI_Irecv(&recv_rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req1);
/* simulate 10 units of work */
usleep(10);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Irecv(&recv_rank, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &req2);
/* simulate 10 units of work */
usleep(10);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
}
}
printf("Processor %d elapsed time: %lfs\n", rank, MPI_Wtime() - before_time);
MPI_Finalize();
return 0;
}
#include <mpi.h>
#include <unistd.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Request req1, req2;
double before_time = MPI_Wtime();
if (rank == 0) {
/* simulate 2 units of work */
sleep(2);
/* send rank to processor 1 */
MPI_Issend(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &req1);
/* simulate 6 units of work */
sleep(4);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Issend(&rank, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req2);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
} else if (rank == 1) {
int recv_rank;
/* simulate 2 units of work */
sleep(4);
/* send rank to processor 1 */
MPI_Irecv(&recv_rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req1);
/* simulate 6 units of work */
sleep(2);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Irecv(&recv_rank, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &req2);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
}
printf("Processor %d elapsed time: %lfs\n", rank, MPI_Wtime() - before_time);
MPI_Finalize();
return 0;
}
#include <mpi.h>
#include <unistd.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Request req1, req2;
double before_time = MPI_Wtime();
const int n = 100000;
if (rank == 0) {
MPI_Send_init(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &req1);
MPI_Send_init(&rank, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req2);
for (int i = 0; i < n; i++) {
/* simulate 10 units of work */
usleep(10);
/* send rank to processor 1 */
MPI_Start(&req1);
/* simulate 10 units of work */
usleep(10);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Start(&req2);
/* simulate 10 units of work */
usleep(10);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
}
} else if (rank == 1) {
int recv_rank;
MPI_Recv_init(&recv_rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req1);
MPI_Recv_init(&recv_rank, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &req2);
for (int i = 0; i < n; i++) {
/* simulate 10 units of work */
usleep(10);
/* send rank to processor 1 */
MPI_Start(&req1);
/* simulate 10 units of work */
usleep(10);
/* wait for first communication to finish */
MPI_Wait(&req1, MPI_STATUS_IGNORE);
/* send rank to processor 1 again */
MPI_Start(&req2);
/* simulate 10 units of work */
usleep(10);
/* wait for second communication to finish */
MPI_Wait(&req2, MPI_STATUS_IGNORE);
}
}
printf("Processor %d elapsed time: %lfs\n", rank, MPI_Wtime() - before_time);
MPI_Finalize();
return 0;
}
Fill with exercise instructions.
# Non-blocking Communication Exercises
## Ring Exchange
Modify your ring exchange implementation from the previous set of exercises to
use non-blocking communication.
Don't expect (much of) a speed up since there is computation to hide latency.
## Heat equation
The provided [heat equation code](./heat_equation/main.c) is a working
implementation of the two-dimensional heat equation with periodic boundary
conditions, parallelised in one dimension (the y-dimension). Replace the
blocking communication pattern with a non-blocking one. Think carefully about
where to place completion routines.
Compare the execution time of your implementation with the blocking version.
You should realise a speedup while the grid should still tend towards
uniformity.
***Note:*** periodic boundary conditions are handled in the x-dimension locally.
MPI handles periodic boundary conditions in the y-dimension as `rank 0` and
`rank N-1` exchange boundary data with one another.
***Hint:*** To perform communication independent work before completion
routines, the evolve function in [evolve.h](./heat_equation/evolve.h) must be
modified.
## Persistent Heat Equation
Further modify the heat equation code to use persistent communication.
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
int main(int argc, char **argv) {
int i, n;
int myRank, uniSize;
double time;
// Must call MPI_Init before any other MPI calls
MPI_Init(&argc, &argv);
if (argc != 2) {
printf("Usage: ./pingPong n\n n is the number of times to run the "
"simulation\n");
MPI_Abort(MPI_COMM_WORLD, 10);
}
// We take a command line argument to decide the number of loops to do
n = atoi(argv[1]);
// Get the size of the communicator
MPI_Comm_size(MPI_COMM_WORLD, &uniSize);
// Get our rank in this communicator
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
if (uniSize == 2) {
// Initialise the message to 10
int msg = 10;
// Start the timer, we will take the average for a number of loops
time = MPI_Wtime();
// time = omp_get_wtime();
/* declare requests */
MPI_Request req[2];
for (i = 0; i < n; i++) {
if (myRank == 0) {
// P 0 increments this message by one and passes it to P 1
// (ping)
msg++;
MPI_Isend(&msg, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &req[0]);
// P 0 also has to receive the msg from P 1
MPI_Irecv(&msg, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req[1]);
}
if (myRank == 1) {
// P 1 receives the message it increments it by one and passes
// it back to P 0
MPI_Irecv(&msg, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &req[1]);
msg++;
MPI_Isend(&msg, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &req[0]);
}
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
}
time = MPI_Wtime() - time;
// time = omp_get_wtime()-time;
// Print the value of msg and the average time
if (myRank == 0) {
printf("value of msg after %d exchanges is %d\n", n, msg);
printf("On average it took %lf seconds\n", time / n);
// printf("Resolution of MPI_Wtime = %lf seconds\n", MPI_Wtick());
}
} else if (myRank == 0) {
printf("Ping pong can be played with exactly 2 processes.\n");
}
// Remember to always call MPI_Finalize()
MPI_Finalize();
return 0;
}
Fill with exercise instructions.
# Collective Communication Exercises
## Ordered Hello World
The [hello world](./hello_world/main.c) code provided prints a simple hello
world message on each process containing its rank. Modify the code so that the
messages are printed in order of increasing rank.
## Find the Deadlock
The [find the deadlock](./find_the_deadlock/main.c) code provided contains a
deadlock. The expected behaviour of the program is;
- processor 0 reads 25 integers from a [file](./find_the_deadlock/values.dat),
- processor 0 broadcasts the integers to the other processors,
- each processor saves the data in separate files.
You should;
- find and fix the deadlock,
- correct some other errors in the program (the expected wall time of the
corrected program is less than 30 seconds).
## Complex Reduction
The goal of this exercise is to perform a summation of a set of complex numbers
distributed across several processes. The [skeleton
code](./complex_reduction/main.c) provided;
- defines the complex number data type,
- initialises arrays a complex number on each process,
- implements a function for summing complex numbers,
- registers the complex number data type with MPI, storing its handle as
`MPI_COMPLEX`.
Calculate the sum of the complex numbers using `MPI_Reduce`.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment