More OpenMP examples if useful.
🎥 Lecture Video
1Example 1: Arrays¶
OpenMP C program: for.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19#include <stdio.h> #include <omp.h> int main() { omp_set_num_threads(4); int a[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; const int N = sizeof(a)/sizeof(int); #pragma omp parallel for for (int i = 0; i < N; i++) { printf("thread %d, i = %2d\n", omp_get_thread_num(), i); a[i] = a[i] + 10 * omp_get_thread_num(); } for (int i = 0; i < N; i++) { printf("%02d ", a[i]); } printf("\n"); return 0; }
Output:
$ gcc -fopenmp -o for for.c
$ ./for
thread 0, i = 0
thread 1, i = 3
thread 2, i = 6
thread 3, i = 8
thread 0, i = 1
thread 1, i = 4
thread 2, i = 7
thread 3, i = 9
thread 0, i = 2
thread 1, i = 5
00 01 02 13 14 15 26 27 38 392Example 2: Computing ¶
We can compute with numerical integration[1]:
We can use Riemann’s sum to approximate the integral as a sum of rectangles:
where the -th rectangle has width and height at the middle of interval .
Figure 1:Slides version of the code below, with some additional diagrams.
#include <stdio.h>
int main(void) {
const long num_steps = 10;
double sum = 0.0;
for (int i = 0; i < num_steps; i++) {
double x = (i + 0.5) * step;
sum += 4.0 * step/(1.0 + x*x);
}
printf("pi = %6.12f\n", sum);
return 0;
}Output:
pi = 3.142425985001Resembles , but not very accurate. Let’s increase num_steps and parallelize.
#include <stdio.h>
#include <omp.h>
int main(void) {
const long num_steps = 10;
double sum = 0.0;
#pragma parallel for
for (int i = 0; i < num_steps; i++) {
double x = (i + 0.5) * step;
sum += 4.0 * step/(1.0 + x*x);
}
printf("pi = %6.12f\n", sum);
return 0;
}Problem: Each thread needs access to the shared variable sum. Code runs sequentially!
Compute a sum array, chunked into components of the Riemann’s Sum. Then, add up the elements of the sum array.
#include <stdio.h>
#include <omp.h>
int main(void) {
const int NUM_THREADS = 4;
omp_set_num_threads(NUM_THREADS);
double sum[NUM_THREADS];
for (int tid = 0; tid < NUM_THREADS; tid++) {
sum[tid] = 0;
}
const long num_steps = 10;
double step = 1.0/((double) num_steps);
#pragma omp parallel
{
int tid = omp_get_thread_num();
for (int i = tid; i < num_steps; i+= NUM_THREADS) {
double x = (i + 0.5) * step;
sum[tid] += 4.0 * step/(1.0 + x*x);
printf("i = %3d, tid = %3d\n", i, tid);
}
}
double pi = 0;
for (int tid = 0; tid < NUM_THREADS; tid++) {
pi += sum[tid];
}
printf("pi = %6.12f\n", pi);
return 0;
}Output:
$ ./pi
i = 1, id = 1
i = 0, id = 0
i = 2, id = 2
i = 3, id = 3
i = 5, id = 1
i = 4, id = 0
i = 6, id = 2
i = 7, id = 3
i = 9, id = 1
i = 8, id = 0
pi = 3.142425985001Scale up: num_steps = = 106
#include <stdio.h>
#include <omp.h>
int main(void) {
const int NUM_THREADS = 4;
omp_set_num_threads(NUM_THREADS);
double sum[NUM_THREADS];
for (int tid = 0; tid < NUM_THREADS; tid++) {
sum[tid] = 0;
}
const long num_steps = 1000000;
double step = 1.0/((double) num_steps);
#pragma omp parallel
{
int tid = omp_get_thread_num();
for (int i = tid; i < num_steps; i+= NUM_THREADS) {
double x = (i + 0.5) * step;
sum[tid] += 4.0 * step/(1.0 + x*x);
// printf("i = %3d, tid = %3d\n", i, tid);
}
}
double pi = 0;
for (int tid = 0; tid < NUM_THREADS; tid++) {
pi += sum[tid];
}
printf("pi = %6.12f\n", pi);
return 0;
}Output:
$ ./pi
pi = 3.141592653590Verify how many digits are correct!
This example is adapted from the OpenMP “Hands On Tutorial” from SC08.[2] View the original slides PDF.
3Example 3: More on the for Directive¶
This section expands on the OpenMP directive for described in an earlier section. Consider the below program, which uses a loop to assign elements of a giant heap-allocated array:
#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
for (int i = 0; i < LENGTH; i++) {
arr[i] = ...;
}
}Toggle between the cards below to compare different parallelizations of this program. Asssume that OMP_NUM_THREADS on this
#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
#pragma omp parallel
{
for(int i = 0; i < LENGTH; i++) {
arr[i] = ...;
}
}
}#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
for(int i = tid; i < LENGTH; i+= num_threads) {
arr[i] = j;
}
}
}#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
int thread_start = tid * LENGTH / num_threads;
int thread_end = (tid+1)*LENGTH / num_threads;
for(int i = thread_start; i < thread_end; i++) {
arr[i] = j;
}
}
}#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
#pragma omp parallel
{
#pragma omp for
for(int i = 0; i < LENGTH; i++) {
arr[i] = j;
}
}
}#define LENGTH (1 << 27)
int main(void) {
char *arr = malloc(sizeof(char) * LENGTH);
#pragma omp parallel for
for(int i = 0; i < LENGTH; i++) {
arr[i] = j;
}
}Duplicates work. The for-loop is repeated 12 times, so each array element is assigned 12 times.
“Interweaves” array access between threads. If there are 12 OpenMP threads, then thread 0 accesses elements 0, 12, 24, etc.; thread 1 accesses elements 1, 13, 25, etc. Potentially wasteful memory access pattern.
“Chunks” array sections by thread. If there are 12 OpenMP threads, then thread 0 accesses elements 0 through LENGTH/12 - 1; thread 1 elements elements LENGTH/12 through element 2*LENGTH/12, etc. Same as Code 4 and Code 5.
Same as Code 3 and Code 5. Here, the #pragma omp for directive specifies loop work-sharing, which is then planned via OpenMP.
Same as Code 3 and Code 4. Like Code 4, the loop work-sharing is planned via OpenMP, though the code is more readable because of the combined directive #pragma omp parallel for.
Tim Mattson and Larry Meadows, SC08 OpenMP “Hands On Tutorial.” 2008. Access on OpenMP website