← Back to list

Stanford CS 149

Introduction

Overall

Machine code

Instruction level parallelism (ILP)

Hardware background info

Power wall

Memory

Efficient processing almost always comes down to accessing data efficiently

Summary

A Modern Multi-Core Processor

Parallelism

// Example: paralleism using C++ threads
typedef struct {
  int N;
  int terms;
  float* x;
  float* y;
} my_args;

void my_thread_func(my_args* args) {
  sinx(args->N, args->terms, args->x, args->y);
}
void parallel_sinx(int N, int terms, float* x, float* y) {
  std::thread my_thread;
  my_args args;

  args.N = N/2;
  args.terms = terms;
  args.x = x;
  args.y = y;

  // launch thread
  my_thread = std::thread(my_thread_func, &args);
  // on main thread
  sinx(N - args.N, terms, x + args.N, y + args.N);
  // wait for thread to complete
  my_thread.join();
}
// Data-parallel expression
// Taylor's expansion of sin(x) function for each element of an array of N floating-point numbers
void sinx(int N, int terms, float* x, float* y) {
  //substitute for (int i = 0; i < N; ++i) { with forall function
  // declares that loop iterations are independent
  // A compiler might automatically generate threaded code for you
  forall (int i from 0 to N) {
    float value = x[i];
    float numer - x[i] * x[i] * x[i];
    int denom = 6;
    int sign = -1;

    for (int j = 1; j <= terms; j ++) {
      value += sign * numer / denom;
      numer *= x[i] * x[i];
      denom *= (2 * j + 2) * (2 * j + 3);
      sign *= -1; 
    }
    y[i] = value;
  }
}
// Vector program (using AVX intrinsics)
# include <immintrin.h>
void sinx(int N, int terms, float* x, float* y) {
  float three_fact = 6;
  for (int i = 0; i < N; i += 8) {
    _m256 origx = _mm256_load_ps(&x[i]);
    _m256 value = origx;
    _m256 numer = _mm256_mul_ps(origx, _mm256_mul_ps(origx, origx));
    _m256 denom = _mm256_broadcast_ss(&three_fact);
    int sign = -1;

    for (int j = 1; j <= terms; ++j) {
      // value += sign * numer / denom
      _m256 tmp = _mm256_div_ps(_mm256_mul_ps(_mm256_set1ps(sign), numer), denom);
      value = _mm256_add_ps(value, tmp);

      numer = _mm256_mul_ps(numer, _mm256_mul_ps(origx, origx));
      denom = _mm256_mul_ps(denom, _mm256_broadcast_ss((2 * j + 2) * (2*j + 3)));
      sign *= -1;
    }    
    _mm256_store_ps(&y[i], value);
  }
}

Accessing Memory

Parallel Programming Abstractions

// ISPC code
// Interleaved assignment
export void ispc_sinx (
  uniform int N,            // N = 1024 as input value
  uniform int terms,
  uniform float* X,
  uniform float* result) {
  // assume N % programCount = 0
  // Interleave assignment
  // "Gang" of ISPC program instances, Gang contains programCount = 8 instances
  for (uniform int i = 0; i < N; i += programCount) {
    int idx = i + programIndex;             // Local variable
    float value = x[idx];                   // Local variable
    float numer = x[idx] * x[idx] *x[idx];  // Local variable
    uniform int denom = 6;
    uniform int sign = -1;
    for (uniform int j = 1; j <= terms; ++j) {
      value += sign * numer / denom;
      numer *= x[idx] * x[idx];
      denom *= (2 * j + 2) * (2 * j + 3);
      sign *= -1;
    }
    result[idx] = value;
  }
}
// Blocked assingment of array elements to program instances
// Assign multiple elements to each instance
export void ispc_sinx_v2 (
  uniform int N,
  uniform int terms,
  uniform float* x,
  uniform float* result) {
  // Assume N % programCount = 0;
  // Block assingment
  uniform int count = N / programCount;
  int start = programIndex * count;
  for (uniform int i = 0; i < count; ++i) {
    int idx = start + i;
    float value = x[idx];
    float numer = x[idx] * x[idx] * x[idx];
    uniform int denom = 6;
    uniform int sign = -1;
    for (uniform int j = 1; j < terms; ++j) {
      value += sign * numer / denom;
      numer *= x[idx] * x[idx];
      denom *= (j + 3) * (j + 4); // block assignment
      sign *= -1;
    }
    result[idx] = value;
  }
}

Parallel Programming Basics

Case study on writing an optimizing a parallel program: data parallel; shared address space

Creating a parallel program

Problem decomposition

An Example

Decomposition

Assignment

Orchestration

Example:

const int n;
float* A; // allocate space
void solve(float* A) {
  float diff, prev;
  bool done = false;
  while (!done) {
    diff = 0.f;
    for (int i = 1; i < n; ++i) {  // only iterate the non border pixels
      for (int j = 1; j < n; ++j) {
        prev = A[i, j];
        A[i,j] = 0.2f * (A[i,j] + A[i, j - 1] + A[i - 1, j] + A[i, j + 1] + A[i + 1, j]);
        diff += fabs(A[i,j] - prev); // compute amount of change
      }
    }
    if (diff/(n*n) < TOLERANCE) {
      done = true;
    }
  }
}
const int n;
float* A = allocate(n + 2, n + 2);
void solve(float* A) {
  bool done = false;
  float diff = 0.f;
  while (!done) {
    // Decomposition
    for_all (red_cells(i,j)) {
      float prev = A[i,j];
      A[i,j] = 0.2f * (A[i,j] + A[i, j - 1] + A[i - 1, j] + A[i, j + 1] + A[i + 1, j]);
      // Orchestration: builtin communication primitive: reduceAdd
      reduceAdd(diff, abs(A[i,j] - prev));
    } // Orchestration: for_all block is implicit wait for all workders before returning to sequential control

    if (diff / (n * n) < TOLERANCE) {
      done = true;
    }
  }
}
 * Shared address space (with SPMD threads) expression of solver:
   * Locks: only one thread in the critical region at a time
   * Barriers: wait for threads to reach this point
// Global variables
int n; // grid size n x n
bool done = false;
float diff = 0.0;
LOCK myLock;
BARRIER myBarrier;

float* A = allocate(n + 2, n + 2);
void solve(float* A) { // executed by all threads; SPMD-style
  float myDiff;
  int threadId = getThreadId(); // threadID is different for each SPMD instance
  int myMin = 1 + (threadId * n / NUM_PROCESSORS);
  int myMax = myMin + (n / NUM_PROCESSORS); // each thread computes the rows

  while (!done) {
    float myDiff = 0.f;
    diff = 0.f;
    barrier(myBarrier, NUM_PROCESSORS);
    for (j = myMin to myMax) {
      for (i = red_cells in this row) {
        float prev = A[i,j];
        A[i,j] = 0.2f * (A[i,j] + A[i, j - 1] + A[i - 1, j] + A[i, j + 1] + A[i + 1, j]);
        myDiff += abs(A[i,j] - prev);
      }
    }
    lock(myLock);
    diff += myDiff;
    unlock(myLock);
    barrier(myBarrier, NUM_PROCESSORS);
    if (diff / (n * n) < TOLERANCE) {
      done = true;
    }
    barrier(myBarrier, NUM_PROCESSORS);
  }
}