July 2007
Intermediate to advanced
332 pages
10h 4m
English
Example 11-6 shows a SerialMatrixMultiply that makes no use of Threading Building Blocks or any other parallelization, whereas Example 11-7 shows the corresponding ParallelMatrixMultiply that uses a blocked_range2d to specify a two-dimensional iteration space. The functions operate the same as far as the rest of the program is concerned. Obviously, we expect ParallelMatrixMultiply to run faster when on a machine with more than one processor core.
Example 11-6. Matrix multiply serial code
const size_t L = 150;
const size_t M = 225;
const size_t N = 300;
void SerialMatrixMultiply( float c[M][N], float a[M][L], float b[L][N] ) {
for( size_t i=0; i<M; ++i ) {
for( size_t j=0; j<N; ++j ) {
float sum = 0;
for( size_t k=0; k<L; ++k )
sum += a[i][k]*b[k][j];
c[i][j] = sum;
}
}
}Example 11-7. Equivalent matrix multiply with blocked_range2d
#include "tbb/parallel_for.h" #include "tbb/blocked_range2d.h" using namespace tbb; const size_t L = 150; const size_t M = 225; const size_t N = 300; class MatrixMultiplyBody2D { float (*my_a)[L]; float (*my_b)[N]; float (*my_c)[N]; public: void operator()( const blocked_range2d<size_t>& r ) const { float (*a)[L] = my_a; // a,b,c used in example to emphasize float (*b)[N] = my_b; // commonality with serial code float (*c)[N] = my_c; for( size_t i=r.rows().begin(); i!=r.rows().end(); ++i ){ for( size_t j=r.cols().begin(); j!=r.cols().end(); ++j ) { float sum = 0; for( size_t k=0; k<L; ++k ) sum += a[i][k]*b[k][j]; c[i][j] = sum; } } } MatrixMultiplyBody2D( ...