Processing Device Arrays with C++ Metaprogramming

Transcription

Processing Device Arrays with C++ Metaprogramming
Processing Device Arrays with C++
Metaprogramming
Jonathan Cohen (NVIDIA Research)
GTC, San Jose Convention Center, CA | Sept. 20–23, 2010
Motivating Example
struct DArray1D {
int _size;
float *_ptr; // device pointer
DeviceArray1D(int size) :
_size(size), _ptr(0) {…}
~DeviceArray1D() {…}
};
__global__ void addition_kernel(
int n, float *result, float *a, float *b)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
result [i] = a[i] + b[i];
}
void addition(
DArray1D &result, DArray1D &a, DArray1D &b)
{
int n = result._size;
addition_kernel<<(n+255)/256, 256>>>(
n, result._ptr, b._ptr, b._ptr);
}
API Design – Take 1: Small building blocks
 shift(DArray1D &output, DArray1D &input, int amount);
 scale(shift(DArray1D &output, DArray1D &input, float scale);
 add(DArray1D &output, DArray1D &input);
 Combine these to build complex expressions:
shift(a_min1, a, -1);
shift(a_plu1, a, 1);
scale(a_cent, a, -2.0);
add(result, a_min1, a_plu1);
add(result, result, a_cent); // result = Laplacian(a)
Problem: intermediate storage + bandwidth
shift then add using intermediate storage
Shift
SM
R1 = a[i+1];
b[i] = R1
MEMORY
a
b’
Add
SM
R1 = a[i];
R2 = b[i];
R3 = R1+R2;
c[i] = R3
3 reads, 2 writes
1 extra array
MEMORY
a
b’
c
Better: store intermediate results in registers
Fused shift then add
Add + Shift
SM
R1 = a[i];
R2 = b[i+1];
R3 = R1+R2;
c[i] = R3
MEMORY
a
b
c
2 reads, 1 write
Intermediate stored
in registers
API Design – Take 2: Fused kernels
 Lets turn this into an API
 Fuse all possible combinations together
 Benefit: Efficient execution, minimal storage
__global__ void addition3_kernel(
__global__
void addition3_sc_sh_kernel(
int n, float *result, float *a, float
*b,
int n, float *result, float *a, float *b,
float *c)
__global__ void addition3_sc_kernel(
float *c, float as, float bs, float cs,
__global__
addition2_sc_sh_mult_kernel(
{
int n, float
*result,void
float
*b,csh)
__global__ void addition3_sc_sh_mult_kernel(
int
ash,*a,
int float
bsh, int
int
n,
float
*result,
float
int i = int
threadIdx.x
+
blockIdx.x
*
blockDim.x;
float *c,
as,
bs, float cs) *a, float *b,
n, float *result,
floatfloat
*a, float
*b,
{ float
float
*d, float
as, float bs,
*c, {
float *d, float as, float
bs,
cs,
if (i < float
n)
int float
i = threadIdx.x
+ blockIdx.x * blockDim.x;
int ash, int bsh)
int
ash,
int
bsh,
int
csh)
(i < n)
result [i] = a[i]
+ c[i];
int+ ib[i]
= threadIdx.x
+ if
blockIdx.x
* blockDim.x;
{
{
result
[i]
=
as*a[i+ash] + bc*b[i+bsh] +
}
if (i < n)
i = threadIdx.x + blockIdx.x * blockDim.x;
int i = threadIdx.x + blockIdx.x int
* blockDim.x;
cs*c[i+csh];
(i < n)+ bc*b[i] + cs*c[i];
result [i] = if
as*a[i]
if (i < n)
}
result [i]
void addition(
result [i] }
= (as*a[i+ash] + bc*b[i+bsh]
+ = (as*a[i+ash] + bc*b[i+bsh]) * d[i];
DArray1D &result,cs*c[i+csh])
DArray1D &a,
DArray1D
&b,
* }d[i];
void addition(
}
DArray1D
&c) void addition(
DArray1D &result, DArray1D &a, DArray1D &b,
void addition(
DArray1D
float as,&b,
float bs, float cs,
{
DArray1D &result,
DArray1D
&a,&c,
DArray1D
DArray1D
&result,
DArray1D
&a, DArray1D &b,
void
addition(
ash, bs,
int bsh,
intcs)
csh)
int n = result._size;
DArray1D &c, float
as,int
float
float
DArray1D
&d,
DArray1D &result, DArray1D &a, DArray1D
&b,
{
addition3_kernel<<(n+255)/256,
float as, float bs,
{ DArray1D &d, 256>>>(
DArray1D &c,
int n = result._size;
int ash, int bsh)
n, result._ptr,
b._ptr,
b._ptr,
int
n =float
result._size;
float as, float
bs,
cs, c._ptr);
addition3_sc_sh_kernel<<(n+255)/256, 256>>>(
}
256>>>(
int ash, int addition3_sc_kernel<<(n+255)/256,
bsh, int csh) {
n, result._ptr,
b._ptr, b._ptr, c._ptr,
int
n
=
result._size;
{
n, result._ptr,
b._ptr,
b._ptr,
c._ptr,
as, bs, cs, ash, bsh, csh);
addition2_sc_sh_mult_kernel<<(n+255)/256, 256>>>(
int n = result._size;
}
as, bs, cs);
n, result._ptr,
addition3_sc_sh_mult_kernel<<(n+255)/256,
256>>>( a._ptr, b._ptr, d._ptr,
}
as, bs, ash, bsh);
n, result._ptr, b._ptr, b._ptr, c._ptr, d._ptr,
as, bs, cs, ash, bsh, csh); }
}
All routines do
basically the same
thing!
And that’s just for
addition!
void addition(
DArray1D &result, DArray1D &a, DArray1D &b);
void addition(
DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &c);
void addition_pt_wise_scale(
DArray1D &result, DArray1D &a, DArray1D &b,
DArray1D &scale);
void addition_pt_wise_scale(
DArray1D &result, DArray1D &a, DArray1D &b,
DArray1D &c, DArray1D &scale);
void addition_pt_wise_scale_shift(
DArray1D &result, DArray1D &a, DArray1D &b,
DArray1D &scale, int ash, int bsh, int );
void addition_ptwise_scale_scale_shift(
DArray1D &result, DArray1D &a, DArray1D &b,
DArray1D &c, DArray1D &d,
float as, float bs, float cs,
int ash, int bsh, int csh)
void addition_scale_shift(
DArray1D &result, DArray1D &a, DArray1D &b,
DArray1D &c, float as, float bs, float cs,
int ash, int bsh, int csh)
void addition_scale_shift(
DArray1D &result, DArray1D &a, DArray1D &b,
float as, float bs,
int ash, int bsh)
API Design – Take 3: Emit fused kernels on-demand
 Expression Templates: C++ technique for performing compile-time
calculations based on expressions
 nvcc includes robust C++ template support
DArray1D a, result;
result =
a[-1] + a[1] –
constant(2.0) * a[0];
Client code
__global__ void auto_generated(
int n, float *result, const float *a)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
result[i] = (a[i-1] + a[i+1] – 2.0f * a[i]);
}
Generated Kernel
Expression Templates Approach
1. Build Abstract Syntax Tree (AST) using C++ types
2. Generate AST nodes automatically using templated functions
3. Provide framework code for calling emitted kernels
4. ―Walk‖ AST to emit kernel
1. Built AST Using C++ Types
template<class PARM>
struct Op {
__device__ static float
exec(int i, const PARM &p) {
/* return some function of p and i */
}
};
template<class OP, class PARM>
struct OpWithParm {
OpWithParm(const PARM &p) : parm(p) { }
PARM parm;
__device__ float exec(int i) const {
return OP::exec(i, parm);
}
};
Abstract Operator
Provide implementation – exec() routine
Input: Index, abstract parameters
Abstract Closure
Bind operator with specific parameter state
5.0
OP = LeafOp<ConstantParm>
PARM = ConstantParm = {5.0}
a[i]
OP = LeafOp<ArrayLookupParm>
PARM = ArrayLookupParm = {a, 0}
template<class PARM>
struct LeafOp {
__device__ static float
exec(int i, const PARM &p) {
return p.value(i);
}
};
+
5.0
a[i]
struct ConstantParm {
float _value;
struct ArrayLookupParm {
const float *_ptr;
int _shift;
__device__ float
value(int i) const {
return _value;
}
};
__device__ float
value(int i) const {
return _ptr[(i+_shift)];
}
};
+
OP = PlusOp<LeafOp<…>, LeafOp<…>, ConstantParm, ArrayLookupParm>
PARM = ParmPair<ConstantParm, ArrayLookupParm> = {{5.0}, {a, 0}}
+
5.0
a[i]
template<class LOP, class ROP, class LPARM, class RPARM>
struct PlusOp {
__device__ static float exec(
int i, const ParmPair<LPARM, RPARM> &p) {
return LOP::exec(i,p.left) + ROP::exec(i,p.right);
}
};
template<typename LPARM, typename RPARM>
struct ParmPair {
LPARM left;
RPARM right;
ParmPair(const LPARM &l, const RPARM &r) :
left(l), right(r) { }
};
2. Generate AST nodes from templated functions
 Expression Templates: use templated functions to compute output
types as well as output values
 T. Veldhuizen. ―Expression Templates,‖ C++ Report, 26-31, June 1995.
template<typename A>
computed_type<A> my_function(A) { return my_type<A>(…); }
my_class a;
my_function(a); // <= return type is computed_type<my_class>
5.0
OP = LeafOp<ConstantParm>
PARM = ConstantParm = {5.0}
OpWithParm<LeafOp<ConstantParm>, ConstantParm>
constant(float value) {
return OpWithParm<LeafOp<ConstantParm>, ConstantParm >(
ConstantParm (value));
}
node = constant(5.0)
a[i]
OP = LeafOp<ArrayLookupParm>
PARM = ArrayLookupParm = {a, 0}
OpWithParm<LeafOp<ArrayLookupParm>, ArrayLookupParm>
DArray1D::operator[](int shift) {
return OpWithParm<LeafOp<ArrayLookupParm>,ArrayLookupParm>(
ArrayLookupParm(g, shift));
}
node = a[0];
+
OP = PlusOp<LeafOp<…>, LeafOp<…>, ConstantParm, ArrayLookupParm>
PARM = ParmPair<ConstantParm, ArrayLookupParm> = {{5}, {a, 0}}
template<class LOP, class LPARM, class ROP, class RPARM>
OpWithParm<AddOp<LOP, LPARM, ROP, RPARM>,
ParmPair<LPARM, RPARM> >
operator+(
const OpWithParm<LOP, LPARM> &left,
const OpWithParm<ROP, RPARM> &right)
{
return OpWithParm<AddOp<LOP, ROP, LPARM, RPARM>,
ParmPair<LPARM, RPARM> >(
ParmPair<LPARM, RPARM>(left.parm, right.parm))
}
node = constant(5.0) + a[0];
3. Framework for calling kernels
 OP::exec(index, parm) routine provides per-index evalulation
 OpWithParm binds parm instance:
— OP::exec(… , …) => OP::exec(… , parm) // (aka ―currying‖)
 Call OP::exec once per array index <= CUDA kernel
 Assign result to each entry in result array <= overload
DArray1D::operator=(OpWithParm)
template <typename OP, typename PARM>
__global__ void kernel_assign(
const OpWithParm<OP,PARM> functor, float *result, int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
result[i] = functor.exec(i);
}
}
template<typename OP, typename PARM>
DArray1D &
DArray1D::operator=(const OpWithParm<OP,PARM> &func)
{
kernel_assign<<<(_size+255)/256, 256>>>(func, _ptr, _size);
return *this;
}
4. Walk AST to emit function
 Invocation of top-level OpWithParm::exec triggers recursive expansion
+
5.0
a[i]
 OpWithParm::exec(i) => PlusOp::exec(i, parm) =>
LeafOp<ConstantParm>::exec(i, parm) => parm.value(i) => return 5
LeafOp<ArrayLookupParm>::exec(i, parm) => parm.value(i) => return a[i]
*
DeviceArray1D A(100), B(100), C(100);
A = constant(0.5f) * (B[0] + C[0]);
+
5.0
B[i]
C[i]
__global__ void generated_kernel_assign(
generated_OpWithParm ftor,
float *dst,
int nx)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < nx) {
dst[i] = ftor.parm.left._value *
(
ftor.parm.right.left._ptr[i+functor.parm.right.left._shift] +
ftor.parm.right.right._ptr[i+functor.parm.right.right._shift]
);
}
Generated kernel: properly fused
Add + Scale
SM
R1 = b[i];
R2 = c[i];
R3 = R1+R2;
R4 = 0.5*R3
a[i] = R4
MEMORY
c
b
a
2 reads, 1 write
Intermediate stored
in registers
One AST => Many emitted functions
 So far, code generator builds per-index exec(int, parm) kernel
 Other useful things:
— Validate that all array reads are in-bounds
— Count number of FLOPs
— Count number of bytes read
— Etc.
Example: bounds checking
 Array padding to DArray1D:
DArray1D a(8,1);
-1 0
1
2
3
a._ptr
A[-1] .. A[8] are valid accesses
4
5
6
7
8
Shifted accesses
 Now we can implement 1D Laplacian:
DArray1D a(n,1), result(n,1);
… // initialize a
result = a[-1] – constant(2.0) * a[0] + a[1];
BUT, improper padding will generate out-of-bounds access:
DArray1D a(n,0), result(n,1);
We would like to catch these errors before kernel launch
Solution: Generate range checker from AST
template<class PARM>
struct Op {
...
bool validate(
const Range &rng, const PARM &p) const {
...
}
};
template<class OP, class PARM>
struct OpWithParm {
...
bool validate(const Range &rng) const {
return OP::validate(rng, parm);
}
};
OP: Validate index range based on any array accesses
OpWithParm: Validate index range for entire tree
template<typename OP, typename PARM>
DArray1D &
DArray1D::operator=(const OpWithParm<OP,PARM> &func)
{
if (!func.validate(this->range())) {
// run-time error
}
kernel_assign<<<(_size+255)/256, 256>>>(func, _ptr, _size);
return *this;
}
Output range validated before kernel launch
Illegal shifts generate run-time error - bad memory accesses impossible
Results – 1D Heat Equation – Explicit Euler
Implementation
Lines of code
Time (speedup)
Laptop1
Time (speedup)
HPC Workstation2
SERIAL (CPU)
28
5,760.5 ms (1x)
3,068.3 ms (1x)
METAPROG
18
543.3 ms (11x)
36.5 ms (84x)
HAND
20 host + 34 device 523.2 ms (11x)
29.5 ms (104x)
HAND-OPT
20 host + 54 device 217.8 ms (26x)
30.2 ms (101x)
1 - Laptop:
NVIDIA Quadro FX 570M
+ 2-core Intel T7300 Centrino @ 2Ghz
2 - Workstation: NVIDIA Tesla C2050 w/ ECC + 4-core Intel Core i7 @ 3.07Ghz
Getting Complicated…
 Write routines:
my_array(0,127) = constant(1.0);
 Array slices
my_array(0,127,2) = my_array[-1] + my_array[+1];
 Array reshaping (like Fortran)
DArray1D from(64);
from = constant(-1.0);
my_array(0,127,2) = from(0,63);
void restrict_residual(
DArray1D &U_f, const DArray1D &B_f, DArray1D &R_f,
DArray1D &B_c, FP h, int level)
{
int n = R_f._size;
update_bc(U_f, level);
R_f = B_f[0] - constant(1.0/(h*h)) * (U_f[1] + U_f[-1] - constant(2) * U_f[0]);
B_c = constant(.5) * (R_f.read(0, n-2, 2) + R_f.read(1, n-1, 2));
}
void prolong(
DArray1D &U_c, DArray1D &U_f, int level)
{
update_bc(U_c, level+1);
int n_c = U_c._size, n_f = U_f._size;
U_f(-1, n_f-1, 2) = U_f.read(-1, n_f-1, 2) + constant(.75)
constant(.25)
U_f( 0, n_f , 2) = U_f.read( 0, n_f , 2) + constant(.25)
constant(.75)
update_bc(U_f, level);
}
*
*
*
*
U_c.read(-1, n_c-1) +
U_c.read(0, n_c);
U_c.read(-1, n_c-1) +
U_c.read(0, n_c);
Results – 1D Poisson Equation - Multigrid
Implementation
Lines of code
(excl. ws,//,/* */)
Time (speedup)
Laptop1 – fp32
Time (speedup)
HPC Workstation2
SERIAL (CPU)
127
26,973 (5x)
12,819 (26x)
METAPROG
119
5,291 (1x)
501 (1x)
1 - Laptop:
NVIDIA Quadro FX 570M
+ 2-core Intel T7300 Centrino @ 2Ghz
2 - Workstation: NVIDIA Tesla C2050 w/ ECC + 4-core Intel Core i7 @ 3.07Ghz
Future Work
 Multi-GPU backend – infer data movement from access patterns
 More backends: OpenMP, OpenCL, naïve single-threaded, etc.
 More parallel primitives + sophisticated fusion = Copperhead
 Use of shared memory – map work to parallel thread sets, rather than
parallel threads
For more info
 Forthcoming chapter in GPU Computing Gems Volume 2
 Googlecode:
http://code.google.com/p/cuda-metaprog/
 [email protected]
 http://research.nvidia.com/users/jonathan-cohen