FC2ブログ

スポンサーサイト

上記の広告は1ヶ月以上更新のないブログに表示されています。
新しい記事を書く事で広告が消せます。

OpenCL の Work Group を可視化する

データパラレルコンピューティングでは,処理が複数の演算器で同時多発的に実行されます.同じ演算器で行われる処理では,互いにメモリを共有したり,同期をとったりできるため,処理の割り当てが重要です.OpenCL では Global Dimension と Work Group という概念によって,処理が演算器にどのように割り当てられるかを制御します.本記事では二次元の並列処理がどのように演算器に配置されるかを可視化します.

Keywords: OpenCL, Work Group


※本記事では以前書いた記事に掲載したプログラムを一部変更して利用します.初期化やメモリ確保についてはそちらを参照してください.

1.Kernel プログラム

Work Group に含まれる各 Work Item には Local Index が割り当てられます.Local Index が (0,0) となる位置のみ,入力画像から出力画像へ値をコピーする処理になっています.
const sampler_t sampler =
CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST;

__kernel void CLCopy(read_only image2d_t a, write_only image2d_t b)
{
int i, j;

i = get_local_id(0);
j = get_local_id(1);

if(i == 0 && j == 0) {
int2 pos;

pos.x = get_global_id(0);
pos.y = get_global_id(1);

if(pos.x >= 16 || pos.y >= 16) return;

float4 pix = read_imagef(a, sampler, pos);

write_imagef(b, pos, pix);
}
}


2.実行

Work Group のサイズを 1x1,2x2,4x4,8x8 と変えながら Kernel を実行し,結果を標準出力に表示します.以下の例では size_local 変数が Work Group のサイズになります.
    // Execute
size_t division = 1;
for(int i=0; i<4; i++) {
size_t size_global[2] = {16, 16};
size_t size_local[2] = {division, division};

// Clear Destination Buffer
err = WriteImage2D(command, imdst, 16, 16, 16, (void*)zeros);
CheckError(err);

// Execute Kernel Program
err = ExecuteKernel(command, kernel, size_global, size_local);
CheckError(err);

// Read Result Image
unsigned char result[256];
err = ReadImage2D(command, imdst, 16, 16, 16, (void*)result);
if(err == CL_SUCCESS) {
printf("[global (%d, %d), local (%d %d)]\n",
size_global[0], size_global[1],
size_local[0], size_local[1]);
Print(result);
printf("\n");
}

division *= 2;
}
出力は以下のようになります.画像がどのように Work Group に分割されているかがわかると思います.
OpenCL Work Group Visualization Sample

[global (16, 16), local (1 1)]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255

[global (16, 16), local (2 2)]
0 0 2 0 4 0 6 0 8 0 10 0 12 0 14 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
32 0 34 0 36 0 38 0 40 0 42 0 44 0 46 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
64 0 66 0 68 0 70 0 72 0 74 0 76 0 78 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
96 0 98 0 100 0 102 0 104 0 106 0 108 0 110 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
128 0 130 0 132 0 134 0 136 0 138 0 140 0 142 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
160 0 162 0 164 0 166 0 168 0 170 0 172 0 174 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
192 0 194 0 196 0 198 0 200 0 202 0 204 0 206 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
224 0 226 0 228 0 230 0 232 0 234 0 236 0 238 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

[global (16, 16), local (4 4)]
0 0 0 0 4 0 0 0 8 0 0 0 12 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
64 0 0 0 68 0 0 0 72 0 0 0 76 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
128 0 0 0 132 0 0 0 136 0 0 0 140 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
192 0 0 0 196 0 0 0 200 0 0 0 204 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

[global (16, 16), local (8 8)]
0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
128 0 0 0 0 0 0 0 136 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


付録

CLWorkGroup.cpp
#include <stdio.h>
#include <stdlib.h>

#include <cl/cl.h>
#include <cl/cl_ext.h>


static cl_platform_id *platform_ids = NULL;
static cl_device_id *device_ids = NULL;

static cl_context context = NULL;
static cl_command_queue command = NULL;
static cl_program program = NULL;
static cl_kernel kernel = NULL;

static cl_mem imsrc = NULL;
static cl_mem imdst = NULL;


static const unsigned char zeros[256] = {0};

static const unsigned char sample[] = {
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
};

static const char source_code[] =
"const sampler_t sampler = "
" CLK_NORMALIZED_COORDS_FALSE |"
" CLK_ADDRESS_CLAMP_TO_EDGE |"
" CLK_FILTER_NEAREST;"
""
"__kernel void CLCopy(read_only image2d_t a, write_only image2d_t b)"
"{"
" int i, j;"
" "
" i = get_local_id(0);"
" j = get_local_id(1);"
" "
" if(i == 0 && j == 0) {"
" int2 pos;"
" "
" pos.x = get_global_id(0);"
" pos.y = get_global_id(1);"
" "
" if(pos.x >= 16 || pos.y >= 16) return;"
" "
" float4 pix = read_imagef(a, sampler, pos);"
" "
" write_imagef(b, pos, pix);"
" }"
"}";


template<typename T>
void SafeRelease(T *pointer, cl_int (CL_API_CALL *func)(T*))
{
if(pointer != NULL) func(pointer);
}

template<typename T>
void SafeFree(T **pointer)
{
free(*pointer);
*pointer = NULL;
}


static cl_uint GetNumPlatforms()
{
cl_uint numPlatforms = 0;
cl_int result = clGetPlatformIDs(0, NULL, &numPlatforms);
if(result != CL_SUCCESS) {
printf("Failed getting number of available platforms\n");
}
return numPlatforms;
}

static cl_platform_id* GetPlatformIDs(cl_uint numPlatforms)
{
size_t allocateSize = sizeof(cl_platform_id) * numPlatforms;

cl_platform_id *ids = (cl_platform_id*)malloc(allocateSize);
if(ids != NULL) {
cl_int result = clGetPlatformIDs(numPlatforms, ids, NULL);
if(result != CL_SUCCESS) {
printf("Failed getting platform IDs\n");
SafeFree(&ids);
}
}
return ids;
}

static cl_platform_id* GetPlatformIDs()
{
cl_platform_id *ids = NULL;
cl_uint numPlatforms = GetNumPlatforms();
if(numPlatforms > 0) {
ids = GetPlatformIDs(numPlatforms);
}
return ids;
}


static cl_uint GetNumDevices(cl_platform_id id)
{
cl_uint numDevices = 0;
cl_int result = clGetDeviceIDs(
id, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices
);
if(result != CL_SUCCESS) {
printf("Failed getting number of available devices\n");
}
return numDevices;
}

static cl_device_id* GetDeviceIDs(cl_platform_id id, cl_uint numDevices)
{
size_t allocateSize = numDevices * sizeof(cl_device_id);
cl_device_id* ids = (cl_device_id*)malloc(allocateSize);
if(ids != NULL) {
cl_int result = clGetDeviceIDs(
id, CL_DEVICE_TYPE_ALL, numDevices, ids, NULL
);
if(result != CL_SUCCESS) {
printf("Failed getting device IDs\n");
SafeFree(ids);
}
}
return ids;
}

static cl_device_id* GetDeviceIDs(cl_platform_id id)
{
cl_device_id *ids = NULL;
cl_uint numDevices = GetNumDevices(id);
if(numDevices > 0) {
ids = GetDeviceIDs(id, numDevices);
}
return ids;
}


static cl_int BuildProgram(
const char *program_code,
const size_t program_length,
cl_context context,
cl_program *program
)
{
cl_int err;
cl_program p = clCreateProgramWithSource(
context,
1,
&program_code,
&program_length,
&err
);
if(err != CL_SUCCESS) {
printf("Failed creating program object from source\n", err);
return err;
}

*program = p;

err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed building program object\n", err);
return err;
}

return CL_SUCCESS;
}


static cl_int CreateImage2D(
cl_context context,
size_t image_width,
size_t image_height,
cl_mem *image
)
{
cl_image_format image_format;
image_format.image_channel_order = CL_INTENSITY;
image_format.image_channel_data_type = CL_UNORM_INT8;

cl_int err;
cl_mem m = clCreateImage2D(
context,
CL_MEM_READ_WRITE,
&image_format,
image_width,
image_height,
0,
NULL,
&err
);
if(err != CL_SUCCESS) {
printf("Failed creating Image2D memory object\n");
return err;
}

*image = m;

return CL_SUCCESS;
}


static cl_int WriteImage2D(
cl_command_queue command,
cl_mem image,
size_t image_width,
size_t image_height,
size_t image_row_pitch,
const void *host_ptr
)
{
size_t origin[3] = {0};
size_t region[3] = {image_width, image_height, 1};
cl_int err = clEnqueueWriteImage(
command,
image,
CL_TRUE,
origin,
region,
image_row_pitch,
0,
host_ptr,
0,
NULL,
NULL
);
if(err != CL_SUCCESS) {
printf("Failed writing data to device\n");
return err;
}

return CL_SUCCESS;
}


static cl_int ReadImage2D(
cl_command_queue command,
cl_mem image,
size_t image_width,
size_t image_height,
size_t image_row_pitch,
void *host_ptr
)
{
size_t origin[3] = {0};
size_t region[3] = {image_width, image_height, 1};
cl_int err = clEnqueueReadImage(
command,
image,
CL_TRUE,
origin,
region,
image_row_pitch,
0,
host_ptr,
0,
NULL,
NULL
);
if(err != CL_SUCCESS) {
printf("Failed reading data from device\n");
return err;
}

return CL_SUCCESS;
}


static cl_int ExecuteKernel(
cl_command_queue command,
cl_kernel kernel,
const size_t global_size[2],
const size_t local_size[2]
)
{
cl_int err = clEnqueueNDRangeKernel(
command,
kernel,
2,
NULL,
global_size,
local_size,
0,
NULL,
NULL
);
if(err != CL_SUCCESS) {
printf("Failed executing kernel on device\n");
return err;
}

return CL_SUCCESS;
}


static void Shutdown()
{
SafeRelease(imsrc, clReleaseMemObject);
SafeRelease(imdst, clReleaseMemObject);
SafeRelease(kernel, clReleaseKernel);
SafeRelease(program, clReleaseProgram);
SafeRelease(command, clReleaseCommandQueue);
SafeRelease(context, clReleaseContext);
SafeFree(&device_ids);
SafeFree(&platform_ids);
exit(0);
}


static void CheckError(const cl_int error)
{
if(error != CL_SUCCESS) Shutdown();
}


static void Print(unsigned char image[256])
{
int i, j;
for(j=0; j<16; j++) {
for(i=0; i<16; i++) {
printf("%3d ", image[i + j * 16]);
}
printf("\n");
}
}


int main(int, char**) {
cl_int err;

printf("OpenCL Work Group Visualization Sample\n");
printf("\n");

platform_ids = GetPlatformIDs();
if(platform_ids == NULL) {
Shutdown();
}

device_ids = GetDeviceIDs(platform_ids[0]);
if(device_ids == NULL) {
Shutdown();
}

// OpenCL Context
context = clCreateContext(NULL, 1, device_ids, NULL, NULL, &err);
CheckError(err);

// OpenCL Command Queue
command = clCreateCommandQueue(context, device_ids[0], 0, &err);
CheckError(err);

// OpenCL Program Build
err = BuildProgram(source_code, sizeof(source_code), context, &program);
CheckError(err);

// OpenCL Kernel Object
kernel = clCreateKernel(program, "CLCopy", &err);
CheckError(err);

// Memory Allocation for Image
err = CreateImage2D(context, 16, 16, &imsrc);
err |= CreateImage2D(context, 16, 16, &imdst);
CheckError(err);

err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&imsrc);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&imdst);
CheckError(err);

// Write Image Data to Device
err = WriteImage2D(command, imsrc, 16, 16, 16, (void*)sample);
CheckError(err);

// Execute
size_t division = 1;
for(int i=0; i<4; i++) {
size_t size_global[2] = {16, 16};
size_t size_local[2] = {division, division};

// Clear Destination Buffer
err = WriteImage2D(command, imdst, 16, 16, 16, (void*)zeros);
CheckError(err);

// Execute Kernel Program
err = ExecuteKernel(command, kernel, size_global, size_local);
CheckError(err);

// Read Result Image
unsigned char result[256];
err = ReadImage2D(command, imdst, 16, 16, 16, (void*)result);
if(err == CL_SUCCESS) {
printf("[global (%d, %d), local (%d %d)]\n",
size_global[0], size_global[1],
size_local[0], size_local[1]);
Print(result);
printf("\n");
}

division *= 2;
}

Shutdown();
}
スポンサーサイト

テーマ : プログラミング
ジャンル : コンピュータ

コメントの投稿

非公開コメント

プロフィール

Ishida Akihiko

Author:Ishida Akihiko
FC2ブログへようこそ!

免責事項
当サイトに掲載する記事内容は,必ずしも正確性,信頼性,妥当性,有用性,完成度などを保証しません.記事の利用はすべて自己責任でお願いします.当サイトに掲載された内容によって発生したいかなる損害に対しても,管理人は一切の責任を負いかねます.
最新記事
最新コメント
最新トラックバック
月別アーカイブ
カテゴリ
アクセスカウンター
検索フォーム
RSSリンクの表示
リンク
ブロとも申請フォーム

この人とブロともになる

QRコード
QR
上記広告は1ヶ月以上更新のないブログに表示されています。新しい記事を書くことで広告を消せます。