複数のGPUでマンデルブロ集合を並列計算

範囲(-0.005, -0.005, 0.005, 0.005)のマンデルブロ集合を描画サイズ4,800x4,800、繰り返しの上限10,000として計算させてみた。C++でコードを書いてXeon X5650 1コアで走らせたところ942.97秒かかった(SIMD最適化はしていない)。TBBを利用した12論理コアの並列計算では88.68秒だった。次いでCUDAを使ってTeslaで走らせてみたが7.45秒まで短縮された。Teslaパない。そして贅沢にも2枚のTeslaをスレッドで並列化して使ってみたら4.26秒で計算できた。実にXeon 1コアの220倍の速度が出たわけだ。カリカリチューンをしなくてもこの程度の高速化ができるということが重要だ。

因みにマンデルブロ集合の上記の範囲は数値が発散しない集合部分であり、描画させてみても真っ黒なので面白くはない。これは並列化した際に計算が偏らないようにしたかったのでこのような範囲を指定している。プログラムで出力させたファイルは色の生データなのでそのままでは表示できない。一応、画像表示するためのPythonコードも付け加えておく。

以下に、GPU2枚を利用してマンデルブロ集合を計算するCUDAコードを示しておく。

mandelbrot_thread.cu

// Mandelbrot set using GPGPU by nox, 2011.02.12 // nvcc -lcutil_x86_64 -arch sm_13 -use_fast_math -prec-sqrt=false -keep -L ~/NVIDIA_GPU_Computing_SDK/C/lib -I ~/NVIDIA_GPU_Computing_SDK/C/common/inc -g mandelbrot_thread.cu -o mandelbrot_thread #include <iostream> #include <fstream> #include <cutil_inline.h> #include <multithreading.h> using namespace std; const int BLOCK_SIZE_X = 16; const int BLOCK_SIZE_Y = 16; struct MandelbrotParam { int dev; // GPU device id uchar4* h_color; // image buffer float t, l, w, h; // top, left, width, height int sw, sh; // screen width, screen height int max_iter; // max number of iterators float th; // threshold }; __device__ uchar4 coloring(int n, int b) { int d = n % b; d *= 256 / b; int m = static_cast<int>(d / 42.667f); switch (m) { case 0: return make_uchar4(0, 6 * d, 255, 0); case 1: return make_uchar4(0, 255, 255 - 6 * (d - 43), 0); case 2: return make_uchar4(6 * (d - 86), 255, 0, 0); case 3: return make_uchar4(255, 255 - 6 * (d - 129), 0, 0); case 4: return make_uchar4(255, 0, 6 * (d - 171), 0); case 5: return make_uchar4(255 - 6 * (d - 214), 0, 255, 0); default: return make_uchar4(0, 0, 0, 0); } } __global__ void mandelbrot(float t, float l, float w, float h, int sw, int sh, int max_iter, float th, uchar4* d_color) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= sw || iy >= sh) return; float ci = t + (static_cast<float>(iy) / sh) * h; float cr = l + (static_cast<float>(ix) / sw) * w; float zi = 0.0f; float zr = 0.0f; float zrzi, zr2, zi2; for (int i = 0; i < max_iter; i++) { zrzi = zr * zi; zr2 = zr * zr; zi2 = zi * zi; zr = zr2 - zi2 + cr; zi = zrzi + zrzi + ci; if (zi2 + zr2 >= th) { d_color[ix*sh+iy] = coloring(i, 256); return; } } d_color[ix*sh+iy] = make_uchar4(0, 0, 0, 0); } CUT_THREADPROC mandelbrot_thread(void* args) { MandelbrotParam* param = (MandelbrotParam*)args; cudaSetDevice(param->dev); uchar4* h_color = param->h_color; float t = param->t; float l = param->l; float w = param->w; float h = param->h; int sw = param->sw; int sh = param->sh; int max_iter = param->max_iter; float th = param->th; dim3 num_blocks((sw - 1) / BLOCK_SIZE_X + 1, (sh - 1) / BLOCK_SIZE_Y + 1, 1); dim3 num_threads(BLOCK_SIZE_X, BLOCK_SIZE_Y, 1); uchar4* d_color; cudaMalloc((void**)&d_color, sizeof(uchar4) * sw * sh); mandelbrot<<<num_blocks, num_threads>>>(t, l, w, h, sw, sh, max_iter, th, d_color); cudaThreadSynchronize(); cudaMemcpy(h_color, d_color, sizeof(uchar4) * sw * sh, cudaMemcpyDeviceToHost); cudaFree(d_color); CUT_THREADEND; } // outfile : output filename // t : top position // l : left position // w : width // h : height // sw : screen width // sh : screen height // max_iter : max number of iterations [256] // th : threshold [2.0] void write_mandelbrot(const string outfile, float t, float l, float w, float h, int sw, int sh, int max_iter=256, float th=2.0f) { MandelbrotParam param[2]; uchar4* h_color; cudaMallocHost((void**)&h_color, sizeof(uchar4) * sw * sh); // using 2 GPUs for (int i = 0; i < 2; i++) { param[i].dev = i; param[i].h_color = h_color; param[i].t = t; param[i].l = l; param[i].w = w / 2.0f; param[i].h = h; param[i].sw = sw / 2; param[i].sh = sh; param[i].max_iter = max_iter; param[i].th = th; } param[1].h_color = h_color + sw * sh / 2; param[1].l = l + w / 2.0f; unsigned int timer; cutCreateTimer(&timer); cutStartTimer(timer); CUTThread threads[2]; for (int i = 0; i < 2; i++) threads[i] = cutStartThread((CUT_THREADROUTINE)mandelbrot_thread, (void*)&param[i]); cutWaitForThreads(threads, 2); cutStopTimer(timer); cout << "Time: " << cutGetTimerValue(timer) / 1000 << endl; fstream fs(outfile.c_str(), ios_base::out); fs << sw << endl; fs << sh << endl; for (int i = 0; i < sw * sh; i++) fs << h_color[i].x << h_color[i].y << h_color[i].z; fs.close(); cudaFreeHost(h_color); } int main(int argc, char* argv[]) { string outfile("mandelbrot.dat"); if (argc >= 2) outfile = argv[1]; // write_mandelbrot(output filename, // top position, // left position, // width, // height, // screen width, // screen height, // max number of iterations [256] // threshold [2.0]); //write_mandelbrot(outfile, -1.0f, -1.5f, 2.0f, 2.0f, 480, 480, 100); //write_mandelbrot(outfile, 0.57f, 0.34f, 0.036f, 0.027f, 6400, 4800, 1000); write_mandelbrot(outfile, -0.005f, -0.005f, 0.01f, 0.01f, 4800, 4800, 10000); return 0; }

表示用のPythonコード。

read_mandelbrot.py

#!/usr/bin/env python import sys, os, Image def main(args): if len(args) < 2: print >>sys.stderr, "%s datafile [imagefile]" % os.path.basename(args[0]) sys.exit() datafile = args[1] imagefile = args[2] if len(args) > 2 else "" f = file(datafile) sw = int(f.readline()) sh = int(f.readline()) data = map(ord, f.read()) im = Image.new("RGB", (sw, sh)) for x in range(sw): for y in range(sh): pos = (x * sh + y) * 3; im.putpixel((x, sh - y - 1), tuple(data[pos:pos+3])) if imagefile: im.save(imagefile) im.show() if __name__ == "__main__": main(sys.argv)

良いもの。悪いもの。

このブログを検索

複数のGPUでマンデルブロ集合を並列計算

ラベル

コメント