euler3d.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. /********************************************************************
  2. euler3d.cpp
  3. : parallelized code of CFD
  4. - original code from the AIAA-2009-4001 by Andrew Corrigan, acorriga@gmu.edu
  5. - parallelization with OpenCL API has been applied by
  6. Jianbin Fang - j.fang@tudelft.nl
  7. Delft University of Technology
  8. Faculty of Electrical Engineering, Mathematics and Computer Science
  9. Department of Software Technology
  10. Parallel and Distributed Systems Group
  11. on 24/03/2011
  12. ********************************************************************/
  13. #include <iostream>
  14. #include <fstream>
  15. #include <math.h>
  16. #include "CLHelper.h"
  17. /*
  18. * Options
  19. *
  20. */
  21. #define GAMMA 1.4f
  22. #define iterations 2000
  23. #ifndef block_length
  24. #define block_length 192
  25. #endif
  26. #define NDIM 3
  27. #define NNB 4
  28. #define RK 3 // 3rd order RK
  29. #define ff_mach 1.2f
  30. #define deg_angle_of_attack 0.0f
  31. /*
  32. * not options
  33. */
  34. #if block_length > 128
  35. #warning "the kernels may fail too launch on some systems if the block length is too large"
  36. #endif
  37. #define VAR_DENSITY 0
  38. #define VAR_MOMENTUM 1
  39. #define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
  40. #define NVAR (VAR_DENSITY_ENERGY+1)
  41. //self-defined user type
  42. typedef struct{
  43. float x;
  44. float y;
  45. float z;
  46. } float3;
  47. /*
  48. * Generic functions
  49. */
  50. template <typename T>
  51. cl_mem alloc(int N){
  52. cl_mem mem_d = _clMalloc(sizeof(T)*N);
  53. return mem_d;
  54. }
  55. template <typename T>
  56. void dealloc(cl_mem array){
  57. _clFree(array);
  58. }
  59. template <typename T>
  60. void copy(cl_mem dst, cl_mem src, int N){
  61. _clMemcpyD2D(dst, src, N*sizeof(T));
  62. }
  63. template <typename T>
  64. void upload(cl_mem dst, T* src, int N){
  65. _clMemcpyH2D(dst, src, N*sizeof(T));
  66. }
  67. template <typename T>
  68. void download(T* dst, cl_mem src, int N){
  69. _clMemcpyD2H(dst, src, N*sizeof(T));
  70. }
  71. void dump(cl_mem variables, int nel, int nelr){
  72. float* h_variables = new float[nelr*NVAR];
  73. download(h_variables, variables, nelr*NVAR);
  74. {
  75. std::ofstream file("density");
  76. file << nel << " " << nelr << std::endl;
  77. for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY*nelr] << std::endl;
  78. }
  79. {
  80. std::ofstream file("momentum");
  81. file << nel << " " << nelr << std::endl;
  82. for(int i = 0; i < nel; i++)
  83. {
  84. for(int j = 0; j != NDIM; j++)
  85. file << h_variables[i + (VAR_MOMENTUM+j)*nelr] << " ";
  86. file << std::endl;
  87. }
  88. }
  89. {
  90. std::ofstream file("density_energy");
  91. file << nel << " " << nelr << std::endl;
  92. for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY_ENERGY*nelr] << std::endl;
  93. }
  94. delete[] h_variables;
  95. }
  96. void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string){
  97. int work_items = nelr;
  98. int work_group_size = BLOCK_SIZE_1;
  99. int kernel_id = 1;
  100. int arg_idx = 0;
  101. _clSetArgs(kernel_id, arg_idx++, variables);
  102. _clSetArgs(kernel_id, arg_idx++, ff_variable);
  103. _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
  104. _clInvokeKernel(kernel_id, work_items, work_group_size);
  105. }
  106. void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors){
  107. int work_items = nelr;
  108. int work_group_size = BLOCK_SIZE_2;
  109. int kernel_id = 2;
  110. int arg_idx = 0;
  111. _clSetArgs(kernel_id, arg_idx++, variables);
  112. _clSetArgs(kernel_id, arg_idx++, areas);
  113. _clSetArgs(kernel_id, arg_idx++, step_factors);
  114. _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
  115. _clInvokeKernel(kernel_id, work_items, work_group_size);
  116. }
  117. void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable, \
  118. cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
  119. cl_mem ff_flux_contribution_momentum_x,
  120. cl_mem ff_flux_contribution_momentum_y,
  121. cl_mem ff_flux_contribution_momentum_z){
  122. int work_items = nelr;
  123. int work_group_size = BLOCK_SIZE_3;
  124. int kernel_id = 3;
  125. int arg_idx = 0;
  126. _clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
  127. _clSetArgs(kernel_id, arg_idx++, normals);
  128. _clSetArgs(kernel_id, arg_idx++, variables);
  129. _clSetArgs(kernel_id, arg_idx++, ff_variable);
  130. _clSetArgs(kernel_id, arg_idx++, fluxes);
  131. _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
  132. _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
  133. _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
  134. _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
  135. _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
  136. _clInvokeKernel(kernel_id, work_items, work_group_size);
  137. }
  138. void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes){
  139. int work_items = nelr;
  140. int work_group_size = BLOCK_SIZE_4;
  141. int kernel_id = 4;
  142. int arg_idx = 0;
  143. _clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
  144. _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
  145. _clSetArgs(kernel_id, arg_idx++, old_variables);
  146. _clSetArgs(kernel_id, arg_idx++, variables);
  147. _clSetArgs(kernel_id, arg_idx++, step_factors);
  148. _clSetArgs(kernel_id, arg_idx++, fluxes);
  149. _clInvokeKernel(kernel_id, work_items, work_group_size);
  150. }
  151. inline void compute_flux_contribution(float& density, float3& momentum, float& density_energy, float& pressure, float3& velocity, float3& fc_momentum_x, float3& fc_momentum_y, float3& fc_momentum_z, float3& fc_density_energy)
  152. {
  153. fc_momentum_x.x = velocity.x*momentum.x + pressure;
  154. fc_momentum_x.y = velocity.x*momentum.y;
  155. fc_momentum_x.z = velocity.x*momentum.z;
  156. fc_momentum_y.x = fc_momentum_x.y;
  157. fc_momentum_y.y = velocity.y*momentum.y + pressure;
  158. fc_momentum_y.z = velocity.y*momentum.z;
  159. fc_momentum_z.x = fc_momentum_x.z;
  160. fc_momentum_z.y = fc_momentum_y.z;
  161. fc_momentum_z.z = velocity.z*momentum.z + pressure;
  162. float de_p = density_energy+pressure;
  163. fc_density_energy.x = velocity.x*de_p;
  164. fc_density_energy.y = velocity.y*de_p;
  165. fc_density_energy.z = velocity.z*de_p;
  166. }
  167. /*
  168. * Main function
  169. */
  170. int main(int argc, char** argv){
  171. printf("WG size of kernel:initialize = %d, WG size of kernel:compute_step_factor = %d, WG size of kernel:compute_flux = %d, WG size of kernel:time_step = %d\n", BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
  172. if (argc < 2){
  173. std::cout << "specify data file name and [platform id] [device id] [device type] " << std::endl;
  174. return 0;
  175. }
  176. const char* data_file_name = argv[1];
  177. _clCmdParams(argc, argv);
  178. cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
  179. cl_mem areas, elements_surrounding_elements, normals;
  180. cl_mem variables, old_variables, fluxes, step_factors;
  181. float h_ff_variable[NVAR];
  182. try{
  183. _clInit(device_type, device_id);
  184. // set far field conditions and load them into constant memory on the gpu
  185. {
  186. //float h_ff_variable[NVAR];
  187. const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
  188. h_ff_variable[VAR_DENSITY] = float(1.4);
  189. float ff_pressure = float(1.0f);
  190. float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
  191. float ff_speed = float(ff_mach)*ff_speed_of_sound;
  192. float3 ff_velocity;
  193. ff_velocity.x = ff_speed*float(cos((float)angle_of_attack));
  194. ff_velocity.y = ff_speed*float(sin((float)angle_of_attack));
  195. ff_velocity.z = 0.0f;
  196. h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
  197. h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
  198. h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
  199. h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*(float(0.5f)*(ff_speed*ff_speed)) + (ff_pressure / float(GAMMA-1.0f));
  200. float3 h_ff_momentum;
  201. h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
  202. h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
  203. h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
  204. float3 h_ff_flux_contribution_momentum_x;
  205. float3 h_ff_flux_contribution_momentum_y;
  206. float3 h_ff_flux_contribution_momentum_z;
  207. float3 h_ff_flux_contribution_density_energy;
  208. compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
  209. // copy far field conditions to the gpu
  210. //cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
  211. ff_variable = _clMalloc(NVAR*sizeof(float));
  212. ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
  213. ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
  214. ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
  215. ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
  216. _clMemcpyH2D(ff_variable, h_ff_variable, NVAR*sizeof(float));
  217. _clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
  218. _clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
  219. _clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));
  220. _clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
  221. _clFinish();
  222. }
  223. int nel;
  224. int nelr;
  225. // read in domain geometry
  226. //float* areas;
  227. //int* elements_surrounding_elements;
  228. //float* normals;
  229. {
  230. std::ifstream file(data_file_name);
  231. if(file==NULL){
  232. throw(string("can not find/open file!"));
  233. }
  234. file >> nel;
  235. nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));
  236. std::cout<<"--cambine: nel="<<nel<<", nelr="<<nelr<<std::endl;
  237. float* h_areas = new float[nelr];
  238. int* h_elements_surrounding_elements = new int[nelr*NNB];
  239. float* h_normals = new float[nelr*NDIM*NNB];
  240. // read in data
  241. for(int i = 0; i < nel; i++)
  242. {
  243. file >> h_areas[i];
  244. for(int j = 0; j < NNB; j++)
  245. {
  246. file >> h_elements_surrounding_elements[i + j*nelr];
  247. if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
  248. h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering
  249. for(int k = 0; k < NDIM; k++)
  250. {
  251. file >> h_normals[i + (j + k*NNB)*nelr];
  252. h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
  253. }
  254. }
  255. }
  256. // fill in remaining data
  257. int last = nel-1;
  258. for(int i = nel; i < nelr; i++)
  259. {
  260. h_areas[i] = h_areas[last];
  261. for(int j = 0; j < NNB; j++)
  262. {
  263. // duplicate the last element
  264. h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
  265. for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
  266. }
  267. }
  268. areas = alloc<float>(nelr);
  269. upload<float>(areas, h_areas, nelr);
  270. elements_surrounding_elements = alloc<int>(nelr*NNB);
  271. upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);
  272. normals = alloc<float>(nelr*NDIM*NNB);
  273. upload<float>(normals, h_normals, nelr*NDIM*NNB);
  274. delete[] h_areas;
  275. delete[] h_elements_surrounding_elements;
  276. delete[] h_normals;
  277. }
  278. // Create arrays and set initial conditions
  279. variables = alloc<float>(nelr*NVAR);
  280. int tp = 0;
  281. initialize_variables(nelr, variables, ff_variable);
  282. old_variables = alloc<float>(nelr*NVAR);
  283. fluxes = alloc<float>(nelr*NVAR);
  284. step_factors = alloc<float>(nelr);
  285. // make sure all memory is floatly allocated before we start timing
  286. initialize_variables(nelr, old_variables, ff_variable);
  287. initialize_variables(nelr, fluxes, ff_variable);
  288. _clMemset(step_factors, 0, sizeof(float)*nelr);
  289. // make sure CUDA isn't still doing something before we start timing
  290. _clFinish();
  291. // these need to be computed the first time in order to compute time step
  292. std::cout << "Starting..." << std::endl;
  293. // Begin iterations
  294. for(int i = 0; i < iterations; i++){
  295. copy<float>(old_variables, variables, nelr*NVAR);
  296. // for the first iteration we compute the time step
  297. compute_step_factor(nelr, variables, areas, step_factors);
  298. for(int j = 0; j < RK; j++){
  299. compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, \
  300. ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
  301. time_step(j, nelr, old_variables, variables, step_factors, fluxes);
  302. }
  303. }
  304. _clFinish();
  305. std::cout << "Saving solution..." << std::endl;
  306. dump(variables, nel, nelr);
  307. std::cout << "Saved solution..." << std::endl;
  308. _clStatistics();
  309. std::cout << "Cleaning up..." << std::endl;
  310. //--release resources
  311. _clFree(ff_variable);
  312. _clFree(ff_flux_contribution_momentum_x);
  313. _clFree(ff_flux_contribution_momentum_y);
  314. _clFree(ff_flux_contribution_momentum_z);
  315. _clFree(ff_flux_contribution_density_energy);
  316. _clFree(areas);
  317. _clFree(elements_surrounding_elements);
  318. _clFree(normals);
  319. _clFree(variables);
  320. _clFree(old_variables);
  321. _clFree(fluxes);
  322. _clFree(step_factors);
  323. _clRelease();
  324. std::cout << "Done..." << std::endl;
  325. }
  326. catch(string msg){
  327. std::cout<<"--cambine:( an exception catched in main body ->"<<msg<<std::endl;
  328. _clFree(ff_variable);
  329. _clFree(ff_flux_contribution_momentum_x);
  330. _clFree(ff_flux_contribution_momentum_y);
  331. _clFree(ff_flux_contribution_momentum_z);
  332. _clFree(ff_flux_contribution_density_energy);
  333. _clFree(areas);
  334. _clFree(elements_surrounding_elements);
  335. _clFree(normals);
  336. _clFree(variables);
  337. _clFree(old_variables);
  338. _clFree(fluxes);
  339. _clFree(step_factors);
  340. _clRelease();
  341. }
  342. catch(...){
  343. std::cout<<"--cambine:( unknow exceptions in main body..."<<std::endl;
  344. _clFree(ff_variable);
  345. _clFree(ff_flux_contribution_momentum_x);
  346. _clFree(ff_flux_contribution_momentum_y);
  347. _clFree(ff_flux_contribution_momentum_z);
  348. _clFree(ff_flux_contribution_density_energy);
  349. _clFree(areas);
  350. _clFree(elements_surrounding_elements);
  351. _clFree(normals);
  352. _clFree(variables);
  353. _clFree(old_variables);
  354. _clFree(fluxes);
  355. _clFree(step_factors);
  356. _clRelease();
  357. }
  358. return 0;
  359. }