main.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. // #ifdef __cplusplus
  2. // extern "C" {
  3. // #endif
  4. //========================================================================================================================================================================================================200
  5. //======================================================================================================================================================150
  6. //====================================================================================================100
  7. //==================================================50
  8. //========================================================================================================================================================================================================200
  9. // UPDATE
  10. //========================================================================================================================================================================================================200
  11. // Lukasz G. Szafaryn 24 JAN 09
  12. //========================================================================================================================================================================================================200
  13. // DESCRIPTION
  14. //========================================================================================================================================================================================================200
  15. // Myocyte application models cardiac myocyte (heart muscle cell) and simulates its behavior according to the work by Saucerman and Bers [8]. The model integrates
  16. // cardiac myocyte electrical activity with the calcineurin pathway, which is a key aspect of the development of heart failure. The model spans large number of temporal
  17. // scales to reflect how changes in heart rate as observed during exercise or stress contribute to calcineurin pathway activation, which ultimately leads to the expression
  18. // of numerous genes that remodel the heart’s structure. It can be used to identify potential therapeutic targets that may be useful for the treatment of heart failure.
  19. // Biochemical reactions, ion transport and electrical activity in the cell are modeled with 91 ordinary differential equations (ODEs) that are determined by more than 200
  20. // experimentally validated parameters. The model is simulated by solving this group of ODEs for a specified time interval. The process of ODE solving is based on the
  21. // causal relationship between values of ODEs at different time steps, thus it is mostly sequential. At every dynamically determined time step, the solver evaluates the
  22. // model consisting of a set of 91 ODEs and 480 supporting equations to determine behavior of the system at that particular time instance. If evaluation results are not
  23. // within the expected tolerance at a given time step (usually as a result of incorrect determination of the time step), another calculation attempt is made at a modified
  24. // (usually reduced) time step. Since the ODEs are stiff (exhibit fast rate of change within short time intervals), they need to be simulated at small time scales with an
  25. // adaptive step size solver.
  26. // 1) The original version of the current solver code was obtained from: Mathematics Source Library (http://mymathlib.webtrellis.net/index.html). The solver has been
  27. // somewhat modified to tailor it to our needs. However, it can be reverted back to original form or modified to suit other simulations.
  28. // 2) This solver and particular solving algorithm used with it (embedded_fehlberg_7_8) were adapted to work with a set of equations, not just one like in original version.
  29. // 3) In order for solver to provide deterministic number of steps (needed for particular amount of memore previousely allocated for results), every next step is
  30. // incremented by 1 time unit (h_init).
  31. // 4) Function assumes that simulation starts at some point of time (whatever time the initial values are provided for) and runs for the number of miliseconds (xmax)
  32. // specified by the uses as a parameter on command line.
  33. // 5) The appropriate amount of memory is previousely allocated for that range (y).
  34. // 6) This setup in 3) - 5) allows solver to adjust the step ony from current time instance to current time instance + 0.9. The next time instance is current time instance + 1;
  35. // 7) The original solver cannot handle cases when equations return NAN and INF values due to discontinuities and /0. That is why equations provided by user need to
  36. // make sure that no NAN and INF are returned.
  37. // 8) Application reads initial data and parameters from text files: y.txt and params.txt respectively that need to be located in the same folder as source files.
  38. // For simplicity and testing purposes only, when multiple number of simulation instances is specified, application still reads initial data from the same input files. That
  39. // can be modified in this source code.
  40. //========================================================================================================================================================================================================200
  41. // IMPLEMENTATION-SPECIFIC DESCRIPTION (CUDA)
  42. //========================================================================================================================================================================================================200
  43. // This is the CUDA version of Myocyte code.
  44. // The original single-threaded code was written in MATLAB and used MATLAB ode45 ODE solver. In the process of accelerating this code, we arrived with the
  45. // intermediate versions that used single-threaded Sundials CVODE solver which evaluated model parallelized with CUDA at each time step. In order to convert entire
  46. // solver to CUDA code (to remove some of the operational overheads such as kernel launches and data transfer in CUDA) we used a simpler solver, from Mathematics
  47. // Source Library, and tailored it to our needs. The parallelism in the cardiac myocyte model is on a very fine-grained level, close to that of ILP, therefore it is very hard
  48. // to exploit as DLP or TLB in CUDA code. We were able to divide the model into 4 individual groups that run in parallel. However, even that is not enough work to
  49. // compensate for some of the CUDA thread launch and data transfer overheads which resulted in performance worse than that of single-threaded C code. Speedup in
  50. // this code could be achieved only if a customizable accelerator such as FPGA was used for evaluation of the model itself. We also approached the application from
  51. // another angle and allowed it to run several concurrent simulations, thus turning it into an embarrassingly parallel problem. This version of the code is also useful for
  52. // scientists who want to run the same simulation with different sets of input parameters. Speedup achieved with CUDA code is variable on the other hand. It depends on
  53. // the number of concurrent simulations and it saturates around 300 simulations with the speedup of about 10x.
  54. // Speedup numbers reported in the description of this application were obtained on the machine with: Intel Quad Core CPU, 4GB of RAM, Nvidia GTX280 GPU.
  55. // 1) When running with parallelization inside each simulation instance (value of 3rd command line parameter equal to 0), performance is bad because:
  56. // a) underutilization of GPU (only 1 out of 32 threads in each block)
  57. // b) significant CPU-GPU memory copy overhead
  58. // c) kernel launch overhead (kernel needs to be finished every time model is evaluated as it is the only way to synchronize threads in different blocks)
  59. // 2) When running with parallelization across simulation instances, code gets continues speedup with the increasing number of simulation insances which saturates
  60. // around 240 instances on GTX280 (roughly corresponding to the number of multiprocessorsXprocessors in GTX280), with the speedup of around 10x compared
  61. // to serial C version of code. Limited performance is explained mainly by:
  62. // a) significant CPU-GPU memory copy overhead
  63. // b) increasingly uncoalesced memory accesses with the increasing number of workloads
  64. // c) lack of cache compared to CPU, or no use of shared memory to compensate
  65. // d) frequency of GPU shader being lower than that of CPU core
  66. // 3) GPU version has an issue with memory allocation that has not been resolved yet. For certain simulation ranges memory allocation fails, or pointers incorrectly overlap
  67. // causeing value trashing.
  68. // The following are the command parameters to the application:
  69. // 1) Simulation time interval which is the number of miliseconds to simulate. Needs to be integer > 0
  70. // Example:
  71. // ./a.out -time 100
  72. //========================================================================================================================================================================================================200
  73. // DEFINE / INCLUDE
  74. //========================================================================================================================================================================================================200
  75. //======================================================================================================================================================150
  76. // DEFINE
  77. //======================================================================================================================================================150
  78. //======================================================================================================================================================150
  79. // COMMON
  80. //======================================================================================================================================================150
  81. #include "./common.h" // (in directory)
  82. //======================================================================================================================================================150
  83. // DEFINE
  84. //======================================================================================================================================================150
  85. //======================================================================================================================================================150
  86. // UTILITIES
  87. //======================================================================================================================================================150
  88. #include "./util/file/file.h" // (in directory)
  89. #include "./util/timer/timer.h" // (in directory)
  90. #include "./util/num/num.h" // (in directory)
  91. //======================================================================================================================================================150
  92. // KERNEL
  93. //======================================================================================================================================================150
  94. #include "./kernel/kernel_gpu_opencl_wrapper.h" // (in directory)
  95. //======================================================================================================================================================150
  96. // LIBRARIES
  97. //======================================================================================================================================================150
  98. #include <stdio.h> // (in path known to compiler) needed by printf
  99. #include <stdlib.h> // (in path known to compiler) meeded by malloc, free
  100. //======================================================================================================================================================150
  101. // HEADER
  102. //======================================================================================================================================================150
  103. #include "./main.h" // (in directory)
  104. //======================================================================================================================================================150
  105. // END
  106. //======================================================================================================================================================150
  107. //========================================================================================================================================================================================================200
  108. // MAIN FUNCTION
  109. //========================================================================================================================================================================================================200
  110. int
  111. main( int argc,
  112. char *argv []){
  113. //======================================================================================================================================================150
  114. // INPUT ARGUMENTS
  115. //======================================================================================================================================================150
  116. // assing default values
  117. int cur_arg;
  118. int xmax;
  119. int workload = 1;
  120. // go through arguments
  121. if(argc==3){
  122. for(cur_arg=1; cur_arg<argc; cur_arg++){
  123. // check if -time
  124. if(strcmp(argv[cur_arg], "-time")==0){
  125. // check if value provided
  126. if(argc>=cur_arg+1){
  127. // check if value is a number
  128. if(isInteger(argv[cur_arg+1])==1){
  129. xmax = atoi(argv[cur_arg+1]);
  130. if(xmax<0){
  131. printf("ERROR: Wrong value to -time argument, cannot be <=0\n");
  132. return 0;
  133. }
  134. cur_arg = cur_arg+1;
  135. }
  136. // value is not a number
  137. else{
  138. printf("ERROR: Value to -time argument in not a number\n");
  139. return 0;
  140. }
  141. }
  142. // value not provided
  143. else{
  144. printf("ERROR: Missing value to -time argument\n");
  145. return 0;
  146. }
  147. }
  148. // unknown
  149. else{
  150. printf("ERROR: Unknown argument\n");
  151. return 0;
  152. }
  153. }
  154. // Print configuration
  155. //printf("Configuration used: arch = %d, cores = %d, time = %d\n", arch_arg, cores_arg, xmax);
  156. }
  157. else{
  158. printf("Provide time argument, example: -time 100");
  159. return 0;
  160. }
  161. //======================================================================================================================================================150
  162. // EXECUTION IF THERE IS 1 WORKLOAD, PARALLELIZE INSIDE 1 WORKLOAD
  163. //======================================================================================================================================================150
  164. if(workload == 1){
  165. //====================================================================================================100
  166. // VARIABLES
  167. //====================================================================================================100
  168. int i,j;
  169. //====================================================================================================100
  170. // MEMORY CHECK
  171. //====================================================================================================100
  172. long long memory;
  173. memory = workload*(xmax+1)*EQUATIONS*4;
  174. if(memory>1000000000){
  175. printf("ERROR: trying to allocate more than 1.0GB of memory, decrease workload and span parameters or change memory parameter\n");
  176. return 0;
  177. }
  178. //====================================================================================================100
  179. // ALLOCATE ARRAYS
  180. //====================================================================================================100
  181. fp*** y;
  182. y = (fp ***) malloc(workload* sizeof(fp **));
  183. for(i=0; i<workload; i++){
  184. y[i] = (fp**)malloc((1+xmax)*sizeof(fp*));
  185. for(j=0; j<(1+xmax); j++){
  186. y[i][j]= (fp *) malloc(EQUATIONS* sizeof(fp));
  187. }
  188. }
  189. fp** x;
  190. x = (fp **) malloc(workload * sizeof(fp *));
  191. for (i= 0; i<workload; i++){
  192. x[i]= (fp *)malloc((1+xmax) *sizeof(fp));
  193. }
  194. fp** params;
  195. params = (fp **) malloc(workload * sizeof(fp *));
  196. for (i= 0; i<workload; i++){
  197. params[i]= (fp *)malloc(PARAMETERS * sizeof(fp));
  198. }
  199. fp* com;
  200. com = (fp*)malloc(3 * sizeof(fp));
  201. //====================================================================================================100
  202. // INITIAL VALUES
  203. //====================================================================================================100
  204. // y
  205. for(i=0; i<workload; i++){
  206. read_file( "../../data/myocyte/y.txt",
  207. y[i][0],
  208. EQUATIONS,
  209. 1,
  210. 0);
  211. }
  212. // params
  213. for(i=0; i<workload; i++){
  214. read_file("../../data/myocyte/params.txt",
  215. params[i],
  216. PARAMETERS,
  217. 1,
  218. 0);
  219. }
  220. //====================================================================================================100
  221. // COMPUTATION
  222. //====================================================================================================100
  223. kernel_gpu_opencl_wrapper( xmax, // span
  224. workload, // # of workloads
  225. y,
  226. x,
  227. params,
  228. com);
  229. FILE * pFile;
  230. pFile = fopen ("output.txt","w");
  231. if (pFile==NULL)
  232. {
  233. fputs ("fopen example",pFile);
  234. return -1;
  235. }
  236. // print results
  237. int k;
  238. for(i=0; i<workload; i++){
  239. fprintf(pFile, "WORKLOAD %d:\n", i);
  240. for(j=0; j<(xmax+1); j++){
  241. fprintf(pFile, "\tTIME %d:\n", j);
  242. for(k=0; k<EQUATIONS; k++){
  243. fprintf(pFile, "\t\ty[%d][%d][%d]=%10.7e\n", i, j, k, y[i][j][k]);
  244. }
  245. }
  246. }
  247. fclose (pFile);
  248. //====================================================================================================100
  249. // FREE SYSTEM MEMORY
  250. //====================================================================================================100
  251. // y values
  252. for (i= 0; i< workload; i++){
  253. for (j= 0; j< (1+xmax); j++){
  254. free(y[i][j]);
  255. }
  256. free(y[i]);
  257. }
  258. free(y);
  259. // x values
  260. for (i= 0; i< workload; i++){
  261. free(x[i]);
  262. }
  263. free(x);
  264. // parameters
  265. for (i= 0; i< workload; i++){
  266. free(params[i]);
  267. }
  268. free(params);
  269. // com
  270. free(com);
  271. //====================================================================================================100
  272. // END
  273. //====================================================================================================100
  274. }
  275. //======================================================================================================================================================150
  276. // RETURN
  277. //======================================================================================================================================================150
  278. return 0;
  279. //======================================================================================================================================================150
  280. // END
  281. //======================================================================================================================================================150
  282. }
  283. //========================================================================================================================================================================================================200
  284. // END
  285. //========================================================================================================================================================================================================200
  286. // #ifdef __cplusplus
  287. // }
  288. // #endif