extras.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. /**************************************************************************
  2. **
  3. ** Copyright (C) 1993 David E. Steward & Zbigniew Leyk, all rights reserved.
  4. **
  5. ** Meschach Library
  6. **
  7. ** This Meschach Library is provided "as is" without any express
  8. ** or implied warranty of any kind with respect to this software.
  9. ** In particular the authors shall not be liable for any direct,
  10. ** indirect, special, incidental or consequential damages arising
  11. ** in any way from use of the software.
  12. **
  13. ** Everyone is granted permission to copy, modify and redistribute this
  14. ** Meschach Library, provided:
  15. ** 1. All copies contain this copyright notice.
  16. ** 2. All modified copies shall carry a notice stating who
  17. ** made the last modification and the date of such modification.
  18. ** 3. No charge is made for this software or works derived from it.
  19. ** This clause shall not be construed as constraining other software
  20. ** distributed on the same medium as this software, nor is a
  21. ** distribution fee considered a charge.
  22. **
  23. ***************************************************************************/
  24. /*
  25. Memory port routines: MEM_COPY and MEM_ZERO
  26. */
  27. /* For BSD 4.[23] environments: using bcopy() and bzero() */
  28. #include "machine.h"
  29. #ifndef MEM_COPY
  30. void MEM_COPY(from,to,len)
  31. char *from, *to;
  32. int len;
  33. {
  34. int i;
  35. if ( from < to )
  36. {
  37. for ( i = 0; i < len; i++ )
  38. *to++ = *from++;
  39. }
  40. else
  41. {
  42. from += len; to += len;
  43. for ( i = 0; i < len; i++ )
  44. *(--to) = *(--from);
  45. }
  46. }
  47. #endif
  48. #ifndef MEM_ZERO
  49. void MEM_ZERO(ptr,len)
  50. char *ptr;
  51. int len;
  52. {
  53. int i;
  54. for ( i = 0; i < len; i++ )
  55. *(ptr++) = '\0';
  56. }
  57. #endif
  58. /*
  59. This file contains versions of something approximating the well-known
  60. BLAS routines in C, suitable for Meschach (hence the `m').
  61. These are "vanilla" implementations, at least with some consideration
  62. of the effects of caching and paging, and maybe some loop unrolling
  63. for register-rich machines
  64. */
  65. /*
  66. Organisation of matrices: it is assumed that matrices are represented
  67. by Real **'s. To keep flexibility, there is also an "initial
  68. column" parameter j0, so that the actual elements used are
  69. A[0][j0], A[0][j0+1], ..., A[0][j0+n-1]
  70. A[1][j0], A[1][j0+1], ..., A[1][j0+n-1]
  71. .. .. ... ..
  72. A[m-1][j0], A[m-1][j0+1], ..., A[m-1][j0+n-1]
  73. */
  74. static char rcsid[] = "$Id: extras.c,v 1.4 1995/06/08 15:13:15 des Exp $";
  75. #include <math.h>
  76. #define REGISTER_RICH 1
  77. /* mblar-1 routines */
  78. /* Mscale -- sets x <- alpha.x */
  79. void Mscale(len,alpha,x)
  80. int len;
  81. double alpha;
  82. Real *x;
  83. {
  84. register int i;
  85. for ( i = 0; i < len; i++ )
  86. x[i] *= alpha;
  87. }
  88. /* Mswap -- swaps x and y */
  89. void Mswap(len,x,y)
  90. int len;
  91. Real *x, *y;
  92. {
  93. register int i;
  94. register Real tmp;
  95. for ( i = 0; i < len; i++ )
  96. {
  97. tmp = x[i];
  98. x[i] = y[i];
  99. y[i] = tmp;
  100. }
  101. }
  102. /* Mcopy -- copies x to y */
  103. void Mcopy(len,x,y)
  104. int len;
  105. Real *x, *y;
  106. {
  107. register int i;
  108. for ( i = 0; i < len; i++ )
  109. y[i] = x[i];
  110. }
  111. /* Maxpy -- y <- y + alpha.x */
  112. void Maxpy(len,alpha,x,y)
  113. int len;
  114. double alpha;
  115. Real *x, *y;
  116. {
  117. register int i, len4;
  118. /****************************************
  119. for ( i = 0; i < len; i++ )
  120. y[i] += alpha*x[i];
  121. ****************************************/
  122. #ifdef REGISTER_RICH
  123. len4 = len / 4;
  124. len = len % 4;
  125. for ( i = 0; i < len4; i++ )
  126. {
  127. y[4*i] += alpha*x[4*i];
  128. y[4*i+1] += alpha*x[4*i+1];
  129. y[4*i+2] += alpha*x[4*i+2];
  130. y[4*i+3] += alpha*x[4*i+3];
  131. }
  132. x += 4*len4; y += 4*len4;
  133. #endif
  134. for ( i = 0; i < len; i++ )
  135. y[i] += alpha*x[i];
  136. }
  137. /* Mdot -- returns x'.y */
  138. double Mdot(len,x,y)
  139. int len;
  140. Real *x, *y;
  141. {
  142. register int i, len4;
  143. register Real sum;
  144. #ifndef REGISTER_RICH
  145. sum = 0.0;
  146. #endif
  147. #ifdef REGISTER_RICH
  148. register Real sum0, sum1, sum2, sum3;
  149. sum0 = sum1 = sum2 = sum3 = 0.0;
  150. len4 = len / 4;
  151. len = len % 4;
  152. for ( i = 0; i < len4; i++ )
  153. {
  154. sum0 += x[4*i ]*y[4*i ];
  155. sum1 += x[4*i+1]*y[4*i+1];
  156. sum2 += x[4*i+2]*y[4*i+2];
  157. sum3 += x[4*i+3]*y[4*i+3];
  158. }
  159. sum = sum0 + sum1 + sum2 + sum3;
  160. x += 4*len4; y += 4*len4;
  161. #endif
  162. for ( i = 0; i < len; i++ )
  163. sum += x[i]*y[i];
  164. return sum;
  165. }
  166. #ifndef ABS
  167. #define ABS(x) ((x) >= 0 ? (x) : -(x))
  168. #endif
  169. /* Mnorminf -- returns ||x||_inf */
  170. double Mnorminf(len,x)
  171. int len;
  172. Real *x;
  173. {
  174. register int i;
  175. register Real tmp, max_val;
  176. max_val = 0.0;
  177. for ( i = 0; i < len; i++ )
  178. {
  179. tmp = ABS(x[i]);
  180. if ( max_val < tmp )
  181. max_val = tmp;
  182. }
  183. return max_val;
  184. }
  185. /* Mnorm1 -- returns ||x||_1 */
  186. double Mnorm1(len,x)
  187. int len;
  188. Real *x;
  189. {
  190. register int i;
  191. register Real sum;
  192. sum = 0.0;
  193. for ( i = 0; i < len; i++ )
  194. sum += ABS(x[i]);
  195. return sum;
  196. }
  197. /* Mnorm2 -- returns ||x||_2 */
  198. double Mnorm2(len,x)
  199. int len;
  200. Real *x;
  201. {
  202. register int i;
  203. register Real norm, invnorm, sum, tmp;
  204. norm = Mnorminf(len,x);
  205. if ( norm == 0.0 )
  206. return 0.0;
  207. invnorm = 1.0/norm;
  208. sum = 0.0;
  209. for ( i = 0; i < len; i++ )
  210. {
  211. tmp = x[i]*invnorm;
  212. sum += tmp*tmp;
  213. }
  214. return sum/invnorm;
  215. }
  216. /* mblar-2 routines */
  217. /* Mmv -- y <- alpha.A.x + beta.y */
  218. void Mmv(m,n,alpha,A,j0,x,beta,y)
  219. int m, n, j0;
  220. double alpha, beta;
  221. Real **A, *x, *y;
  222. {
  223. register int i, j, m4, n4;
  224. register Real sum0, sum1, sum2, sum3, tmp0, tmp1, tmp2, tmp3;
  225. register Real *dp0, *dp1, *dp2, *dp3;
  226. /****************************************
  227. for ( i = 0; i < m; i++ )
  228. y[i] += alpha*Mdot(n,&(A[i][j0]),x);
  229. ****************************************/
  230. m4 = n4 = 0;
  231. #ifdef REGISTER_RICH
  232. m4 = m / 4;
  233. m = m % 4;
  234. n4 = n / 4;
  235. n = n % 4;
  236. for ( i = 0; i < m4; i++ )
  237. {
  238. sum0 = sum1 = sum2 = sum3 = 0.0;
  239. dp0 = &(A[4*i ][j0]);
  240. dp1 = &(A[4*i+1][j0]);
  241. dp2 = &(A[4*i+2][j0]);
  242. dp3 = &(A[4*i+3][j0]);
  243. for ( j = 0; j < n4; j++ )
  244. {
  245. tmp0 = x[4*j ];
  246. tmp1 = x[4*j+1];
  247. tmp2 = x[4*j+2];
  248. tmp3 = x[4*j+3];
  249. sum0 = sum0 + dp0[j]*tmp0 + dp0[j+1]*tmp1 +
  250. dp0[j+2]*tmp2 + dp0[j+3]*tmp3;
  251. sum1 = sum1 + dp1[j]*tmp0 + dp1[j+1]*tmp1 +
  252. dp1[j+2]*tmp2 + dp1[j+3]*tmp3;
  253. sum2 = sum2 + dp2[j]*tmp0 + dp2[j+1]*tmp1 +
  254. dp2[j+2]*tmp2 + dp2[j+3]*tmp3;
  255. sum3 = sum3 + dp3[j]*tmp0 + dp3[j+1]*tmp2 +
  256. dp3[j+2]*tmp2 + dp3[j+3]*tmp3;
  257. }
  258. for ( j = 0; j < n; j++ )
  259. {
  260. sum0 += dp0[4*n4+j]*x[4*n4+j];
  261. sum1 += dp1[4*n4+j]*x[4*n4+j];
  262. sum2 += dp2[4*n4+j]*x[4*n4+j];
  263. sum3 += dp3[4*n4+j]*x[4*n4+j];
  264. }
  265. y[4*i ] = beta*y[4*i ] + alpha*sum0;
  266. y[4*i+1] = beta*y[4*i+1] + alpha*sum1;
  267. y[4*i+2] = beta*y[4*i+2] + alpha*sum2;
  268. y[4*i+3] = beta*y[4*i+3] + alpha*sum3;
  269. }
  270. #endif
  271. for ( i = 0; i < m; i++ )
  272. y[4*m4+i] = beta*y[i] + alpha*Mdot(4*n4+n,&(A[4*m4+i][j0]),x);
  273. }
  274. /* Mvm -- y <- alpha.A^T.x + beta.y */
  275. void Mvm(m,n,alpha,A,j0,x,beta,y)
  276. int m, n, j0;
  277. double alpha, beta;
  278. Real **A, *x, *y;
  279. {
  280. register int i, j, m4, n2;
  281. register Real *Aref;
  282. register Real tmp;
  283. #ifdef REGISTER_RICH
  284. register Real *Aref0, *Aref1;
  285. register Real tmp0, tmp1;
  286. register Real yval0, yval1, yval2, yval3;
  287. #endif
  288. if ( beta != 1.0 )
  289. Mscale(m,beta,y);
  290. /****************************************
  291. for ( j = 0; j < n; j++ )
  292. Maxpy(m,alpha*x[j],&(A[j][j0]),y);
  293. ****************************************/
  294. m4 = n2 = 0;
  295. m4 = m / 4;
  296. m = m % 4;
  297. #ifdef REGISTER_RICH
  298. n2 = n / 2;
  299. n = n % 2;
  300. for ( j = 0; j < n2; j++ )
  301. {
  302. tmp0 = alpha*x[2*j];
  303. tmp1 = alpha*x[2*j+1];
  304. Aref0 = &(A[2*j ][j0]);
  305. Aref1 = &(A[2*j+1][j0]);
  306. for ( i = 0; i < m4; i++ )
  307. {
  308. yval0 = y[4*i ] + tmp0*Aref0[4*i ];
  309. yval1 = y[4*i+1] + tmp0*Aref0[4*i+1];
  310. yval2 = y[4*i+2] + tmp0*Aref0[4*i+2];
  311. yval3 = y[4*i+3] + tmp0*Aref0[4*i+3];
  312. y[4*i ] = yval0 + tmp1*Aref1[4*i ];
  313. y[4*i+1] = yval1 + tmp1*Aref1[4*i+1];
  314. y[4*i+2] = yval2 + tmp1*Aref1[4*i+2];
  315. y[4*i+3] = yval3 + tmp1*Aref1[4*i+3];
  316. }
  317. y += 4*m4; Aref0 += 4*m4; Aref1 += 4*m4;
  318. for ( i = 0; i < m; i++ )
  319. y[i] += tmp0*Aref0[i] + tmp1*Aref1[i];
  320. }
  321. #endif
  322. for ( j = 0; j < n; j++ )
  323. {
  324. tmp = alpha*x[2*n2+j];
  325. Aref = &(A[2*n2+j][j0]);
  326. for ( i = 0; i < m4; i++ )
  327. {
  328. y[4*i ] += tmp*Aref[4*i ];
  329. y[4*i+1] += tmp*Aref[4*i+1];
  330. y[4*i+2] += tmp*Aref[4*i+2];
  331. y[4*i+3] += tmp*Aref[4*i+3];
  332. }
  333. y += 4*m4; Aref += 4*m4;
  334. for ( i = 0; i < m; i++ )
  335. y[i] += tmp*Aref[i];
  336. }
  337. }
  338. /* Mupdate -- A <- A + alpha.x.y^T */
  339. void Mupdate(m,n,alpha,x,y,A,j0)
  340. int m, n, j0;
  341. double alpha;
  342. Real **A, *x, *y;
  343. {
  344. register int i, j, n4;
  345. register Real *Aref;
  346. register Real tmp;
  347. /****************************************
  348. for ( i = 0; i < m; i++ )
  349. Maxpy(n,alpha*x[i],y,&(A[i][j0]));
  350. ****************************************/
  351. n4 = n / 4;
  352. n = n % 4;
  353. for ( i = 0; i < m; i++ )
  354. {
  355. tmp = alpha*x[i];
  356. Aref = &(A[i][j0]);
  357. for ( j = 0; j < n4; j++ )
  358. {
  359. Aref[4*j ] += tmp*y[4*j ];
  360. Aref[4*j+1] += tmp*y[4*j+1];
  361. Aref[4*j+2] += tmp*y[4*j+2];
  362. Aref[4*j+3] += tmp*y[4*j+3];
  363. }
  364. Aref += 4*n4; y += 4*n4;
  365. for ( j = 0; j < n; j++ )
  366. Aref[j] += tmp*y[j];
  367. }
  368. }
  369. /* mblar-3 routines */
  370. /* Mmm -- C <- C + alpha.A.B */
  371. void Mmm(m,n,p,alpha,A,Aj0,B,Bj0,C,Cj0)
  372. int m, n, p; /* C is m x n */
  373. double alpha;
  374. Real **A, **B, **C;
  375. int Aj0, Bj0, Cj0;
  376. {
  377. register int i, j, k;
  378. /* register Real tmp, sum; */
  379. /****************************************
  380. for ( i = 0; i < m; i++ )
  381. for ( k = 0; k < p; k++ )
  382. Maxpy(n,alpha*A[i][Aj0+k],&(B[k][Bj0]),&(C[i][Cj0]));
  383. ****************************************/
  384. for ( i = 0; i < m; i++ )
  385. Mvm(p,n,alpha,B,Bj0,&(A[i][Aj0]),1.0,&(C[i][Cj0]));
  386. }
  387. /* Mmtrm -- C <- C + alpha.A^T.B */
  388. void Mmtrm(m,n,p,alpha,A,Aj0,B,Bj0,C,Cj0)
  389. int m, n, p; /* C is m x n */
  390. double alpha;
  391. Real **A, **B, **C;
  392. int Aj0, Bj0, Cj0;
  393. {
  394. register int i, j, k;
  395. /****************************************
  396. for ( i = 0; i < m; i++ )
  397. for ( k = 0; k < p; k++ )
  398. Maxpy(n,alpha*A[k][Aj0+i],&(B[k][Bj0]),&(C[i][Cj0]));
  399. ****************************************/
  400. for ( k = 0; k < p; k++ )
  401. Mupdate(m,n,alpha,&(A[k][Aj0]),&(B[k][Bj0]),C,Cj0);
  402. }
  403. /* Mmmtr -- C <- C + alpha.A.B^T */
  404. void Mmmtr(m,n,p,alpha,A,Aj0,B,Bj0,C,Cj0)
  405. int m, n, p; /* C is m x n */
  406. double alpha;
  407. Real **A, **B, **C;
  408. int Aj0, Bj0, Cj0;
  409. {
  410. register int i, j, k;
  411. /****************************************
  412. for ( i = 0; i < m; i++ )
  413. for ( j = 0; j < n; j++ )
  414. C[i][Cj0+j] += alpha*Mdot(p,&(A[i][Aj0]),&(B[j][Bj0]));
  415. ****************************************/
  416. for ( i = 0; i < m; i++ )
  417. Mmv(n,p,alpha,B,Bj0,&(A[i][Aj0]),1.0,&(C[i][Cj0]));
  418. }
  419. /* Mmtrmtr -- C <- C + alpha.A^T.B^T */
  420. void Mmtrmtr(m,n,p,alpha,A,Aj0,B,Bj0,C,Cj0)
  421. int m, n, p; /* C is m x n */
  422. double alpha;
  423. Real **A, **B, **C;
  424. int Aj0, Bj0, Cj0;
  425. {
  426. register int i, j, k;
  427. for ( i = 0; i < m; i++ )
  428. for ( j = 0; j < n; j++ )
  429. for ( k = 0; k < p; k++ )
  430. C[i][Cj0+j] += A[i][Aj0+k]*B[k][Bj0+j];
  431. }