machine.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. /**************************************************************************
  2. **
  3. ** Copyright (C) 1993 David E. Stewart & Zbigniew Leyk, all rights reserved.
  4. **
  5. ** Meschach Library
  6. **
  7. ** This Meschach Library is provided "as is" without any express
  8. ** or implied warranty of any kind with respect to this software.
  9. ** In particular the authors shall not be liable for any direct,
  10. ** indirect, special, incidental or consequential damages arising
  11. ** in any way from use of the software.
  12. **
  13. ** Everyone is granted permission to copy, modify and redistribute this
  14. ** Meschach Library, provided:
  15. ** 1. All copies contain this copyright notice.
  16. ** 2. All modified copies shall carry a notice stating who
  17. ** made the last modification and the date of such modification.
  18. ** 3. No charge is made for this software or works derived from it.
  19. ** This clause shall not be construed as constraining other software
  20. ** distributed on the same medium as this software, nor is a
  21. ** distribution fee considered a charge.
  22. **
  23. ***************************************************************************/
  24. /*
  25. This file contains basic routines which are used by the functions
  26. in matrix.a etc.
  27. These are the routines that should be modified in order to take
  28. full advantage of specialised architectures (pipelining, vector
  29. processors etc).
  30. */
  31. static char *rcsid = "$Header: /usr/local/home/des/meschach/meschach/RCS/machine.c,v 1.3 1991/08/29 06:42:11 des Exp $";
  32. #include "machine.h"
  33. /* __ip__ -- inner product */
  34. double __ip__(dp1,dp2,len)
  35. register double *dp1, *dp2;
  36. int len;
  37. {
  38. register int len4;
  39. register int i;
  40. register double sum0, sum1, sum2, sum3;
  41. sum0 = sum1 = sum2 = sum3 = 0.0;
  42. len4 = len / 4;
  43. len = len % 4;
  44. for ( i = 0; i < len4; i++ )
  45. {
  46. sum0 += dp1[4*i]*dp2[4*i];
  47. sum1 += dp1[4*i+1]*dp2[4*i+1];
  48. sum2 += dp1[4*i+2]*dp2[4*i+2];
  49. sum3 += dp1[4*i+3]*dp2[4*i+3];
  50. }
  51. sum0 += sum1 + sum2 + sum3;
  52. dp1 += 4*len4; dp2 += 4*len4;
  53. for ( i = 0; i < len; i++ )
  54. sum0 += (*dp1++)*(*dp2++);
  55. return sum0;
  56. }
  57. /* __mltadd__ -- scalar multiply and add c.f. v_mltadd() */
  58. void __mltadd__(dp1,dp2,s,len)
  59. register double *dp1, *dp2, s;
  60. register int len;
  61. {
  62. register int i, len4;
  63. len4 = len / 4;
  64. len = len % 4;
  65. for ( i = 0; i < len4; i++ )
  66. {
  67. dp1[4*i] += s*dp2[4*i];
  68. dp1[4*i+1] += s*dp2[4*i+1];
  69. dp1[4*i+2] += s*dp2[4*i+2];
  70. dp1[4*i+3] += s*dp2[4*i+3];
  71. }
  72. dp1 += 4*len4; dp2 += 4*len4;
  73. for ( i = 0; i < len; i++ )
  74. (*dp1++) += s*(*dp2++);
  75. }
  76. /* __smlt__ scalar multiply array c.f. sv_mlt() */
  77. void __smlt__(dp,s,out,len)
  78. register double *dp, s, *out;
  79. register int len;
  80. {
  81. register int i;
  82. for ( i = 0; i < len; i++ )
  83. (*out++) = s*(*dp++);
  84. }
  85. /* __add__ -- add arrays c.f. v_add() */
  86. void __add__(dp1,dp2,out,len)
  87. register double *dp1, *dp2, *out;
  88. register int len;
  89. {
  90. register int i;
  91. for ( i = 0; i < len; i++ )
  92. (*out++) = (*dp1++) + (*dp2++);
  93. }
  94. /* __sub__ -- subtract arrays c.f. v_sub() */
  95. void __sub__(dp1,dp2,out,len)
  96. register double *dp1, *dp2, *out;
  97. register int len;
  98. {
  99. register int i;
  100. for ( i = 0; i < len; i++ )
  101. (*out++) = (*dp1++) - (*dp2++);
  102. }
  103. /* __zero__ -- zeros an array of double precision numbers */
  104. void __zero__(dp,len)
  105. register double *dp;
  106. register int len;
  107. {
  108. /* if a double precision zero is equivalent to a string of nulls */
  109. MEM_ZERO((char *)dp,len*sizeof(double));
  110. /* else, need to zero the array entry by entry */
  111. /*************************************************
  112. while ( len-- )
  113. *dp++ = 0.0;
  114. *************************************************/
  115. }
  116. /***********************************************************************
  117. ****** Faster versions ********
  118. ***********************************************************************/
  119. /* __ip4__ -- compute 4 inner products in one go */
  120. void __ip4__(v0,v1,v2,v3,w,out,len)
  121. double *v0, *v1, *v2, *v3, *w;
  122. double out[4];
  123. int len;
  124. {
  125. register int i, len2;
  126. register double sum00, sum10, sum20, sum30, w_val0;
  127. register double sum01, sum11, sum21, sum31, w_val1;
  128. len2 = len / 2;
  129. len = len % 2;
  130. sum00 = sum10 = sum20 = sum30 = 0.0;
  131. sum01 = sum11 = sum21 = sum31 = 0.0;
  132. for ( i = 0; i < len2; i++ )
  133. {
  134. w_val0 = w[2*i];
  135. w_val1 = w[2*i+1];
  136. sum00 += v0[2*i] *w_val0;
  137. sum01 += v0[2*i+1]*w_val1;
  138. sum10 += v1[2*i] *w_val0;
  139. sum11 += v1[2*i+1]*w_val1;
  140. sum20 += v2[2*i] *w_val0;
  141. sum21 += v2[2*i+1]*w_val1;
  142. sum30 += v3[2*i] *w_val0;
  143. sum31 += v3[2*i+1]*w_val1;
  144. }
  145. w += 2*len2;
  146. v0 += 2*len2;
  147. v1 += 2*len2;
  148. v2 += 2*len2;
  149. v3 += 2*len2;
  150. for ( i = 0; i < len; i++ )
  151. {
  152. w_val0 = w[i];
  153. sum00 += v0[i]*w_val0;
  154. sum10 += v1[i]*w_val0;
  155. sum20 += v2[i]*w_val0;
  156. sum30 += v3[i]*w_val0;
  157. }
  158. out[0] = sum00 + sum01;
  159. out[1] = sum10 + sum11;
  160. out[2] = sum20 + sum21;
  161. out[3] = sum30 + sum31;
  162. }
  163. /* __lc4__ -- linear combinations: w <- w+a[0]*v0+ ... + a[3]*v3 */
  164. void __lc4__(v0,v1,v2,v3,w,a,len)
  165. double *v0, *v1, *v2, *v3, *w;
  166. double a[4];
  167. int len;
  168. {
  169. register int i, len2;
  170. register double a0, a1, a2, a3, tmp0, tmp1;
  171. len2 = len / 2;
  172. len = len % 2;
  173. a0 = a[0]; a1 = a[1];
  174. a2 = a[2]; a3 = a[3];
  175. for ( i = 0; i < len2; i++ )
  176. {
  177. tmp0 = w[2*i] + a0*v0[2*i];
  178. tmp1 = w[2*i+1] + a0*v0[2*i+1];
  179. tmp0 += a1*v1[2*i];
  180. tmp1 += a1*v1[2*i+1];
  181. tmp0 += a2*v2[2*i];
  182. tmp1 += a2*v2[2*i+1];
  183. tmp0 += a3*v3[2*i];
  184. tmp1 += a3*v3[2*i+1];
  185. w[2*i] = tmp0;
  186. w[2*i+1] = tmp1;
  187. }
  188. w += 2*len2;
  189. v0 += 2*len2;
  190. v1 += 2*len2;
  191. v2 += 2*len2;
  192. v3 += 2*len2;
  193. for ( i = 0; i < len; i++ )
  194. w[i] += a0*v0[i] + a1*v1[i] + a2*v2[i] + a3*v3[i];
  195. }
  196. /* __ma4__ -- multiply and add with 4 vectors: vi <- vi + ai*w */
  197. void __ma4__(v0,v1,v2,v3,w,a,len)
  198. double *v0, *v1, *v2, *v3, *w;
  199. double a[4];
  200. int len;
  201. {
  202. register int i;
  203. register double a0, a1, a2, a3, w0, w1, w2, w3;
  204. a0 = a[0]; a1 = a[1];
  205. a2 = a[2]; a3 = a[3];
  206. for ( i = 0; i < len; i++ )
  207. {
  208. w0 = w[i];
  209. v0[i] += a0*w0;
  210. v1[i] += a1*w0;
  211. v2[i] += a2*w0;
  212. v3[i] += a3*w0;
  213. }
  214. }