File indexing completed on 2024-12-22 04:09:37

0001 /////////////////////////////////////////////////////////////////////////////
0002 //  einspline:  a library for creating and evaluating B-splines            //
0003 //  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
0004 //                                                                         //
0005 //  This program is free software; you can redistribute it and/or modify   //
0006 //  it under the terms of the GNU General Public License as published by   //
0007 //  the Free Software Foundation; either version 2 of the License, or      //
0008 //  (at your option) any later version.                                    //
0009 //                                                                         //
0010 //  This program is distributed in the hope that it will be useful,        //
0011 //  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
0012 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
0013 //  GNU General Public License for more details.                           //
0014 //                                                                         //
0015 //  You should have received a copy of the GNU General Public License      //
0016 //  along with this program; if not, write to the Free Software            //
0017 //  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
0018 //  Boston, MA  02110-1301  USA                                            //
0019 /////////////////////////////////////////////////////////////////////////////
0020 
0021 //#include "config.h"
0022 
0023 #include "local_definitions.h"
0024 
0025 /*****************/
0026 /*   SSE Data    */
0027 /*****************/
0028 
0029 #ifdef _XOPEN_SOURCE
0030 #undef _XOPEN_SOURCE
0031 #endif 
0032 
0033 #define _XOPEN_SOURCE 600
0034 
0035 #ifndef __USE_XOPEN2K
0036   #define __USE_XOPEN2K
0037 #endif
0038 #include <stdlib.h>
0039 
0040 #ifdef HAVE_SSE
0041 #include <xmmintrin.h>
0042 
0043 // Single-precision version of matrices
0044 __m128 *restrict A_s = (__m128 *)0;
0045 // There is a problem with alignment of global variables in shared
0046 // libraries on 32-bit machines.
0047 // __m128  A0, A1, A2, A3, dA0, dA1, dA2, dA3, d2A0, d2A1, d2A2, d2A3;
0048 #endif
0049 
0050 #ifdef HAVE_SSE2
0051 // Double-precision version of matrices
0052 #include <emmintrin.h>
0053 __m128d *restrict A_d = (__m128d *)0;
0054 
0055 // There is a problem with alignment of global variables in shared
0056 // libraries on 32-bit machines.
0057 //__m128d A0_01, A0_23, A1_01, A1_23, A2_01, A2_23, A3_01, A3_23,
0058 //  dA0_01, dA0_23, dA1_01, dA1_23, dA2_01, dA2_23, dA3_01, dA3_23,
0059 //  d2A0_01, d2A0_23, d2A1_01, d2A1_23, d2A2_01, d2A2_23, d2A3_01, d2A3_23;
0060 #endif 
0061 
0062 void init_sse_data()
0063 {
0064 #ifdef HAVE_SSE
0065   if (A_s == 0) {
0066     posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
0067     A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
0068     A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );      
0069     A_s[1]  = _mm_setr_ps ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0 );      
0070     A_s[2]  = _mm_setr_ps ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0 );      
0071     A_s[3]  = _mm_setr_ps ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0 );      
0072     A_s[4]  = _mm_setr_ps ( -0.5,  1.0, -0.5, 0.0  );         
0073     A_s[5]  = _mm_setr_ps (  0.0, -2.0,  1.5, 0.0  );         
0074     A_s[6]  = _mm_setr_ps (  0.5,  1.0, -1.5, 0.0  );         
0075     A_s[7]  = _mm_setr_ps (  0.0,  0.0,  0.5, 0.0  );         
0076     A_s[8]  = _mm_setr_ps (  1.0, -1.0,  0.0, 0.0  );         
0077     A_s[9]  = _mm_setr_ps ( -2.0,  3.0,  0.0, 0.0  );         
0078     A_s[10] = _mm_setr_ps (  1.0, -3.0,  0.0, 0.0  );         
0079     A_s[11] = _mm_setr_ps (  0.0,  1.0,  0.0, 0.0  );                  
0080   }
0081                  
0082 #endif
0083 #ifdef HAVE_SSE2
0084   if (A_d == 0) {
0085     posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24));
0086     A_d[ 0] = _mm_setr_pd (  3.0/6.0, -1.0/6.0 );      
0087     A_d[ 1] = _mm_setr_pd (  1.0/6.0, -3.0/6.0 );      
0088     A_d[ 2] = _mm_setr_pd ( -6.0/6.0,  3.0/6.0 );      
0089     A_d[ 3] = _mm_setr_pd (  4.0/6.0,  0.0/6.0 );      
0090     A_d[ 4] = _mm_setr_pd (  3.0/6.0, -3.0/6.0 );      
0091     A_d[ 5] = _mm_setr_pd (  1.0/6.0,  3.0/6.0 );      
0092     A_d[ 6] = _mm_setr_pd (  0.0/6.0,  1.0/6.0 );      
0093     A_d[ 7] = _mm_setr_pd (  0.0/6.0,  0.0/6.0 );      
0094     A_d[ 8] = _mm_setr_pd ( -0.5,  0.0 );          
0095     A_d[ 9] = _mm_setr_pd ( -0.5,  1.0 );          
0096     A_d[10] = _mm_setr_pd (  1.5,  0.0 );          
0097     A_d[11] = _mm_setr_pd (  0.0, -2.0 );          
0098     A_d[12] = _mm_setr_pd ( -1.5,  0.0 );          
0099     A_d[13] = _mm_setr_pd (  0.5,  1.0 );          
0100     A_d[14] = _mm_setr_pd (  0.5,  0.0 );          
0101     A_d[15] = _mm_setr_pd (  0.0,  0.0 );          
0102     A_d[16] = _mm_setr_pd (  0.0,  0.0 );          
0103     A_d[17] = _mm_setr_pd (  1.0, -1.0 );          
0104     A_d[18] = _mm_setr_pd (  0.0,  0.0 );          
0105     A_d[19] = _mm_setr_pd ( -2.0,  3.0 );          
0106     A_d[20] = _mm_setr_pd (  0.0,  0.0 );          
0107     A_d[21] = _mm_setr_pd (  1.0, -3.0 );          
0108     A_d[22] = _mm_setr_pd (  0.0,  0.0 );          
0109     A_d[23] = _mm_setr_pd (  0.0,  1.0 );   
0110   }                
0111 #endif
0112 }
0113 
0114 
0115 #ifdef USE_ALTIVEC
0116 vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0);
0117 vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0);
0118 vector float A2   = (vector float) ( -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0);
0119 vector float A3   = (vector float) (  1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0);
0120 /* vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0); */
0121 /* vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  3.0/6.0, 0.0/6.0); */
0122 /* vector float A2   = (vector float) ( -3.0/6.0,  0.0/6.0,  3.0/6.0, 0.0/6.0); */
0123 /* vector float A3   = (vector float) (  1.0/6.0,  4.0/6.0,  1.0/6.0, 0.0/6.0); */
0124 /* vector float A0   = (vector float) ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0); */
0125 /* vector float A1   = (vector float) ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0); */
0126 /* vector float A2   = (vector float) ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0); */
0127 /* vector float A3   = (vector float) ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0); */
0128 vector float dA0  = (vector float) ( 0.0, -0.5,  1.0, -0.5 );
0129 vector float dA1  = (vector float) ( 0.0,  1.5, -2.0,  0.0 );
0130 vector float dA2  = (vector float) ( 0.0, -1.5,  1.0,  0.5 );
0131 vector float dA3  = (vector float) ( 0.0,  0.5,  0.0,  0.0 );
0132 vector float d2A0 = (vector float) ( 0.0,  0.0, -1.0,  1.0 );
0133 vector float d2A1 = (vector float) ( 0.0,  0.0,  3.0, -2.0 );
0134 vector float d2A2 = (vector float) ( 0.0,  0.0, -3.0,  1.0 );
0135 vector float d2A3 = (vector float) ( 0.0,  0.0,  1.0,  0.0 );
0136 #endif
0137 
0138 /*****************/
0139 /* Standard Data */
0140 /*****************/
0141 
0142 //////////////////////
0143 // Single precision //
0144 //////////////////////
0145 const float A44f[16] = 
0146   { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
0147      3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
0148     -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
0149      1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
0150 const float* restrict Af = A44f;
0151 
0152 const float dA44f[16] =
0153   {  0.0, -0.5,  1.0, -0.5,
0154      0.0,  1.5, -2.0,  0.0,
0155      0.0, -1.5,  1.0,  0.5,
0156      0.0,  0.5,  0.0,  0.0 };
0157 const float* restrict dAf = dA44f;
0158 
0159 const float d2A44f[16] = 
0160   {  0.0, 0.0, -1.0,  1.0,
0161      0.0, 0.0,  3.0, -2.0,
0162      0.0, 0.0, -3.0,  1.0,
0163      0.0, 0.0,  1.0,  0.0 };
0164 const float* restrict d2Af = d2A44f;
0165 
0166 const float d3A44f[16] =
0167   {  0.0, 0.0,  0.0, -1.0,
0168      0.0, 0.0,  0.0,  3.0,
0169      0.0, 0.0,  0.0, -3.0,
0170      0.0, 0.0,  0.0,  1.0};
0171 const float* restrict d3Af = d3A44f;
0172 
0173 //////////////////////
0174 // Double precision //
0175 //////////////////////
0176 const double A44d[16] = 
0177   { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
0178      3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
0179     -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
0180      1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
0181 const double* restrict Ad = A44d;
0182 
0183 const double dA44d[16] =
0184   {  0.0, -0.5,  1.0, -0.5,
0185      0.0,  1.5, -2.0,  0.0,
0186      0.0, -1.5,  1.0,  0.5,
0187      0.0,  0.5,  0.0,  0.0 };
0188 const double* restrict dAd = dA44d;
0189 
0190 const double d2A44d[16] = 
0191   {  0.0, 0.0, -1.0,  1.0,
0192      0.0, 0.0,  3.0, -2.0,
0193      0.0, 0.0, -3.0,  1.0,
0194      0.0, 0.0,  1.0,  0.0 };
0195 const double* restrict d2Ad = d2A44d;
0196 
0197 const double d3A44d[16] =
0198   {  0.0, 0.0,  0.0, -1.0,
0199      0.0, 0.0,  0.0,  3.0,
0200      0.0, 0.0,  0.0, -3.0,
0201      0.0, 0.0,  0.0,  1.0};
0202 const double* restrict d3Ad = d3A44d;