File indexing completed on 2024-12-22 04:09:37
0001 ///////////////////////////////////////////////////////////////////////////// 0002 // einspline: a library for creating and evaluating B-splines // 0003 // Copyright (C) 2007 Kenneth P. Esler, Jr. // 0004 // // 0005 // This program is free software; you can redistribute it and/or modify // 0006 // it under the terms of the GNU General Public License as published by // 0007 // the Free Software Foundation; either version 2 of the License, or // 0008 // (at your option) any later version. // 0009 // // 0010 // This program is distributed in the hope that it will be useful, // 0011 // but WITHOUT ANY WARRANTY; without even the implied warranty of // 0012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // 0013 // GNU General Public License for more details. // 0014 // // 0015 // You should have received a copy of the GNU General Public License // 0016 // along with this program; if not, write to the Free Software // 0017 // Foundation, Inc., 51 Franklin Street, Fifth Floor, // 0018 // Boston, MA 02110-1301 USA // 0019 ///////////////////////////////////////////////////////////////////////////// 0020 0021 //#include "config.h" 0022 0023 #include "local_definitions.h" 0024 0025 /*****************/ 0026 /* SSE Data */ 0027 /*****************/ 0028 0029 #ifdef _XOPEN_SOURCE 0030 #undef _XOPEN_SOURCE 0031 #endif 0032 0033 #define _XOPEN_SOURCE 600 0034 0035 #ifndef __USE_XOPEN2K 0036 #define __USE_XOPEN2K 0037 #endif 0038 #include <stdlib.h> 0039 0040 #ifdef HAVE_SSE 0041 #include <xmmintrin.h> 0042 0043 // Single-precision version of matrices 0044 __m128 *restrict A_s = (__m128 *)0; 0045 // There is a problem with alignment of global variables in shared 0046 // libraries on 32-bit machines. 0047 // __m128 A0, A1, A2, A3, dA0, dA1, dA2, dA3, d2A0, d2A1, d2A2, d2A3; 0048 #endif 0049 0050 #ifdef HAVE_SSE2 0051 // Double-precision version of matrices 0052 #include <emmintrin.h> 0053 __m128d *restrict A_d = (__m128d *)0; 0054 0055 // There is a problem with alignment of global variables in shared 0056 // libraries on 32-bit machines. 0057 //__m128d A0_01, A0_23, A1_01, A1_23, A2_01, A2_23, A3_01, A3_23, 0058 // dA0_01, dA0_23, dA1_01, dA1_23, dA2_01, dA2_23, dA3_01, dA3_23, 0059 // d2A0_01, d2A0_23, d2A1_01, d2A1_23, d2A2_01, d2A2_23, d2A3_01, d2A3_23; 0060 #endif 0061 0062 void init_sse_data() 0063 { 0064 #ifdef HAVE_SSE 0065 if (A_s == 0) { 0066 posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12)); 0067 A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); 0068 A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); 0069 A_s[1] = _mm_setr_ps ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0 ); 0070 A_s[2] = _mm_setr_ps ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0 ); 0071 A_s[3] = _mm_setr_ps ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0 ); 0072 A_s[4] = _mm_setr_ps ( -0.5, 1.0, -0.5, 0.0 ); 0073 A_s[5] = _mm_setr_ps ( 0.0, -2.0, 1.5, 0.0 ); 0074 A_s[6] = _mm_setr_ps ( 0.5, 1.0, -1.5, 0.0 ); 0075 A_s[7] = _mm_setr_ps ( 0.0, 0.0, 0.5, 0.0 ); 0076 A_s[8] = _mm_setr_ps ( 1.0, -1.0, 0.0, 0.0 ); 0077 A_s[9] = _mm_setr_ps ( -2.0, 3.0, 0.0, 0.0 ); 0078 A_s[10] = _mm_setr_ps ( 1.0, -3.0, 0.0, 0.0 ); 0079 A_s[11] = _mm_setr_ps ( 0.0, 1.0, 0.0, 0.0 ); 0080 } 0081 0082 #endif 0083 #ifdef HAVE_SSE2 0084 if (A_d == 0) { 0085 posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24)); 0086 A_d[ 0] = _mm_setr_pd ( 3.0/6.0, -1.0/6.0 ); 0087 A_d[ 1] = _mm_setr_pd ( 1.0/6.0, -3.0/6.0 ); 0088 A_d[ 2] = _mm_setr_pd ( -6.0/6.0, 3.0/6.0 ); 0089 A_d[ 3] = _mm_setr_pd ( 4.0/6.0, 0.0/6.0 ); 0090 A_d[ 4] = _mm_setr_pd ( 3.0/6.0, -3.0/6.0 ); 0091 A_d[ 5] = _mm_setr_pd ( 1.0/6.0, 3.0/6.0 ); 0092 A_d[ 6] = _mm_setr_pd ( 0.0/6.0, 1.0/6.0 ); 0093 A_d[ 7] = _mm_setr_pd ( 0.0/6.0, 0.0/6.0 ); 0094 A_d[ 8] = _mm_setr_pd ( -0.5, 0.0 ); 0095 A_d[ 9] = _mm_setr_pd ( -0.5, 1.0 ); 0096 A_d[10] = _mm_setr_pd ( 1.5, 0.0 ); 0097 A_d[11] = _mm_setr_pd ( 0.0, -2.0 ); 0098 A_d[12] = _mm_setr_pd ( -1.5, 0.0 ); 0099 A_d[13] = _mm_setr_pd ( 0.5, 1.0 ); 0100 A_d[14] = _mm_setr_pd ( 0.5, 0.0 ); 0101 A_d[15] = _mm_setr_pd ( 0.0, 0.0 ); 0102 A_d[16] = _mm_setr_pd ( 0.0, 0.0 ); 0103 A_d[17] = _mm_setr_pd ( 1.0, -1.0 ); 0104 A_d[18] = _mm_setr_pd ( 0.0, 0.0 ); 0105 A_d[19] = _mm_setr_pd ( -2.0, 3.0 ); 0106 A_d[20] = _mm_setr_pd ( 0.0, 0.0 ); 0107 A_d[21] = _mm_setr_pd ( 1.0, -3.0 ); 0108 A_d[22] = _mm_setr_pd ( 0.0, 0.0 ); 0109 A_d[23] = _mm_setr_pd ( 0.0, 1.0 ); 0110 } 0111 #endif 0112 } 0113 0114 0115 #ifdef USE_ALTIVEC 0116 vector float A0 = (vector float) ( -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0); 0117 vector float A1 = (vector float) ( 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0); 0118 vector float A2 = (vector float) ( -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0); 0119 vector float A3 = (vector float) ( 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0); 0120 /* vector float A0 = (vector float) ( -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0); */ 0121 /* vector float A1 = (vector float) ( 3.0/6.0, -6.0/6.0, 3.0/6.0, 0.0/6.0); */ 0122 /* vector float A2 = (vector float) ( -3.0/6.0, 0.0/6.0, 3.0/6.0, 0.0/6.0); */ 0123 /* vector float A3 = (vector float) ( 1.0/6.0, 4.0/6.0, 1.0/6.0, 0.0/6.0); */ 0124 /* vector float A0 = (vector float) ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0); */ 0125 /* vector float A1 = (vector float) ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0); */ 0126 /* vector float A2 = (vector float) ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0); */ 0127 /* vector float A3 = (vector float) ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0); */ 0128 vector float dA0 = (vector float) ( 0.0, -0.5, 1.0, -0.5 ); 0129 vector float dA1 = (vector float) ( 0.0, 1.5, -2.0, 0.0 ); 0130 vector float dA2 = (vector float) ( 0.0, -1.5, 1.0, 0.5 ); 0131 vector float dA3 = (vector float) ( 0.0, 0.5, 0.0, 0.0 ); 0132 vector float d2A0 = (vector float) ( 0.0, 0.0, -1.0, 1.0 ); 0133 vector float d2A1 = (vector float) ( 0.0, 0.0, 3.0, -2.0 ); 0134 vector float d2A2 = (vector float) ( 0.0, 0.0, -3.0, 1.0 ); 0135 vector float d2A3 = (vector float) ( 0.0, 0.0, 1.0, 0.0 ); 0136 #endif 0137 0138 /*****************/ 0139 /* Standard Data */ 0140 /*****************/ 0141 0142 ////////////////////// 0143 // Single precision // 0144 ////////////////////// 0145 const float A44f[16] = 0146 { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0, 0147 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0, 0148 -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0, 0149 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0 }; 0150 const float* restrict Af = A44f; 0151 0152 const float dA44f[16] = 0153 { 0.0, -0.5, 1.0, -0.5, 0154 0.0, 1.5, -2.0, 0.0, 0155 0.0, -1.5, 1.0, 0.5, 0156 0.0, 0.5, 0.0, 0.0 }; 0157 const float* restrict dAf = dA44f; 0158 0159 const float d2A44f[16] = 0160 { 0.0, 0.0, -1.0, 1.0, 0161 0.0, 0.0, 3.0, -2.0, 0162 0.0, 0.0, -3.0, 1.0, 0163 0.0, 0.0, 1.0, 0.0 }; 0164 const float* restrict d2Af = d2A44f; 0165 0166 const float d3A44f[16] = 0167 { 0.0, 0.0, 0.0, -1.0, 0168 0.0, 0.0, 0.0, 3.0, 0169 0.0, 0.0, 0.0, -3.0, 0170 0.0, 0.0, 0.0, 1.0}; 0171 const float* restrict d3Af = d3A44f; 0172 0173 ////////////////////// 0174 // Double precision // 0175 ////////////////////// 0176 const double A44d[16] = 0177 { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0, 0178 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0, 0179 -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0, 0180 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0 }; 0181 const double* restrict Ad = A44d; 0182 0183 const double dA44d[16] = 0184 { 0.0, -0.5, 1.0, -0.5, 0185 0.0, 1.5, -2.0, 0.0, 0186 0.0, -1.5, 1.0, 0.5, 0187 0.0, 0.5, 0.0, 0.0 }; 0188 const double* restrict dAd = dA44d; 0189 0190 const double d2A44d[16] = 0191 { 0.0, 0.0, -1.0, 1.0, 0192 0.0, 0.0, 3.0, -2.0, 0193 0.0, 0.0, -3.0, 1.0, 0194 0.0, 0.0, 1.0, 0.0 }; 0195 const double* restrict d2Ad = d2A44d; 0196 0197 const double d3A44d[16] = 0198 { 0.0, 0.0, 0.0, -1.0, 0199 0.0, 0.0, 0.0, 3.0, 0200 0.0, 0.0, 0.0, -3.0, 0201 0.0, 0.0, 0.0, 1.0}; 0202 const double* restrict d3Ad = d3A44d;