File indexing completed on 2024-05-12 15:27:10
0001 /*************************************************************************** 0002 File : nsl_stats.c 0003 Project : LabPlot 0004 Description : NSL statistics functions 0005 -------------------------------------------------------------------- 0006 Copyright : (C) 2016-2017 by Stefan Gerlach (stefan.gerlach@uni.kn) 0007 0008 ***************************************************************************/ 0009 0010 /*************************************************************************** 0011 * * 0012 * This program is free software; you can redistribute it and/or modify * 0013 * it under the terms of the GNU General Public License as published by * 0014 * the Free Software Foundation; either version 2 of the License, or * 0015 * (at your option) any later version. * 0016 * * 0017 * This program is distributed in the hope that it will be useful, * 0018 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0020 * GNU General Public License for more details. * 0021 * * 0022 * You should have received a copy of the GNU General Public License * 0023 * along with this program; if not, write to the Free Software * 0024 * Foundation, Inc., 51 Franklin Street, Fifth Floor, * 0025 * Boston, MA 02110-1301 USA * 0026 * * 0027 ***************************************************************************/ 0028 0029 #include "nsl_stats.h" 0030 #include <math.h> 0031 #include <float.h> 0032 #include <gsl/gsl_sort.h> 0033 #include <gsl/gsl_cdf.h> 0034 0035 double nsl_stats_minimum(const double data[], const size_t n, size_t *index) { 0036 size_t i; 0037 0038 double min = data[0]; 0039 if (index != NULL) 0040 *index = 0; 0041 0042 for (i = 1; i < n; i++) { 0043 if (data[i] < min) { 0044 min = data[i]; 0045 if (index != NULL) 0046 *index = i; 0047 } 0048 } 0049 0050 return min; 0051 } 0052 0053 double nsl_stats_maximum(const double data[], const size_t n, size_t *index) { 0054 size_t i; 0055 0056 double max = data[0]; 0057 if (index != NULL) 0058 *index = 0; 0059 0060 for (i = 1; i < n; i++) { 0061 if (data[i] > max) { 0062 max = data[i]; 0063 if (index != NULL) 0064 *index = i; 0065 } 0066 } 0067 0068 return max; 0069 } 0070 0071 double nsl_stats_median(double data[], size_t stride, size_t n, nsl_stats_quantile_type type) { 0072 gsl_sort(data, stride, n); 0073 return nsl_stats_median_sorted(data, stride, n, type); 0074 } 0075 0076 double nsl_stats_median_sorted(const double sorted_data[], size_t stride, size_t n, nsl_stats_quantile_type type) { 0077 return nsl_stats_quantile_sorted(sorted_data, stride, n, 0.5, type); 0078 } 0079 0080 double nsl_stats_median_from_sorted_data(const double sorted_data[], size_t stride, size_t n) { 0081 // default method is number 7 0082 return nsl_stats_median_sorted(sorted_data, stride, n, nsl_stats_quantile_type7); 0083 } 0084 0085 double nsl_stats_quantile(double data[], size_t stride, size_t n, double p, nsl_stats_quantile_type type) { 0086 gsl_sort(data, stride, n); 0087 return nsl_stats_quantile_sorted(data, stride, n, p, type); 0088 } 0089 0090 double nsl_stats_quantile_sorted(const double d[], size_t stride, size_t n, double p, nsl_stats_quantile_type type) { 0091 0092 switch(type) { 0093 case nsl_stats_quantile_type1: // h = Np + 1/2, x[ceil(h – 1/2)] 0094 if (p == 0.0) 0095 return d[0]; 0096 else 0097 return d[((int)ceil(n*p)-1)*stride]; 0098 case nsl_stats_quantile_type2: 0099 if (p == 0.0) 0100 return d[0]; 0101 else if (p == 1.0) 0102 return d[(n-1)*stride]; 0103 else 0104 return (d[((int)ceil(n*p)-1)*stride]+d[((int)ceil(n*p+1)-1)*stride])/2.; 0105 case nsl_stats_quantile_type3: 0106 if(p <= 0.5/n) 0107 return d[0]; 0108 else 0109 #ifdef _WIN32 0110 return d[((int)round(n*p)-1)*stride]; 0111 #else 0112 return d[(lrint(n*p)-1)*stride]; 0113 #endif 0114 case nsl_stats_quantile_type4: 0115 if(p < 1./n) 0116 return d[0]; 0117 else if (p == 1.0) 0118 return d[(n-1)*stride]; 0119 else { 0120 int i = (int)floor(n*p); 0121 return d[(i-1)*stride]+(n*p-i)*(d[i*stride]-d[(i-1)*stride]); 0122 } 0123 case nsl_stats_quantile_type5: 0124 if(p < 0.5/n) 0125 return d[0]; 0126 else if (p >= (n-0.5)/n) 0127 return d[(n-1)*stride]; 0128 else { 0129 int i = (int)floor(n*p+0.5); 0130 return d[(i-1)*stride]+(n*p+0.5-i)*(d[i*stride]-d[(i-1)*stride]); 0131 } 0132 case nsl_stats_quantile_type6: 0133 if(p < 1./(n+1.)) 0134 return d[0]; 0135 else if (p > n/(n+1.)) 0136 return d[(n-1)*stride]; 0137 else { 0138 int i = (int)floor((n+1)*p); 0139 return d[(i-1)*stride]+((n+1)*p-i)*(d[i*stride]-d[(i-1)*stride]); 0140 } 0141 case nsl_stats_quantile_type7: // = gsl_stats_quantile_from_sorted_data(d, stride, n, p); 0142 if (p == 1.0) 0143 return d[(n-1)*stride]; 0144 else { 0145 int i = (int)floor((n-1)*p+1); 0146 return d[(i-1)*stride]+((n-1)*p+1-i)*(d[i*stride]-d[(i-1)*stride]); 0147 } 0148 case nsl_stats_quantile_type8: 0149 if (p < 2./3./(n+1./3.)) 0150 return d[0]; 0151 else if (p >= (n-1./3.)/(n+1./3.)) 0152 return d[(n-1)*stride]; 0153 else { 0154 int i = (int)floor((n+1./3.)*p+1./3.); 0155 return d[(i-1)*stride]+((n+1./3.)*p+1./3.-i)*(d[i*stride]-d[(i-1)*stride]); 0156 } 0157 case nsl_stats_quantile_type9: 0158 if (p < 5./8./(n+1./4.)) 0159 return d[0]; 0160 else if (p >= (n-3./8.)/(n+1./4.)) 0161 return d[(n-1)*stride]; 0162 else { 0163 int i = (int)floor((n+1./4.)*p+3./8.); 0164 return d[(i-1)*stride]+((n+1./4.)*p+3./8.-i)*(d[i*stride]-d[(i-1)*stride]); 0165 } 0166 } 0167 0168 return 0; 0169 } 0170 0171 double nsl_stats_quantile_from_sorted_data(const double sorted_data[], size_t stride, size_t n, double p) { 0172 return nsl_stats_quantile_sorted(sorted_data, stride, n, p, nsl_stats_quantile_type7); 0173 } 0174 0175 /* R^2 and adj. R^2 */ 0176 double nsl_stats_rsquare(double sse, double sst) { 0177 return 1. - sse/sst; 0178 } 0179 double nsl_stats_rsquareAdj(double rsquare, size_t np, size_t dof, int version) { 0180 size_t n = np + dof; 0181 // see https://stats.stackexchange.com/questions/48703/what-is-the-adjusted-r-squared-formula-in-lm-in-r-and-how-should-it-be-interpret 0182 switch (version) { 0183 case 2: 0184 return 1. - (1. - rsquare) * (n - 1.)/dof; 0185 default: 0186 return 1. - (1. - rsquare) * (n - 1.)/(dof - 1.); 0187 } 0188 } 0189 0190 /* t distribution */ 0191 double nsl_stats_tdist_t(double parameter, double error) { 0192 if (error > 0) 0193 return parameter/error; 0194 else 0195 return DBL_MAX; 0196 } 0197 0198 double nsl_stats_tdist_p(double t, double dof) { 0199 double p = 2. * gsl_cdf_tdist_Q(fabs(t), dof); 0200 if (p < 1.e-9) 0201 p = 0; 0202 return p; 0203 } 0204 double nsl_stats_tdist_margin(double alpha, double dof, double error) { 0205 return gsl_cdf_tdist_Pinv(1. - alpha/2., dof) * error; 0206 } 0207 0208 /* chi^2 distribution */ 0209 double nsl_stats_chisq_p(double t, double dof) { 0210 double p = gsl_cdf_chisq_Q(t, dof); 0211 if (p < 1.e-9) 0212 p = 0; 0213 return p; 0214 } 0215 0216 /* F distribution */ 0217 double nsl_stats_fdist_F(double rsquare, size_t np, size_t dof) { 0218 // (sst/sse - 1.) * dof/(p-1) = dof/(p-1)/(1./R^2 - 1) 0219 if (np < 2) 0220 np = 2; 0221 return dof/(np - 1.)/(1./rsquare - 1.); 0222 } 0223 double nsl_stats_fdist_p(double F, size_t np, double dof) { 0224 double p = gsl_cdf_fdist_Q(F, (double)np, dof); 0225 if (p < 1.e-9) 0226 p = 0; 0227 return p; 0228 } 0229 0230 /* log-likelihood */ 0231 double nsl_stats_logLik(double sse, size_t n) { 0232 double ll = -(double)n/2.*log(sse/n) - n/2.*log(2*M_PI) -n/2.; 0233 return ll; 0234 } 0235 0236 /* Akaike information criterion */ 0237 double nsl_stats_aic(double sse, size_t n, size_t np, int version) { 0238 switch (version) { 0239 case 2: 0240 return n * log(sse/n) + 2. * np; // reduced formula 0241 case 3: { 0242 double aic = n * log(sse/n) + 2. * np; 0243 if (n < 40 * np) // bias correction 0244 aic += 2. * np * (np + 1.)/(n - np - 1.); 0245 return aic; 0246 } 0247 default: 0248 return n * log(sse/n) + 2. * (np+1) + n*log(2.*M_PI) + n; // complete formula used in R 0249 } 0250 } 0251 double nsl_stats_aicc(double sse, size_t n, size_t np, int version) { 0252 double aic; 0253 switch (version) { 0254 case 2: 0255 aic = n * log(sse/n) + 2. * np; 0256 break; 0257 default: 0258 aic = n * log(sse/n) + 2. * (np+1) + n*log(2.*M_PI) + n; 0259 } 0260 return aic + 2. * np * (np + 1.)/(n - np - 1.); 0261 } 0262 0263 /* Bayasian information criterion */ 0264 double nsl_stats_bic(double sse, size_t n, size_t np, int version) { 0265 switch (version) { 0266 case 2: 0267 return n * log(sse/n) + np * log((double)n); // reduced formula 0268 default: 0269 return n * log(sse/n) + (np+1) * log((double)n) + n + n*log(2.*M_PI); // complete formula used in R 0270 } 0271 }