File indexing completed on 2024-04-28 03:42:46
0001 /* 0002 SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #include "robuststatistics.h" 0008 #include <cmath> 0009 0010 namespace Mathematics::RobustStatistics 0011 { 0012 using namespace Mathematics::GSLHelpers; 0013 0014 // int64 code is currently deactiovated by GSLHELPERS_INT64. It doesn't compile on Mac becuase it 0015 // won't cast a long long * to a long * even though they are both 64bit pointers. 0016 // On 32bit systems this would be an issue because they are not the same. 0017 // If the int64 versions are required in future then this "problem" will need to 0018 // be resolved. 0019 0020 0021 template<typename Base = double> 0022 double Pn_from_sorted_data(const Base sorted_data[], 0023 const size_t stride, 0024 const size_t n, 0025 Base work[], 0026 int work_int[]); 0027 0028 template<typename Base> 0029 double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0030 const std::vector<Base> &data, 0031 const size_t stride) 0032 { 0033 auto const size = data.size(); 0034 auto const theData = data.data(); 0035 switch (scaleMethod) 0036 { 0037 // Compute biweight midvariance 0038 case SCALE_BWMV: 0039 { 0040 auto work = std::make_unique<double[]>(size); 0041 auto const adjustedMad = 9.0 * gslMAD(theData, stride, size, work.get()); 0042 auto const median = gslMedianFromSortedData(theData, stride, size); 0043 0044 auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride); 0045 auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride); 0046 0047 auto ys = std::vector<Base>(size / stride); 0048 std::transform(begin, end, ys.begin(), [ = ](Base x) 0049 { 0050 return (x - median) / adjustedMad; 0051 }); 0052 auto const indicator = [](Base y) 0053 { 0054 return std::fabs(y) < 1 ? 1 : 0; 0055 }; 0056 auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{}, 0057 [ = ](Base x, Base y) 0058 { 0059 return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4); 0060 }); 0061 auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{}, 0062 [ = ](Base y) 0063 { 0064 return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2)); 0065 }); 0066 // The -1 is for Bessel's correction. 0067 auto const bottom = bottomSum * (bottomSum - 1); 0068 0069 return (size / stride) * (top / bottom); 0070 0071 } 0072 case SCALE_MAD: 0073 { 0074 auto work = std::make_unique<double[]>(size); 0075 return gslMAD(theData, stride, size, work.get()); 0076 } 0077 case SCALE_SESTIMATOR: 0078 { 0079 auto work = std::make_unique<Base[]>(size); 0080 return gslSnFromSortedData(theData, stride, size, work.get()); 0081 } 0082 case SCALE_QESTIMATOR: 0083 { 0084 auto work = std::make_unique<Base[]>(3 * size); 0085 auto intWork = std::make_unique<int[]>(5 * size); 0086 0087 return gslQnFromSortedData(theData, stride, size, work.get(), intWork.get()); 0088 } 0089 case SCALE_PESTIMATOR: 0090 { 0091 auto work = std::make_unique<Base[]>(3 * size); 0092 auto intWork = std::make_unique<int[]>(5 * size); 0093 0094 return Pn_from_sorted_data(theData, stride, size, work.get(), intWork.get()); 0095 } 0096 case SCALE_VARIANCE: 0097 [[fallthrough]]; 0098 default: 0099 return gslVariance(theData, stride, size); 0100 } 0101 } 0102 0103 0104 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0105 const std::vector<double> &data, 0106 const size_t stride); 0107 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0108 const std::vector<float> &data, 0109 const size_t stride); 0110 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0111 const std::vector<uint8_t> &data, 0112 const size_t stride); 0113 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0114 const std::vector<uint16_t> &data, 0115 const size_t stride); 0116 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0117 const std::vector<int16_t> &data, 0118 const size_t stride); 0119 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0120 const std::vector<uint32_t> &data, 0121 const size_t stride); 0122 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0123 const std::vector<int32_t> &data, 0124 const size_t stride); 0125 #ifdef GSLHELPERS_INT64 0126 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0127 const std::vector<int64_t> &data, 0128 const size_t stride); 0129 #endif 0130 0131 template<typename Base> double ComputeLocationFromSortedData( 0132 const LocationCalculation locationMethod, 0133 const std::vector<Base> &data, 0134 double trimAmount, const size_t stride) 0135 { 0136 auto const size = data.size(); 0137 auto const theData = data.data(); 0138 0139 switch (locationMethod) 0140 { 0141 case LOCATION_MEDIAN: 0142 { 0143 return gslMedianFromSortedData(theData, stride, size); 0144 } 0145 case LOCATION_TRIMMEDMEAN: 0146 { 0147 return gslTrimmedMeanFromSortedData(trimAmount, theData, stride, size); 0148 } 0149 case LOCATION_GASTWIRTH: 0150 { 0151 return gslGastwirthMeanFromSortedData(theData, stride, size); 0152 } 0153 // TODO: Parameterise 0154 case LOCATION_SIGMACLIPPING: 0155 { 0156 auto const median = gslMedianFromSortedData(theData, stride, size); 0157 if (data.size() > 3) 0158 { 0159 auto const stddev = gslStandardDeviation(theData, stride, size); 0160 auto begin = GSLHelpers::make_strided_iter(data.cbegin(), stride); 0161 auto end = GSLHelpers::make_strided_iter(data.cend(), stride); 0162 0163 // Remove samples over 2 standard deviations away. 0164 auto lower = std::lower_bound(begin, end, (median - stddev * trimAmount)); 0165 auto upper = std::upper_bound(lower, end, (median + stddev * trimAmount)); 0166 0167 // New mean 0168 auto sum = std::reduce(lower, upper); 0169 const int num_remaining = std::distance(lower, upper); 0170 if (num_remaining > 0) return sum / num_remaining; 0171 } 0172 return median; 0173 } 0174 case LOCATION_MEAN: 0175 [[fallthrough]]; 0176 default: 0177 return gslMean(theData, stride, size); 0178 0179 } 0180 } 0181 0182 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0183 const std::vector<double> &data, 0184 double trimAmount, const size_t stride); 0185 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0186 const std::vector<float> &data, 0187 double trimAmount, const size_t stride); 0188 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0189 const std::vector<uint8_t> &data, 0190 double trimAmount, const size_t stride); 0191 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0192 const std::vector<uint16_t> &data, 0193 double trimAmount, const size_t stride); 0194 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0195 const std::vector<int16_t> &data, 0196 double trimAmount, const size_t stride); 0197 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0198 const std::vector<uint32_t> &data, 0199 double trimAmount, const size_t stride); 0200 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0201 const std::vector<int32_t> &data, 0202 double trimAmount, const size_t stride); 0203 #ifdef GSLHELPERS_INT64 0204 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod, 0205 const std::vector<int64_t> &data, 0206 double trimAmount, const size_t stride); 0207 #endif 0208 0209 0210 SampleStatistics ComputeSampleStatistics(std::vector<double> data, 0211 const RobustStatistics::LocationCalculation locationMethod, 0212 const RobustStatistics::ScaleCalculation scaleMethod, 0213 double trimAmount, 0214 const size_t stride) 0215 { 0216 std::sort(data.begin(), data.end()); 0217 double location = RobustStatistics::ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride); 0218 double scale = RobustStatistics::ComputeScaleFromSortedData(scaleMethod, data, stride); 0219 double weight = RobustStatistics::ConvertScaleToWeight(scaleMethod, scale); 0220 return RobustStatistics::SampleStatistics{location, scale, weight}; 0221 } 0222 0223 /* 0224 Algorithm to compute the weighted high median in O(n) time. 0225 The whimed is defined as the smallest a[j] such that the sum 0226 of the weights of all a[i] <= a[j] is strictly greater than 0227 half of the total weight. 0228 Arguments: 0229 a: double array containing the observations 0230 n: number of observations 0231 w: array of (int/double) weights of the observations. 0232 */ 0233 0234 template<typename Base = double> Base whimed(Base* a, int * w, int n, Base* a_cand, Base* a_srt, int * w_cand) 0235 { 0236 int n2, i, kcand; 0237 /* sum of weights: `int' do overflow when n ~>= 1e5 */ 0238 int64_t wleft, wmid, wright, w_tot, wrest; 0239 0240 Base trial; 0241 0242 w_tot = 0; 0243 for (i = 0; i < n; ++i) 0244 w_tot += w[i]; 0245 0246 wrest = 0; 0247 0248 /* REPEAT : */ 0249 do 0250 { 0251 for (i = 0; i < n; ++i) 0252 a_srt[i] = a[i]; 0253 0254 n2 = n / 2; /* =^= n/2 +1 with 0-indexing */ 0255 0256 gslSort(a_srt, 1, n); 0257 0258 trial = a_srt[n2]; 0259 0260 wleft = 0; 0261 wmid = 0; 0262 wright = 0; 0263 0264 for (i = 0; i < n; ++i) 0265 { 0266 if (a[i] < trial) 0267 wleft += w[i]; 0268 else if (a[i] > trial) 0269 wright += w[i]; 0270 else 0271 wmid += w[i]; 0272 } 0273 0274 /* wleft = sum_{i; a[i] < trial} w[i] 0275 * wmid = sum_{i; a[i] == trial} w[i] at least one 'i' since trial is one a[]! 0276 * wright= sum_{i; a[i] > trial} w[i] 0277 */ 0278 kcand = 0; 0279 if (2 * (wrest + wleft) > w_tot) 0280 { 0281 for (i = 0; i < n; ++i) 0282 { 0283 if (a[i] < trial) 0284 { 0285 a_cand[kcand] = a[i]; 0286 w_cand[kcand] = w[i]; 0287 ++kcand; 0288 } 0289 } 0290 } 0291 else if (2 * (wrest + wleft + wmid) <= w_tot) 0292 { 0293 for (i = 0; i < n; ++i) 0294 { 0295 if (a[i] > trial) 0296 { 0297 a_cand[kcand] = a[i]; 0298 w_cand[kcand] = w[i]; 0299 ++kcand; 0300 } 0301 } 0302 0303 wrest += wleft + wmid; 0304 } 0305 else 0306 { 0307 return trial; 0308 } 0309 0310 n = kcand; 0311 for (i = 0; i < n; ++i) 0312 { 0313 a[i] = a_cand[i]; 0314 w[i] = w_cand[i]; 0315 } 0316 } 0317 while(1); 0318 } 0319 0320 0321 /* 0322 gsl_stats_Qn0_from_sorted_data() 0323 Efficient algorithm for the scale estimator: 0324 Q_n0 = { |x_i - x_j|; i<j }_(k) [ = Qn without scaling ] 0325 i.e. the k-th order statistic of the |x_i - x_j|, where: 0326 k = (floor(n/2) + 1 choose 2) 0327 Inputs: sorted_data - sorted array containing the observations 0328 stride - stride 0329 n - length of 'sorted_data' 0330 work - workspace of length 3n of type BASE 0331 work_int - workspace of length 5n of type int 0332 Return: Q_n statistic (without scale/correction factor); same type as input data 0333 */ 0334 0335 template<typename Base = double> double Pn0k_from_sorted_data(const Base sorted_data[], 0336 const double k, 0337 const size_t stride, 0338 const size_t n, 0339 Base work[], 0340 int work_int[]) 0341 { 0342 const int ni = (int) n; 0343 Base * a_srt = &work[n]; 0344 Base * a_cand = &work[2 * n]; 0345 0346 int *left = &work_int[0]; 0347 int *right = &work_int[n]; 0348 int *p = &work_int[2 * n]; 0349 int *q = &work_int[3 * n]; 0350 int *weight = &work_int[4 * n]; 0351 0352 Base trial = (Base)0.0; 0353 int found = 0; 0354 0355 int h, i, j, jh; 0356 0357 /* following should be `long long int' : they can be of order n^2 */ 0358 int64_t knew, nl, nr, sump, sumq; 0359 0360 /* check for quick return */ 0361 if (n < 2) 0362 return (Base)0.0; 0363 0364 h = n / 2 + 1; 0365 0366 for (i = 0; i < ni; ++i) 0367 { 0368 left[i] = ni - i + 1; 0369 right[i] = (i <= h) ? ni : ni - (i - h); 0370 0371 /* the n - (i-h) is from the paper; original code had `n' */ 0372 } 0373 0374 nl = (int64_t)n * (n + 1) / 2; 0375 nr = (int64_t)n * n; 0376 knew = k + nl;/* = k + (n+1 \over 2) */ 0377 0378 /* L200: */ 0379 while (!found && nr - nl > ni) 0380 { 0381 j = 0; 0382 /* Truncation to float : try to make sure that the same values are got later (guard bits !) */ 0383 for (i = 1; i < ni; ++i) 0384 { 0385 if (left[i] <= right[i]) 0386 { 0387 weight[j] = right[i] - left[i] + 1; 0388 jh = left[i] + weight[j] / 2; 0389 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jh) * stride]) / 2 ; 0390 ++j; 0391 } 0392 } 0393 0394 trial = whimed(work, weight, j, a_cand, a_srt, /*iw_cand*/ p); 0395 0396 j = 0; 0397 for (i = ni - 1; i >= 0; --i) 0398 { 0399 while (j < ni && ((double)(sorted_data[i * stride] + sorted_data[(ni - j - 1) * stride])) / 2 < trial) 0400 ++j; 0401 0402 p[i] = j; 0403 } 0404 0405 j = ni + 1; 0406 for (i = 0; i < ni; ++i) 0407 { 0408 while ((double)(sorted_data[i * stride] + sorted_data[(ni - j + 1) * stride]) / 2 > trial) 0409 --j; 0410 0411 q[i] = j; 0412 } 0413 0414 sump = 0; 0415 sumq = 0; 0416 0417 for (i = 0; i < ni; ++i) 0418 { 0419 sump += p[i]; 0420 sumq += q[i] - 1; 0421 } 0422 0423 if (knew <= sump) 0424 { 0425 for (i = 0; i < ni; ++i) 0426 right[i] = p[i]; 0427 0428 nr = sump; 0429 } 0430 else if (knew > sumq) 0431 { 0432 for (i = 0; i < ni; ++i) 0433 left[i] = q[i]; 0434 0435 nl = sumq; 0436 } 0437 else /* sump < knew <= sumq */ 0438 { 0439 found = 1; 0440 } 0441 } /* while */ 0442 0443 if (found) 0444 { 0445 return trial; 0446 } 0447 else 0448 { 0449 j = 0; 0450 for (i = 1; i < ni; ++i) 0451 { 0452 int jj; 0453 0454 for (jj = left[i]; jj <= right[i]; ++jj) 0455 { 0456 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jj) * stride]) / 2; 0457 j++; 0458 }/* j will be = sum_{i=2}^n ((right[i] + left[i] + 1)_{+})/2 */ 0459 } 0460 0461 /* return pull(work, j - 1, knew - nl) : */ 0462 knew -= (nl + 1); /* -1: 0-indexing */ 0463 0464 /* sort work array */ 0465 gslSort(work, 1, j); 0466 0467 return (work[knew]); 0468 } 0469 } 0470 0471 0472 template<typename Base> 0473 double Pn_from_sorted_data(const Base sorted_data[], 0474 const size_t stride, 0475 const size_t n, 0476 Base work[], 0477 int work_int[]) 0478 { 0479 if (n <= 2) 0480 return sqrt(gslVariance(sorted_data, stride, n)); 0481 0482 double upper = Pn0k_from_sorted_data(sorted_data, (3.0 / 4.0), stride, n, work, work_int); 0483 double lower = Pn0k_from_sorted_data(sorted_data, (1.0 / 4.0), stride, n, work, work_int); 0484 0485 const double asymptoticCorrection = 1 / 0.9539; 0486 0487 auto const asymptoticEstimate = asymptoticCorrection * (upper - lower); 0488 0489 static double correctionFactors[] = {1.128, 1.303, 1.109, 1.064, 1.166, 1.103, 1.087, 1.105, 1.047, 1.063, 1.057, 0490 1.040, 1.061, 1.047, 1.043, 1.048, 1.031, 1.037, 1.035, 1.028, 1.036, 1.030, 0491 1.029, 1.032, 1.023, 1.025, 1.024, 1.021, 1.026, 1.022, 1.021, 1.023, 1.018, 0492 1.020, 1.019, 1.017, 1.020, 1.018, 1.017, 1.018, 1.015, 1.016, 1.016, 1.014, 0493 1.016, 1.015, 1.014, 1.015 0494 }; 0495 0496 if (n <= 42) 0497 { 0498 return asymptoticEstimate * correctionFactors[n - 2]; 0499 } 0500 else 0501 { 0502 return asymptoticEstimate * (n / (n - 0.7)); 0503 } 0504 0505 } 0506 0507 0508 } // namespace Mathematics