File indexing completed on 2024-04-28 03:42:46
0001 /* 0002 SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 // The purpose of Robust Statistics is to provide a more robust way of calculating typical statistical measures 0008 // such as averages. More information is available here: https://en.wikipedia.org/wiki/Robust_statistics 0009 // The basic idea is that traditional measures of "location" such as the mean are susceptible to outliers in the 0010 // distribution. More information on location is available here: https://en.wikipedia.org/wiki/Location_parameter 0011 // The same applies to measures of scale such as variance (more information available here: 0012 // https://en.wikipedia.org/wiki/Scale_parameter) 0013 // 0014 // Robust Statistics uses underlying GNU Science Library (GSL) routines and provides a wrapper around these routines. 0015 // See the Statistics section of the GSL documentation: https://www.gnu.org/software/gsl/doc/html/statistics.html 0016 // 0017 // Currently the following are provided: 0018 // Location: LOCATION_MEAN - calculate the mean of input data 0019 // LOCATION_MEDIAN - calculate the median 0020 // LOCATION_TRIMMEDMEAN - discard a specified fraction of high/low values before calculating the mean 0021 // LOCATION_GASTWIRTH - use the Gastwirth estimator based on combining different quantiles 0022 // see https://www.gnu.org/software/gsl/doc/html/statistics.html#gastwirth-estimator 0023 // LOCATION_SIGMACLIPPING - single step sigma clipping routine to sigma clip outliers fron the 0024 // input data and calculate the mean from the remaining data 0025 // 0026 // Scale: SCALE_VARIANCE - variance 0027 // SCALE_MAD - median absolute deviation is the median of absolute differences between each 0028 // datapoint and the median. 0029 // SCALE_BWMV - biweight midvariance is a more complex calculate detailed here 0030 // https://en.wikipedia.org/wiki/Robust_measures_of_scale#The_biweight_midvariance 0031 // implemenated internally - not provided by GSL 0032 // SCALE_SESTIMATOR - Qn and Sn estimators are pairwise alternative to MAD by Rousseeuw and Croux 0033 // SCALE_QESTIMATOR https://www.researchgate.net/publication/221996720_Alternatives_to_Median_Absolute_Deviation 0034 // SCALE_PESTIMATOR - Pairwise mean scale estimator (Pn). This is the interquartile range of the 0035 // pairwise means. Implemented internally, not provided by GSL. 0036 // 0037 // Where necessary data is sorted by the routines and functionality to use a user selected array sride is included. 0038 // C++ Templates are used to provide access to the GSL routines based on the datatype of the input data. 0039 0040 #pragma once 0041 0042 #include <limits> 0043 #include <vector> 0044 #include <cmath> 0045 #include <QObject> 0046 #include <QVector> 0047 0048 #include "gslhelpers.h" 0049 0050 namespace Mathematics::RobustStatistics 0051 { 0052 0053 template<typename Base> 0054 double Pn_from_sorted_data(const Base sorted_data[], 0055 const size_t stride, 0056 const size_t n, 0057 Base work[], 0058 int work_int[]); 0059 0060 enum ScaleCalculation { SCALE_VARIANCE, SCALE_BWMV, SCALE_SESTIMATOR, SCALE_QESTIMATOR, SCALE_MAD, SCALE_PESTIMATOR }; 0061 0062 0063 enum LocationCalculation { LOCATION_MEAN, LOCATION_MEDIAN, LOCATION_TRIMMEDMEAN, LOCATION_GASTWIRTH, 0064 LOCATION_SIGMACLIPPING 0065 }; 0066 0067 struct SampleStatistics 0068 { 0069 constexpr SampleStatistics() : location(0), scale(0), weight(std::numeric_limits<double>::infinity()) {} 0070 constexpr SampleStatistics(const double location, const double scale, const double weight) : 0071 location(location), scale(scale), weight(weight) {} 0072 double location; 0073 double scale; 0074 double weight; 0075 }; 0076 0077 //template<typename Base=double> 0078 //struct Estimator 0079 //{ 0080 // virtual double Estimate(std::vector<Base> data, const size_t stride = 1) 0081 // { 0082 // std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride), 0083 // Mathematics::GSLHelpers::make_strided_iter(data.end(), stride)); 0084 // return EstimateFromSortedData(data, stride); 0085 // } 0086 // virtual double Estimate(const std::vector<Base> &data, const size_t stride = 1) 0087 // { 0088 // auto sorted = data; 0089 // std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride), 0090 // Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride)); 0091 // return EstimateFromSortedData(sorted, stride); 0092 // } 0093 0094 // virtual double EstimateFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0; 0095 //}; 0096 0097 template<typename Base> struct LocationEstimator; 0098 0099 template<typename Base = double> 0100 struct ScaleEstimator //: public virtual Estimator<Base> 0101 { 0102 LocationEstimator<Base> locationEstimator; 0103 virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1) 0104 { 0105 std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride), 0106 Mathematics::GSLHelpers::make_strided_iter(data.end(), stride)); 0107 return EstimateScaleFromSortedData(data, stride); 0108 } 0109 virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1) 0110 { 0111 auto sorted = data; 0112 std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride), 0113 Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride)); 0114 return EstimateScaleFromSortedData(sorted, stride); 0115 } 0116 0117 virtual double ConvertScaleToWeight(const double scale) 0118 { 0119 return 1 / (scale * scale); 0120 } 0121 0122 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0; 0123 }; 0124 0125 template <typename Base = double> 0126 struct MAD : public virtual ScaleEstimator<Base> 0127 { 0128 virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1) override 0129 { 0130 auto const size = data.size(); 0131 auto work = std::make_unique<double[]>(size / stride); 0132 return Mathematics::GSLHelpers::gslMAD(data.data(), stride, size, work.get()); 0133 } 0134 virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1) override 0135 { 0136 return EstimateScale(data, stride); 0137 } 0138 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0139 { 0140 return EstimateScale(data, stride); 0141 } 0142 }; 0143 template <typename Base = double> 0144 struct Variance : public virtual ScaleEstimator<Base> 0145 { 0146 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0147 { 0148 return Mathematics::GSLHelpers::gslVariance(data.data(), stride, data.size()); 0149 } 0150 virtual double ConvertScaleToWeight(const double variance) override 0151 { 0152 return 1 / variance; 0153 } 0154 }; 0155 0156 template <typename Base = double> 0157 struct SnEstimator : public virtual ScaleEstimator<Base> 0158 { 0159 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0160 { 0161 auto size = data.size(); 0162 auto work = std::make_unique<Base[]>(size); 0163 return Mathematics::GSLHelpers::gslSnFromSortedData(data.data(), stride, size, work.get()); 0164 } 0165 }; 0166 0167 template <typename Base = double> 0168 struct QnEstimator : public virtual ScaleEstimator<Base> 0169 { 0170 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0171 { 0172 auto size = data.size(); 0173 auto work = std::make_unique<Base[]>(3 * size); 0174 auto intWork = std::make_unique<Base[]>(5 * size); 0175 return Mathematics::GSLHelpers::gslQnFromSortedData(data.data(), stride, size, work.get(), intWork.get()); 0176 } 0177 }; 0178 0179 template <typename Base = double> 0180 struct PnEstimator : public virtual ScaleEstimator<Base> 0181 { 0182 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0183 { 0184 auto size = data.size(); 0185 auto work = std::make_unique<Base[]>(3 * size); 0186 auto intWork = std::make_unique<Base[]>(5 * size); 0187 return Pn_from_sorted_data(data.data(), stride, size, work.get(), intWork.get()); 0188 } 0189 }; 0190 0191 0192 template <typename Base = double> 0193 struct BiweightMidvariance : public virtual ScaleEstimator<Base> 0194 { 0195 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0196 { 0197 auto const size = data.size(); 0198 auto const theData = data.data(); 0199 auto work = std::make_unique<double[]>(size / stride); 0200 auto const adjustedMad = 9.0 * Mathematics::GSLHelpers::gslMAD(theData, stride, size, work.get()); 0201 auto const median = this->locationEstimator.EstimateLocationFromSortedData(theData, 0202 stride); //gslMedianFromSortedData(theData, stride, size); 0203 0204 auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride); 0205 auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride); 0206 0207 auto ys = std::vector<Base>(size / stride); 0208 std::transform(begin, end, ys.begin(), [ = ](Base x) 0209 { 0210 return (x - median) / adjustedMad; 0211 }); 0212 auto const indicator = [](Base y) 0213 { 0214 return std::fabs(y) < 1 ? 1 : 0; 0215 }; 0216 auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{}, 0217 [ = ](Base x, Base y) 0218 { 0219 return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4); 0220 }); 0221 auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{}, 0222 [ = ](Base y) 0223 { 0224 return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2)); 0225 }); 0226 // The -1 is for Bessel's correction. 0227 auto const bottom = bottomSum * (bottomSum - 1); 0228 0229 return (size / stride) * (top / bottom); 0230 } 0231 0232 virtual double ConvertScaleToWeight(const double variance) override 0233 { 0234 return 1 / variance; 0235 } 0236 }; 0237 0238 template<typename Base = double> 0239 struct LocationEstimator //: public virtual Estimator<Base> 0240 { 0241 ScaleEstimator<Base> scaleEstimator; 0242 virtual double EstimateLocation(std::vector<Base> data, const size_t stride = 1) 0243 { 0244 std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride), 0245 Mathematics::GSLHelpers::make_strided_iter(data.end(), stride)); 0246 return EstimateLocationFromSortedData(data, stride); 0247 } 0248 virtual double EstimateLocation(const std::vector<Base> &data, const size_t stride = 1) 0249 { 0250 auto sorted = data; 0251 std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride), 0252 Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride)); 0253 return EstimateLocationFromSortedData(sorted, stride); 0254 } 0255 virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0; 0256 }; 0257 0258 template<typename Base = double> 0259 struct StatisticalCoEstimator : public virtual LocationEstimator<Base>, public virtual ScaleEstimator<Base> 0260 { 0261 private: 0262 LocationEstimator<Base> locationEstimator; 0263 ScaleEstimator<Base> scaleEstimator; 0264 public: 0265 StatisticalCoEstimator(LocationEstimator<Base> locationEstimator, ScaleEstimator<Base> scaleEstimator) 0266 : locationEstimator(locationEstimator), scaleEstimator(scaleEstimator) {} 0267 virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0268 { 0269 auto const stats = EstimateStatisticsFromSortedData(data, stride); 0270 return stats.location; 0271 } 0272 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override 0273 { 0274 auto const stats = EstimateStatisticsFromSortedData(data, stride); 0275 return stats.scale; 0276 } 0277 virtual SampleStatistics EstimateStatisticsFromSortedData(const std::vector<Base> &data, const size_t stride = 1) 0278 { 0279 auto const location = EstimateLocationFromSortedData(data, stride); 0280 auto const scale = EstimateScaleFromSortedData(data, stride); 0281 auto const weight = ConvertScaleToWeight(scale); 0282 0283 return SampleStatistics{location, scale, weight}; 0284 } 0285 }; 0286 0287 /** 0288 * @short Computes a estimate of the statistical scale of the input sample. 0289 * 0290 * @param scaleMethod The estimator to use. 0291 * @param data The sample to estimate the scale of. 0292 * @param stride The stide of the data. 0293 */ 0294 template<typename Base = double> 0295 double ComputeScale(const ScaleCalculation scaleMethod, std::vector<Base> data, 0296 const size_t stride = 1) 0297 { 0298 if (scaleMethod != SCALE_VARIANCE) 0299 std::sort(data.begin(), data.end()); 0300 return ComputeScaleFromSortedData(scaleMethod, data, stride); 0301 } 0302 //[[using gnu : pure]] 0303 template<typename Base = double> 0304 double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod, 0305 const std::vector<Base> &data, 0306 const size_t stride = 1); 0307 /** 0308 * @short Computes a estimate of the statistical location of the input sample. 0309 * 0310 * @param scaleMethod The estimator to use. 0311 * @param data The sample to estimate the location of. 0312 * @param sortedData The sorted data. 0313 * @param trimAmount When using the trimmed mean estimate, the percentage quartile to remove either side. 0314 * When using sigma clipping, the number of standard deviations a sample has to be from the mean 0315 * to be considered an outlier. 0316 * @param stride The stide of the data. Note that in cases other than the simple mean, the stride will only apply 0317 * AFTER sorting. 0318 */ 0319 template<typename Base = double> 0320 double ComputeLocation(const LocationCalculation locationMethod, std::vector<Base> data, 0321 const double trimAmount = 0.25, const size_t stride = 1) 0322 { 0323 if (locationMethod != LOCATION_MEAN) 0324 std::sort(data.begin(), data.end()); 0325 return ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride); 0326 } 0327 0328 //[[using gnu : pure]] 0329 template<typename Base = double> 0330 double ComputeLocationFromSortedData(const LocationCalculation locationMethod, 0331 const std::vector<Base> &data, const double trimAmount = 0.25, const size_t stride = 1); 0332 0333 /** 0334 * @short Computes a weight for use in regression. 0335 * 0336 * @param scaleMethod The estimator to use. 0337 * @param data The sample to estimate the scale of. 0338 * @param sortedData The sorted data. 0339 * @param stride The stide of the data. Note that in cases other than the simple variance, the stride will only 0340 * apply AFTER sorting. 0341 */ 0342 template<typename Base = double> 0343 double ComputeWeight(const ScaleCalculation scaleMethod, std::vector<Base> data, const size_t stride = 1) 0344 { 0345 if (scaleMethod != SCALE_VARIANCE) 0346 std::sort(data.begin(), data.end()); 0347 return ComputeWeightFromSortedData(scaleMethod, data, stride); 0348 } 0349 //[[using gnu : pure]] 0350 template<typename Base = double> 0351 double ComputeWeightFromSortedData(const ScaleCalculation scaleMethod, const std::vector<Base> &data, 0352 const size_t stride = 1) 0353 { 0354 auto const scale = ComputeScaleFromSortedData(scaleMethod, data, stride); 0355 return ConvertScaleToWeight(scaleMethod, scale); 0356 } 0357 0358 SampleStatistics ComputeSampleStatistics(std::vector<double> data, 0359 const LocationCalculation locationMethod = LOCATION_TRIMMEDMEAN, 0360 const ScaleCalculation scaleMethod = SCALE_QESTIMATOR, 0361 double trimAmount = 0.25, 0362 const size_t stride = 1); 0363 0364 [[using gnu : pure]] 0365 constexpr double ConvertScaleToWeight(const ScaleCalculation scaleMethod, double scale) 0366 { 0367 // If the passed in scale is zero or near zero return a very small weight rather than infinity. 0368 // This fixes the situation where, e.g. there is a single datapoint so scale is zero and 0369 // weighting this datapoint with infinite weight is incorrect (and breaks the LM solver) 0370 switch (scaleMethod) 0371 { 0372 // Variance, biweight midvariance are variances. 0373 case SCALE_VARIANCE: 0374 [[fallthrough]]; 0375 case SCALE_BWMV: 0376 return (scale < 1e-10) ? 1e-10 : 1 / scale; 0377 // MAD, Sn, Qn and Pn estimators are all calibrated to estimate the standard deviation of Gaussians. 0378 case SCALE_MAD: 0379 [[fallthrough]]; 0380 case SCALE_SESTIMATOR: 0381 [[fallthrough]]; 0382 case SCALE_QESTIMATOR: 0383 [[fallthrough]]; 0384 case SCALE_PESTIMATOR: 0385 return (scale < 1e-10) ? 1e-10 : 1 / (scale * scale); 0386 } 0387 return -1; 0388 } 0389 0390 0391 } // namespace Mathematics