File indexing completed on 2024-04-28 03:42:46

0001 /*
0002     SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe>
0003 
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006 
0007 // The purpose of Robust Statistics is to provide a more robust way of calculating typical statistical measures
0008 // such as averages. More information is available here: https://en.wikipedia.org/wiki/Robust_statistics
0009 // The basic idea is that traditional measures of "location" such as the mean are susceptible to outliers in the
0010 // distribution. More information on location is available here: https://en.wikipedia.org/wiki/Location_parameter
0011 // The same applies to measures of scale such as variance (more information available here:
0012 // https://en.wikipedia.org/wiki/Scale_parameter)
0013 //
0014 // Robust Statistics uses underlying GNU Science Library (GSL) routines and provides a wrapper around these routines.
0015 // See the Statistics section of the GSL documentation: https://www.gnu.org/software/gsl/doc/html/statistics.html
0016 //
0017 // Currently the following are provided:
0018 // Location: LOCATION_MEAN          - calculate the mean of input data
0019 //           LOCATION_MEDIAN        - calculate the median
0020 //           LOCATION_TRIMMEDMEAN   - discard a specified fraction of high/low values before calculating the mean
0021 //           LOCATION_GASTWIRTH     - use the Gastwirth estimator based on combining different quantiles
0022 //                                    see https://www.gnu.org/software/gsl/doc/html/statistics.html#gastwirth-estimator
0023 //           LOCATION_SIGMACLIPPING - single step sigma clipping routine to sigma clip outliers fron the
0024 //                                    input data and calculate the mean from the remaining data
0025 //
0026 // Scale:    SCALE_VARIANCE         - variance
0027 //           SCALE_MAD              - median absolute deviation is the median of absolute differences between each
0028 //                                    datapoint and the median.
0029 //           SCALE_BWMV             - biweight midvariance is a more complex calculate detailed here
0030 //                                    https://en.wikipedia.org/wiki/Robust_measures_of_scale#The_biweight_midvariance
0031 //                                    implemenated internally - not provided by GSL
0032 //           SCALE_SESTIMATOR       - Qn and Sn estimators are pairwise alternative to MAD by Rousseeuw and Croux
0033 //           SCALE_QESTIMATOR         https://www.researchgate.net/publication/221996720_Alternatives_to_Median_Absolute_Deviation
0034 //           SCALE_PESTIMATOR       - Pairwise mean scale estimator (Pn). This is the interquartile range of the
0035 //                                    pairwise means. Implemented internally, not provided by GSL.
0036 //
0037 // Where necessary data is sorted by the routines and functionality to use a user selected array sride is included.
0038 // C++ Templates are used to provide access to the GSL routines based on the datatype of the input data.
0039 
0040 #pragma once
0041 
0042 #include <limits>
0043 #include <vector>
0044 #include <cmath>
0045 #include <QObject>
0046 #include <QVector>
0047 
0048 #include "gslhelpers.h"
0049 
0050 namespace Mathematics::RobustStatistics
0051 {
0052 
0053 template<typename Base>
0054 double Pn_from_sorted_data(const Base sorted_data[],
0055                            const size_t stride,
0056                            const size_t n,
0057                            Base work[],
0058                            int work_int[]);
0059 
0060 enum ScaleCalculation { SCALE_VARIANCE, SCALE_BWMV, SCALE_SESTIMATOR, SCALE_QESTIMATOR, SCALE_MAD, SCALE_PESTIMATOR };
0061 
0062 
0063 enum LocationCalculation { LOCATION_MEAN, LOCATION_MEDIAN, LOCATION_TRIMMEDMEAN, LOCATION_GASTWIRTH,
0064                            LOCATION_SIGMACLIPPING
0065                          };
0066 
0067 struct SampleStatistics
0068 {
0069     constexpr SampleStatistics() : location(0), scale(0), weight(std::numeric_limits<double>::infinity()) {}
0070     constexpr SampleStatistics(const double location, const double scale, const double weight) :
0071         location(location), scale(scale), weight(weight) {}
0072     double location;
0073     double scale;
0074     double weight;
0075 };
0076 
0077 //template<typename Base=double>
0078 //struct Estimator
0079 //{
0080 //    virtual double Estimate(std::vector<Base> data, const size_t stride = 1)
0081 //    {
0082 //        std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
0083 //                  Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
0084 //        return EstimateFromSortedData(data, stride);
0085 //    }
0086 //    virtual double Estimate(const std::vector<Base> &data, const size_t stride = 1)
0087 //    {
0088 //        auto sorted = data;
0089 //        std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
0090 //                  Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
0091 //        return EstimateFromSortedData(sorted, stride);
0092 //    }
0093 
0094 //    virtual double EstimateFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
0095 //};
0096 
0097 template<typename Base> struct LocationEstimator;
0098 
0099 template<typename Base = double>
0100 struct ScaleEstimator //: public virtual Estimator<Base>
0101 {
0102     LocationEstimator<Base> locationEstimator;
0103     virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1)
0104     {
0105         std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
0106                   Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
0107         return EstimateScaleFromSortedData(data, stride);
0108     }
0109     virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1)
0110     {
0111         auto sorted = data;
0112         std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
0113                   Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
0114         return EstimateScaleFromSortedData(sorted, stride);
0115     }
0116 
0117     virtual double ConvertScaleToWeight(const double scale)
0118     {
0119         return 1 / (scale * scale);
0120     }
0121 
0122     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
0123 };
0124 
0125 template <typename Base = double>
0126 struct MAD : public virtual ScaleEstimator<Base>
0127 {
0128     virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1) override
0129     {
0130         auto const size = data.size();
0131         auto work = std::make_unique<double[]>(size / stride);
0132         return Mathematics::GSLHelpers::gslMAD(data.data(), stride, size, work.get());
0133     }
0134     virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1) override
0135     {
0136         return EstimateScale(data, stride);
0137     }
0138     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0139     {
0140         return EstimateScale(data, stride);
0141     }
0142 };
0143 template <typename Base = double>
0144 struct Variance : public virtual ScaleEstimator<Base>
0145 {
0146     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0147     {
0148         return Mathematics::GSLHelpers::gslVariance(data.data(), stride, data.size());
0149     }
0150     virtual double ConvertScaleToWeight(const double variance) override
0151     {
0152         return 1 / variance;
0153     }
0154 };
0155 
0156 template <typename Base = double>
0157 struct SnEstimator : public virtual ScaleEstimator<Base>
0158 {
0159     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0160     {
0161         auto size = data.size();
0162         auto work = std::make_unique<Base[]>(size);
0163         return Mathematics::GSLHelpers::gslSnFromSortedData(data.data(), stride, size, work.get());
0164     }
0165 };
0166 
0167 template <typename Base = double>
0168 struct QnEstimator : public virtual ScaleEstimator<Base>
0169 {
0170     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0171     {
0172         auto size = data.size();
0173         auto work = std::make_unique<Base[]>(3 * size);
0174         auto intWork = std::make_unique<Base[]>(5 * size);
0175         return Mathematics::GSLHelpers::gslQnFromSortedData(data.data(), stride, size, work.get(), intWork.get());
0176     }
0177 };
0178 
0179 template <typename Base = double>
0180 struct PnEstimator : public virtual ScaleEstimator<Base>
0181 {
0182     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0183     {
0184         auto size = data.size();
0185         auto work = std::make_unique<Base[]>(3 * size);
0186         auto intWork = std::make_unique<Base[]>(5 * size);
0187         return Pn_from_sorted_data(data.data(), stride, size, work.get(), intWork.get());
0188     }
0189 };
0190 
0191 
0192 template <typename Base = double>
0193 struct BiweightMidvariance : public virtual ScaleEstimator<Base>
0194 {
0195     virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0196     {
0197         auto const size = data.size();
0198         auto const theData = data.data();
0199         auto work = std::make_unique<double[]>(size / stride);
0200         auto const adjustedMad = 9.0 * Mathematics::GSLHelpers::gslMAD(theData, stride, size, work.get());
0201         auto const median = this->locationEstimator.EstimateLocationFromSortedData(theData,
0202                             stride); //gslMedianFromSortedData(theData, stride, size);
0203 
0204         auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride);
0205         auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride);
0206 
0207         auto ys = std::vector<Base>(size / stride);
0208         std::transform(begin, end, ys.begin(), [ = ](Base x)
0209         {
0210             return (x - median) / adjustedMad;
0211         });
0212         auto const indicator = [](Base y)
0213         {
0214             return std::fabs(y) < 1 ? 1 : 0;
0215         };
0216         auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{},
0217                                                [ = ](Base x, Base y)
0218         {
0219             return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4);
0220         });
0221         auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{},
0222                                [ = ](Base y)
0223         {
0224             return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2));
0225         });
0226         // The -1 is for Bessel's correction.
0227         auto const bottom = bottomSum * (bottomSum - 1);
0228 
0229         return (size / stride) * (top / bottom);
0230     }
0231 
0232     virtual double ConvertScaleToWeight(const double variance) override
0233     {
0234         return 1 / variance;
0235     }
0236 };
0237 
0238 template<typename Base = double>
0239 struct LocationEstimator //: public virtual Estimator<Base>
0240 {
0241     ScaleEstimator<Base> scaleEstimator;
0242     virtual double EstimateLocation(std::vector<Base> data, const size_t stride = 1)
0243     {
0244         std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
0245                   Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
0246         return EstimateLocationFromSortedData(data, stride);
0247     }
0248     virtual double EstimateLocation(const std::vector<Base> &data, const size_t stride = 1)
0249     {
0250         auto sorted = data;
0251         std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
0252                   Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
0253         return EstimateLocationFromSortedData(sorted, stride);
0254     }
0255     virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
0256 };
0257 
0258 template<typename Base = double>
0259 struct StatisticalCoEstimator : public virtual LocationEstimator<Base>, public virtual ScaleEstimator<Base>
0260 {
0261     private:
0262         LocationEstimator<Base> locationEstimator;
0263         ScaleEstimator<Base> scaleEstimator;
0264     public:
0265         StatisticalCoEstimator(LocationEstimator<Base> locationEstimator, ScaleEstimator<Base> scaleEstimator)
0266             : locationEstimator(locationEstimator), scaleEstimator(scaleEstimator) {}
0267         virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0268         {
0269             auto const stats = EstimateStatisticsFromSortedData(data, stride);
0270             return stats.location;
0271         }
0272         virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
0273         {
0274             auto const stats = EstimateStatisticsFromSortedData(data, stride);
0275             return stats.scale;
0276         }
0277         virtual SampleStatistics EstimateStatisticsFromSortedData(const std::vector<Base> &data, const size_t stride = 1)
0278         {
0279             auto const location = EstimateLocationFromSortedData(data, stride);
0280             auto const scale = EstimateScaleFromSortedData(data, stride);
0281             auto const weight = ConvertScaleToWeight(scale);
0282 
0283             return SampleStatistics{location, scale, weight};
0284         }
0285 };
0286 
0287 /**
0288  * @short Computes a estimate of the statistical scale of the input sample.
0289  *
0290  * @param scaleMethod The estimator to use.
0291  * @param data The sample to estimate the scale of.
0292  * @param stride The stide of the data.
0293  */
0294 template<typename Base = double>
0295 double ComputeScale(const ScaleCalculation scaleMethod, std::vector<Base> data,
0296                     const size_t stride = 1)
0297 {
0298     if (scaleMethod != SCALE_VARIANCE)
0299         std::sort(data.begin(), data.end());
0300     return ComputeScaleFromSortedData(scaleMethod, data, stride);
0301 }
0302 //[[using gnu : pure]]
0303 template<typename Base = double>
0304 double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0305                                   const std::vector<Base> &data,
0306                                   const size_t stride = 1);
0307 /**
0308  * @short Computes a estimate of the statistical location of the input sample.
0309  *
0310  * @param scaleMethod The estimator to use.
0311  * @param data The sample to estimate the location of.
0312  * @param sortedData The sorted data.
0313  * @param trimAmount When using the trimmed mean estimate, the percentage quartile to remove either side.
0314  *                   When using sigma clipping, the number of standard deviations a sample has to be from the mean
0315  *                   to be considered an outlier.
0316  * @param stride The stide of the data. Note that in cases other than the simple mean, the stride will only apply
0317  *               AFTER sorting.
0318  */
0319 template<typename Base = double>
0320 double ComputeLocation(const LocationCalculation locationMethod, std::vector<Base> data,
0321                        const double trimAmount = 0.25, const size_t stride = 1)
0322 {
0323     if (locationMethod != LOCATION_MEAN)
0324         std::sort(data.begin(), data.end());
0325     return ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride);
0326 }
0327 
0328 //[[using gnu : pure]]
0329 template<typename Base = double>
0330 double ComputeLocationFromSortedData(const LocationCalculation locationMethod,
0331                                      const std::vector<Base> &data, const double trimAmount = 0.25, const size_t stride = 1);
0332 
0333 /**
0334  * @short Computes a weight for use in regression.
0335  *
0336  * @param scaleMethod The estimator to use.
0337  * @param data The sample to estimate the scale of.
0338  * @param sortedData The sorted data.
0339  * @param stride The stide of the data. Note that in cases other than the simple variance, the stride will only
0340  *               apply AFTER sorting.
0341  */
0342 template<typename Base = double>
0343 double ComputeWeight(const ScaleCalculation scaleMethod, std::vector<Base> data, const size_t stride = 1)
0344 {
0345     if (scaleMethod != SCALE_VARIANCE)
0346         std::sort(data.begin(), data.end());
0347     return ComputeWeightFromSortedData(scaleMethod, data, stride);
0348 }
0349 //[[using gnu : pure]]
0350 template<typename Base = double>
0351 double ComputeWeightFromSortedData(const ScaleCalculation scaleMethod, const std::vector<Base> &data,
0352                                    const size_t stride = 1)
0353 {
0354     auto const scale = ComputeScaleFromSortedData(scaleMethod, data, stride);
0355     return ConvertScaleToWeight(scaleMethod, scale);
0356 }
0357 
0358 SampleStatistics ComputeSampleStatistics(std::vector<double> data,
0359         const LocationCalculation locationMethod = LOCATION_TRIMMEDMEAN,
0360         const ScaleCalculation scaleMethod = SCALE_QESTIMATOR,
0361         double trimAmount = 0.25,
0362         const size_t stride = 1);
0363 
0364 [[using gnu : pure]]
0365 constexpr double ConvertScaleToWeight(const ScaleCalculation scaleMethod, double scale)
0366 {
0367     // If the passed in scale is zero or near zero return a very small weight rather than infinity.
0368     // This fixes the situation where, e.g. there is a single datapoint so scale is zero and
0369     // weighting this datapoint with infinite weight is incorrect (and breaks the LM solver)
0370     switch (scaleMethod)
0371     {
0372         // Variance, biweight midvariance are variances.
0373         case SCALE_VARIANCE:
0374             [[fallthrough]];
0375         case SCALE_BWMV:
0376             return (scale < 1e-10) ? 1e-10 : 1 / scale;
0377         // MAD, Sn, Qn and Pn estimators are all calibrated to estimate the standard deviation of Gaussians.
0378         case SCALE_MAD:
0379             [[fallthrough]];
0380         case SCALE_SESTIMATOR:
0381             [[fallthrough]];
0382         case SCALE_QESTIMATOR:
0383             [[fallthrough]];
0384         case SCALE_PESTIMATOR:
0385             return (scale < 1e-10) ? 1e-10 : 1 / (scale * scale);
0386     }
0387     return -1;
0388 }
0389 
0390 
0391 } // namespace Mathematics