File indexing completed on 2024-04-28 03:42:46

0001 /*
0002     SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe>
0003 
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006 
0007 #include "robuststatistics.h"
0008 #include <cmath>
0009 
0010 namespace Mathematics::RobustStatistics
0011 {
0012 using namespace Mathematics::GSLHelpers;
0013 
0014 // int64 code is currently deactiovated by GSLHELPERS_INT64. It doesn't compile on Mac becuase it
0015 // won't cast a long long * to a long * even though they are both 64bit pointers.
0016 // On 32bit systems this would be an issue because they are not the same.
0017 // If the int64 versions are required in future then this "problem" will need to
0018 // be resolved.
0019 
0020 
0021 template<typename Base = double>
0022 double Pn_from_sorted_data(const Base sorted_data[],
0023                            const size_t stride,
0024                            const size_t n,
0025                            Base work[],
0026                            int work_int[]);
0027 
0028 template<typename Base>
0029 double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0030                                   const std::vector<Base> &data,
0031                                   const size_t stride)
0032 {
0033     auto const size = data.size();
0034     auto const theData = data.data();
0035     switch (scaleMethod)
0036     {
0037         // Compute biweight midvariance
0038         case SCALE_BWMV:
0039         {
0040             auto work = std::make_unique<double[]>(size);
0041             auto const adjustedMad = 9.0 * gslMAD(theData, stride, size, work.get());
0042             auto const median = gslMedianFromSortedData(theData, stride, size);
0043 
0044             auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride);
0045             auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride);
0046 
0047             auto ys = std::vector<Base>(size / stride);
0048             std::transform(begin, end, ys.begin(), [ = ](Base x)
0049             {
0050                 return (x - median) / adjustedMad;
0051             });
0052             auto const indicator = [](Base y)
0053             {
0054                 return std::fabs(y) < 1 ? 1 : 0;
0055             };
0056             auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{},
0057                                                    [ = ](Base x, Base y)
0058             {
0059                 return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4);
0060             });
0061             auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{},
0062                                    [ = ](Base y)
0063             {
0064                 return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2));
0065             });
0066             // The -1 is for Bessel's correction.
0067             auto const bottom = bottomSum * (bottomSum - 1);
0068 
0069             return (size / stride) * (top / bottom);
0070 
0071         }
0072         case SCALE_MAD:
0073         {
0074             auto work = std::make_unique<double[]>(size);
0075             return gslMAD(theData, stride, size, work.get());
0076         }
0077         case SCALE_SESTIMATOR:
0078         {
0079             auto work = std::make_unique<Base[]>(size);
0080             return gslSnFromSortedData(theData, stride, size, work.get());
0081         }
0082         case SCALE_QESTIMATOR:
0083         {
0084             auto work = std::make_unique<Base[]>(3 * size);
0085             auto intWork = std::make_unique<int[]>(5 * size);
0086 
0087             return gslQnFromSortedData(theData, stride, size, work.get(), intWork.get());
0088         }
0089         case SCALE_PESTIMATOR:
0090         {
0091             auto work = std::make_unique<Base[]>(3 * size);
0092             auto intWork = std::make_unique<int[]>(5 * size);
0093 
0094             return Pn_from_sorted_data(theData, stride, size, work.get(), intWork.get());
0095         }
0096         case SCALE_VARIANCE:
0097             [[fallthrough]];
0098         default:
0099             return gslVariance(theData, stride, size);
0100     }
0101 }
0102 
0103 
0104 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0105         const std::vector<double> &data,
0106         const size_t stride);
0107 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0108         const std::vector<float> &data,
0109         const size_t stride);
0110 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0111         const std::vector<uint8_t> &data,
0112         const size_t stride);
0113 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0114         const std::vector<uint16_t> &data,
0115         const size_t stride);
0116 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0117         const std::vector<int16_t> &data,
0118         const size_t stride);
0119 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0120         const std::vector<uint32_t> &data,
0121         const size_t stride);
0122 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0123         const std::vector<int32_t> &data,
0124         const size_t stride);
0125 #ifdef GSLHELPERS_INT64
0126 template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
0127         const std::vector<int64_t> &data,
0128         const size_t stride);
0129 #endif
0130 
0131 template<typename Base> double ComputeLocationFromSortedData(
0132     const LocationCalculation locationMethod,
0133     const std::vector<Base> &data,
0134     double trimAmount, const size_t stride)
0135 {
0136     auto const size = data.size();
0137     auto const theData = data.data();
0138 
0139     switch (locationMethod)
0140     {
0141         case LOCATION_MEDIAN:
0142         {
0143             return gslMedianFromSortedData(theData, stride, size);
0144         }
0145         case LOCATION_TRIMMEDMEAN:
0146         {
0147             return gslTrimmedMeanFromSortedData(trimAmount, theData, stride, size);
0148         }
0149         case  LOCATION_GASTWIRTH:
0150         {
0151             return gslGastwirthMeanFromSortedData(theData, stride, size);
0152         }
0153         // TODO: Parameterise
0154         case LOCATION_SIGMACLIPPING:
0155         {
0156             auto const median = gslMedianFromSortedData(theData, stride, size);
0157             if (data.size() > 3)
0158             {
0159                 auto const stddev = gslStandardDeviation(theData, stride, size);
0160                 auto begin = GSLHelpers::make_strided_iter(data.cbegin(), stride);
0161                 auto end = GSLHelpers::make_strided_iter(data.cend(), stride);
0162 
0163                 // Remove samples over 2 standard deviations away.
0164                 auto lower = std::lower_bound(begin, end, (median - stddev * trimAmount));
0165                 auto upper = std::upper_bound(lower, end, (median + stddev * trimAmount));
0166 
0167                 // New mean
0168                 auto sum = std::reduce(lower, upper);
0169                 const int num_remaining = std::distance(lower, upper);
0170                 if (num_remaining > 0) return sum / num_remaining;
0171             }
0172             return median;
0173         }
0174         case LOCATION_MEAN:
0175             [[fallthrough]];
0176         default:
0177             return gslMean(theData, stride, size);
0178 
0179     }
0180 }
0181 
0182 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0183         const std::vector<double> &data,
0184         double trimAmount, const size_t stride);
0185 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0186         const std::vector<float> &data,
0187         double trimAmount, const size_t stride);
0188 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0189         const std::vector<uint8_t> &data,
0190         double trimAmount, const size_t stride);
0191 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0192         const std::vector<uint16_t> &data,
0193         double trimAmount, const size_t stride);
0194 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0195         const std::vector<int16_t> &data,
0196         double trimAmount, const size_t stride);
0197 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0198         const std::vector<uint32_t> &data,
0199         double trimAmount, const size_t stride);
0200 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0201         const std::vector<int32_t> &data,
0202         double trimAmount, const size_t stride);
0203 #ifdef GSLHELPERS_INT64
0204 template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
0205         const std::vector<int64_t> &data,
0206         double trimAmount, const size_t stride);
0207 #endif
0208 
0209 
0210 SampleStatistics ComputeSampleStatistics(std::vector<double> data,
0211         const RobustStatistics::LocationCalculation locationMethod,
0212         const RobustStatistics::ScaleCalculation scaleMethod,
0213         double trimAmount,
0214         const size_t stride)
0215 {
0216     std::sort(data.begin(), data.end());
0217     double location = RobustStatistics::ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride);
0218     double scale = RobustStatistics::ComputeScaleFromSortedData(scaleMethod, data, stride);
0219     double weight = RobustStatistics::ConvertScaleToWeight(scaleMethod, scale);
0220     return RobustStatistics::SampleStatistics{location, scale, weight};
0221 }
0222 
0223 /*
0224   Algorithm to compute the weighted high median in O(n) time.
0225   The whimed is defined as the smallest a[j] such that the sum
0226   of the weights of all a[i] <= a[j] is strictly greater than
0227   half of the total weight.
0228   Arguments:
0229   a: double array containing the observations
0230   n: number of observations
0231   w: array of (int/double) weights of the observations.
0232 */
0233 
0234 template<typename Base = double> Base whimed(Base* a, int * w, int n, Base* a_cand, Base* a_srt, int * w_cand)
0235 {
0236     int n2, i, kcand;
0237     /* sum of weights: `int' do overflow when  n ~>= 1e5 */
0238     int64_t wleft, wmid, wright, w_tot, wrest;
0239 
0240     Base trial;
0241 
0242     w_tot = 0;
0243     for (i = 0; i < n; ++i)
0244         w_tot += w[i];
0245 
0246     wrest = 0;
0247 
0248     /* REPEAT : */
0249     do
0250     {
0251         for (i = 0; i < n; ++i)
0252             a_srt[i] = a[i];
0253 
0254         n2 = n / 2; /* =^= n/2 +1 with 0-indexing */
0255 
0256         gslSort(a_srt, 1, n);
0257 
0258         trial = a_srt[n2];
0259 
0260         wleft = 0;
0261         wmid = 0;
0262         wright = 0;
0263 
0264         for (i = 0; i < n; ++i)
0265         {
0266             if (a[i] < trial)
0267                 wleft += w[i];
0268             else if (a[i] > trial)
0269                 wright += w[i];
0270             else
0271                 wmid += w[i];
0272         }
0273 
0274         /* wleft = sum_{i; a[i]  < trial}  w[i]
0275          * wmid  = sum_{i; a[i] == trial}  w[i] at least one 'i' since trial is one a[]!
0276          * wright= sum_{i; a[i]  > trial}  w[i]
0277          */
0278         kcand = 0;
0279         if (2 * (wrest + wleft) > w_tot)
0280         {
0281             for (i = 0; i < n; ++i)
0282             {
0283                 if (a[i] < trial)
0284                 {
0285                     a_cand[kcand] = a[i];
0286                     w_cand[kcand] = w[i];
0287                     ++kcand;
0288                 }
0289             }
0290         }
0291         else if (2 * (wrest + wleft + wmid) <= w_tot)
0292         {
0293             for (i = 0; i < n; ++i)
0294             {
0295                 if (a[i] > trial)
0296                 {
0297                     a_cand[kcand] = a[i];
0298                     w_cand[kcand] = w[i];
0299                     ++kcand;
0300                 }
0301             }
0302 
0303             wrest += wleft + wmid;
0304         }
0305         else
0306         {
0307             return trial;
0308         }
0309 
0310         n = kcand;
0311         for (i = 0; i < n; ++i)
0312         {
0313             a[i] = a_cand[i];
0314             w[i] = w_cand[i];
0315         }
0316     }
0317     while(1);
0318 }
0319 
0320 
0321 /*
0322 gsl_stats_Qn0_from_sorted_data()
0323   Efficient algorithm for the scale estimator:
0324     Q_n0 = { |x_i - x_j|; i<j }_(k) [ = Qn without scaling ]
0325 i.e. the k-th order statistic of the |x_i - x_j|, where:
0326 k = (floor(n/2) + 1 choose 2)
0327 Inputs: sorted_data - sorted array containing the observations
0328         stride      - stride
0329         n           - length of 'sorted_data'
0330         work        - workspace of length 3n of type BASE
0331         work_int    - workspace of length 5n of type int
0332 Return: Q_n statistic (without scale/correction factor); same type as input data
0333 */
0334 
0335 template<typename Base = double> double Pn0k_from_sorted_data(const Base sorted_data[],
0336         const double k,
0337         const size_t stride,
0338         const size_t n,
0339         Base work[],
0340         int work_int[])
0341 {
0342     const int ni = (int) n;
0343     Base * a_srt = &work[n];
0344     Base * a_cand = &work[2 * n];
0345 
0346     int *left = &work_int[0];
0347     int *right = &work_int[n];
0348     int *p = &work_int[2 * n];
0349     int *q = &work_int[3 * n];
0350     int *weight = &work_int[4 * n];
0351 
0352     Base trial = (Base)0.0;
0353     int found = 0;
0354 
0355     int h, i, j, jh;
0356 
0357     /* following should be `long long int' : they can be of order n^2 */
0358     int64_t knew, nl, nr, sump, sumq;
0359 
0360     /* check for quick return */
0361     if (n < 2)
0362         return (Base)0.0;
0363 
0364     h = n / 2 + 1;
0365 
0366     for (i = 0; i < ni; ++i)
0367     {
0368         left[i] = ni - i + 1;
0369         right[i] = (i <= h) ? ni : ni - (i - h);
0370 
0371         /* the n - (i-h) is from the paper; original code had `n' */
0372     }
0373 
0374     nl = (int64_t)n * (n + 1) / 2;
0375     nr = (int64_t)n * n;
0376     knew = k + nl;/* = k + (n+1 \over 2) */
0377 
0378     /* L200: */
0379     while (!found && nr - nl > ni)
0380     {
0381         j = 0;
0382         /* Truncation to float : try to make sure that the same values are got later (guard bits !) */
0383         for (i = 1; i < ni; ++i)
0384         {
0385             if (left[i] <= right[i])
0386             {
0387                 weight[j] = right[i] - left[i] + 1;
0388                 jh = left[i] + weight[j] / 2;
0389                 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jh) * stride]) / 2 ;
0390                 ++j;
0391             }
0392         }
0393 
0394         trial = whimed(work, weight, j, a_cand, a_srt, /*iw_cand*/ p);
0395 
0396         j = 0;
0397         for (i = ni - 1; i >= 0; --i)
0398         {
0399             while (j < ni && ((double)(sorted_data[i * stride] + sorted_data[(ni - j - 1) * stride])) / 2 < trial)
0400                 ++j;
0401 
0402             p[i] = j;
0403         }
0404 
0405         j = ni + 1;
0406         for (i = 0; i < ni; ++i)
0407         {
0408             while ((double)(sorted_data[i * stride] + sorted_data[(ni - j + 1) * stride]) / 2 > trial)
0409                 --j;
0410 
0411             q[i] = j;
0412         }
0413 
0414         sump = 0;
0415         sumq = 0;
0416 
0417         for (i = 0; i < ni; ++i)
0418         {
0419             sump += p[i];
0420             sumq += q[i] - 1;
0421         }
0422 
0423         if (knew <= sump)
0424         {
0425             for (i = 0; i < ni; ++i)
0426                 right[i] = p[i];
0427 
0428             nr = sump;
0429         }
0430         else if (knew > sumq)
0431         {
0432             for (i = 0; i < ni; ++i)
0433                 left[i] = q[i];
0434 
0435             nl = sumq;
0436         }
0437         else /* sump < knew <= sumq */
0438         {
0439             found = 1;
0440         }
0441     } /* while */
0442 
0443     if (found)
0444     {
0445         return trial;
0446     }
0447     else
0448     {
0449         j = 0;
0450         for (i = 1; i < ni; ++i)
0451         {
0452             int jj;
0453 
0454             for (jj = left[i]; jj <= right[i]; ++jj)
0455             {
0456                 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jj) * stride]) / 2;
0457                 j++;
0458             }/* j will be = sum_{i=2}^n ((right[i] + left[i] + 1)_{+})/2  */
0459         }
0460 
0461         /* return pull(work, j - 1, knew - nl)  : */
0462         knew -= (nl + 1); /* -1: 0-indexing */
0463 
0464         /* sort work array */
0465         gslSort(work, 1, j);
0466 
0467         return (work[knew]);
0468     }
0469 }
0470 
0471 
0472 template<typename Base>
0473 double Pn_from_sorted_data(const Base sorted_data[],
0474                            const size_t stride,
0475                            const size_t n,
0476                            Base work[],
0477                            int work_int[])
0478 {
0479     if (n <= 2)
0480         return sqrt(gslVariance(sorted_data, stride, n));
0481 
0482     double upper = Pn0k_from_sorted_data(sorted_data, (3.0 / 4.0), stride, n, work, work_int);
0483     double lower = Pn0k_from_sorted_data(sorted_data, (1.0 / 4.0), stride, n, work, work_int);
0484 
0485     const double asymptoticCorrection = 1 / 0.9539;
0486 
0487     auto const asymptoticEstimate = asymptoticCorrection * (upper - lower);
0488 
0489     static double correctionFactors[] = {1.128, 1.303, 1.109, 1.064, 1.166, 1.103, 1.087, 1.105, 1.047, 1.063, 1.057,
0490                                          1.040, 1.061, 1.047, 1.043, 1.048, 1.031, 1.037, 1.035, 1.028, 1.036, 1.030,
0491                                          1.029, 1.032, 1.023, 1.025, 1.024, 1.021, 1.026, 1.022, 1.021, 1.023, 1.018,
0492                                          1.020, 1.019, 1.017, 1.020, 1.018, 1.017, 1.018, 1.015, 1.016, 1.016, 1.014,
0493                                          1.016, 1.015, 1.014, 1.015
0494                                         };
0495 
0496     if (n <= 42)
0497     {
0498         return  asymptoticEstimate * correctionFactors[n - 2];
0499     }
0500     else
0501     {
0502         return asymptoticEstimate * (n / (n - 0.7));
0503     }
0504 
0505 }
0506 
0507 
0508 } // namespace Mathematics