File indexing completed on 2024-05-12 03:47:20

0001 /*
0002     File                 : VariableParser.h
0003     Project              : LabPlot
0004     Description          : Variable parser for different CAS backends
0005     --------------------------------------------------------------------
0006     SPDX-FileCopyrightText: 2015 Garvit Khatri <garvitdelhi@gmail.com>
0007     SPDX-FileCopyrightText: 2016 Alexander Semke <alexander.semke@web.de>
0008     SPDX-License-Identifier: GPL-2.0-or-later
0009 */
0010 
0011 #include "VariableParser.h"
0012 #include "backend/lib/trace.h"
0013 #include <QRegularExpressionMatch>
0014 #include <QStringList>
0015 #include <cmath> // NAN
0016 
0017 VariableParser::VariableParser(QString name, QString value)
0018     : m_backendName(std::move(name))
0019     , m_string(std::move(value)) {
0020     PERFTRACE(QLatin1String("parsing variable"));
0021     if (m_backendName.compare(QStringLiteral("Maxima"), Qt::CaseInsensitive) == 0)
0022         parseMaximaValues();
0023     else if ((m_backendName.compare(QStringLiteral("Python"), Qt::CaseInsensitive) == 0)
0024              || (m_backendName.compare(QStringLiteral("Python 3"), Qt::CaseInsensitive) == 0)
0025              || (m_backendName.compare(QStringLiteral("Python 2"), Qt::CaseInsensitive) == 0))
0026         parsePythonValues();
0027     else if (m_backendName.compare(QStringLiteral("Sage"), Qt::CaseInsensitive) == 0)
0028         parsePythonValues();
0029     else if (m_backendName.compare(QStringLiteral("R"), Qt::CaseInsensitive) == 0)
0030         parseRValues();
0031     else if (m_backendName.compare(QStringLiteral("Julia"), Qt::CaseInsensitive) == 0)
0032         parsePythonValues();
0033     else if (m_backendName.compare(QStringLiteral("Octave"), Qt::CaseInsensitive) == 0)
0034         parseOctaveValues();
0035 }
0036 
0037 QVector<int>& VariableParser::integers() {
0038     return *static_cast<QVector<int>*>(m_values);
0039 }
0040 
0041 QVector<qint64>& VariableParser::bigInt() {
0042     return *static_cast<QVector<qint64>*>(m_values);
0043 }
0044 
0045 QVector<double>& VariableParser::doublePrecision() {
0046     return *static_cast<QVector<double>*>(m_values);
0047 }
0048 
0049 QVector<QDateTime>& VariableParser::dateTime() {
0050     return *static_cast<QVector<QDateTime>*>(m_values);
0051 }
0052 
0053 QVector<QString>& VariableParser::text() {
0054     return *static_cast<QVector<QString>*>(m_values);
0055 }
0056 
0057 void VariableParser::parseMaximaValues() {
0058     if (m_string.count(QStringLiteral("[")) > 2)
0059         return;
0060 
0061     Datatype dataType = Datatype::float64;
0062     if (m_string.startsWith(QLatin1String("[\""))) // Maxime uses " to quote string values in the output
0063         dataType = Datatype::text;
0064 
0065     m_string = m_string.replace(QStringLiteral("["), QString());
0066     m_string = m_string.replace(QStringLiteral("]"), QString());
0067     m_string = m_string.trimmed();
0068 
0069     const QStringList valueStringList = m_string.split(QStringLiteral(","));
0070     parseValues(valueStringList, dataType);
0071 }
0072 
0073 /*!
0074  * Python containers that can be parsed:
0075  * * List (a collection which is ordered and changeable, allows duplicate members)
0076  * * Tuple (collection which is ordered and unchangeable, allows duplicate members)
0077  * * Set (collection which is unordered, unchangeable and unindexed, no duplicate members)
0078  * * Numpy's array (with and without the explicit specification of the data type)
0079  * */
0080 void VariableParser::parsePythonValues() {
0081     QStringList valueStringList;
0082     QString dataType = QStringLiteral("float64");
0083     m_string = m_string.trimmed();
0084     if (m_string.startsWith(QLatin1String("array"))) {
0085         // parse numpy arrays, string representation like array([1,2,3,4,5]) or
0086         //  array([1, 2,3], dtype=uint32)
0087 
0088         // we don't handle array of arrays, e.g. the output of 'np.ones((2,2), dtype=np.int16)'
0089         // which is 'array([[1, 1], [1, 1]], dtype=int16)'
0090         if (m_string.count(QStringLiteral("[")) > 2)
0091             return;
0092 
0093         QRegularExpressionMatch match;
0094         auto numpyDatatypeRegex = QStringLiteral("\\s*,\\s*dtype='{0,1}[a-zA-Z0-9\\[\\]]*'{0,1}");
0095         m_string.indexOf(QRegularExpression(numpyDatatypeRegex), 0, &match);
0096         if (match.isValid() && match.captured() != QString())
0097             dataType = match.captured().replace(QStringLiteral("'"), QString()).replace(QStringLiteral(", dtype="), QString());
0098         m_string = m_string.replace(QStringLiteral("array(["), QString());
0099         m_string = m_string.replace(QRegExp(numpyDatatypeRegex), QString());
0100         m_string = m_string.replace(QStringLiteral("])"), QString());
0101     } else if (m_string.startsWith(QStringLiteral("["))) {
0102         // parse python's lists
0103         if (m_string.startsWith(QLatin1String("['"))) // python uses ' to quote string values in the output
0104             dataType = QStringLiteral("text");
0105         m_string = m_string.replace(QStringLiteral("["), QString());
0106         m_string = m_string.replace(QStringLiteral("]"), QString());
0107     } else if (m_string.startsWith(QStringLiteral("("))) {
0108         // parse python's tuples
0109         if (m_string.startsWith(QLatin1String("('")))
0110             dataType = QStringLiteral("text");
0111         m_string = m_string.replace(QStringLiteral("("), QString());
0112         m_string = m_string.replace(QStringLiteral(")"), QString());
0113     } else if (m_string.startsWith(QStringLiteral("{"))) {
0114         // parse python's sets
0115         if (m_string.startsWith(QLatin1String("{'")))
0116             dataType = QStringLiteral("text");
0117         m_string = m_string.replace(QStringLiteral("{"), QString());
0118         m_string = m_string.replace(QStringLiteral("}"), QString());
0119     } else {
0120         return;
0121     }
0122 
0123     // Fast method to determine the separator. It is assumed if at least one
0124     // commas exist, the comma is the separator
0125     if (m_string.indexOf(QLatin1Char(',')) != -1)
0126         valueStringList = m_string.split(QStringLiteral(","));
0127     else
0128         valueStringList = m_string.split(QStringLiteral(" "));
0129 
0130     parseValues(valueStringList, convertNumpyDatatype(dataType));
0131 }
0132 
0133 void VariableParser::parseRValues() {
0134     m_string = m_string.trimmed();
0135     const QStringList valueStringList = m_string.split(QStringLiteral(", "));
0136     parseValues(valueStringList);
0137 }
0138 
0139 void VariableParser::parseOctaveValues() {
0140     m_string = m_string.trimmed();
0141 
0142     QStringList valueStringList;
0143     const QStringList tempStringList = m_string.split(QLatin1Char('\n'));
0144     if (m_string.indexOf(QStringLiteral("; ")) != -1) { // parse column vectors
0145         for (const QString& values : tempStringList)
0146             valueStringList << values.split(QStringLiteral("; "));
0147     } else {
0148         for (const QString& values : tempStringList) { // parse row vectors
0149             // TODO: in newer version of Cantor the rows with "Columns..." were removed already.
0150             // we can stop looking for this substring in some point in time later.
0151             if (!values.isEmpty() && !values.trimmed().startsWith(QStringLiteral("Columns")))
0152                 valueStringList << values.split(QLatin1Char(' '));
0153         }
0154     }
0155 
0156     valueStringList.removeAll(QString());
0157     parseValues(valueStringList);
0158 }
0159 
0160 bool VariableParser::isParsed() {
0161     return m_parsed;
0162 }
0163 
0164 void VariableParser::clearValues() {
0165     switch (m_dataType) {
0166     case AbstractColumn::ColumnMode::Integer:
0167         delete static_cast<QVector<int>*>(m_values);
0168         break;
0169     case AbstractColumn::ColumnMode::BigInt:
0170         delete static_cast<QVector<qlonglong>*>(m_values);
0171         break;
0172     case AbstractColumn::ColumnMode::Double:
0173         delete static_cast<QVector<qreal>*>(m_values);
0174         break;
0175     case AbstractColumn::ColumnMode::Day:
0176     case AbstractColumn::ColumnMode::Month:
0177     case AbstractColumn::ColumnMode::DateTime:
0178         delete static_cast<QVector<QDateTime>*>(m_values);
0179         break;
0180     case AbstractColumn::ColumnMode::Text:
0181         delete static_cast<QVector<QString>*>(m_values);
0182         break;
0183     }
0184 }
0185 
0186 VariableParser::Datatype VariableParser::convertNumpyDatatype(const QString& d) {
0187     if (d == QStringLiteral("uint8"))
0188         return Datatype::uint8;
0189     else if (d == QStringLiteral("int8"))
0190         return Datatype::int8;
0191     else if (d == QStringLiteral("uint16"))
0192         return Datatype::uint16;
0193     else if (d == QStringLiteral("int16"))
0194         return Datatype::int16;
0195     else if (d == QStringLiteral("uint32"))
0196         return Datatype::uint32;
0197     else if (d == QStringLiteral("int32"))
0198         return Datatype::int32;
0199     else if (d == QStringLiteral("uint64"))
0200         return Datatype::uint64;
0201     else if (d == QStringLiteral("int64"))
0202         return Datatype::int64;
0203     else if (d == QStringLiteral("float32"))
0204         return Datatype::float32;
0205     else if (d == QStringLiteral("float64"))
0206         return Datatype::float64;
0207     else if (d == QStringLiteral("datetime64[ms]"))
0208         return Datatype::datetime64_ms;
0209     else if (d == QStringLiteral("datetime64[s]"))
0210         return Datatype::datetime64_s;
0211     else if (d == QStringLiteral("datetime64[m]"))
0212         return Datatype::datetime64_m;
0213     else if (d == QStringLiteral("datetime64[h]"))
0214         return Datatype::datetime64_h;
0215     else if (d == QStringLiteral("datetime64[D]") || d == QStringLiteral("datetime64"))
0216         return Datatype::datetime64_D;
0217 
0218     // as default text is used
0219     return Datatype::text;
0220 }
0221 
0222 void VariableParser::parseValues(const QStringList& values, VariableParser::Datatype dataType) {
0223     PERFTRACE(QStringLiteral("parsing variable values string list"));
0224     switch (dataType) {
0225     case Datatype::uint8:
0226     case Datatype::int8:
0227     case Datatype::uint16:
0228     case Datatype::int16:
0229     case Datatype::int32:
0230         m_values = new QVector<int>(values.size());
0231         m_dataType = AbstractColumn::ColumnMode::Integer;
0232         break;
0233     case Datatype::uint32:
0234     case Datatype::int64:
0235         m_values = new QVector<qint64>(values.size());
0236         m_dataType = AbstractColumn::ColumnMode::BigInt;
0237         break;
0238     case Datatype::uint64: // larger than qint64!
0239     case Datatype::float32:
0240     case Datatype::float64:
0241         m_values = new QVector<double>(values.size());
0242         m_dataType = AbstractColumn::ColumnMode::Double;
0243         break;
0244     case Datatype::datetime64_D:
0245     case Datatype::datetime64_h:
0246     case Datatype::datetime64_m:
0247     case Datatype::datetime64_s:
0248     case Datatype::datetime64_ms:
0249         m_values = new QVector<QDateTime>(values.size());
0250         m_dataType = AbstractColumn::ColumnMode::DateTime;
0251         break;
0252     case Datatype::text:
0253         m_values = new QVector<QString>(values.size());
0254         m_dataType = AbstractColumn::ColumnMode::Text;
0255     }
0256 
0257     int i = 0;
0258     bool isNumber = false;
0259     switch (dataType) {
0260     case Datatype::uint8:
0261     case Datatype::int8:
0262     case Datatype::uint16:
0263     case Datatype::int16:
0264     case Datatype::int32: {
0265         for (const auto& v : values) {
0266             int value = v.trimmed().toUInt(&isNumber);
0267 
0268             // accept the variable only if there is at least one numerical value in the array.
0269             if (isNumber) {
0270                 if (!m_parsed)
0271                     m_parsed = true;
0272             } else
0273                 value = 0;
0274 
0275             integers()[i] = value;
0276             i++;
0277         }
0278         break;
0279     }
0280     case Datatype::uint32:
0281     case Datatype::int64: {
0282         for (const auto& v : values) {
0283             qint64 value = v.trimmed().toLongLong(&isNumber);
0284             if (isNumber) {
0285                 if (!m_parsed)
0286                     m_parsed = true;
0287             } else
0288                 value = 0;
0289 
0290             bigInt()[i] = value;
0291             i++;
0292         }
0293         break;
0294     }
0295     case Datatype::uint64:
0296     case Datatype::float32:
0297     case Datatype::float64: {
0298         // use the first value in the vector to check whether we need to consider
0299         // the locale specific representation of floats (for example, R's output is locale specific)
0300         bool useLocale = false;
0301         if (!values.isEmpty()) {
0302             values.constFirst().trimmed().toDouble(&isNumber);
0303             if (!isNumber)
0304                 useLocale = true; // direct conversion has failed, use QLocale to parse the strings further below
0305         }
0306 
0307         if (!useLocale) {
0308             for (const auto& v : values) {
0309                 double value = v.trimmed().toDouble(&isNumber);
0310                 if (isNumber) {
0311                     if (!m_parsed)
0312                         m_parsed = true;
0313                 } else
0314                     value = NAN;
0315 
0316                 doublePrecision()[i] = value;
0317                 i++;
0318             }
0319         } else {
0320             QLocale locale;
0321             for (const auto& v : values) {
0322                 double value = locale.toDouble(v.trimmed(), &isNumber);
0323                 if (isNumber) {
0324                     if (!m_parsed)
0325                         m_parsed = true;
0326                 } else
0327                     value = NAN;
0328 
0329                 doublePrecision()[i] = value;
0330                 i++;
0331             }
0332         }
0333 
0334         break;
0335     }
0336     // Adding timezone indicator "Z" is necessary, because specific dates like
0337     // 2017-03-26T02:14:34.000 are not available in different timezones.
0338     // https://forum.qt.io/topic/133181/qdatetime-fromstring-returns-invalid-datetime
0339     case Datatype::datetime64_D:
0340         for (const auto& v : values) {
0341 #if (QT_VERSION >= QT_VERSION_CHECK(5, 14, 0))
0342             dateTime()[i] = QDate::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"), Qt::ISODate).startOfDay(Qt::UTC);
0343 #else
0344             dateTime()[i] = QDateTime(QDate::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"), Qt::ISODate));
0345 #endif
0346             m_parsed = true;
0347             i++;
0348         }
0349         break;
0350     case Datatype::datetime64_h:
0351         for (const auto& v : values) {
0352             dateTime()[i] = QDateTime::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"),
0353                                                   QStringLiteral("yyyy-MM-ddThht")); // last t is important. It is the timezone
0354             m_parsed = true;
0355             i++;
0356         }
0357         break;
0358     case Datatype::datetime64_m:
0359         for (const auto& v : values) {
0360             dateTime()[i] = QDateTime::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"),
0361                                                   QStringLiteral("yyyy-MM-ddThh:mmt")); // last t is important. It is the timezone
0362             m_parsed = true;
0363             i++;
0364         }
0365         break;
0366     case Datatype::datetime64_s:
0367         for (const auto& v : values) {
0368             dateTime()[i] = QDateTime::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"), Qt::ISODate);
0369             m_parsed = true;
0370             i++;
0371         }
0372         break;
0373     case Datatype::datetime64_ms:
0374         for (const auto& v : values) {
0375             dateTime()[i] = QDateTime::fromString(v.trimmed().replace(QStringLiteral("'"), QString()) + QStringLiteral("Z"), Qt::ISODateWithMs);
0376             m_parsed = true;
0377             i++;
0378         }
0379         break;
0380     case Datatype::text:
0381         for (const auto& v : values) {
0382             text()[i] = v;
0383             m_parsed = true;
0384             i++;
0385         }
0386         break;
0387     }
0388 }