File indexing completed on 2024-04-21 03:41:36

0001 /*
0002     SPDX-FileCopyrightText: 2005 Inge Wallin <inge@lysator.liu.se>
0003     SPDX-License-Identifier: GPL-2.0-or-later
0004 */
0005 
0006 #include "moleculeparser.h"
0007 
0008 #include <cctype>
0009 
0010 #include "kalzium_libscience_debug.h"
0011 #include <QFile>
0012 #include <QStandardPaths>
0013 
0014 // ================================================================
0015 //                    class ElementCountMap
0016 
0017 ElementCountMap::ElementCountMap()
0018 {
0019     m_map.clear();
0020 }
0021 
0022 ElementCountMap::~ElementCountMap() = default;
0023 
0024 ElementCount *ElementCountMap::search(Element *_element)
0025 {
0026     for (ElementCount *c : std::as_const(m_map)) {
0027         if (c->element() == _element) {
0028             return c;
0029         }
0030     }
0031 
0032     return nullptr;
0033 }
0034 
0035 void ElementCountMap::add(ElementCountMap &_map)
0036 {
0037     for (ElementCount *c : std::as_const(_map.m_map)) {
0038         add(c->m_element, c->m_count);
0039     }
0040 }
0041 
0042 QList<Element *> ElementCountMap::elements() const
0043 {
0044     QList<Element *> list;
0045 
0046     for (ElementCount *c : std::as_const(m_map)) {
0047         Element *e = c->m_element;
0048         if (!list.contains(e)) {
0049             list << e;
0050         }
0051     }
0052 
0053     return list;
0054 }
0055 
0056 void ElementCountMap::add(Element *_element, int _count)
0057 {
0058     ElementCount *elemCount;
0059 
0060     elemCount = search(_element);
0061     if (elemCount) {
0062         elemCount->m_count += _count;
0063     } else {
0064         m_map.append(new ElementCount(_element, _count));
0065     }
0066 }
0067 
0068 void ElementCountMap::multiply(int _factor)
0069 {
0070     for (ElementCount *count : std::as_const(m_map)) {
0071         count->multiply(_factor);
0072     }
0073 }
0074 
0075 // ================================================================
0076 //                    class MoleculeParser
0077 
0078 MoleculeParser::MoleculeParser(const QList<Element *> &list)
0079     : Parser()
0080 {
0081     m_elementList = list;
0082     m_aliasList = new QSet<QString>;
0083 }
0084 
0085 MoleculeParser::MoleculeParser(const QString &_str)
0086     : Parser(_str)
0087 {
0088     m_aliasList = new QSet<QString>;
0089 }
0090 
0091 MoleculeParser::~MoleculeParser()
0092 {
0093     delete m_aliasList;
0094 }
0095 
0096 // ----------------------------------------------------------------
0097 //                            public methods
0098 
0099 // Try to parse the molecule and get the weight of it.
0100 //
0101 // This method also acts as the main loop.
0102 
0103 bool MoleculeParser::weight(const QString &_shortMoleculeString, double *_resultMass, ElementCountMap *_resultMap)
0104 {
0105     if (_shortMoleculeString.isEmpty()) {
0106         return false;
0107     }
0108     // Clear the list of aliases and start filling it again.
0109 
0110     m_aliasList->clear();
0111     QString _moleculeString;
0112     // Clear the result variables and set m_error to false
0113     _resultMap->clear();
0114     m_error = false;
0115     *_resultMass = 0.0;
0116 
0117     // Expand the molecule string
0118     // Example : MeOH -> (CH3)OH
0119     qCDebug(KALZIUM_LIBSCIENCE_LOG) << _shortMoleculeString << "is going to be expanded";
0120     _moleculeString = expandFormula(_shortMoleculeString);
0121     qCDebug(KALZIUM_LIBSCIENCE_LOG) << _moleculeString << "is the expanded string";
0122 
0123     // Now set the expanded string
0124     // Initialize the parsing process, and parse te molecule.
0125     start(_moleculeString);
0126     parseSubmolecule(_resultMass, _resultMap);
0127 
0128     if (nextToken() != -1) {
0129         return false;
0130     }
0131 
0132     if (m_error) { // there was an error in the input...
0133         return false;
0134     }
0135 
0136     return true;
0137 }
0138 
0139 QSet<QString> MoleculeParser::aliasList()
0140 {
0141     return *m_aliasList;
0142 }
0143 // ----------------------------------------------------------------
0144 //            helper methods for the public methods
0145 
0146 // Parse a submolecule.  This is a list of terms.
0147 //
0148 
0149 bool MoleculeParser::parseSubmolecule(double *_resultMass, ElementCountMap *_resultMap)
0150 {
0151     double subMass = 0.0;
0152     ElementCountMap subMap;
0153 
0154     *_resultMass = 0.0;
0155     _resultMap->clear();
0156     while (parseTerm(&subMass, &subMap)) {
0157         // qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Parsed a term, weight = " << subresult;
0158 
0159         // Add the mass and composition of the submolecule to the total.
0160         *_resultMass += subMass;
0161         _resultMap->add(subMap);
0162     }
0163 
0164     return true;
0165 }
0166 
0167 // Parse a term within the molecule, i.e. a single atom or a
0168 // submolecule within parenthesis followed by an optional number.
0169 // Examples: Bk, Mn2, (COOH)2
0170 //
0171 // Return true if correct, otherwise return false.
0172 
0173 // If correct, the mass of the term is returned in *_resultMass, and
0174 // the flattened composition of the molecule in *_resultMap.
0175 //
0176 
0177 bool MoleculeParser::parseTerm(double *_resultMass, ElementCountMap *_resultMap)
0178 {
0179     *_resultMass = 0.0;
0180     _resultMap->clear();
0181 
0182     if (nextToken() == ELEMENT_TOKEN) {
0183         // qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Parsed an element: " << m_elementVal->symbol();
0184         *_resultMass = m_elementVal->dataAsVariant(ChemicalDataObject::mass).toDouble();
0185         _resultMap->add(m_elementVal, 1);
0186 
0187         getNextToken();
0188     } else if (nextToken() == '(') {
0189         // A submolecule.
0190 
0191         getNextToken();
0192         parseSubmolecule(_resultMass, _resultMap);
0193 
0194         // Must end in a ")".
0195         if (nextToken() == ')') {
0196             // qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Parsed a submolecule. weight = " << *_result;
0197             getNextToken();
0198         } else {
0199             return false;
0200         }
0201     } else {
0202         // Neither an element nor a list within ().
0203         return false;
0204     }
0205 
0206     // Optional number.
0207     if (nextToken() == INT_TOKEN) {
0208         // qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Parsed a number: " << intVal();
0209 
0210         *_resultMass *= intVal();
0211         _resultMap->multiply(intVal());
0212 
0213         getNextToken();
0214     }
0215 
0216     qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Weight of term = " << *_resultMass;
0217     return true;
0218 }
0219 
0220 // ----------------------------------------------------------------
0221 //                           protected methods
0222 
0223 // Extend Parser::getNextToken with elements.
0224 
0225 int MoleculeParser::getNextToken()
0226 {
0227     QString name;
0228 
0229 #if 0
0230     qCDebug(KALZIUM_LIBSCIENCE_LOG) << "getNextToken(): Next character = "
0231           << nextChar() << endl;
0232 #endif
0233 
0234     // Check if the token is an element name.
0235     if ('A' <= nextChar() && nextChar() <= 'Z') {
0236         name = char(nextChar());
0237         getNextChar();
0238 
0239         if ('a' <= nextChar() && nextChar() <= 'z') {
0240             name.append(char(nextChar()));
0241             getNextChar();
0242         }
0243 
0244         // Look up the element from the name..
0245         m_elementVal = lookupElement(name);
0246         if (m_elementVal) {
0247             m_nextToken = ELEMENT_TOKEN;
0248         } else {
0249             m_nextToken = -1;
0250         }
0251     } else {
0252         return Parser::getNextToken();
0253     }
0254 
0255     return m_nextToken;
0256 }
0257 
0258 // ----------------------------------------------------------------
0259 //                          private methods
0260 
0261 Element *MoleculeParser::lookupElement(const QString &_name)
0262 {
0263     qCDebug(KALZIUM_LIBSCIENCE_LOG) << "looking up " << _name;
0264 
0265     for (Element *e : std::as_const(m_elementList)) {
0266         if (e->dataAsVariant(ChemicalDataObject::symbol) == _name) {
0267             qCDebug(KALZIUM_LIBSCIENCE_LOG) << "Found element " << _name;
0268             return e;
0269         }
0270     }
0271 
0272     // if there is an error make m_error true.
0273     m_error = true;
0274 
0275     qCDebug(KALZIUM_LIBSCIENCE_LOG) << "no such element!: " << _name;
0276 
0277     return nullptr;
0278 }
0279 
0280 QString MoleculeParser::expandFormula(const QString &_shortString)
0281 {
0282     QString _fullString; // the expanded string that will be returned
0283     QString::const_iterator i; // iterator
0284     QString temp; // a temporary string that will contain a single element/group
0285     QString expandedTerm; // expansion of a particular term
0286 
0287     // Go through all letters in the string.
0288     for (i = _shortString.constBegin(); i != _shortString.constEnd();) {
0289         temp = QLatin1String("");
0290 
0291         // If a capital letter was found
0292         if ((*i).category() == QChar::Letter_Uppercase) {
0293             temp += (*i);
0294             ++i;
0295 
0296             // A small letter following a capital letter
0297             if (i != _shortString.end() && (*i).category() == QChar::Letter_Lowercase) {
0298                 temp += (*i);
0299                 ++i;
0300             }
0301 
0302             // If element is found, append it
0303             if (lookupElement(temp)) {
0304                 _fullString += temp;
0305             } else if (!((expandedTerm = expandTerm(temp)).isEmpty())) {
0306                 // If an expansion was made, return the expansion
0307                 qCDebug(KALZIUM_LIBSCIENCE_LOG) << "expanded" << temp << "to" << expandedTerm;
0308                 _fullString += '(' + expandedTerm + ')';
0309             } else { // invalid term, append it. (Validation is done later anyway.)
0310                 _fullString += temp;
0311             }
0312         } else if (*i == '(') { // Return parenthesis as and when found
0313             _fullString += '(';
0314             ++i;
0315         } else if (*i == ')') {
0316             _fullString += ')';
0317             ++i;
0318         } else if (*i == '#') { // If # is found, we have a short-form eg #EDTA#
0319             ++i; // go to the next character
0320             // Get the term between # and #
0321             while (*i != '#' && i != _shortString.constEnd()) {
0322                 temp += *i;
0323                 ++i;
0324             }
0325             // If the string ended, just add the part that comes after #
0326             if (i == _shortString.constEnd()) {
0327                 _fullString += temp;
0328                 break;
0329             } else if (!temp.isEmpty()) { // else expand the term between # and #
0330                 // if alias is not found, just add without expanding the term
0331                 if ((expandedTerm = expandTerm(temp)).isEmpty()) {
0332                     _fullString += temp;
0333                 } else { // else add the expanded term
0334                     _fullString += expandedTerm;
0335                 }
0336             }
0337             ++i;
0338         } else if ((*i).category() == QChar::Number_DecimalDigit) { // If number was found, return it
0339             _fullString += *i;
0340             ++i;
0341         } else { // invalid character, return it, validation is done again later
0342             _fullString += *i;
0343             ++i;
0344             qCDebug(KALZIUM_LIBSCIENCE_LOG) << *i << "invalid character!";
0345         }
0346     }
0347 
0348     // Reset all "element not found" errors.
0349     m_error = false;
0350     return _fullString;
0351 }
0352 
0353 QString MoleculeParser::expandTerm(const QString &_group)
0354 {
0355     QString shortForm, fullForm; // short form (symbol) and full form (expansion)
0356     QString temp; // A temporary QString used in Regular expressions
0357 
0358     // Search in User defined aliases.
0359     QString fileName = QStandardPaths::locate(QStandardPaths::GenericDataLocation, QStringLiteral("libkdeedu/data/symbols2.csv"));
0360     QFile file(fileName);
0361 
0362     // Check file validity
0363     if (!(!file.open(QIODevice::ReadOnly | QIODevice::Text))) {
0364         qCDebug(KALZIUM_LIBSCIENCE_LOG) << fileName << " opened";
0365         QTextStream in(&file);
0366 
0367         // Get all shortForms and fullForms in the file.
0368         while (!in.atEnd()) {
0369             QString line = in.readLine();
0370             shortForm = line.section(',', 0, 0);
0371             shortForm.remove(QChar('\"'));
0372             fullForm = line.section(',', 1, 1);
0373             fullForm.remove(QChar('\"'));
0374 
0375             // If short term is found, return fullForm
0376             if (shortForm == _group) {
0377                 *m_aliasList << (_group + " : " + fullForm);
0378                 return (fullForm);
0379             }
0380         }
0381     } else {
0382         qCDebug(KALZIUM_LIBSCIENCE_LOG) << fileName << " could not be opened!";
0383     }
0384 
0385     // Find the system defined aliases
0386     // Open the file
0387     fileName = QStandardPaths::locate(QStandardPaths::GenericDataLocation, QStringLiteral("libkdeedu/data/symbols.csv"));
0388     QFile file2(fileName);
0389 
0390     // Check file validity
0391     if (file2.open(QIODevice::ReadOnly | QIODevice::Text)) {
0392         qCDebug(KALZIUM_LIBSCIENCE_LOG) << fileName << " opened";
0393         QTextStream in(&file2);
0394 
0395         // Get all shortForms and fullForms in the file.
0396         while (!in.atEnd()) {
0397             QString line = in.readLine();
0398             shortForm = line.section(',', 0, 0);
0399             shortForm.remove(QChar('\"'));
0400             fullForm = line.section(',', 1, 1);
0401             fullForm.remove(QChar('\"'));
0402 
0403             if (shortForm == _group) {
0404                 *m_aliasList << (_group + " : " + fullForm);
0405                 return (fullForm);
0406             }
0407         }
0408     } else {
0409         qCDebug(KALZIUM_LIBSCIENCE_LOG) << fileName << " could not be opened!";
0410     }
0411 
0412     // Sample expansions, work even when file is not found, testing purposes
0413     if (_group == QLatin1String("Me")) {
0414         return ("CH3");
0415     } else if (_group == QLatin1String("Et")) {
0416         return ("C2H5");
0417     } else { // If not found return an empty string.
0418         return ("");
0419     }
0420 }