File indexing completed on 2024-05-12 05:10:13

0001 /***************************************************************************
0002     Copyright (C) 2003-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "tellico_xml.h"
0026 #include "../tellico_debug.h"
0027 
0028 #include <libxml/parserInternals.h> // needed for IS_LETTER
0029 #include <libxml/parser.h> // has to be before valid.h
0030 #include <libxml/valid.h>
0031 
0032 #include <QRegularExpression>
0033 
0034 const QString Tellico::XML::nsXSL = QStringLiteral("http://www.w3.org/1999/XSL/Transform");
0035 const QString Tellico::XML::nsBibtexml = QStringLiteral("http://bibtexml.sf.net/");
0036 const QString Tellico::XML::dtdBibtexml = QStringLiteral("bibtexml.dtd");
0037 
0038 /*
0039  * VERSION 2 added namespaces, changed to multiple elements,
0040  * and changed the "keywords" field to "keyword"
0041  *
0042  * VERSION 3 broke out the formatType, and changed NoComplete to AllowCompletion
0043  *
0044  * VERSION 4 added a bibtex-field name for Bibtex collections, element name was
0045  * changed to 'entry', field elements changed to 'field', and boolean fields are now "true"
0046  *
0047  * VERSION 5 moved the bibtex-field and any other extended field property to property elements
0048  * inside the field element, and added the image element.
0049  *
0050  * VERSION 6 added id, i18n attributes, and year, month, day elements in date fields with a calendar name
0051  * attribute.
0052  *
0053  * VERSION 7 changed the application name to Tellico, renamed unitTitle to entryTitle, and made the id permanent.
0054  *
0055  * VERSION 8 added loans and saved filters.
0056  *
0057  * VERSION 9 changed music collections to always have three columns by default, with title/artist/length and
0058  * added file catalog collection.
0059  *
0060  * VERSION 10 added the game board collection.
0061  *
0062  * VERSION 11 remove ReadOnly and Dependent fields, and added appropriate FieldFlags. An ID field was added by default.
0063  *
0064  * VERSION 12 added new filter rules: before and after, less than and greater than. But only use v12 when needed
0065  */
0066 const uint Tellico::XML::syntaxVersion = 12;
0067 const QString Tellico::XML::nsTellico = QStringLiteral("http://periapsis.org/tellico/");
0068 
0069 const QString Tellico::XML::nsBookcase = QStringLiteral("http://periapsis.org/bookcase/");
0070 const QString Tellico::XML::nsDublinCore = QStringLiteral("http://purl.org/dc/elements/1.1/");
0071 const QString Tellico::XML::nsZing = QStringLiteral("http://www.loc.gov/zing/srw/");
0072 const QString Tellico::XML::nsZingDiag = QStringLiteral("http://www.loc.gov/zing/srw/diagnostic/");
0073 const QString Tellico::XML::nsAtom = QStringLiteral("http://www.w3.org/2005/Atom");
0074 const QString Tellico::XML::nsOpenSearch = QStringLiteral("http://a9.com/-/spec/opensearch/1.1/");
0075 
0076 QString Tellico::XML::pubTellico(int version) {
0077  return QStringLiteral("-//Robby Stephenson/DTD Tellico V%1.0//EN").arg(version);
0078 }
0079 
0080 QString Tellico::XML::dtdTellico(int version) {
0081   return QStringLiteral("http://periapsis.org/tellico/dtd/v%1/tellico.dtd").arg(version);
0082 }
0083 
0084 bool Tellico::XML::validXMLElementName(const QString& name_) {
0085   return xmlValidateNCName((xmlChar *)name_.toUtf8().data(), 0) == 0;
0086 }
0087 
0088 QString Tellico::XML::elementName(const QString& name_) {
0089   static const QRegularExpression whitespace(QStringLiteral("\\s+"));
0090   QString name = name_;
0091   // change white space to dashes
0092   name.replace(whitespace, QStringLiteral("-"));
0093   // first cut, if it passes, we're done
0094   if(XML::validXMLElementName(name)) {
0095     return name;
0096   }
0097 
0098   // next check first characters IS_DIGIT is defined in libxml/vali.d
0099   for(int i = 0; i < name.length() && (!IS_LETTER(name[i].unicode()) || name[i] == QLatin1Char('_')); ++i) {
0100     name = name.mid(1);
0101   }
0102   if(name.isEmpty() || XML::validXMLElementName(name)) {
0103     return name; // empty names are handled later
0104   }
0105 
0106   // now brute-force it, one character at a time
0107   int i = 0;
0108   while(i < name.length()) {
0109     if(!XML::validXMLElementName(name.left(i+1))) {
0110       name.remove(i, 1); // remember it's zero-indexed
0111     } else {
0112       // character is ok, increment i
0113       ++i;
0114     }
0115   }
0116   return name;
0117 }
0118 
0119 QByteArray Tellico::XML::recoverFromBadXMLName(const QByteArray& data_) {
0120   // this is going to be ugly (Bug 418067)
0121   // Do a rough parse of the data, grab the field names, determine which ones are invalid
0122   // then search/replace to recover. Let's assume the XML format is as written directly from Tellico
0123   // so don't worry about attribute order within the field elements
0124   const int fieldsEnd = data_.indexOf("</fields>");
0125   if(fieldsEnd == -1) {
0126 //    myDebug() << "no fields end";
0127     return data_;
0128   }
0129 
0130   QByteArray newData = data_;
0131 
0132   typedef QPair<QByteArray, QByteArray> ByteArrayPair;
0133   // keep a list of pairs to replace
0134   QList<ByteArrayPair> badNames;
0135   // an expensive conversion, but have to convert to a string
0136   const QString fieldsSection = QString::fromUtf8(data_.left(fieldsEnd));
0137   QString newFieldsSection = fieldsSection;
0138   QRegularExpression fieldNameRX(QStringLiteral("<field .*?name=\"(.+?)\".*?>"));
0139   QRegularExpressionMatchIterator i = fieldNameRX.globalMatch(fieldsSection);
0140   while(i.hasNext()) {
0141     QRegularExpressionMatch match = i.next();
0142     const QString fieldName = match.captured(1);
0143     if(!validXMLElementName(fieldName)) {
0144       const QString newName = elementName(fieldName);
0145       if(newName.isEmpty()) {
0146         return data_;
0147       }
0148       myDebug() << "Bad name is" << fieldName << "; Good name is" << newName;
0149       badNames += qMakePair(fieldName.toUtf8().prepend('<').append('>'),
0150                             newName.toUtf8().prepend('<').append('>'));
0151       badNames += qMakePair(fieldName.toUtf8().prepend("</").append('>'),
0152                             newName.toUtf8().prepend("</").append('>'));
0153       // also have to check for plurals
0154       badNames += qMakePair(fieldName.toUtf8().prepend('<').append("s>"),
0155                             newName.toUtf8().prepend('<').append("s>"));
0156       badNames += qMakePair(fieldName.toUtf8().prepend("</").append("s>"),
0157                             newName.toUtf8().prepend("</").append("s>"));
0158       // the bad name might be in the description attribute which is fine, leave it alone
0159       newFieldsSection.replace(QStringLiteral("name=\"") + fieldName,
0160                                QStringLiteral("name=\"") + newName);
0161     }
0162   }
0163 
0164   // if there are no fields to replace, we're done
0165   if(badNames.isEmpty()) {
0166     return data_;
0167   }
0168 
0169   // swap out the new fields header
0170   newData.replace(0, fieldsEnd, newFieldsSection.toUtf8());
0171 
0172   foreach(const ByteArrayPair& ii, badNames) {
0173 //    myDebug() << "Replacing" << ii.first << "with" << ii.second;
0174     newData.replace(ii.first, ii.second);
0175   }
0176   return newData;
0177 }
0178 
0179 QByteArray Tellico::XML::removeInvalidXml(const QByteArray& data_) {
0180   const uint len = data_.length();
0181   QByteArray result;
0182   result.reserve(len);
0183   for(uint i = 0; i < len; ++i) {
0184     auto c = data_.at(i);
0185     // for now, stick with anything below #x20 except for #x9 | #xA | #xD
0186     if(c >= 0x20 || c == 0x09 || c == 0x0A || c == 0x0D) {
0187       result.append(c);
0188     }
0189   }
0190   result.squeeze();
0191   return result;
0192 }