File indexing completed on 2024-05-12 05:10:13
0001 /*************************************************************************** 0002 Copyright (C) 2003-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "tellico_xml.h" 0026 #include "../tellico_debug.h" 0027 0028 #include <libxml/parserInternals.h> // needed for IS_LETTER 0029 #include <libxml/parser.h> // has to be before valid.h 0030 #include <libxml/valid.h> 0031 0032 #include <QRegularExpression> 0033 0034 const QString Tellico::XML::nsXSL = QStringLiteral("http://www.w3.org/1999/XSL/Transform"); 0035 const QString Tellico::XML::nsBibtexml = QStringLiteral("http://bibtexml.sf.net/"); 0036 const QString Tellico::XML::dtdBibtexml = QStringLiteral("bibtexml.dtd"); 0037 0038 /* 0039 * VERSION 2 added namespaces, changed to multiple elements, 0040 * and changed the "keywords" field to "keyword" 0041 * 0042 * VERSION 3 broke out the formatType, and changed NoComplete to AllowCompletion 0043 * 0044 * VERSION 4 added a bibtex-field name for Bibtex collections, element name was 0045 * changed to 'entry', field elements changed to 'field', and boolean fields are now "true" 0046 * 0047 * VERSION 5 moved the bibtex-field and any other extended field property to property elements 0048 * inside the field element, and added the image element. 0049 * 0050 * VERSION 6 added id, i18n attributes, and year, month, day elements in date fields with a calendar name 0051 * attribute. 0052 * 0053 * VERSION 7 changed the application name to Tellico, renamed unitTitle to entryTitle, and made the id permanent. 0054 * 0055 * VERSION 8 added loans and saved filters. 0056 * 0057 * VERSION 9 changed music collections to always have three columns by default, with title/artist/length and 0058 * added file catalog collection. 0059 * 0060 * VERSION 10 added the game board collection. 0061 * 0062 * VERSION 11 remove ReadOnly and Dependent fields, and added appropriate FieldFlags. An ID field was added by default. 0063 * 0064 * VERSION 12 added new filter rules: before and after, less than and greater than. But only use v12 when needed 0065 */ 0066 const uint Tellico::XML::syntaxVersion = 12; 0067 const QString Tellico::XML::nsTellico = QStringLiteral("http://periapsis.org/tellico/"); 0068 0069 const QString Tellico::XML::nsBookcase = QStringLiteral("http://periapsis.org/bookcase/"); 0070 const QString Tellico::XML::nsDublinCore = QStringLiteral("http://purl.org/dc/elements/1.1/"); 0071 const QString Tellico::XML::nsZing = QStringLiteral("http://www.loc.gov/zing/srw/"); 0072 const QString Tellico::XML::nsZingDiag = QStringLiteral("http://www.loc.gov/zing/srw/diagnostic/"); 0073 const QString Tellico::XML::nsAtom = QStringLiteral("http://www.w3.org/2005/Atom"); 0074 const QString Tellico::XML::nsOpenSearch = QStringLiteral("http://a9.com/-/spec/opensearch/1.1/"); 0075 0076 QString Tellico::XML::pubTellico(int version) { 0077 return QStringLiteral("-//Robby Stephenson/DTD Tellico V%1.0//EN").arg(version); 0078 } 0079 0080 QString Tellico::XML::dtdTellico(int version) { 0081 return QStringLiteral("http://periapsis.org/tellico/dtd/v%1/tellico.dtd").arg(version); 0082 } 0083 0084 bool Tellico::XML::validXMLElementName(const QString& name_) { 0085 return xmlValidateNCName((xmlChar *)name_.toUtf8().data(), 0) == 0; 0086 } 0087 0088 QString Tellico::XML::elementName(const QString& name_) { 0089 static const QRegularExpression whitespace(QStringLiteral("\\s+")); 0090 QString name = name_; 0091 // change white space to dashes 0092 name.replace(whitespace, QStringLiteral("-")); 0093 // first cut, if it passes, we're done 0094 if(XML::validXMLElementName(name)) { 0095 return name; 0096 } 0097 0098 // next check first characters IS_DIGIT is defined in libxml/vali.d 0099 for(int i = 0; i < name.length() && (!IS_LETTER(name[i].unicode()) || name[i] == QLatin1Char('_')); ++i) { 0100 name = name.mid(1); 0101 } 0102 if(name.isEmpty() || XML::validXMLElementName(name)) { 0103 return name; // empty names are handled later 0104 } 0105 0106 // now brute-force it, one character at a time 0107 int i = 0; 0108 while(i < name.length()) { 0109 if(!XML::validXMLElementName(name.left(i+1))) { 0110 name.remove(i, 1); // remember it's zero-indexed 0111 } else { 0112 // character is ok, increment i 0113 ++i; 0114 } 0115 } 0116 return name; 0117 } 0118 0119 QByteArray Tellico::XML::recoverFromBadXMLName(const QByteArray& data_) { 0120 // this is going to be ugly (Bug 418067) 0121 // Do a rough parse of the data, grab the field names, determine which ones are invalid 0122 // then search/replace to recover. Let's assume the XML format is as written directly from Tellico 0123 // so don't worry about attribute order within the field elements 0124 const int fieldsEnd = data_.indexOf("</fields>"); 0125 if(fieldsEnd == -1) { 0126 // myDebug() << "no fields end"; 0127 return data_; 0128 } 0129 0130 QByteArray newData = data_; 0131 0132 typedef QPair<QByteArray, QByteArray> ByteArrayPair; 0133 // keep a list of pairs to replace 0134 QList<ByteArrayPair> badNames; 0135 // an expensive conversion, but have to convert to a string 0136 const QString fieldsSection = QString::fromUtf8(data_.left(fieldsEnd)); 0137 QString newFieldsSection = fieldsSection; 0138 QRegularExpression fieldNameRX(QStringLiteral("<field .*?name=\"(.+?)\".*?>")); 0139 QRegularExpressionMatchIterator i = fieldNameRX.globalMatch(fieldsSection); 0140 while(i.hasNext()) { 0141 QRegularExpressionMatch match = i.next(); 0142 const QString fieldName = match.captured(1); 0143 if(!validXMLElementName(fieldName)) { 0144 const QString newName = elementName(fieldName); 0145 if(newName.isEmpty()) { 0146 return data_; 0147 } 0148 myDebug() << "Bad name is" << fieldName << "; Good name is" << newName; 0149 badNames += qMakePair(fieldName.toUtf8().prepend('<').append('>'), 0150 newName.toUtf8().prepend('<').append('>')); 0151 badNames += qMakePair(fieldName.toUtf8().prepend("</").append('>'), 0152 newName.toUtf8().prepend("</").append('>')); 0153 // also have to check for plurals 0154 badNames += qMakePair(fieldName.toUtf8().prepend('<').append("s>"), 0155 newName.toUtf8().prepend('<').append("s>")); 0156 badNames += qMakePair(fieldName.toUtf8().prepend("</").append("s>"), 0157 newName.toUtf8().prepend("</").append("s>")); 0158 // the bad name might be in the description attribute which is fine, leave it alone 0159 newFieldsSection.replace(QStringLiteral("name=\"") + fieldName, 0160 QStringLiteral("name=\"") + newName); 0161 } 0162 } 0163 0164 // if there are no fields to replace, we're done 0165 if(badNames.isEmpty()) { 0166 return data_; 0167 } 0168 0169 // swap out the new fields header 0170 newData.replace(0, fieldsEnd, newFieldsSection.toUtf8()); 0171 0172 foreach(const ByteArrayPair& ii, badNames) { 0173 // myDebug() << "Replacing" << ii.first << "with" << ii.second; 0174 newData.replace(ii.first, ii.second); 0175 } 0176 return newData; 0177 } 0178 0179 QByteArray Tellico::XML::removeInvalidXml(const QByteArray& data_) { 0180 const uint len = data_.length(); 0181 QByteArray result; 0182 result.reserve(len); 0183 for(uint i = 0; i < len; ++i) { 0184 auto c = data_.at(i); 0185 // for now, stick with anything below #x20 except for #x9 | #xA | #xD 0186 if(c >= 0x20 || c == 0x09 || c == 0x0A || c == 0x0D) { 0187 result.append(c); 0188 } 0189 } 0190 result.squeeze(); 0191 return result; 0192 }