File indexing completed on 2024-05-12 16:46:38

0001 /***************************************************************************
0002     Copyright (C) 2010-2020 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "xmlhandler.h"
0026 #include "../tellico_debug.h"
0027 
0028 #include <QRegularExpression>
0029 #include <QTextStream>
0030 #include <QXmlStreamReader>
0031 #include <QTextCodec>
0032 
0033 using Tellico::XMLHandler;
0034 
0035 bool XMLHandler::setUtf8XmlEncoding(QString& text_) {
0036   static const QRegularExpression rx(QLatin1String("encoding\\s*=\\s*\"([\\w-]+)\""));
0037   QTextStream stream(&text_);
0038   // the xml header might still indicate an encoding other than utf-8
0039   // so read the first line and ensure it is set to utf-8
0040   QString firstLine = stream.readLine();
0041   QRegularExpressionMatch match = rx.match(firstLine);
0042   if(match.hasMatch() &&
0043      match.capturedRef(1).compare(QLatin1String("utf-8"), Qt::CaseInsensitive) != 0) {
0044     firstLine.replace(rx, QStringLiteral("encoding=\"utf-8\""));
0045     text_ = firstLine + QLatin1Char('\n') + stream.readAll();
0046     return true;
0047   }
0048   return false;
0049 }
0050 
0051 QString XMLHandler::readXMLData(const QByteArray& data_) {
0052   // need to recognize encoding from the data, like QXmlInputSource::fromRawData() used to do
0053   QXmlStreamReader reader(data_);
0054   while(!reader.isStartDocument() && !reader.atEnd()) {
0055     reader.readNext();
0056   }
0057   QStringRef enc = reader.documentEncoding();
0058   if(enc.isEmpty() || enc.compare(QLatin1String("utf-8"), Qt::CaseInsensitive) == 0) {
0059     // default to utf8 and no need to parse to change embedded encoding
0060     return QString::fromUtf8(data_);
0061   }
0062 
0063   QTextCodec* codec = QTextCodec::codecForName(enc.toUtf8());
0064   if(!codec) {
0065     return QString::fromUtf8(data_);
0066   }
0067   QString text = codec->toUnicode(data_);
0068   // since we always process XML files as utf-8, make sure the embedded encoding is set to utf-8
0069   if(!setUtf8XmlEncoding(text)) {
0070     myDebug() << "Found non utf-8 encoding but did not change the embedded declaration" << enc;
0071     // output up to first 100 characters for debugging purposes
0072     myDebug() << text.left(100);
0073   }
0074   return text;
0075 }