File indexing completed on 2024-05-19 04:49:31
0001 /**************************************************************************************** 0002 * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org> * 0003 * 2009 Mathias Panzenböck <grosser.meister.morti@gmx.net> * 0004 * 2013 Ralf Engels <ralf-engels@gmx.de> * 0005 * * 0006 * This program is free software; you can redistribute it and/or modify it under * 0007 * the terms of the GNU General Public License as published by the Free Software * 0008 * Foundation; either version 2 of the License, or (at your option) any later * 0009 * version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, but WITHOUT ANY * 0012 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * 0013 * PARTICULAR PURPOSE. See the GNU General Public License for more details. * 0014 * * 0015 * You should have received a copy of the GNU General Public License along with * 0016 * this program. If not, see <http://www.gnu.org/licenses/>. * 0017 ****************************************************************************************/ 0018 0019 #ifndef PODCASTREADER_H 0020 #define PODCASTREADER_H 0021 0022 #include "core/podcasts/PodcastProvider.h" 0023 #include "core/podcasts/PodcastMeta.h" 0024 0025 #include <QDateTime> 0026 #include <QXmlStreamReader> 0027 #include <QObject> 0028 #include <QStack> 0029 #include <QRegExp> 0030 0031 #include <KIO/TransferJob> 0032 0033 class QUrl; 0034 class KJob; 0035 0036 namespace Podcasts { 0037 0038 /** Class that parses a podcast xml file and provides the results to a PodcastProvider. 0039 0040 @author Bart Cerneels <bart.cerneels@kde.org> 0041 Mathias Panzenböck <grooser.meister.morti@gmx.net> 0042 */ 0043 class AMAROKCORE_EXPORT PodcastReader : public QObject 0044 { 0045 Q_OBJECT 0046 public: 0047 /** Create a new PodcastReader that delivers the result to the podcastProvider. 0048 Note: the PodcastProvider pointer is not owned by the PodcastReader and 0049 must remain valid throughout the lifetime of this object. 0050 */ 0051 explicit PodcastReader( PodcastProvider *podcastProvider, QObject *parent = nullptr ); 0052 ~PodcastReader() override; 0053 0054 bool read( QIODevice *device ); 0055 bool read( const QUrl &url ); 0056 bool update(const PodcastChannelPtr &channel ); 0057 QUrl & url() { return m_url; } 0058 0059 Podcasts::PodcastChannelPtr channel() { return m_channel; } 0060 0061 QXmlStreamReader::Error error () const { return m_xmlReader.error(); } 0062 QString errorString () const { return m_xmlReader.errorString(); } 0063 0064 Q_SIGNALS: 0065 void finished( PodcastReader *podcastReader ); 0066 void statusBarErrorMessage( const QString &message ); 0067 void statusBarNewProgressOperation( KIO::TransferJob *, const QString &, Podcasts::PodcastReader* ); 0068 0069 public Q_SLOTS: 0070 virtual void slotAbort(); 0071 0072 private Q_SLOTS: 0073 void slotRedirection( KIO::Job *job, const QUrl &url ); 0074 void slotPermanentRedirection ( KIO::Job * job, const QUrl &fromUrl, 0075 const QUrl &toUrl ); 0076 void slotAddData( KIO::Job *, const QByteArray & data ); 0077 0078 void downloadResult( KJob * ); 0079 0080 private: 0081 /** these are the keys used by the automata */ 0082 enum ElementType 0083 { 0084 Unknown = 0, 0085 Any, 0086 Document, 0087 CharacterData, 0088 Rss, 0089 Rdf, 0090 Feed, 0091 Channel, 0092 Item, 0093 NewFeedUrl, 0094 Image, 0095 Link, 0096 Author, 0097 ItunesAuthor, 0098 Url, 0099 Title, 0100 EnclosureElement, 0101 Guid, 0102 PubDate, 0103 Description, 0104 Body, 0105 Html, 0106 Entry, 0107 Subtitle, 0108 ItunesSubtitle, 0109 Updated, 0110 Published, 0111 Summary, 0112 ItunesSummary, 0113 Keywords, 0114 ItunesKeywords, 0115 Content, 0116 SupportedContent, 0117 Name, 0118 Id, 0119 Logo, 0120 Icon, 0121 Creator, 0122 Encoded 0123 }; 0124 0125 class Action; 0126 typedef void (PodcastReader::*ActionCallback)(); 0127 typedef QHash<ElementType, Action*> ActionMap; 0128 0129 class Action 0130 { 0131 public: 0132 explicit Action( ActionMap &actionMap ) 0133 : m_actionMap( actionMap ) 0134 , m_begin( nullptr ) 0135 , m_end( nullptr ) 0136 , m_characters( nullptr ) {} 0137 0138 Action(ActionMap &actionMap, ActionCallback begin) 0139 : m_actionMap( actionMap ) 0140 , m_begin( begin ) 0141 , m_end( nullptr ) 0142 , m_characters( nullptr ) {} 0143 0144 Action(ActionMap &actionMap, ActionCallback begin, ActionCallback end) 0145 : m_actionMap( actionMap ) 0146 , m_begin( begin ) 0147 , m_end( end ) 0148 , m_characters( nullptr ) {} 0149 0150 Action(ActionMap &actionMap, ActionCallback begin, 0151 ActionCallback end, ActionCallback characters) 0152 : m_actionMap( actionMap ) 0153 , m_begin( begin ) 0154 , m_end( end ) 0155 , m_characters( characters ) {} 0156 0157 void begin(PodcastReader *podcastReader) const; 0158 void end(PodcastReader *podcastReader) const; 0159 void characters(PodcastReader *podcastReader) const; 0160 0161 const ActionMap &actionMap() const { return m_actionMap; } 0162 0163 private: 0164 ActionMap &m_actionMap; 0165 ActionCallback m_begin; 0166 ActionCallback m_end; 0167 ActionCallback m_characters; 0168 }; 0169 0170 static bool mightBeHtml( const QString& text ); 0171 0172 ElementType elementType() const; 0173 bool read(); 0174 bool continueRead(); 0175 void createChannel(); 0176 0177 // callback methods for feed parsing: 0178 void beginRss(); 0179 void beginRdf(); 0180 void beginFeed(); 0181 void beginHtml(); 0182 void beginUnknownFeedType(); 0183 void beginEnclosure(); 0184 void beginText(); 0185 void beginChannel(); 0186 void beginItem(); 0187 void beginImage(); 0188 void beginXml(); 0189 void beginNoElement(); 0190 void beginAtomText(); 0191 void beginAtomFeedLink(); 0192 void beginAtomEntryLink(); 0193 void beginAtomTextChild(); 0194 0195 void endDocument(); 0196 void endTitle(); 0197 void endSubtitle(); 0198 void endDescription(); 0199 void endEncoded(); 0200 void endBody(); 0201 void endLink(); 0202 void endGuid(); 0203 void endPubDate(); 0204 void endItem(); 0205 void endImageUrl(); 0206 void endKeywords(); 0207 void endNewFeedUrl(); 0208 void endAuthor(); 0209 void endCreator(); 0210 void endXml(); 0211 void endAtomLogo(); 0212 void endAtomIcon(); 0213 void endAtomTitle(); 0214 void endAtomSubtitle(); 0215 void endAtomPublished(); 0216 void endAtomUpdated(); 0217 void endAtomSummary(); 0218 void endAtomContent(); 0219 void endAtomTextChild(); 0220 0221 // TODO: maybe I can remove readCharacters() and readEscapedCharacters() 0222 // and use readAtomTextCharacters() plus setting m_contentType even 0223 // in Rss 1.0/2.0 parsers instead. 0224 void readCharacters(); 0225 void readNoCharacters(); 0226 void readEscapedCharacters(); 0227 void readAtomTextCharacters(); 0228 0229 QDateTime parsePubDate( const QString &datestring ); 0230 0231 void stopWithError(const QString &message); 0232 0233 static QString unescape( const QString &text ); 0234 static QString textToHtml( const QString &text ); 0235 0236 QString atomTextAsText(); 0237 QString atomTextAsHtml(); 0238 0239 QStringRef attribute(const char *namespaceUri, const char *name) const; 0240 bool hasAttribute(const char *namespaceUri, const char *name) const; 0241 0242 void setDescription(const QString &description); 0243 void setSummary(const QString &description); 0244 0245 /** podcastEpisodeCheck 0246 * Check if this PodcastEpisode has been fetched before. Uses a scoring algorithm. 0247 * @return A pointer to a PodcastEpisode that has been fetched before or the \ 0248 * same pointer as the argument. 0249 */ 0250 Podcasts::PodcastEpisodePtr podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode ); 0251 0252 // TODO: move this to PodcastMeta and add a field 0253 // descriptionType to PodcastCommonMeta. 0254 enum ContentType 0255 { 0256 TextContent, 0257 HtmlContent, 0258 XHtmlContent 0259 }; 0260 0261 class Enclosure 0262 { 0263 public: 0264 Enclosure(const QUrl &url, int filesize, const QString& mimeType) 0265 : m_url( url ), m_filesize( filesize ), m_mimeType( mimeType ) {} 0266 0267 const QUrl &url() const { return m_url; } 0268 int fileSize() const { return m_filesize; } 0269 const QString &mimeType() const { return m_mimeType; } 0270 0271 private: 0272 QUrl m_url; 0273 int m_filesize; 0274 QString m_mimeType; 0275 }; 0276 0277 class StaticData { 0278 public: 0279 StaticData(); 0280 0281 // This here basically builds an automata. 0282 // This way feed parsing can be paused after any token, 0283 // thus enabling paralell download and parsing of multiple 0284 // feeds without the need for threads. 0285 0286 QHash<QString, ElementType> knownElements; 0287 QRegExp removeScripts; 0288 QRegExp mightBeHtml; 0289 QRegExp linkify; 0290 0291 // Actions 0292 Action startAction; 0293 0294 Action docAction; 0295 Action xmlAction; 0296 Action skipAction; 0297 Action noContentAction; 0298 0299 Action rdfAction; // RSS 1.0 0300 Action rssAction; // RSS 2.0 0301 Action feedAction; // Atom 0302 Action htmlAction; 0303 Action unknownFeedTypeAction; 0304 0305 // RSS 1.0+2.0 0306 Action rss10ChannelAction; 0307 Action rss20ChannelAction; 0308 0309 Action titleAction; 0310 Action subtitleAction; 0311 Action descriptionAction; 0312 Action encodedAction; 0313 Action bodyAction; 0314 Action linkAction; 0315 Action imageAction; 0316 Action itemAction; 0317 Action urlAction; 0318 Action authorAction; 0319 Action creatorAction; 0320 Action enclosureAction; 0321 Action guidAction; 0322 Action pubDateAction; 0323 Action keywordsAction; 0324 Action newFeedUrlAction; 0325 0326 // Atom 0327 Action atomLogoAction; 0328 Action atomIconAction; 0329 Action atomEntryAction; 0330 Action atomTitleAction; 0331 Action atomSubtitleAction; 0332 Action atomAuthorAction; 0333 Action atomFeedLinkAction; 0334 Action atomEntryLinkAction; 0335 Action atomIdAction; 0336 Action atomPublishedAction; 0337 Action atomUpdatedAction; 0338 Action atomSummaryAction; 0339 Action atomContentAction; 0340 Action atomTextAction; 0341 0342 // ActionMaps 0343 ActionMap rootMap; 0344 ActionMap skipMap; 0345 ActionMap noContentMap; 0346 ActionMap xmlMap; 0347 0348 ActionMap docMap; 0349 ActionMap rssMap; 0350 ActionMap rdfMap; 0351 ActionMap feedMap; 0352 0353 ActionMap rss10ChannelMap; 0354 ActionMap rss20ChannelMap; 0355 ActionMap imageMap; 0356 ActionMap itemMap; 0357 ActionMap textMap; 0358 0359 ActionMap atomEntryMap; 0360 ActionMap atomAuthorMap; 0361 ActionMap atomTextMap; 0362 }; 0363 0364 static const StaticData sd; 0365 0366 QXmlStreamReader m_xmlReader; 0367 0368 QUrl m_url; 0369 PodcastProvider *m_podcastProvider; 0370 KIO::TransferJob *m_transferJob; 0371 Podcasts::PodcastChannelPtr m_channel; 0372 Podcasts::PodcastEpisodePtr m_item; 0373 0374 /** This points to the data of the current channel or (if parsing an item) 0375 the data of the current item */ 0376 Podcasts::PodcastMetaCommon *m_current; 0377 0378 // this somewhat emulates a callstack (without local variables): 0379 QStack<const Action*> m_actionStack; 0380 0381 ContentType m_contentType; 0382 QString m_buffer; 0383 QList<Enclosure> m_enclosures; 0384 0385 }; 0386 0387 } //namespace Podcasts 0388 0389 #endif