File indexing completed on 2024-05-19 04:49:31

0001 /****************************************************************************************
0002  * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org>                             *
0003  *               2009 Mathias Panzenböck <grosser.meister.morti@gmx.net>                *
0004  *               2013 Ralf Engels <ralf-engels@gmx.de>                                  *
0005  *                                                                                      *
0006  * This program is free software; you can redistribute it and/or modify it under        *
0007  * the terms of the GNU General Public License as published by the Free Software        *
0008  * Foundation; either version 2 of the License, or (at your option) any later           *
0009  * version.                                                                             *
0010  *                                                                                      *
0011  * This program is distributed in the hope that it will be useful, but WITHOUT ANY      *
0012  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A      *
0013  * PARTICULAR PURPOSE. See the GNU General Public License for more details.             *
0014  *                                                                                      *
0015  * You should have received a copy of the GNU General Public License along with         *
0016  * this program.  If not, see <http://www.gnu.org/licenses/>.                           *
0017  ****************************************************************************************/
0018 
0019 #ifndef PODCASTREADER_H
0020 #define PODCASTREADER_H
0021 
0022 #include "core/podcasts/PodcastProvider.h"
0023 #include "core/podcasts/PodcastMeta.h"
0024 
0025 #include <QDateTime>
0026 #include <QXmlStreamReader>
0027 #include <QObject>
0028 #include <QStack>
0029 #include <QRegExp>
0030 
0031 #include <KIO/TransferJob>
0032 
0033 class QUrl;
0034 class KJob;
0035 
0036 namespace Podcasts {
0037 
0038 /** Class that parses a podcast xml file and provides the results to a PodcastProvider.
0039 
0040     @author Bart Cerneels <bart.cerneels@kde.org>
0041             Mathias Panzenböck <grooser.meister.morti@gmx.net>
0042 */
0043 class AMAROKCORE_EXPORT PodcastReader : public QObject
0044 {
0045     Q_OBJECT
0046     public:
0047         /** Create a new PodcastReader that delivers the result to the podcastProvider.
0048             Note: the PodcastProvider pointer is not owned by the PodcastReader and
0049                   must remain valid throughout the lifetime of this object.
0050         */
0051         explicit PodcastReader( PodcastProvider *podcastProvider, QObject *parent = nullptr );
0052         ~PodcastReader() override;
0053 
0054         bool read( QIODevice *device );
0055         bool read( const QUrl &url );
0056         bool update(const PodcastChannelPtr &channel );
0057         QUrl & url() { return m_url; }
0058 
0059         Podcasts::PodcastChannelPtr channel() { return m_channel; }
0060 
0061         QXmlStreamReader::Error error () const { return m_xmlReader.error(); }
0062         QString errorString () const { return m_xmlReader.errorString(); }
0063 
0064     Q_SIGNALS:
0065         void finished( PodcastReader *podcastReader );
0066         void statusBarErrorMessage( const QString &message );
0067         void statusBarNewProgressOperation( KIO::TransferJob *, const QString &, Podcasts::PodcastReader* );
0068 
0069     public Q_SLOTS:
0070         virtual void slotAbort();
0071 
0072     private Q_SLOTS:
0073         void slotRedirection( KIO::Job *job, const QUrl &url );
0074         void slotPermanentRedirection ( KIO::Job * job, const QUrl &fromUrl,
0075                 const QUrl &toUrl );
0076         void slotAddData( KIO::Job *, const QByteArray & data );
0077 
0078         void downloadResult( KJob * );
0079 
0080     private:
0081         /** these are the keys used by the automata */
0082         enum ElementType
0083         {
0084             Unknown = 0,
0085             Any,
0086             Document,
0087             CharacterData,
0088             Rss,
0089             Rdf,
0090             Feed,
0091             Channel,
0092             Item,
0093             NewFeedUrl,
0094             Image,
0095             Link,
0096             Author,
0097             ItunesAuthor,
0098             Url,
0099             Title,
0100             EnclosureElement,
0101             Guid,
0102             PubDate,
0103             Description,
0104             Body,
0105             Html,
0106             Entry,
0107             Subtitle,
0108             ItunesSubtitle,
0109             Updated,
0110             Published,
0111             Summary,
0112             ItunesSummary,
0113             Keywords,
0114             ItunesKeywords,
0115             Content,
0116             SupportedContent,
0117             Name,
0118             Id,
0119             Logo,
0120             Icon,
0121             Creator,
0122             Encoded
0123         };
0124 
0125         class Action;
0126         typedef void (PodcastReader::*ActionCallback)();
0127         typedef QHash<ElementType, Action*> ActionMap;
0128 
0129         class Action
0130         {
0131             public:
0132                 explicit Action( ActionMap &actionMap )
0133                     : m_actionMap( actionMap )
0134                     , m_begin( nullptr )
0135                     , m_end( nullptr )
0136                     , m_characters( nullptr ) {}
0137 
0138                 Action(ActionMap &actionMap, ActionCallback begin)
0139                     : m_actionMap( actionMap )
0140                     , m_begin( begin )
0141                     , m_end( nullptr )
0142                     , m_characters( nullptr ) {}
0143 
0144                 Action(ActionMap &actionMap, ActionCallback begin, ActionCallback end)
0145                     : m_actionMap( actionMap )
0146                     , m_begin( begin )
0147                     , m_end( end )
0148                     , m_characters( nullptr ) {}
0149 
0150                 Action(ActionMap &actionMap, ActionCallback begin,
0151                         ActionCallback end, ActionCallback characters)
0152                     : m_actionMap( actionMap )
0153                     , m_begin( begin )
0154                     , m_end( end )
0155                     , m_characters( characters ) {}
0156 
0157                 void begin(PodcastReader *podcastReader) const;
0158                 void end(PodcastReader *podcastReader) const;
0159                 void characters(PodcastReader *podcastReader) const;
0160 
0161                 const ActionMap &actionMap() const { return m_actionMap; }
0162 
0163             private:
0164                 ActionMap        &m_actionMap;
0165                 ActionCallback    m_begin;
0166                 ActionCallback    m_end;
0167                 ActionCallback    m_characters;
0168         };
0169 
0170         static bool mightBeHtml( const QString& text );
0171 
0172         ElementType elementType() const;
0173         bool read();
0174         bool continueRead();
0175         void createChannel();
0176         
0177         // callback methods for feed parsing:
0178         void beginRss();
0179         void beginRdf();
0180         void beginFeed();
0181         void beginHtml();
0182         void beginUnknownFeedType();
0183         void beginEnclosure();
0184         void beginText();
0185         void beginChannel();
0186         void beginItem();
0187         void beginImage();
0188         void beginXml();
0189         void beginNoElement();
0190         void beginAtomText();
0191         void beginAtomFeedLink();
0192         void beginAtomEntryLink();
0193         void beginAtomTextChild();
0194 
0195         void endDocument();
0196         void endTitle();
0197         void endSubtitle();
0198         void endDescription();
0199         void endEncoded();
0200         void endBody();
0201         void endLink();
0202         void endGuid();
0203         void endPubDate();
0204         void endItem();
0205         void endImageUrl();
0206         void endKeywords();
0207         void endNewFeedUrl();
0208         void endAuthor();
0209         void endCreator();
0210         void endXml();
0211         void endAtomLogo();
0212         void endAtomIcon();
0213         void endAtomTitle();
0214         void endAtomSubtitle();
0215         void endAtomPublished();
0216         void endAtomUpdated();
0217         void endAtomSummary();
0218         void endAtomContent();
0219         void endAtomTextChild();
0220 
0221         // TODO: maybe I can remove readCharacters() and readEscapedCharacters()
0222         //       and use readAtomTextCharacters() plus setting m_contentType even
0223         //       in Rss 1.0/2.0 parsers instead.
0224         void readCharacters();
0225         void readNoCharacters();
0226         void readEscapedCharacters();
0227         void readAtomTextCharacters();
0228 
0229         QDateTime parsePubDate( const QString &datestring );
0230 
0231         void stopWithError(const QString &message);
0232 
0233         static QString unescape( const QString &text );
0234         static QString textToHtml( const QString &text );
0235 
0236         QString atomTextAsText();
0237         QString atomTextAsHtml();
0238 
0239         QStringRef attribute(const char *namespaceUri, const char *name) const;
0240         bool hasAttribute(const char *namespaceUri, const char *name) const;
0241 
0242         void setDescription(const QString &description);
0243         void setSummary(const QString &description);
0244 
0245         /** podcastEpisodeCheck
0246         * Check if this PodcastEpisode has been fetched before. Uses a scoring algorithm.
0247         * @return A pointer to a PodcastEpisode that has been fetched before or the \
0248         *   same pointer as the argument.
0249         */
0250         Podcasts::PodcastEpisodePtr podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode );
0251 
0252         // TODO: move this to PodcastMeta and add a field
0253         //       descriptionType to PodcastCommonMeta.
0254         enum ContentType
0255         {
0256             TextContent,
0257             HtmlContent,
0258             XHtmlContent
0259         };
0260 
0261         class Enclosure
0262         {
0263             public:
0264                 Enclosure(const QUrl &url, int filesize, const QString& mimeType)
0265                     : m_url( url ), m_filesize( filesize ), m_mimeType( mimeType ) {}
0266 
0267                 const QUrl &url() const { return m_url; }
0268                 int fileSize() const { return m_filesize; }
0269                 const QString &mimeType() const { return m_mimeType; }
0270 
0271             private:
0272                 QUrl    m_url;
0273                 int     m_filesize;
0274                 QString m_mimeType;
0275         };
0276 
0277         class StaticData {
0278             public:
0279                 StaticData();
0280 
0281                 // This here basically builds an automata.
0282                 // This way feed parsing can be paused after any token,
0283                 // thus enabling paralell download and parsing of multiple
0284                 // feeds without the need for threads.
0285 
0286                 QHash<QString, ElementType> knownElements;
0287                 QRegExp removeScripts;
0288                 QRegExp mightBeHtml;
0289                 QRegExp linkify;
0290                 
0291                 // Actions
0292                 Action startAction;
0293                 
0294                 Action docAction;
0295                 Action xmlAction;
0296                 Action skipAction;
0297                 Action noContentAction;
0298 
0299                 Action rdfAction;  // RSS 1.0
0300                 Action rssAction;  // RSS 2.0
0301                 Action feedAction; // Atom
0302                 Action htmlAction;
0303                 Action unknownFeedTypeAction;
0304 
0305                 // RSS 1.0+2.0
0306                 Action rss10ChannelAction;
0307                 Action rss20ChannelAction;
0308 
0309                 Action titleAction;
0310                 Action subtitleAction;
0311                 Action descriptionAction;
0312                 Action encodedAction;
0313                 Action bodyAction;
0314                 Action linkAction;
0315                 Action imageAction;
0316                 Action itemAction;
0317                 Action urlAction;
0318                 Action authorAction;
0319                 Action creatorAction;
0320                 Action enclosureAction;
0321                 Action guidAction;
0322                 Action pubDateAction;
0323                 Action keywordsAction;
0324                 Action newFeedUrlAction;
0325 
0326                 // Atom
0327                 Action atomLogoAction;
0328                 Action atomIconAction;
0329                 Action atomEntryAction;
0330                 Action atomTitleAction;
0331                 Action atomSubtitleAction;
0332                 Action atomAuthorAction;
0333                 Action atomFeedLinkAction;
0334                 Action atomEntryLinkAction;
0335                 Action atomIdAction;
0336                 Action atomPublishedAction;
0337                 Action atomUpdatedAction;
0338                 Action atomSummaryAction;
0339                 Action atomContentAction;
0340                 Action atomTextAction;
0341                 
0342                 // ActionMaps
0343                 ActionMap rootMap;
0344                 ActionMap skipMap;
0345                 ActionMap noContentMap;
0346                 ActionMap xmlMap;
0347 
0348                 ActionMap docMap;
0349                 ActionMap rssMap;
0350                 ActionMap rdfMap;
0351                 ActionMap feedMap;
0352 
0353                 ActionMap rss10ChannelMap;
0354                 ActionMap rss20ChannelMap;
0355                 ActionMap imageMap;
0356                 ActionMap itemMap;
0357                 ActionMap textMap;
0358 
0359                 ActionMap atomEntryMap;
0360                 ActionMap atomAuthorMap;
0361                 ActionMap atomTextMap;
0362         };
0363 
0364         static const StaticData sd;
0365 
0366         QXmlStreamReader m_xmlReader;
0367 
0368         QUrl m_url;
0369         PodcastProvider *m_podcastProvider;
0370         KIO::TransferJob *m_transferJob;
0371         Podcasts::PodcastChannelPtr m_channel;
0372         Podcasts::PodcastEpisodePtr m_item;
0373 
0374         /** This points to the data of the current channel or (if parsing an item) 
0375             the data of the current item */
0376         Podcasts::PodcastMetaCommon *m_current;
0377 
0378         // this somewhat emulates a callstack (without local variables):
0379         QStack<const Action*> m_actionStack;
0380 
0381         ContentType m_contentType;
0382         QString m_buffer;
0383         QList<Enclosure> m_enclosures;
0384 
0385 };
0386 
0387 } //namespace Podcasts
0388 
0389 #endif