File indexing completed on 2024-05-19 04:49:31
0001 /**************************************************************************************** 0002 * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org> * 0003 * 2009 Mathias Panzenböck <grosser.meister.morti@gmx.net> * 0004 * 2013 Ralf Engels <ralf-engels@gmx.de> * 0005 * * 0006 * This program is free software; you can redistribute it and/or modify it under * 0007 * the terms of the GNU General Public License as published by the Free Software * 0008 * Foundation; either version 2 of the License, or (at your option) any later * 0009 * version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, but WITHOUT ANY * 0012 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * 0013 * PARTICULAR PURPOSE. See the GNU General Public License for more details. * 0014 * * 0015 * You should have received a copy of the GNU General Public License along with * 0016 * this program. If not, see <http://www.gnu.org/licenses/>. * 0017 ****************************************************************************************/ 0018 0019 #include "core/podcasts/PodcastReader.h" 0020 0021 #include "core/support/Amarok.h" 0022 #include "core/support/Components.h" 0023 #include "core/support/Debug.h" 0024 #include "core/meta/support/MetaUtility.h" 0025 0026 #include <QUrl> 0027 0028 #include <QDate> 0029 #include <QSet> 0030 0031 #include <algorithm> 0032 0033 using namespace Podcasts; 0034 0035 #define ITUNES_NS "http://www.itunes.com/dtds/podcast-1.0.dtd" 0036 #define RDF_NS "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 0037 #define RSS10_NS "http://purl.org/rss/1.0/" 0038 #define RSS20_NS "" 0039 #define ATOM_NS "http://www.w3.org/2005/Atom" 0040 #define ENC_NS "http://purl.oclc.org/net/rss_2.0/enc#" 0041 #define CONTENT_NS "http://purl.org/rss/1.0/modules/content" 0042 #define DC_NS "http://purl.org/dc/elements/1.1/" 0043 0044 // regular expressions for linkification: 0045 #define RE_USER "[-+_%\\.\\w]+" 0046 #define RE_PASSWD RE_USER 0047 #define RE_DOMAIN "[-a-zA-Z0-9]+(?:\\.[-a-zA-Z0-9]+)*" 0048 #define RE_PROT "[a-zA-Z]+://" 0049 #define RE_URL RE_PROT "(?:" RE_USER "(?::" RE_PASSWD ")?@)?" RE_DOMAIN \ 0050 "(?::\\d+)?(?:/[-\\w\\?&=%+.,;:_#~/!@]*)?" 0051 #define RE_MAIL RE_USER "@" RE_DOMAIN 0052 0053 const PodcastReader::StaticData PodcastReader::sd; 0054 0055 PodcastReader::PodcastReader( PodcastProvider *podcastProvider, QObject *parent ) 0056 : QObject( parent ) 0057 , m_xmlReader() 0058 , m_podcastProvider( podcastProvider ) 0059 , m_transferJob( ) 0060 , m_current( nullptr ) 0061 , m_actionStack() 0062 , m_contentType( TextContent ) 0063 , m_buffer() 0064 {} 0065 0066 void 0067 PodcastReader::Action::begin( PodcastReader *podcastReader ) const 0068 { 0069 if( m_begin ) 0070 (( *podcastReader ).*m_begin )(); 0071 } 0072 0073 void 0074 PodcastReader::Action::end( PodcastReader *podcastReader ) const 0075 { 0076 if( m_end ) 0077 (( *podcastReader ).*m_end )(); 0078 } 0079 0080 void 0081 PodcastReader::Action::characters( PodcastReader *podcastReader ) const 0082 { 0083 if( m_characters ) 0084 (( *podcastReader ).*m_characters )(); 0085 } 0086 0087 // initialization of the feed parser automata: 0088 PodcastReader::StaticData::StaticData() 0089 : removeScripts( QStringLiteral("<script[^<]*</script>|<script[^>]*>"), Qt::CaseInsensitive ) 0090 , mightBeHtml( "<\\?xml[^>]*\\?>|<br[^>]*>|<p[^>]*>|<|>|&|"|" 0091 "<([-:\\w\\d]+)[^>]*(/>|>.*</\\1>)|<hr[>]*>|&#\\d+;|&#x[a-fA-F\\d]+;", Qt::CaseInsensitive ) 0092 , linkify( "\\b(" RE_URL ")|\\b(" RE_MAIL ")|(\n)" ) 0093 0094 , startAction( rootMap ) 0095 0096 , docAction( 0097 docMap, 0098 nullptr, 0099 &PodcastReader::endDocument ) 0100 , xmlAction( 0101 xmlMap, 0102 &PodcastReader::beginXml, 0103 &PodcastReader::endXml, 0104 &PodcastReader::readEscapedCharacters ) 0105 , skipAction( skipMap ) 0106 , noContentAction( 0107 noContentMap, 0108 &PodcastReader::beginNoElement, 0109 nullptr, 0110 &PodcastReader::readNoCharacters ) 0111 0112 , rdfAction( 0113 rdfMap, 0114 &PodcastReader::beginRdf ) 0115 , rssAction( 0116 rssMap, 0117 &PodcastReader::beginRss ) 0118 , feedAction( 0119 feedMap, 0120 &PodcastReader::beginFeed ) 0121 , htmlAction( 0122 skipMap, 0123 &PodcastReader::beginHtml ) 0124 , unknownFeedTypeAction( 0125 skipMap, 0126 &PodcastReader::beginUnknownFeedType ) 0127 0128 // RSS 1.0+2.0 0129 , rss10ChannelAction( 0130 rss10ChannelMap, 0131 &PodcastReader::beginChannel ) 0132 , rss20ChannelAction( 0133 rss20ChannelMap, 0134 &PodcastReader::beginChannel ) 0135 0136 , titleAction( 0137 textMap, 0138 &PodcastReader::beginText, 0139 &PodcastReader::endTitle, 0140 &PodcastReader::readCharacters ) 0141 , subtitleAction( 0142 textMap, 0143 &PodcastReader::beginText, 0144 &PodcastReader::endSubtitle, 0145 &PodcastReader::readCharacters ) 0146 , descriptionAction( 0147 textMap, 0148 &PodcastReader::beginText, 0149 &PodcastReader::endDescription, 0150 &PodcastReader::readCharacters ) 0151 , encodedAction( 0152 textMap, 0153 &PodcastReader::beginText, 0154 &PodcastReader::endEncoded, 0155 &PodcastReader::readCharacters ) 0156 , bodyAction( 0157 xmlMap, 0158 &PodcastReader::beginText, 0159 &PodcastReader::endBody, 0160 &PodcastReader::readEscapedCharacters ) 0161 , linkAction( 0162 textMap, 0163 &PodcastReader::beginText, 0164 &PodcastReader::endLink, 0165 &PodcastReader::readCharacters ) 0166 , imageAction( imageMap, 0167 &PodcastReader::beginImage ) 0168 , itemAction( 0169 itemMap, 0170 &PodcastReader::beginItem, 0171 &PodcastReader::endItem ) 0172 , urlAction( 0173 textMap, 0174 &PodcastReader::beginText, 0175 &PodcastReader::endImageUrl, 0176 &PodcastReader::readCharacters ) 0177 , authorAction( 0178 textMap, 0179 &PodcastReader::beginText, 0180 &PodcastReader::endAuthor, 0181 &PodcastReader::readCharacters ) 0182 , creatorAction( 0183 textMap, 0184 &PodcastReader::beginText, 0185 &PodcastReader::endCreator, 0186 &PodcastReader::readCharacters ) 0187 , enclosureAction( 0188 noContentMap, 0189 &PodcastReader::beginEnclosure ) 0190 , guidAction( 0191 textMap, 0192 &PodcastReader::beginText, 0193 &PodcastReader::endGuid, 0194 &PodcastReader::readCharacters ) 0195 , pubDateAction( 0196 textMap, 0197 &PodcastReader::beginText, 0198 &PodcastReader::endPubDate, 0199 &PodcastReader::readCharacters ) 0200 , keywordsAction( 0201 textMap, 0202 &PodcastReader::beginText, 0203 &PodcastReader::endKeywords, 0204 &PodcastReader::readCharacters ) 0205 , newFeedUrlAction( 0206 textMap, 0207 &PodcastReader::beginText, 0208 &PodcastReader::endNewFeedUrl, 0209 &PodcastReader::readCharacters ) 0210 0211 // Atom 0212 , atomLogoAction( 0213 textMap, 0214 &PodcastReader::beginText, 0215 &PodcastReader::endImageUrl, 0216 &PodcastReader::readCharacters ) 0217 , atomIconAction( 0218 textMap, 0219 &PodcastReader::beginText, 0220 &PodcastReader::endAtomIcon, 0221 &PodcastReader::readCharacters ) 0222 , atomEntryAction( 0223 atomEntryMap, 0224 &PodcastReader::beginItem, 0225 &PodcastReader::endItem ) 0226 , atomTitleAction( 0227 atomTextMap, 0228 &PodcastReader::beginAtomText, 0229 &PodcastReader::endAtomTitle, 0230 &PodcastReader::readAtomTextCharacters ) 0231 , atomSubtitleAction( 0232 atomTextMap, 0233 &PodcastReader::beginAtomText, 0234 &PodcastReader::endAtomSubtitle, 0235 &PodcastReader::readAtomTextCharacters ) 0236 , atomAuthorAction( 0237 atomAuthorMap ) 0238 , atomFeedLinkAction( 0239 noContentMap, 0240 &PodcastReader::beginAtomFeedLink, 0241 nullptr, 0242 &PodcastReader::readNoCharacters ) 0243 , atomEntryLinkAction( 0244 noContentMap, 0245 &PodcastReader::beginAtomEntryLink, 0246 nullptr, 0247 &PodcastReader::readNoCharacters ) 0248 , atomIdAction( 0249 textMap, 0250 &PodcastReader::beginText, 0251 &PodcastReader::endGuid, 0252 &PodcastReader::readCharacters ) 0253 , atomPublishedAction( 0254 textMap, 0255 &PodcastReader::beginText, 0256 &PodcastReader::endAtomPublished, 0257 &PodcastReader::readCharacters ) 0258 , atomUpdatedAction( 0259 textMap, 0260 &PodcastReader::beginText, 0261 &PodcastReader::endAtomUpdated, 0262 &PodcastReader::readCharacters ) 0263 , atomSummaryAction( 0264 atomTextMap, 0265 &PodcastReader::beginAtomText, 0266 &PodcastReader::endAtomSummary, 0267 &PodcastReader::readAtomTextCharacters ) 0268 , atomContentAction( 0269 atomTextMap, 0270 &PodcastReader::beginAtomText, 0271 &PodcastReader::endAtomContent, 0272 &PodcastReader::readAtomTextCharacters ) 0273 , atomTextAction( 0274 atomTextMap, 0275 &PodcastReader::beginAtomTextChild, 0276 &PodcastReader::endAtomTextChild, 0277 &PodcastReader::readAtomTextCharacters ) 0278 { 0279 // known elements: 0280 knownElements[ QStringLiteral("rss") ] = Rss; 0281 knownElements[ QStringLiteral("RDF") ] = Rdf; 0282 knownElements[ QStringLiteral("feed") ] = Feed; 0283 knownElements[ QStringLiteral("channel") ] = Channel; 0284 knownElements[ QStringLiteral("item") ] = Item; 0285 knownElements[ QStringLiteral("image") ] = Image; 0286 knownElements[ QStringLiteral("link") ] = Link; 0287 knownElements[ QStringLiteral("url") ] = Url; 0288 knownElements[ QStringLiteral("title") ] = Title; 0289 knownElements[ QStringLiteral("author") ] = Author; 0290 knownElements[ QStringLiteral("enclosure") ] = EnclosureElement; 0291 knownElements[ QStringLiteral("guid") ] = Guid; 0292 knownElements[ QStringLiteral("pubDate") ] = PubDate; 0293 knownElements[ QStringLiteral("description") ] = Description; 0294 knownElements[ QStringLiteral("summary") ] = Summary; 0295 knownElements[ QStringLiteral("body") ] = Body; 0296 knownElements[ QStringLiteral("entry") ] = Entry; 0297 knownElements[ QStringLiteral("content") ] = Content; 0298 knownElements[ QStringLiteral("name") ] = Name; 0299 knownElements[ QStringLiteral("id") ] = Id; 0300 knownElements[ QStringLiteral("subtitle") ] = Subtitle; 0301 knownElements[ QStringLiteral("updated") ] = Updated; 0302 knownElements[ QStringLiteral("published") ] = Published; 0303 knownElements[ QStringLiteral("logo") ] = Logo; 0304 knownElements[ QStringLiteral("icon") ] = Icon; 0305 knownElements[ QStringLiteral("encoded") ] = Encoded; 0306 knownElements[ QStringLiteral("creator") ] = Creator; 0307 knownElements[ QStringLiteral("keywords") ] = Keywords; 0308 knownElements[ QStringLiteral("new-feed-url") ] = NewFeedUrl; 0309 knownElements[ QStringLiteral("html") ] = Html; 0310 knownElements[ QStringLiteral("HTML") ] = Html; 0311 0312 // before start document/after end document 0313 rootMap.insert( Document, &docAction ); 0314 0315 // parse document 0316 docMap.insert( Rss, &rssAction ); 0317 docMap.insert( Html, &htmlAction ); 0318 docMap.insert( Rdf, &rdfAction ); 0319 docMap.insert( Feed, &feedAction ); 0320 docMap.insert( Any, &unknownFeedTypeAction ); 0321 0322 // parse <rss> "RSS 2.0" 0323 rssMap.insert( Channel, &rss20ChannelAction ); 0324 0325 // parse <RDF> "RSS 1.0" 0326 rdfMap.insert( Channel, &rss10ChannelAction ); 0327 rdfMap.insert( Item, &itemAction ); 0328 0329 // parse <channel> "RSS 2.0" 0330 rss20ChannelMap.insert( Title, &titleAction ); 0331 rss20ChannelMap.insert( ItunesSubtitle, &subtitleAction ); 0332 rss20ChannelMap.insert( ItunesAuthor, &authorAction ); 0333 rss20ChannelMap.insert( Creator, &creatorAction ); 0334 rss20ChannelMap.insert( Description, &descriptionAction ); 0335 rss20ChannelMap.insert( Encoded, &encodedAction ); 0336 rss20ChannelMap.insert( ItunesSummary, &descriptionAction ); 0337 rss20ChannelMap.insert( Body, &bodyAction ); 0338 rss20ChannelMap.insert( Link, &linkAction ); 0339 rss20ChannelMap.insert( Image, &imageAction ); 0340 rss20ChannelMap.insert( ItunesKeywords, &keywordsAction ); 0341 rss20ChannelMap.insert( NewFeedUrl, &newFeedUrlAction ); 0342 rss20ChannelMap.insert( Item, &itemAction ); 0343 0344 // parse <channel> "RSS 1.0" 0345 rss10ChannelMap.insert( Title, &titleAction ); 0346 rss10ChannelMap.insert( ItunesSubtitle, &subtitleAction ); 0347 rss10ChannelMap.insert( ItunesAuthor, &authorAction ); 0348 rss10ChannelMap.insert( Creator, &creatorAction ); 0349 rss10ChannelMap.insert( Description, &descriptionAction ); 0350 rss10ChannelMap.insert( Encoded, &encodedAction ); 0351 rss10ChannelMap.insert( ItunesSummary, &descriptionAction ); 0352 rss10ChannelMap.insert( Body, &bodyAction ); 0353 rss10ChannelMap.insert( Link, &linkAction ); 0354 rss10ChannelMap.insert( Image, &imageAction ); 0355 rss10ChannelMap.insert( ItunesKeywords, &keywordsAction ); 0356 rss10ChannelMap.insert( NewFeedUrl, &newFeedUrlAction ); 0357 0358 // parse <image> 0359 imageMap.insert( Title, &skipAction ); 0360 imageMap.insert( Link, &skipAction ); 0361 imageMap.insert( Url, &urlAction ); 0362 0363 // parse <item> 0364 itemMap.insert( Title, &titleAction ); 0365 itemMap.insert( ItunesSubtitle, &subtitleAction ); 0366 itemMap.insert( Author, &authorAction ); 0367 itemMap.insert( ItunesAuthor, &authorAction ); 0368 itemMap.insert( Creator, &creatorAction ); 0369 itemMap.insert( Description, &descriptionAction ); 0370 itemMap.insert( Encoded, &encodedAction ); 0371 itemMap.insert( ItunesSummary, &descriptionAction ); 0372 itemMap.insert( Body, &bodyAction ); 0373 itemMap.insert( EnclosureElement, &enclosureAction ); 0374 itemMap.insert( Guid, &guidAction ); 0375 itemMap.insert( PubDate, &pubDateAction ); 0376 itemMap.insert( ItunesKeywords, &keywordsAction ); 0377 // TODO: move the link field from PodcastChannel to PodcastMetaCommon 0378 // itemMap.insert( Link, &linkAction ); 0379 0380 // parse <feed> "Atom" 0381 feedMap.insert( Title, &atomTitleAction ); 0382 feedMap.insert( Subtitle, &atomSubtitleAction ); 0383 feedMap.insert( Icon, &atomIconAction ); 0384 feedMap.insert( Logo, &atomLogoAction ); 0385 feedMap.insert( Author, &atomAuthorAction ); 0386 feedMap.insert( Link, &atomFeedLinkAction ); 0387 feedMap.insert( Entry, &atomEntryAction ); 0388 0389 // parse <entry> "Atom" 0390 atomEntryMap.insert( Title, &atomTitleAction ); 0391 atomEntryMap.insert( Subtitle, &atomSubtitleAction ); 0392 atomEntryMap.insert( Author, &atomAuthorAction ); 0393 atomEntryMap.insert( Id, &atomIdAction ); 0394 atomEntryMap.insert( Published, &atomPublishedAction ); 0395 atomEntryMap.insert( Updated, &atomUpdatedAction ); 0396 atomEntryMap.insert( Summary, &atomSummaryAction ); 0397 atomEntryMap.insert( Link, &atomEntryLinkAction ); 0398 atomEntryMap.insert( SupportedContent, &atomContentAction ); 0399 0400 // parse <author> "Atom" 0401 atomAuthorMap.insert( Name, &authorAction ); 0402 0403 // parse atom text 0404 atomTextMap.insert( Any, &atomTextAction ); 0405 0406 // parse arbitrary xml 0407 xmlMap.insert( Any, &xmlAction ); 0408 0409 // skip elements 0410 skipMap.insert( Any, &skipAction ); 0411 } 0412 0413 PodcastReader::~PodcastReader() 0414 { 0415 DEBUG_BLOCK 0416 } 0417 0418 bool 0419 PodcastReader::mightBeHtml( const QString& text ) //Static 0420 { 0421 return sd.mightBeHtml.indexIn( text ) != -1; 0422 } 0423 0424 bool PodcastReader::read( QIODevice *device ) 0425 { 0426 DEBUG_BLOCK 0427 0428 m_xmlReader.setDevice( device ); 0429 return read(); 0430 } 0431 0432 bool 0433 PodcastReader::read( const QUrl &url ) 0434 { 0435 DEBUG_BLOCK 0436 0437 m_url = url; 0438 0439 m_transferJob = KIO::get( m_url, KIO::Reload, KIO::HideProgressInfo ); 0440 0441 connect( m_transferJob, &KIO::TransferJob::data, 0442 this, &PodcastReader::slotAddData ); 0443 0444 connect( m_transferJob, &KIO::TransferJob::result, 0445 this, &PodcastReader::downloadResult ); 0446 0447 connect( m_transferJob, &KIO::TransferJob::redirection, 0448 this, &PodcastReader::slotRedirection ); 0449 0450 connect( m_transferJob, &KIO::TransferJob::permanentRedirection, 0451 this, &PodcastReader::slotPermanentRedirection ); 0452 0453 QString description = i18n( "Importing podcast channel from %1", url.url() ); 0454 if( m_channel ) 0455 { 0456 description = m_channel->title().isEmpty() 0457 ? i18n( "Updating podcast channel" ) 0458 : i18n( "Updating \"%1\"", m_channel->title() ); 0459 } 0460 0461 Q_EMIT statusBarNewProgressOperation( m_transferJob, description, this ); 0462 0463 // parse data 0464 return read(); 0465 } 0466 0467 void 0468 PodcastReader::slotAbort() 0469 { 0470 DEBUG_BLOCK 0471 } 0472 0473 bool 0474 PodcastReader::update( const PodcastChannelPtr &channel ) 0475 { 0476 DEBUG_BLOCK 0477 m_channel = channel; 0478 0479 return read( m_channel->url() ); 0480 } 0481 0482 void 0483 PodcastReader::slotAddData( KIO::Job *job, const QByteArray &data ) 0484 { 0485 DEBUG_BLOCK 0486 Q_UNUSED( job ) 0487 0488 m_xmlReader.addData( data ); 0489 0490 // parse more data 0491 continueRead(); 0492 } 0493 0494 void 0495 PodcastReader::downloadResult( KJob * job ) 0496 { 0497 DEBUG_BLOCK 0498 0499 // parse more data 0500 continueRead(); 0501 0502 KIO::TransferJob *transferJob = dynamic_cast<KIO::TransferJob *>( job ); 0503 if( transferJob && transferJob->isErrorPage() ) 0504 { 0505 QString errorMessage = 0506 i18n( "Importing podcast from %1 failed with error:\n", m_url.url() ); 0507 if( m_channel ) 0508 { 0509 errorMessage = m_channel->title().isEmpty() 0510 ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() ) 0511 : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() ); 0512 } 0513 errorMessage = errorMessage.append( job->errorString() ); 0514 0515 Q_EMIT statusBarErrorMessage( errorMessage ); 0516 } 0517 else if( job->error() ) 0518 { 0519 QString errorMessage = 0520 i18n( "Importing podcast from %1 failed with error:\n", m_url.url() ); 0521 if( m_channel ) 0522 { 0523 errorMessage = m_channel->title().isEmpty() 0524 ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() ) 0525 : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() ); 0526 } 0527 errorMessage = errorMessage.append( job->errorString() ); 0528 0529 Q_EMIT statusBarErrorMessage( errorMessage ); 0530 } 0531 0532 m_transferJob = nullptr; 0533 } 0534 0535 PodcastReader::ElementType 0536 PodcastReader::elementType() const 0537 { 0538 if( m_xmlReader.isEndDocument() || m_xmlReader.isStartDocument() ) 0539 return Document; 0540 0541 if( m_xmlReader.isCDATA() || m_xmlReader.isCharacters() ) 0542 return CharacterData; 0543 0544 ElementType elementType = sd.knownElements[ m_xmlReader.name().toString()]; 0545 0546 // This is a bit hacky because my automata does not support conditions. 0547 // Therefore I put the decision logic in here and declare some pseudo elements. 0548 // I don't think it is worth it to extend the automata to support such conditions. 0549 switch( elementType ) 0550 { 0551 case Summary: 0552 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 0553 { 0554 elementType = ItunesSummary; 0555 } 0556 break; 0557 0558 case Subtitle: 0559 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 0560 { 0561 elementType = ItunesSubtitle; 0562 } 0563 break; 0564 0565 case Author: 0566 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 0567 { 0568 elementType = ItunesAuthor; 0569 } 0570 break; 0571 0572 case Keywords: 0573 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 0574 { 0575 elementType = ItunesKeywords; 0576 } 0577 break; 0578 0579 case Content: 0580 if( m_xmlReader.namespaceUri() == ATOM_NS && 0581 // ignore atom:content elements that do not 0582 // have content but only refer to some url: 0583 !hasAttribute( ATOM_NS, "src" ) ) 0584 { 0585 // Atom supports arbitrary Base64 encoded content. 0586 // Because we can only something with text/html/xhtml I ignore 0587 // anything else. 0588 // See: 0589 // http://tools.ietf.org/html/rfc4287#section-4.1.3 0590 if( hasAttribute( ATOM_NS, "type" ) ) 0591 { 0592 QStringRef type( attribute( ATOM_NS, "type" ) ); 0593 0594 if( type == "text" || type == "html" || type == "xhtml" ) 0595 { 0596 elementType = SupportedContent; 0597 } 0598 } 0599 else 0600 { 0601 elementType = SupportedContent; 0602 } 0603 } 0604 break; 0605 0606 default: 0607 break; 0608 } 0609 0610 return elementType; 0611 } 0612 0613 bool 0614 PodcastReader::read() 0615 { 0616 DEBUG_BLOCK 0617 0618 m_current = nullptr; 0619 m_item = nullptr; 0620 m_contentType = TextContent; 0621 m_buffer.clear(); 0622 m_actionStack.clear(); 0623 m_actionStack.push( &( PodcastReader::sd.startAction ) ); 0624 m_xmlReader.setNamespaceProcessing( true ); 0625 0626 return continueRead(); 0627 } 0628 0629 bool 0630 PodcastReader::continueRead() 0631 { 0632 // this is some kind of pushdown automata 0633 // with this it should be possible to parse feeds in parallel 0634 // without using threads 0635 DEBUG_BLOCK 0636 0637 while( !m_xmlReader.atEnd() && m_xmlReader.error() != QXmlStreamReader::CustomError ) 0638 { 0639 QXmlStreamReader::TokenType token = m_xmlReader.readNext(); 0640 0641 if( m_xmlReader.error() == QXmlStreamReader::PrematureEndOfDocumentError && m_transferJob ) 0642 { 0643 return true; 0644 } 0645 0646 if( m_xmlReader.hasError() ) 0647 { 0648 Q_EMIT finished( this ); 0649 return false; 0650 } 0651 0652 if( m_actionStack.isEmpty() ) 0653 { 0654 debug() << "expected element on stack!"; 0655 return false; 0656 } 0657 0658 const Action* action = m_actionStack.top(); 0659 const Action* subAction = nullptr; 0660 0661 switch( token ) 0662 { 0663 case QXmlStreamReader::Invalid: 0664 return false; 0665 0666 case QXmlStreamReader::StartDocument: 0667 case QXmlStreamReader::StartElement: 0668 subAction = action->actionMap()[ elementType()]; 0669 0670 if( !subAction ) 0671 subAction = action->actionMap()[ Any ]; 0672 0673 if( !subAction ) 0674 subAction = &( PodcastReader::sd.skipAction ); 0675 0676 m_actionStack.push( subAction ); 0677 0678 subAction->begin( this ); 0679 break; 0680 0681 case QXmlStreamReader::EndDocument: 0682 case QXmlStreamReader::EndElement: 0683 action->end( this ); 0684 0685 if( m_actionStack.pop() != action ) 0686 { 0687 debug() << "popped other element than expected!"; 0688 } 0689 break; 0690 0691 case QXmlStreamReader::Characters: 0692 if( !m_xmlReader.isWhitespace() || m_xmlReader.isCDATA() ) 0693 { 0694 action->characters( this ); 0695 } 0696 break; 0697 // ignorable whitespaces 0698 case QXmlStreamReader::Comment: 0699 case QXmlStreamReader::EntityReference: 0700 case QXmlStreamReader::ProcessingInstruction: 0701 case QXmlStreamReader::DTD: 0702 case QXmlStreamReader::NoToken: 0703 // ignore 0704 break; 0705 } 0706 } 0707 0708 return !m_xmlReader.hasError(); 0709 } 0710 0711 void 0712 PodcastReader::stopWithError( const QString &message ) 0713 { 0714 m_xmlReader.raiseError( message ); 0715 0716 if( m_transferJob ) 0717 { 0718 m_transferJob->kill(KJob::EmitResult); 0719 m_transferJob = nullptr; 0720 } 0721 0722 Q_EMIT finished( this ); 0723 } 0724 0725 void 0726 PodcastReader::beginText() 0727 { 0728 m_buffer.clear(); 0729 } 0730 0731 void 0732 PodcastReader::endTitle() 0733 { 0734 m_current->setTitle( m_buffer.trimmed() ); 0735 } 0736 0737 void 0738 PodcastReader::endSubtitle() 0739 { 0740 m_current->setSubtitle( m_buffer.trimmed() ); 0741 } 0742 0743 QString 0744 PodcastReader::atomTextAsText() 0745 { 0746 switch( m_contentType ) 0747 { 0748 case HtmlContent: 0749 case XHtmlContent: 0750 // TODO: strip tags (there should not be any non-xml entities here) 0751 return unescape( m_buffer ); 0752 0753 case TextContent: 0754 default: 0755 return m_buffer; 0756 } 0757 } 0758 0759 QString 0760 PodcastReader::atomTextAsHtml() 0761 { 0762 switch( m_contentType ) 0763 { 0764 case HtmlContent: 0765 case XHtmlContent: 0766 // strip <script> elements 0767 // This will work because there aren't <![CDATA[ ]]> sections 0768 // in m_buffer, because we have (re)escape the code manually. 0769 // XXX: But it does not remove event handlers like onclick="..." 0770 // and JavaScript links like href="javascript:..." 0771 return m_buffer.remove( sd.removeScripts ); 0772 0773 case TextContent: 0774 default: 0775 return textToHtml( m_buffer ); 0776 } 0777 } 0778 0779 QString 0780 PodcastReader::unescape( const QString &text ) 0781 { 0782 // TODO: resolve predefined html entities 0783 QString buf; 0784 0785 for ( int i = 0; i < text.size(); ++ i ) 0786 { 0787 QChar c( text[ i ] ); 0788 0789 if( c == '&' ) 0790 { 0791 int endIndex = text.indexOf( QLatin1Char(';'), i ); 0792 0793 if( endIndex == -1 ) 0794 { 0795 // fix invalid input 0796 buf += c; 0797 } 0798 else if( text[ i + 1 ] == '#' ) 0799 { 0800 int num = 0; 0801 bool ok = false; 0802 if( text[ i + 2 ] == 'x' ) 0803 { 0804 QString entity( text.mid( i + 3, endIndex - i - 3 ) ); 0805 num = entity.toInt( &ok, 16 ); 0806 } 0807 else 0808 { 0809 QString entity( text.mid( i + 2, endIndex - i - 2 ) ); 0810 num = entity.toInt( &ok, 10 ); 0811 } 0812 0813 if( !ok || num < 0 ) 0814 { 0815 // fix invalid input 0816 buf += c; 0817 } 0818 else 0819 { 0820 buf += QChar( num ); 0821 i = endIndex; 0822 } 0823 } 0824 else 0825 { 0826 QString entity( text.mid( i + 1, endIndex - i - 1 ) ); 0827 0828 if( entity == QLatin1String("lt") ) 0829 { 0830 buf += QLatin1Char('<'); 0831 i = endIndex; 0832 } 0833 else if( entity == QLatin1String("gt") ) 0834 { 0835 buf += QLatin1Char('>'); 0836 i = endIndex; 0837 } 0838 else if( entity == QLatin1String("amp") ) 0839 { 0840 buf += QLatin1Char('&'); 0841 i = endIndex; 0842 } 0843 else if( entity == QLatin1String("apos") ) 0844 { 0845 buf += QLatin1Char('\''); 0846 i = endIndex; 0847 } 0848 else if( entity == QLatin1String("quot") ) 0849 { 0850 buf += QLatin1Char('"'); 0851 i = endIndex; 0852 } 0853 else 0854 { 0855 // fix invalid input 0856 buf += c; 0857 } 0858 } 0859 } 0860 else 0861 { 0862 buf += c; 0863 } 0864 } 0865 0866 return buf; 0867 } 0868 0869 void 0870 PodcastReader::setSummary( const QString &description ) 0871 { 0872 if( m_current->summary().size() < description.size() ) 0873 { 0874 m_current->setSummary( description ); 0875 } 0876 } 0877 0878 void 0879 PodcastReader::setDescription( const QString &description ) 0880 { 0881 // The content of the <description>, <itunes:summary> or <body> 0882 // elements might be assigned to the field description, unless 0883 // there is already longer data in it. Then it will be assigned 0884 // to summary, unless summary depending on whether there 0885 // already is some (longer) information in the description 0886 // field. 0887 // If there is already data in the description field, instead of 0888 // overwriting, it will be moved to the summary field, unless 0889 // there is already longer data there. 0890 if( m_current->description().size() < description.size() ) 0891 { 0892 setSummary( m_current->description() ); 0893 m_current->setDescription( description ); 0894 } 0895 else 0896 { 0897 setSummary( description ); 0898 } 0899 } 0900 0901 void 0902 PodcastReader::endDescription() 0903 { 0904 QString description( m_buffer.trimmed() ); 0905 0906 if( !mightBeHtml( description ) ) 0907 { 0908 // content type is plain text 0909 description = textToHtml( description ); 0910 } 0911 // else: content type is html 0912 setDescription( description ); 0913 } 0914 0915 QString 0916 PodcastReader::textToHtml( const QString &text ) 0917 { 0918 QString buf; 0919 QRegExp re( sd.linkify ); 0920 int index = 0; 0921 0922 for(;;) 0923 { 0924 int next = re.indexIn( text, index ); 0925 0926 if( next == -1 ) 0927 break; 0928 0929 if( next != index ) 0930 { 0931 buf += text.mid( index, next - index ).toHtmlEscaped(); 0932 } 0933 0934 QString s; 0935 0936 if( !(s = re.cap( 1 )).isEmpty() ) 0937 { 0938 if( s.startsWith( QLatin1String( "javascript:" ), Qt::CaseInsensitive ) || 0939 s.startsWith( QLatin1String( "exec:" ), Qt::CaseInsensitive ) ) 0940 { 0941 buf += s.toHtmlEscaped(); 0942 } 0943 else 0944 { 0945 buf += QStringLiteral( "<a href=\"%1\">%1</a>" ) 0946 .arg( s.toHtmlEscaped() ); 0947 } 0948 } 0949 else if( !(s = re.cap( 2 )).isEmpty() ) 0950 { 0951 buf += QStringLiteral( "<a href=\"mailto:%1\">%1</a>" ) 0952 .arg( s.toHtmlEscaped() ); 0953 } 0954 else if( !re.cap( 3 ).isEmpty() ) 0955 { 0956 buf += QLatin1String("<br/>\n"); 0957 } 0958 0959 index = re.pos() + re.matchedLength(); 0960 } 0961 0962 buf += text.mid( index ).toHtmlEscaped(); 0963 0964 return buf; 0965 } 0966 0967 void 0968 PodcastReader::endEncoded() 0969 { 0970 // content type is html 0971 setDescription( m_buffer.trimmed() ); 0972 } 0973 0974 void 0975 PodcastReader::endBody() 0976 { 0977 // content type is xhtml 0978 // always prefer <body>, because it's likely to 0979 // contain nice html formatted information 0980 setSummary( m_current->description() ); 0981 m_current->setDescription( m_buffer.trimmed() ); 0982 } 0983 0984 void 0985 PodcastReader::endLink() 0986 { 0987 // TODO: change to m_current->... when the field 0988 // is moved to the PodcastMetaCommon class. 0989 m_channel->setWebLink( QUrl( m_buffer ) ); 0990 } 0991 0992 void 0993 PodcastReader::beginHtml() 0994 { 0995 stopWithError( i18n( "While parsing %1, a feed was expected but an HTML page was received." 0996 "\nDid you enter the correct URL?", m_url.url() ) ); 0997 } 0998 0999 void 1000 PodcastReader::beginUnknownFeedType() 1001 { 1002 stopWithError( i18n( "Feed has an unknown type: %1", m_url.url() ) ); 1003 } 1004 1005 void 1006 PodcastReader::beginRss() 1007 { 1008 if( m_xmlReader.attributes().value( QStringLiteral("version") ) != "2.0" ) 1009 { 1010 // TODO: change this string once we support more 1011 stopWithError( i18n( "%1 is not an RSS version 2.0 feed.", m_url.url() ) ); 1012 } 1013 } 1014 1015 void 1016 PodcastReader::beginRdf() 1017 { 1018 bool ok = true; 1019 if( m_xmlReader.namespaceUri() != RDF_NS ) 1020 { 1021 ok = false; 1022 } 1023 1024 if( ok ) 1025 { 1026 bool found = false; 1027 foreach( const QXmlStreamNamespaceDeclaration &nsdecl, m_xmlReader.namespaceDeclarations() ) 1028 { 1029 if( nsdecl.namespaceUri() == RSS10_NS ) 1030 { 1031 found = true; 1032 break; 1033 } 1034 } 1035 1036 if( !found ) 1037 ok = false; 1038 } 1039 1040 if( !ok ) 1041 stopWithError( i18n( "%1 is not a valid RSS version 1.0 feed.", m_url.url() ) ); 1042 } 1043 1044 void 1045 PodcastReader::beginFeed() 1046 { 1047 if( m_xmlReader.namespaceUri() != ATOM_NS ) 1048 { 1049 stopWithError( i18n( "%1 is not a valid Atom feed.", m_url.url() ) ); 1050 } 1051 else 1052 { 1053 beginChannel(); 1054 } 1055 } 1056 1057 void 1058 PodcastReader::endDocument() 1059 { 1060 debug() << "successfully parsed feed: " << m_url.url(); 1061 Q_EMIT finished( this ); 1062 } 1063 1064 void 1065 PodcastReader::createChannel() 1066 { 1067 if( !m_channel ) 1068 { 1069 debug() << "new channel"; 1070 1071 Podcasts::PodcastChannelPtr channel( new Podcasts::PodcastChannel() ); 1072 channel->setUrl( m_url ); 1073 channel->setSubscribeDate( QDate::currentDate() ); 1074 /* add this new channel to the provider, we get a pointer to a 1075 * PodcastChannelPtr of the correct type which we will use from now on. 1076 */ 1077 m_channel = m_podcastProvider->addChannel( channel ); 1078 } 1079 } 1080 1081 void 1082 PodcastReader::beginChannel() 1083 { 1084 createChannel(); 1085 1086 m_current = m_channel.data(); 1087 1088 // Because the summary and description fields are read from several elements 1089 // they only get changed when longer information is read as there is stored in 1090 // the appropriate field already. In order to still be able to correctly update 1091 // the feed's description/summary I set it here to the empty string: 1092 m_channel->setDescription( QLatin1String("") ); 1093 m_channel->setSummary( QLatin1String("") ); 1094 m_channel->setKeywords( QStringList() ); 1095 } 1096 1097 void 1098 PodcastReader::beginItem() 1099 { 1100 // theoretically it is possible that an ugly RSS 1.0 feed has 1101 // first the <item> elements followed by the <channel> element: 1102 createChannel(); 1103 1104 m_item = new Podcasts::PodcastEpisode( m_channel ); 1105 m_current = m_item.data(); 1106 1107 m_enclosures.clear(); 1108 } 1109 1110 void 1111 PodcastReader::endItem() 1112 { 1113 // TODO: change superclass of PodcastEpisode to MultiTrack 1114 1115 /* some feeds contain normal blogposts without 1116 enclosures alongside of podcasts */ 1117 1118 if( !m_enclosures.isEmpty() ) 1119 { 1120 // just take the first enclosure on multi 1121 m_item->setUidUrl( m_enclosures[ 0 ].url() ); 1122 m_item->setFilesize( m_enclosures[ 0 ].fileSize() ); 1123 m_item->setMimeType( m_enclosures[ 0 ].mimeType() ); 1124 1125 m_enclosures.removeAt( 0 ); 1126 1127 // append alternative enclosures to description 1128 if( !m_enclosures.isEmpty() ) 1129 { 1130 QString description( m_item->description() ); 1131 description += QLatin1String("\n<p><b>"); 1132 description += i18n( "Alternative Enclosures:" ); 1133 description += QLatin1String("</b><br/>\n<ul>"); 1134 1135 foreach( const Enclosure& enclosure, m_enclosures ) 1136 { 1137 description += QStringLiteral( "<li><a href=\"%1\">%2</a> (%3, %4)</li>" ) 1138 .arg( enclosure.url().url().toHtmlEscaped(), 1139 enclosure.url().fileName().toHtmlEscaped(), 1140 Meta::prettyFilesize( enclosure.fileSize() ), 1141 enclosure.mimeType().isEmpty() ? 1142 i18n( "unknown type" ) : 1143 enclosure.mimeType().toHtmlEscaped() ); 1144 } 1145 1146 description += QLatin1String("</ul></p>"); 1147 m_item->setDescription( description ); 1148 } 1149 1150 Podcasts::PodcastEpisodePtr episode; 1151 QString guid = m_item->guid(); 1152 if( guid.isEmpty() ) 1153 { 1154 episode = Podcasts::PodcastEpisodePtr::dynamicCast( 1155 m_podcastProvider->trackForUrl( QUrl::fromUserInput(m_item->uidUrl()) ) 1156 ); 1157 } 1158 else 1159 { 1160 episode = m_podcastProvider->episodeForGuid( guid ); 1161 } 1162 1163 //make sure that the episode is not a bogus match. The channel has to be correct. 1164 // See https://bugs.kde.org/show_bug.cgi?id=227515 1165 if( !episode.isNull() && episode->channel() == m_channel ) 1166 { 1167 debug() << "updating episode: " << episode->title(); 1168 1169 episode->setTitle( m_item->title() ); 1170 episode->setSubtitle( m_item->subtitle() ); 1171 episode->setSummary( m_item->summary() ); 1172 episode->setDescription( m_item->description() ); 1173 episode->setAuthor( m_item->author() ); 1174 episode->setUidUrl( QUrl::fromUserInput(m_item->uidUrl()) ); 1175 episode->setFilesize( m_item->filesize() ); 1176 episode->setMimeType( m_item->mimeType() ); 1177 episode->setPubDate( m_item->pubDate() ); 1178 episode->setKeywords( m_item->keywords() ); 1179 1180 // set the guid in case it was empty (for some buggy reason): 1181 episode->setGuid( m_item->guid() ); 1182 } 1183 else 1184 { 1185 debug() << "new episode: " << m_item->title(); 1186 1187 episode = m_channel->addEpisode( m_item ); 1188 // also let the provider know an episode has been added 1189 // TODO: change into a signal 1190 m_podcastProvider->addEpisode( episode ); 1191 } 1192 } 1193 1194 m_current = m_channel.data(); 1195 m_item = nullptr; 1196 } 1197 1198 void 1199 PodcastReader::beginEnclosure() 1200 { 1201 // This should read both, RSS 2.0 and RSS 1.0 with mod_enclosure 1202 // <enclosure> elements. 1203 // See: 1204 // http://www.rssboard.org/rss-specification 1205 // http://www.xs4all.nl/~foz/mod_enclosure.html 1206 QStringRef str; 1207 1208 str = m_xmlReader.attributes().value( QStringLiteral("url") ); 1209 1210 if( str.isEmpty() ) 1211 str = attribute( RDF_NS, "about" ); 1212 1213 if( str.isEmpty() ) 1214 { 1215 debug() << "invalid enclosure containing no/empty url"; 1216 return; 1217 } 1218 1219 QUrl url( str.toString() ); 1220 1221 str = m_xmlReader.attributes().value( QStringLiteral("length") ); 1222 1223 if( str.isEmpty() ) 1224 str = attribute( ENC_NS, "length" ); 1225 1226 int length = str.toString().toInt(); 1227 1228 str = m_xmlReader.attributes().value( QStringLiteral("type") ); 1229 1230 if( str.isEmpty() ) 1231 str = attribute( ENC_NS, "type" ); 1232 1233 QString mimeType( str.toString().trimmed() ); 1234 1235 m_enclosures.append( Enclosure( url, length, mimeType ) ); 1236 } 1237 1238 void 1239 PodcastReader::endGuid() 1240 { 1241 m_item->setGuid( m_buffer ); 1242 } 1243 1244 void 1245 PodcastReader::endPubDate() 1246 { 1247 QDateTime pubDate( parsePubDate( m_buffer ) ); 1248 1249 if( !pubDate.isValid() ) 1250 { 1251 debug() << "invalid podcast episode pubDate: " << m_buffer; 1252 return; 1253 } 1254 1255 m_item->setPubDate( pubDate ); 1256 } 1257 1258 void 1259 PodcastReader::beginImage() 1260 { 1261 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 1262 { 1263 m_channel->setImageUrl( QUrl( m_xmlReader.attributes().value( QStringLiteral("href") ).toString() ) ); 1264 } 1265 } 1266 1267 void 1268 PodcastReader::endImageUrl() 1269 { 1270 // TODO save image data 1271 m_channel->setImageUrl( QUrl( m_buffer ) ); 1272 } 1273 1274 void 1275 PodcastReader::endKeywords() 1276 { 1277 QList<QString> keywords( m_current->keywords() ); 1278 1279 foreach( const QString &keyword, m_buffer.split( QLatin1Char(',') ) ) 1280 { 1281 QString kwd( keyword.simplified() ); 1282 if( !kwd.isEmpty() && !keywords.contains( kwd ) ) 1283 keywords.append( kwd ); 1284 } 1285 1286 std::sort( keywords.begin(), keywords.end() ); 1287 m_current->setKeywords( keywords ); 1288 1289 } 1290 1291 void 1292 PodcastReader::endNewFeedUrl() 1293 { 1294 if( m_xmlReader.namespaceUri() == ITUNES_NS ) 1295 { 1296 m_url = QUrl( m_buffer.trimmed() ); 1297 1298 if( m_channel && m_channel->url() != m_url ) 1299 { 1300 debug() << "feed url changed to: " << m_url.url(); 1301 m_channel->setUrl( m_url ); 1302 } 1303 } 1304 } 1305 1306 void 1307 PodcastReader::endAuthor() 1308 { 1309 m_current->setAuthor( m_buffer.trimmed() ); 1310 } 1311 1312 void 1313 PodcastReader::endCreator() 1314 { 1315 // there are funny people that do not use <author> but <dc:creator> 1316 if( m_xmlReader.namespaceUri() == DC_NS ) 1317 { 1318 endAuthor(); 1319 } 1320 } 1321 1322 void 1323 PodcastReader::beginXml() 1324 { 1325 m_buffer += '<'; 1326 m_buffer += m_xmlReader.name().toString(); 1327 1328 foreach( const QXmlStreamAttribute &attr, m_xmlReader.attributes() ) 1329 { 1330 m_buffer += QStringLiteral( " %1=\"%2\"" ) 1331 .arg( attr.name().toString(), 1332 attr.value().toString().toHtmlEscaped() ); 1333 } 1334 1335 m_buffer += '>'; 1336 } 1337 1338 void 1339 PodcastReader::beginNoElement() 1340 { 1341 DEBUG_BLOCK 1342 debug() << "no element expected here, but got element: " 1343 << m_xmlReader.name(); 1344 } 1345 1346 void 1347 PodcastReader::beginAtomText() 1348 { 1349 if( hasAttribute( ATOM_NS, "type" ) ) 1350 { 1351 QStringRef type( attribute( ATOM_NS, "type" ) ); 1352 1353 if( type == "text" ) 1354 { 1355 m_contentType = TextContent; 1356 } 1357 else if( type == "html" ) 1358 { 1359 m_contentType = HtmlContent; 1360 } 1361 else if( type == "xhtml" ) 1362 { 1363 m_contentType = XHtmlContent; 1364 } 1365 else 1366 { 1367 // this should not happen, see elementType() 1368 debug() << "unsupported atom:content type: " << type.toString(); 1369 m_contentType = TextContent; 1370 } 1371 } 1372 else 1373 { 1374 m_contentType = TextContent; 1375 } 1376 1377 m_buffer.clear(); 1378 } 1379 1380 void 1381 PodcastReader::beginAtomTextChild() 1382 { 1383 switch( m_contentType ) 1384 { 1385 case XHtmlContent: 1386 beginXml(); 1387 break; 1388 1389 case HtmlContent: 1390 case TextContent: 1391 // stripping illegal tags 1392 debug() << "read unexpected open tag in atom text: " << m_xmlReader.name(); 1393 1394 default: 1395 break; 1396 } 1397 } 1398 1399 void 1400 PodcastReader::endAtomTextChild() 1401 { 1402 switch( m_contentType ) 1403 { 1404 case XHtmlContent: 1405 endXml(); 1406 break; 1407 1408 case HtmlContent: 1409 case TextContent: 1410 // stripping illegal tags 1411 debug() << "read unexpected close tag in atom text: " << m_xmlReader.name(); 1412 1413 default: 1414 break; 1415 } 1416 } 1417 1418 void 1419 PodcastReader::readAtomTextCharacters() 1420 { 1421 switch( m_contentType ) 1422 { 1423 case XHtmlContent: 1424 m_buffer += m_xmlReader.text().toString().toHtmlEscaped(); 1425 break; 1426 1427 case HtmlContent: 1428 m_buffer += m_xmlReader.text(); 1429 break; 1430 1431 case TextContent: 1432 m_buffer += m_xmlReader.text(); 1433 1434 default: 1435 break; 1436 } 1437 } 1438 1439 void 1440 PodcastReader::beginAtomFeedLink() 1441 { 1442 if( !hasAttribute( ATOM_NS, "rel" ) || 1443 attribute( ATOM_NS, "rel" ) == "alternate" ) 1444 { 1445 m_channel->setWebLink( QUrl( attribute( ATOM_NS, "href" ).toString() ) ); 1446 } 1447 else if( attribute( ATOM_NS, "rel" ) == "self" ) 1448 { 1449 m_url = QUrl( attribute( ATOM_NS, "href" ).toString() ); 1450 1451 if( m_channel && m_channel->url() != m_url ) 1452 { 1453 debug() << "feed url changed to: " << m_url.url(); 1454 m_channel->setUrl( m_url ); 1455 } 1456 } 1457 } 1458 1459 void 1460 PodcastReader::beginAtomEntryLink() 1461 { 1462 if( attribute( ATOM_NS, "rel" ) == "enclosure" ) 1463 { 1464 QUrl url( attribute( ATOM_NS, "href" ).toString() ); 1465 int filesize = 0; 1466 QString mimeType; 1467 1468 if( hasAttribute( ATOM_NS, "length" ) ) 1469 { 1470 filesize = attribute( ATOM_NS, "length" ).toString().toInt(); 1471 } 1472 1473 if( hasAttribute( ATOM_NS, "type" ) ) 1474 { 1475 mimeType = attribute( ATOM_NS, "type" ).toString(); 1476 } 1477 1478 m_enclosures.append( Enclosure( url, filesize, mimeType ) ); 1479 } 1480 } 1481 1482 void 1483 PodcastReader::endAtomIcon() 1484 { 1485 if( !m_channel->hasImage() ) 1486 { 1487 endImageUrl(); 1488 } 1489 } 1490 1491 void 1492 PodcastReader::endAtomTitle() 1493 { 1494 // TODO: don't convert text but store m_contentType 1495 m_current->setTitle( atomTextAsText().trimmed() ); 1496 } 1497 1498 void 1499 PodcastReader::endAtomSubtitle() 1500 { 1501 // TODO: don't convert text but store m_contentType 1502 m_current->setSubtitle( atomTextAsText().trimmed() ); 1503 } 1504 1505 void 1506 PodcastReader::endAtomSummary() 1507 { 1508 // TODO: don't convert text but store m_contentType 1509 m_current->setSummary( atomTextAsHtml().trimmed() ); 1510 } 1511 1512 void 1513 PodcastReader::endAtomContent() 1514 { 1515 // TODO: don't convert text but store m_contentType 1516 m_current->setDescription( atomTextAsHtml() ); 1517 } 1518 1519 void 1520 PodcastReader::endAtomPublished() 1521 { 1522 QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate ); 1523 1524 if( !date.isValid() ) 1525 { 1526 debug() << "invalid podcast episode atom:published date: " << m_buffer; 1527 return; 1528 } 1529 1530 if( !m_item->pubDate().isValid() || m_item->pubDate() < date ) 1531 { 1532 m_item->setPubDate( date ); 1533 } 1534 } 1535 1536 void 1537 PodcastReader::endAtomUpdated() 1538 { 1539 QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate ); 1540 1541 if( !date.isValid() ) 1542 { 1543 debug() << "invalid podcast episode atom:updated date: " << m_buffer; 1544 return; 1545 } 1546 1547 if( !m_item->pubDate().isValid() || m_item->pubDate() < date ) 1548 { 1549 // TODO: add field updatedDate and use this (throughout amarok) 1550 m_item->setPubDate( date ); 1551 } 1552 } 1553 1554 void 1555 PodcastReader::readNoCharacters() 1556 { 1557 DEBUG_BLOCK 1558 debug() << "no characters expected here"; 1559 } 1560 1561 void 1562 PodcastReader::endXml() 1563 { 1564 m_buffer += QLatin1String("</"); 1565 m_buffer += m_xmlReader.name().toString(); 1566 m_buffer += '>'; 1567 } 1568 1569 void 1570 PodcastReader::readCharacters() 1571 { 1572 m_buffer += m_xmlReader.text(); 1573 } 1574 1575 void 1576 PodcastReader::readEscapedCharacters() 1577 { 1578 m_buffer += m_xmlReader.text().toString().toHtmlEscaped() ; 1579 } 1580 1581 QStringRef 1582 PodcastReader::attribute( const char *namespaceUri, const char *name ) const 1583 { 1584 // workaround, because Qt seems to have a bug: 1585 // when the default namespace is used attributes 1586 // aren't inside this namespace for some reason 1587 if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) ) 1588 return m_xmlReader.attributes().value( namespaceUri, name ); 1589 else 1590 return m_xmlReader.attributes().value( QString(), name ); 1591 } 1592 1593 bool 1594 PodcastReader::hasAttribute( const char *namespaceUri, const char *name ) const 1595 { 1596 // see PodcastReader::attribute() 1597 if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) ) 1598 return true; 1599 else 1600 return m_xmlReader.attributes().hasAttribute( QString(), name ); 1601 } 1602 1603 QDateTime 1604 PodcastReader::parsePubDate( const QString &dateString ) 1605 { 1606 DEBUG_BLOCK 1607 QString parseInput = dateString; 1608 debug() << "Parsing pubdate: " << parseInput; 1609 1610 QRegExp rfcDateDayRegex( QStringLiteral("^[A-Z]{1}[a-z]{2}\\s*,\\s*(.*)") ); 1611 if( rfcDateDayRegex.indexIn( parseInput ) != -1 ) 1612 { 1613 parseInput = rfcDateDayRegex.cap(1); 1614 } 1615 //Hack around a to strict RFCDate implementation in KDateTime. 1616 //See https://bugs.kde.org/show_bug.cgi?id=231062 1617 QRegExp rfcMonthLowercase( QStringLiteral("^\\d+\\s+\\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\\b") ); 1618 if( rfcMonthLowercase.indexIn( parseInput ) != -1 ) 1619 { 1620 QString lowerMonth = rfcMonthLowercase.cap( 1 ); 1621 QString upperMonth = lowerMonth; 1622 upperMonth.replace( 0, 1, lowerMonth.at( 0 ).toUpper() ); 1623 parseInput.replace( lowerMonth, upperMonth ); 1624 } 1625 1626 QDateTime pubDate = QDateTime::fromString( parseInput, Qt::RFC2822Date ); 1627 1628 debug() << "result: " << pubDate.toString(); 1629 return pubDate; 1630 } 1631 1632 void 1633 PodcastReader::slotRedirection( KIO::Job * job, const QUrl &url ) 1634 { 1635 DEBUG_BLOCK 1636 Q_UNUSED( job ); 1637 debug() << "redirected to: " << url.url(); 1638 } 1639 1640 void 1641 PodcastReader::slotPermanentRedirection( KIO::Job * job, const QUrl &fromUrl, 1642 const QUrl &toUrl ) 1643 { 1644 DEBUG_BLOCK 1645 Q_UNUSED( job ); 1646 Q_UNUSED( fromUrl ); 1647 debug() << "permanently redirected to: " << toUrl.url(); 1648 m_url = toUrl; 1649 /* change the url for existing feeds as well. Permanent redirection means the old one 1650 might disappear soon. */ 1651 if( m_channel ) 1652 m_channel->setUrl( m_url ); 1653 } 1654 1655 Podcasts::PodcastEpisodePtr 1656 PodcastReader::podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode ) 1657 { 1658 // DEBUG_BLOCK 1659 Podcasts::PodcastEpisodePtr episodeMatch = episode; 1660 Podcasts::PodcastEpisodeList episodes = m_channel->episodes(); 1661 1662 // debug() << "episode title: " << episode->title(); 1663 // debug() << "episode url: " << episode->prettyUrl(); 1664 // debug() << "episode guid: " << episode->guid(); 1665 1666 foreach( PodcastEpisodePtr match, episodes ) 1667 { 1668 // debug() << "match title: " << match->title(); 1669 // debug() << "match url: " << match->prettyUrl(); 1670 // debug() << "match guid: " << match->guid(); 1671 1672 int score = 0; 1673 if( !episode->title().isEmpty() && episode->title() == match->title() ) 1674 score += 1; 1675 if( !episode->prettyUrl().isEmpty() && episode->prettyUrl() == match->prettyUrl() ) 1676 score += 3; 1677 if( !episode->guid().isEmpty() && episode->guid() == match->guid() ) 1678 score += 3; 1679 1680 // debug() << "score: " << score; 1681 if( score >= 3 ) 1682 { 1683 episodeMatch = match; 1684 break; 1685 } 1686 } 1687 1688 return episodeMatch; 1689 } 1690