File indexing completed on 2024-05-19 04:56:25

0001 /**
0002  * \file amazonimporter.cpp
0003  * Amazon database importer.
0004  *
0005  * \b Project: Kid3
0006  * \author Urs Fleisch
0007  * \date 13 Dec 2009
0008  *
0009  * Copyright (C) 2009-2024  Urs Fleisch
0010  *
0011  * This file is part of Kid3.
0012  *
0013  * Kid3 is free software; you can redistribute it and/or modify
0014  * it under the terms of the GNU General Public License as published by
0015  * the Free Software Foundation; either version 2 of the License, or
0016  * (at your option) any later version.
0017  *
0018  * Kid3 is distributed in the hope that it will be useful,
0019  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0020  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0021  * GNU General Public License for more details.
0022  *
0023  * You should have received a copy of the GNU General Public License
0024  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
0025  */
0026 
0027 #include "amazonimporter.h"
0028 #include <QRegularExpression>
0029 #include "trackdatamodel.h"
0030 #include "amazonconfig.h"
0031 
0032 namespace {
0033 
0034 /**
0035  * Remove " [Explicit]" suffix from end of string.
0036  * @param str string to modify
0037  * @return modified string.
0038  */
0039 QString removeExplicit(QString str)
0040 {
0041   if (str.endsWith(QLatin1String(" [Explicit]"))) {
0042     str.truncate(str.length() - 11);
0043   }
0044   return str;
0045 }
0046 
0047 }
0048 
0049 
0050 /**
0051  * Constructor.
0052  *
0053  * @param netMgr network access manager
0054  * @param trackDataModel track data to be filled with imported values
0055  */
0056 AmazonImporter::AmazonImporter(
0057   QNetworkAccessManager* netMgr,
0058   TrackDataModel* trackDataModel)
0059   : ServerImporter(netMgr, trackDataModel)
0060 {
0061   setObjectName(QLatin1String("AmazonImporter"));
0062   m_headers["User-Agent"] =
0063       "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) "
0064       "Gecko/20090729 Firefox/3.5.2 GTB5";
0065 }
0066 
0067 /**
0068  * Name of import source.
0069  * @return name.
0070  */
0071 const char* AmazonImporter::name() const {
0072   return QT_TRANSLATE_NOOP("@default", "Amazon");
0073 }
0074 
0075 /** NULL-terminated array of server strings, 0 if not used */
0076 const char** AmazonImporter::serverList() const
0077 {
0078   static const char* servers[] = {
0079     // Parsing only works with English text
0080     "www.amazon.com",
0081     "www.amazon.co.uk",
0082     nullptr            // end of StrList
0083   };
0084   return servers;
0085 }
0086 
0087 /** default server, 0 to disable */
0088 const char* AmazonImporter::defaultServer() const { return "www.amazon.com"; }
0089 
0090 /** anchor to online help, 0 to disable */
0091 const char* AmazonImporter::helpAnchor() const { return "import-amazon"; }
0092 
0093 /** configuration, 0 if not used */
0094 ServerImporterConfig* AmazonImporter::config() const { return &AmazonConfig::instance(); }
0095 
0096 /** additional tags option, false if not used */
0097 bool AmazonImporter::additionalTags() const { return true; }
0098 
0099 /**
0100  * Process finished findCddbAlbum request.
0101  *
0102  * @param searchStr search data received
0103  */
0104 void AmazonImporter::parseFindResults(const QByteArray& searchStr)
0105 {
0106   /* products have the following format:
0107 <a class="a-link-normal s-access-detail-page  a-text-normal" title="The Avenger" href="http://www.amazon.com/Avenger-AMON-AMARTH/dp/B001VROVHO/ref=sr_1_1?s=music&amp;ie=UTF8&amp;qid=1426338609&amp;sr=1-1">
0108 (..)>by </span>(..)<a class="a-link-normal a-text-normal" href="/Amon-Amarth/e/B000APIBHO/ref=sr_ntt_srch_lnk_1?qid=1426338609&sr=1-1">Amon Amarth</a>
0109    */
0110   QString str = QString::fromUtf8(searchStr);
0111   QRegularExpression catIdTitleRe(
0112         QLatin1String(R"(href="[^"]+/(dp|ASIN|images|product|-)/([A-Z0-9]+))"
0113             R"([^"]+">.*<span[^>]*>([^<]+)</span>)"
0114             R"((?:[\s\n]*(?:</a>|</h2>|<div[^>]*>|<span[^>]*>))*by </span>)"
0115             R"([\s\n]*<(?:a|span)[^>]*>([^<]+)</)"));
0116 
0117   str.remove(QLatin1Char('\r'));
0118   m_albumListModel->clear();
0119   auto it = catIdTitleRe.globalMatch(str);
0120   while (it.hasNext()) {
0121     auto match = it.next();
0122     QString category = match.captured(1);
0123     QString id = match.captured(2);
0124     QString artistTitle = replaceHtmlEntities(
0125           match.captured(4).trimmed() + QLatin1String(" - ") +
0126           removeExplicit(match.captured(3).trimmed()));
0127     m_albumListModel->appendItem(artistTitle, category, id);
0128   }
0129 }
0130 
0131 /**
0132  * Parse result of album request and populate m_trackDataModel with results.
0133  *
0134  * @param albumStr album data received
0135  */
0136 void AmazonImporter::parseAlbumResults(const QByteArray& albumStr)
0137 {
0138   /*
0139 <span id="productTitle" class="a-size-large product-title-word-break">        The Avenger         </span>
0140 <span class="author(..)<a class="a-link-normal" href="/Amon-Amarth/e/B000APIBHO/ref=dp_byline_cont_music_1">Amon Amarth</a>
0141 
0142 <h2>Track Listings</h2>(..)<tr> <td>1</td> <td>Bleed for Ancient Gods</td> </tr>
0143   <tr> <td>2</td> <td>The Last with Pagan Blood</td>(..)
0144 
0145 <h2>Product details</h2>(..)
0146 <span class="a-text-bold">Manufacturer(..)</span> <span>Metal Blade</span>
0147 <span class="a-text-bold">Date First Available(..)</span> <span>April 4, 2009</span>
0148    */
0149   QString str = QString::fromUtf8(albumStr);
0150   FrameCollection framesHdr;
0151   const bool standardTags = getStandardTags();
0152   // search for 'dmusicProductTitle', next element after '>' until ' [' or '<' => album
0153   int end = 0;
0154   int start = str.indexOf(
0155       QLatin1String("<span id=\"productTitle\""));
0156   if (start >= 0 && standardTags) {
0157     start = str.indexOf(QLatin1Char('>'), start + 23);
0158     if (start >= 0) {
0159       end = str.indexOf(QLatin1Char('<'), start);
0160       if (end > start) {
0161         if (int bracketPos = str.indexOf(QLatin1String(" ["), start);
0162             bracketPos >= 0 && bracketPos < end) {
0163           end = bracketPos;
0164         }
0165         framesHdr.setAlbum(
0166               replaceHtmlEntities(str.mid(start + 1, end - start - 1)
0167                                   .trimmed()));
0168         // next 'ArtistLinkSection'
0169         start = str.indexOf(QLatin1String("<span class=\"author"), end);
0170         if (start >= 0) {
0171           end = str.indexOf(QLatin1Char('>'), start);
0172           if (end > start) {
0173 
0174             // next '<a', text after '>' until '<' => artist
0175             start = str.indexOf(QLatin1String("<a"), end);
0176             if (start >= 0) {
0177               start = str.indexOf(QLatin1Char('>'), start);
0178               if (start >= 0) {
0179                 end = str.indexOf(QLatin1Char('<'), start);
0180                 if (end > start) {
0181                   framesHdr.setArtist(
0182                       replaceHtmlEntities(str.mid(start + 1, end - start - 1)
0183                                           .trimmed()));
0184                 }
0185               }
0186             }
0187 
0188           }
0189         }
0190       }
0191     }
0192   }
0193 
0194   // search for >Product Details<, >Original Release Date:<, >Label:<
0195   const bool additionalTags = getAdditionalTags();
0196   QString albumArtist;
0197   start = str.indexOf(QLatin1String(">Product details<"));
0198   if (start >= 0) {
0199     QRegularExpression yearRe(
0200           QLatin1String(R"(>Date First Available.*?)"
0201                         R"(<span>[^<]*(\d{4})[^<]*</span>)"),
0202           QRegularExpression::DotMatchesEverythingOption);
0203     QRegularExpression labelRe(
0204           QLatin1String(R"(>Manufacturer.*?<span>([^<]+)</span>)"),
0205           QRegularExpression::DotMatchesEverythingOption);
0206     if (additionalTags) {
0207       QRegularExpressionMatch match = yearRe.match(str, start);
0208       if (match.hasMatch()) {
0209         framesHdr.setYear(match.captured(1).toInt());
0210       }
0211       match = labelRe.match(str, start);
0212       if (match.hasMatch()) {
0213         framesHdr.setValue(Frame::FT_Publisher, removeHtml(match.captured(1)));
0214       }
0215     }
0216   }
0217 
0218   ImportTrackDataVector trackDataVector(m_trackDataModel->getTrackData());
0219   trackDataVector.setCoverArtUrl(QUrl());
0220   if (getCoverArt()) {
0221     QRegularExpression imgSrcRe(
0222           QLatin1String("id=\"imgTagWrapperId\"[^>]*>\\s*"
0223                         "<img[^>]*src=\"([^\"]+)\""),
0224           QRegularExpression::DotMatchesEverythingOption);
0225     if (auto match = imgSrcRe.match(str); match.hasMatch()) {
0226       trackDataVector.setCoverArtUrl(QUrl(match.captured(1)));
0227     }
0228   }
0229 
0230   start = str.indexOf(QLatin1String("<h2>Track Listings</h2>"));
0231   if (start >= 0) {
0232     QRegularExpression trackNumberTitleRe(
0233           QLatin1String(R"(<td>(\d+)</td>\s*<td>([^<]+?)(?:\s*\[?(\d+):(\d+)\]?\s*)?</td>)"));
0234     FrameCollection frames(framesHdr);
0235     auto it = trackDataVector.begin();
0236     bool atTrackDataListEnd = it == trackDataVector.end();
0237     while (start >= 0) {
0238       start = str.indexOf(QLatin1String("<tr"), start);
0239       if (start >= 0) {
0240         end = str.indexOf(QLatin1String("</tr>"), start);
0241         if (end > start) {
0242           QString trackRow = str.mid(start, end - start);
0243           start = end + 5;
0244           QString title;
0245           int trackNr = 0;
0246           int duration = 0;
0247           if (auto match = trackNumberTitleRe.match(trackRow);
0248               match.hasMatch()) {
0249             trackNr = match.captured(1).toInt();
0250             title = match.captured(2).remove(QLatin1String("[*]")).trimmed();
0251             duration = match.captured(3).toInt() * 60 +
0252               match.captured(4).toInt();
0253           }
0254           if (!title.isEmpty()) {
0255             if (standardTags) {
0256               frames.setTitle(removeExplicit(replaceHtmlEntities(title)));
0257               frames.setTrack(trackNr);
0258             }
0259             if (atTrackDataListEnd) {
0260               ImportTrackData trackData;
0261               trackData.setFrameCollection(frames);
0262               trackData.setImportDuration(duration);
0263               trackDataVector.push_back(trackData);
0264             } else {
0265               while (!atTrackDataListEnd && !it->isEnabled()) {
0266                 ++it;
0267                 atTrackDataListEnd = it == trackDataVector.end();
0268               }
0269               if (!atTrackDataListEnd) {
0270                 it->setFrameCollection(frames);
0271                 it->setImportDuration(duration);
0272                 ++it;
0273                 atTrackDataListEnd = it == trackDataVector.end();
0274               }
0275             }
0276             frames = framesHdr;
0277           }
0278         }
0279       }
0280     }
0281 
0282     // handle redundant tracks
0283     frames.clear();
0284     while (!atTrackDataListEnd) {
0285       if (it->isEnabled()) {
0286         if (it->getFileDuration() == 0) {
0287           it = trackDataVector.erase(it);
0288         } else {
0289           it->setFrameCollection(frames);
0290           it->setImportDuration(0);
0291           ++it;
0292         }
0293       } else {
0294         ++it;
0295       }
0296       atTrackDataListEnd = it == trackDataVector.end();
0297     }
0298   } else if (!framesHdr.empty()) {
0299     // if there are no track data, fill frame header data
0300     for (auto it = trackDataVector.begin(); it != trackDataVector.end(); ++it) {
0301       if (it->isEnabled()) {
0302         it->setFrameCollection(framesHdr);
0303       }
0304     }
0305   }
0306   m_trackDataModel->setTrackData(trackDataVector);
0307 }
0308 
0309 /**
0310  * Send a query command to search on the server.
0311  *
0312  * @param cfg      import source configuration
0313  * @param artist   artist to search
0314  * @param album    album to search
0315  */
0316 void AmazonImporter::sendFindQuery(
0317   const ServerImporterConfig* cfg,
0318   const QString& artist, const QString& album)
0319 {
0320   // If an URL is entered in the first search field, its result will be directly
0321   // available in the album results list.
0322   if (artist.startsWith(QLatin1String("https://www.amazon.com/"))) {
0323     constexpr int catBegin = 23;
0324     if (int catEnd = artist.indexOf(QLatin1Char('/'), catBegin);
0325         catEnd > catBegin) {
0326       m_albumListModel->clear();
0327       m_albumListModel->appendItem(
0328             artist,
0329             artist.mid(catBegin, catEnd - catBegin),
0330             artist.mid(catEnd + 1));
0331       return;
0332     }
0333   }
0334   /*
0335    * Query looks like this:
0336    * http://www.amazon.com/gp/search/ref=sr_adv_m_pop/?search-alias=popular&field-artist=amon+amarth&field-title=the+avenger
0337    */
0338   sendRequest(cfg->server(),
0339               QLatin1String("/s?i=music-intl-ship&k=") +
0340               encodeUrlQuery(artist + QLatin1Char(' ') + album),
0341               QLatin1String("https"), m_headers);
0342 }
0343 
0344 /**
0345  * Send a query command to fetch the track list
0346  * from the server.
0347  *
0348  * @param cfg      import source configuration
0349  * @param cat      category
0350  * @param id       ID
0351  */
0352 void AmazonImporter::sendTrackListQuery(
0353   const ServerImporterConfig* cfg, const QString& cat, const QString& id)
0354 {
0355   /*
0356    * Query looks like this:
0357    * http://www.amazon.com/dp/B001VROVHO
0358    */
0359   sendRequest(cfg->server(), QLatin1Char('/') + cat + QLatin1Char('/') + id,
0360               QLatin1String("https"), m_headers);
0361 }