File indexing completed on 2024-05-19 04:56:25
0001 /** 0002 * \file amazonimporter.cpp 0003 * Amazon database importer. 0004 * 0005 * \b Project: Kid3 0006 * \author Urs Fleisch 0007 * \date 13 Dec 2009 0008 * 0009 * Copyright (C) 2009-2024 Urs Fleisch 0010 * 0011 * This file is part of Kid3. 0012 * 0013 * Kid3 is free software; you can redistribute it and/or modify 0014 * it under the terms of the GNU General Public License as published by 0015 * the Free Software Foundation; either version 2 of the License, or 0016 * (at your option) any later version. 0017 * 0018 * Kid3 is distributed in the hope that it will be useful, 0019 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0020 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 0021 * GNU General Public License for more details. 0022 * 0023 * You should have received a copy of the GNU General Public License 0024 * along with this program. If not, see <http://www.gnu.org/licenses/>. 0025 */ 0026 0027 #include "amazonimporter.h" 0028 #include <QRegularExpression> 0029 #include "trackdatamodel.h" 0030 #include "amazonconfig.h" 0031 0032 namespace { 0033 0034 /** 0035 * Remove " [Explicit]" suffix from end of string. 0036 * @param str string to modify 0037 * @return modified string. 0038 */ 0039 QString removeExplicit(QString str) 0040 { 0041 if (str.endsWith(QLatin1String(" [Explicit]"))) { 0042 str.truncate(str.length() - 11); 0043 } 0044 return str; 0045 } 0046 0047 } 0048 0049 0050 /** 0051 * Constructor. 0052 * 0053 * @param netMgr network access manager 0054 * @param trackDataModel track data to be filled with imported values 0055 */ 0056 AmazonImporter::AmazonImporter( 0057 QNetworkAccessManager* netMgr, 0058 TrackDataModel* trackDataModel) 0059 : ServerImporter(netMgr, trackDataModel) 0060 { 0061 setObjectName(QLatin1String("AmazonImporter")); 0062 m_headers["User-Agent"] = 0063 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) " 0064 "Gecko/20090729 Firefox/3.5.2 GTB5"; 0065 } 0066 0067 /** 0068 * Name of import source. 0069 * @return name. 0070 */ 0071 const char* AmazonImporter::name() const { 0072 return QT_TRANSLATE_NOOP("@default", "Amazon"); 0073 } 0074 0075 /** NULL-terminated array of server strings, 0 if not used */ 0076 const char** AmazonImporter::serverList() const 0077 { 0078 static const char* servers[] = { 0079 // Parsing only works with English text 0080 "www.amazon.com", 0081 "www.amazon.co.uk", 0082 nullptr // end of StrList 0083 }; 0084 return servers; 0085 } 0086 0087 /** default server, 0 to disable */ 0088 const char* AmazonImporter::defaultServer() const { return "www.amazon.com"; } 0089 0090 /** anchor to online help, 0 to disable */ 0091 const char* AmazonImporter::helpAnchor() const { return "import-amazon"; } 0092 0093 /** configuration, 0 if not used */ 0094 ServerImporterConfig* AmazonImporter::config() const { return &AmazonConfig::instance(); } 0095 0096 /** additional tags option, false if not used */ 0097 bool AmazonImporter::additionalTags() const { return true; } 0098 0099 /** 0100 * Process finished findCddbAlbum request. 0101 * 0102 * @param searchStr search data received 0103 */ 0104 void AmazonImporter::parseFindResults(const QByteArray& searchStr) 0105 { 0106 /* products have the following format: 0107 <a class="a-link-normal s-access-detail-page a-text-normal" title="The Avenger" href="http://www.amazon.com/Avenger-AMON-AMARTH/dp/B001VROVHO/ref=sr_1_1?s=music&ie=UTF8&qid=1426338609&sr=1-1"> 0108 (..)>by </span>(..)<a class="a-link-normal a-text-normal" href="/Amon-Amarth/e/B000APIBHO/ref=sr_ntt_srch_lnk_1?qid=1426338609&sr=1-1">Amon Amarth</a> 0109 */ 0110 QString str = QString::fromUtf8(searchStr); 0111 QRegularExpression catIdTitleRe( 0112 QLatin1String(R"(href="[^"]+/(dp|ASIN|images|product|-)/([A-Z0-9]+))" 0113 R"([^"]+">.*<span[^>]*>([^<]+)</span>)" 0114 R"((?:[\s\n]*(?:</a>|</h2>|<div[^>]*>|<span[^>]*>))*by </span>)" 0115 R"([\s\n]*<(?:a|span)[^>]*>([^<]+)</)")); 0116 0117 str.remove(QLatin1Char('\r')); 0118 m_albumListModel->clear(); 0119 auto it = catIdTitleRe.globalMatch(str); 0120 while (it.hasNext()) { 0121 auto match = it.next(); 0122 QString category = match.captured(1); 0123 QString id = match.captured(2); 0124 QString artistTitle = replaceHtmlEntities( 0125 match.captured(4).trimmed() + QLatin1String(" - ") + 0126 removeExplicit(match.captured(3).trimmed())); 0127 m_albumListModel->appendItem(artistTitle, category, id); 0128 } 0129 } 0130 0131 /** 0132 * Parse result of album request and populate m_trackDataModel with results. 0133 * 0134 * @param albumStr album data received 0135 */ 0136 void AmazonImporter::parseAlbumResults(const QByteArray& albumStr) 0137 { 0138 /* 0139 <span id="productTitle" class="a-size-large product-title-word-break"> The Avenger </span> 0140 <span class="author(..)<a class="a-link-normal" href="/Amon-Amarth/e/B000APIBHO/ref=dp_byline_cont_music_1">Amon Amarth</a> 0141 0142 <h2>Track Listings</h2>(..)<tr> <td>1</td> <td>Bleed for Ancient Gods</td> </tr> 0143 <tr> <td>2</td> <td>The Last with Pagan Blood</td>(..) 0144 0145 <h2>Product details</h2>(..) 0146 <span class="a-text-bold">Manufacturer(..)</span> <span>Metal Blade</span> 0147 <span class="a-text-bold">Date First Available(..)</span> <span>April 4, 2009</span> 0148 */ 0149 QString str = QString::fromUtf8(albumStr); 0150 FrameCollection framesHdr; 0151 const bool standardTags = getStandardTags(); 0152 // search for 'dmusicProductTitle', next element after '>' until ' [' or '<' => album 0153 int end = 0; 0154 int start = str.indexOf( 0155 QLatin1String("<span id=\"productTitle\"")); 0156 if (start >= 0 && standardTags) { 0157 start = str.indexOf(QLatin1Char('>'), start + 23); 0158 if (start >= 0) { 0159 end = str.indexOf(QLatin1Char('<'), start); 0160 if (end > start) { 0161 if (int bracketPos = str.indexOf(QLatin1String(" ["), start); 0162 bracketPos >= 0 && bracketPos < end) { 0163 end = bracketPos; 0164 } 0165 framesHdr.setAlbum( 0166 replaceHtmlEntities(str.mid(start + 1, end - start - 1) 0167 .trimmed())); 0168 // next 'ArtistLinkSection' 0169 start = str.indexOf(QLatin1String("<span class=\"author"), end); 0170 if (start >= 0) { 0171 end = str.indexOf(QLatin1Char('>'), start); 0172 if (end > start) { 0173 0174 // next '<a', text after '>' until '<' => artist 0175 start = str.indexOf(QLatin1String("<a"), end); 0176 if (start >= 0) { 0177 start = str.indexOf(QLatin1Char('>'), start); 0178 if (start >= 0) { 0179 end = str.indexOf(QLatin1Char('<'), start); 0180 if (end > start) { 0181 framesHdr.setArtist( 0182 replaceHtmlEntities(str.mid(start + 1, end - start - 1) 0183 .trimmed())); 0184 } 0185 } 0186 } 0187 0188 } 0189 } 0190 } 0191 } 0192 } 0193 0194 // search for >Product Details<, >Original Release Date:<, >Label:< 0195 const bool additionalTags = getAdditionalTags(); 0196 QString albumArtist; 0197 start = str.indexOf(QLatin1String(">Product details<")); 0198 if (start >= 0) { 0199 QRegularExpression yearRe( 0200 QLatin1String(R"(>Date First Available.*?)" 0201 R"(<span>[^<]*(\d{4})[^<]*</span>)"), 0202 QRegularExpression::DotMatchesEverythingOption); 0203 QRegularExpression labelRe( 0204 QLatin1String(R"(>Manufacturer.*?<span>([^<]+)</span>)"), 0205 QRegularExpression::DotMatchesEverythingOption); 0206 if (additionalTags) { 0207 QRegularExpressionMatch match = yearRe.match(str, start); 0208 if (match.hasMatch()) { 0209 framesHdr.setYear(match.captured(1).toInt()); 0210 } 0211 match = labelRe.match(str, start); 0212 if (match.hasMatch()) { 0213 framesHdr.setValue(Frame::FT_Publisher, removeHtml(match.captured(1))); 0214 } 0215 } 0216 } 0217 0218 ImportTrackDataVector trackDataVector(m_trackDataModel->getTrackData()); 0219 trackDataVector.setCoverArtUrl(QUrl()); 0220 if (getCoverArt()) { 0221 QRegularExpression imgSrcRe( 0222 QLatin1String("id=\"imgTagWrapperId\"[^>]*>\\s*" 0223 "<img[^>]*src=\"([^\"]+)\""), 0224 QRegularExpression::DotMatchesEverythingOption); 0225 if (auto match = imgSrcRe.match(str); match.hasMatch()) { 0226 trackDataVector.setCoverArtUrl(QUrl(match.captured(1))); 0227 } 0228 } 0229 0230 start = str.indexOf(QLatin1String("<h2>Track Listings</h2>")); 0231 if (start >= 0) { 0232 QRegularExpression trackNumberTitleRe( 0233 QLatin1String(R"(<td>(\d+)</td>\s*<td>([^<]+?)(?:\s*\[?(\d+):(\d+)\]?\s*)?</td>)")); 0234 FrameCollection frames(framesHdr); 0235 auto it = trackDataVector.begin(); 0236 bool atTrackDataListEnd = it == trackDataVector.end(); 0237 while (start >= 0) { 0238 start = str.indexOf(QLatin1String("<tr"), start); 0239 if (start >= 0) { 0240 end = str.indexOf(QLatin1String("</tr>"), start); 0241 if (end > start) { 0242 QString trackRow = str.mid(start, end - start); 0243 start = end + 5; 0244 QString title; 0245 int trackNr = 0; 0246 int duration = 0; 0247 if (auto match = trackNumberTitleRe.match(trackRow); 0248 match.hasMatch()) { 0249 trackNr = match.captured(1).toInt(); 0250 title = match.captured(2).remove(QLatin1String("[*]")).trimmed(); 0251 duration = match.captured(3).toInt() * 60 + 0252 match.captured(4).toInt(); 0253 } 0254 if (!title.isEmpty()) { 0255 if (standardTags) { 0256 frames.setTitle(removeExplicit(replaceHtmlEntities(title))); 0257 frames.setTrack(trackNr); 0258 } 0259 if (atTrackDataListEnd) { 0260 ImportTrackData trackData; 0261 trackData.setFrameCollection(frames); 0262 trackData.setImportDuration(duration); 0263 trackDataVector.push_back(trackData); 0264 } else { 0265 while (!atTrackDataListEnd && !it->isEnabled()) { 0266 ++it; 0267 atTrackDataListEnd = it == trackDataVector.end(); 0268 } 0269 if (!atTrackDataListEnd) { 0270 it->setFrameCollection(frames); 0271 it->setImportDuration(duration); 0272 ++it; 0273 atTrackDataListEnd = it == trackDataVector.end(); 0274 } 0275 } 0276 frames = framesHdr; 0277 } 0278 } 0279 } 0280 } 0281 0282 // handle redundant tracks 0283 frames.clear(); 0284 while (!atTrackDataListEnd) { 0285 if (it->isEnabled()) { 0286 if (it->getFileDuration() == 0) { 0287 it = trackDataVector.erase(it); 0288 } else { 0289 it->setFrameCollection(frames); 0290 it->setImportDuration(0); 0291 ++it; 0292 } 0293 } else { 0294 ++it; 0295 } 0296 atTrackDataListEnd = it == trackDataVector.end(); 0297 } 0298 } else if (!framesHdr.empty()) { 0299 // if there are no track data, fill frame header data 0300 for (auto it = trackDataVector.begin(); it != trackDataVector.end(); ++it) { 0301 if (it->isEnabled()) { 0302 it->setFrameCollection(framesHdr); 0303 } 0304 } 0305 } 0306 m_trackDataModel->setTrackData(trackDataVector); 0307 } 0308 0309 /** 0310 * Send a query command to search on the server. 0311 * 0312 * @param cfg import source configuration 0313 * @param artist artist to search 0314 * @param album album to search 0315 */ 0316 void AmazonImporter::sendFindQuery( 0317 const ServerImporterConfig* cfg, 0318 const QString& artist, const QString& album) 0319 { 0320 // If an URL is entered in the first search field, its result will be directly 0321 // available in the album results list. 0322 if (artist.startsWith(QLatin1String("https://www.amazon.com/"))) { 0323 constexpr int catBegin = 23; 0324 if (int catEnd = artist.indexOf(QLatin1Char('/'), catBegin); 0325 catEnd > catBegin) { 0326 m_albumListModel->clear(); 0327 m_albumListModel->appendItem( 0328 artist, 0329 artist.mid(catBegin, catEnd - catBegin), 0330 artist.mid(catEnd + 1)); 0331 return; 0332 } 0333 } 0334 /* 0335 * Query looks like this: 0336 * http://www.amazon.com/gp/search/ref=sr_adv_m_pop/?search-alias=popular&field-artist=amon+amarth&field-title=the+avenger 0337 */ 0338 sendRequest(cfg->server(), 0339 QLatin1String("/s?i=music-intl-ship&k=") + 0340 encodeUrlQuery(artist + QLatin1Char(' ') + album), 0341 QLatin1String("https"), m_headers); 0342 } 0343 0344 /** 0345 * Send a query command to fetch the track list 0346 * from the server. 0347 * 0348 * @param cfg import source configuration 0349 * @param cat category 0350 * @param id ID 0351 */ 0352 void AmazonImporter::sendTrackListQuery( 0353 const ServerImporterConfig* cfg, const QString& cat, const QString& id) 0354 { 0355 /* 0356 * Query looks like this: 0357 * http://www.amazon.com/dp/B001VROVHO 0358 */ 0359 sendRequest(cfg->server(), QLatin1Char('/') + cat + QLatin1Char('/') + id, 0360 QLatin1String("https"), m_headers); 0361 }