File indexing completed on 2024-12-29 04:50:00
0001 /* 0002 SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "genericboardingpassextractor.h" 0008 #include "flightutil_p.h" 0009 #include "locationutil.h" 0010 #include "logging.h" 0011 #include "stringutil.h" 0012 0013 #include "knowledgedb/airportdb.h" 0014 #include "knowledgedb/airportnametokenizer_p.h" 0015 #include "pdf/pdfdocument.h" 0016 #include "text/terminalfinder_p.h" 0017 #include "text/timefinder_p.h" 0018 0019 #include <KItinerary/ExtractorDocumentNode> 0020 #include <KItinerary/ExtractorResult> 0021 #include <KItinerary/Flight> 0022 #include <KItinerary/Reservation> 0023 0024 #include <QDebug> 0025 #include <QTimeZone> 0026 0027 #include <unordered_map> 0028 0029 using namespace KItinerary; 0030 0031 GenericBoardingPassExtractor::GenericBoardingPassExtractor() 0032 { 0033 m_filter.setMimeType(QStringLiteral("internal/iata-bcbp")); 0034 m_filter.setScope(ExtractorFilter::Descendants); 0035 } 0036 0037 GenericBoardingPassExtractor::~GenericBoardingPassExtractor() = default; 0038 0039 QString GenericBoardingPassExtractor::name() const 0040 { 0041 return QStringLiteral("<Generic PDF Boarding Pass>"); 0042 } 0043 0044 bool GenericBoardingPassExtractor::canHandle(const ExtractorDocumentNode &node) const 0045 { 0046 return node.content<PdfDocument*>() && m_filter.matches(node); 0047 } 0048 0049 static void mergeOrAppend(QStringList &l, QStringView s) 0050 { 0051 for (auto &n : l) { 0052 if (n.compare(s, Qt::CaseInsensitive) == 0) { 0053 n = StringUtil::betterString(n, s).toString(); 0054 return; 0055 } 0056 } 0057 l.push_back(s.toString()); 0058 } 0059 0060 static int airportDistance(KnowledgeDb::IataCode from, KnowledgeDb::IataCode to) 0061 { 0062 const auto fromCoord = KnowledgeDb::coordinateForAirport(from); 0063 const auto toCoord = KnowledgeDb::coordinateForAirport(to); 0064 if (!fromCoord.isValid() || !toCoord.isValid()) { 0065 return std::numeric_limits<int>::max(); 0066 } 0067 return LocationUtil::distance({fromCoord.latitude, fromCoord.longitude}, {toCoord.latitude, toCoord.longitude}); 0068 } 0069 0070 static bool isPlausibleBoardingTime(const QDateTime &boarding, const QDateTime &departure) 0071 { 0072 return boarding < departure && boarding.secsTo(departure) <= 3600; 0073 } 0074 0075 static bool isPlausibleFlightTime(const QDateTime &fromTime, const QDateTime &toTime, KnowledgeDb::IataCode from, KnowledgeDb::IataCode to) 0076 { 0077 const auto distance = airportDistance(from, to); 0078 0079 // times are local, so convert them to the right timezone first 0080 auto fromDt = fromTime; 0081 fromDt.setTimeZone(KnowledgeDb::timezoneForAirport(from)); 0082 auto toDt = toTime; 0083 toDt.setTimeZone(KnowledgeDb::timezoneForAirport(to)); 0084 0085 const auto flightDuration = fromDt.secsTo(toDt); 0086 if (flightDuration < 3600) { 0087 return false; 0088 } 0089 return fromDt < toDt && FlightUtil::isPlausibleDistanceForDuration(distance, flightDuration); 0090 } 0091 0092 [[nodiscard]] static qint64 flightDuration(const QDateTime &fromTime, const QDateTime &toTime, KnowledgeDb::IataCode from, KnowledgeDb::IataCode to) 0093 { 0094 // times are local, so convert them to the right timezone first 0095 auto fromDt = fromTime; 0096 fromDt.setTimeZone(KnowledgeDb::timezoneForAirport(from)); 0097 auto toDt = toTime; 0098 toDt.setTimeZone(KnowledgeDb::timezoneForAirport(to)); 0099 return fromDt.secsTo(toDt) / 60; 0100 } 0101 0102 static bool conflictIfSet(const QDateTime &lhs, const QDateTime &rhs) 0103 { 0104 return lhs.isValid() && rhs.isValid() && lhs != rhs; 0105 } 0106 0107 static void applyFlightTimes(QList<QVariant> &result, const QDateTime &boarding, const QDateTime &dep, const QDateTime &arr) 0108 { 0109 for (auto &res : result) { 0110 auto flightRes = res.value<FlightReservation>(); 0111 auto flight = flightRes.reservationFor().value<Flight>(); 0112 0113 // check if already set times match, otherwise discard the entire set 0114 if (conflictIfSet(flight.boardingTime(), boarding) || conflictIfSet(flight.departureTime(), dep) || conflictIfSet(flight.arrivalTime(), arr)) { 0115 continue; 0116 } 0117 0118 // apply not yet set times 0119 if (!flight.boardingTime().isValid() && boarding.isValid()) { 0120 flight.setBoardingTime(boarding); 0121 } 0122 if (!flight.departureTime().isValid() && dep.isValid()) { 0123 flight.setDepartureTime(dep); 0124 } 0125 if (!flight.arrivalTime().isValid() && arr.isValid()) { 0126 flight.setArrivalTime(arr); 0127 } 0128 flightRes.setReservationFor(flight); 0129 res = flightRes; 0130 } 0131 } 0132 0133 ExtractorResult GenericBoardingPassExtractor::extract(const ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const 0134 { 0135 static TerminalFinder terminalFinder(u"^", u"(?=\\b|\\s|$)"); 0136 0137 QList<QVariant> fullResult; 0138 0139 const auto pdf = node.content<PdfDocument*>(); 0140 0141 std::vector<ExtractorDocumentNode> bcbpNodes; 0142 m_filter.allMatches(node, bcbpNodes); 0143 std::remove_if(bcbpNodes.begin(), bcbpNodes.end(), [](const auto &node) { 0144 return node.location().userType() != QMetaType::Int || node.result().isEmpty(); 0145 }); 0146 std::sort(bcbpNodes.begin(), bcbpNodes.end(), [](const auto &lhs, const auto &rhs) { return lhs.location().toInt() < rhs.location().toInt(); }); 0147 0148 for (auto it = bcbpNodes.begin(); it != bcbpNodes.end(); ++it) { 0149 QDate departureDay; 0150 KnowledgeDb::IataCode from, to; 0151 QList<QVariant> result; 0152 0153 // 1 determine which airports we need to look for on the same page 0154 const auto pageNum = (*it).location().toInt(); 0155 std::unordered_map<KnowledgeDb::IataCode, QStringList> airportNames; 0156 std::unordered_map<KnowledgeDb::IataCode, QString> terminalNames; 0157 for (auto it2 = it; it2 != bcbpNodes.end() && (*it2).location().toInt() == pageNum; ++it2) { 0158 const auto flightReservations = (*it).result().result(); 0159 for (const auto &flightRes : flightReservations) { 0160 const auto flight = flightRes.value<FlightReservation>().reservationFor().value<Flight>(); 0161 if (!flight.departureAirport().iataCode().isEmpty()) { 0162 from = KnowledgeDb::IataCode{flight.departureAirport().iataCode()}; 0163 airportNames[from] = QStringList(); 0164 terminalNames[from] = QString(); 0165 } 0166 if (!flight.arrivalAirport().iataCode().isEmpty()) { 0167 to = KnowledgeDb::IataCode{flight.arrivalAirport().iataCode()}; 0168 airportNames[to] = QStringList(); 0169 terminalNames[to] = QString(); 0170 } 0171 departureDay = flight.departureDay(); 0172 } 0173 } 0174 0175 // 2 tokenize the page and scan for airport names 0176 const auto page = pdf->page(pageNum); 0177 qCDebug(Log) << "scanning page" << pageNum << "for airport names"; 0178 const auto pageText = page.text(); 0179 AirportNameTokenizer tokenizer(pageText); 0180 while (tokenizer.hasNext()) { 0181 const auto s = tokenizer.next(); 0182 if (s.compare(QLatin1StringView("international"), 0183 Qt::CaseInsensitive) == 0) { 0184 qCDebug(Log) << " ignoring" << s; 0185 continue; 0186 } 0187 0188 // IATA code of one of the airports 0189 if (const auto code = KnowledgeDb::IataCode(s); !s.isNull() && airportNames.find(KnowledgeDb::IataCode{s}) != airportNames.end()) { 0190 // also look for terminal information after the IATA code itself 0191 const auto offset = s.size() + s.data() - pageText.data(); 0192 const auto res = terminalFinder.find(QStringView(pageText).mid(offset)); 0193 if (res.hasResult() && res.name != s.toString()) { 0194 terminalNames[code] = res.name; 0195 } 0196 0197 qCDebug(Log) << " found own IATA code" << s; 0198 continue; 0199 } 0200 0201 const auto iataCodes = KnowledgeDb::iataCodesFromName(s); 0202 for (const auto code : iataCodes) { 0203 auto it2 = airportNames.find(code); 0204 if (it2 != airportNames.end()) { 0205 qCDebug(Log) << " found candidate:" << s << iataCodes; 0206 mergeOrAppend((*it2).second, s); 0207 0208 // look for a following terminal name at the position after s 0209 const auto offset = s.size() + s.data() - pageText.data(); 0210 const auto res = terminalFinder.find(QStringView(pageText).mid(offset)); 0211 if (res.hasResult() && res.name != code.toString()) { 0212 terminalNames[(*it2).first] = res.name; 0213 } 0214 } 0215 } 0216 } 0217 0218 // 3 augment the results with what we found 0219 const auto flightReservations = (*it).result().result(); 0220 for (const auto &res : flightReservations) { 0221 auto flightRes = res.value<FlightReservation>(); 0222 auto flight = flightRes.reservationFor().value<Flight>(); 0223 auto airport = flight.departureAirport(); 0224 airport.setName(airportNames[KnowledgeDb::IataCode{airport.iataCode()}].join(QLatin1Char(' '))); 0225 flight.setDepartureAirport(airport); 0226 flight.setDepartureTerminal(terminalNames[KnowledgeDb::IataCode{airport.iataCode()}]); 0227 airport = flight.arrivalAirport(); 0228 airport.setName(airportNames[KnowledgeDb::IataCode{airport.iataCode()}].join(QLatin1Char(' '))); 0229 flight.setArrivalAirport(airport); 0230 flight.setArrivalTerminal(terminalNames[KnowledgeDb::IataCode{airport.iataCode()}]); 0231 flightRes.setReservationFor(flight); 0232 result.push_back(std::move(flightRes)); 0233 } 0234 0235 // 4 if there's only a single leg on this page, try to see if we can determine times 0236 if (airportNames.size() == 2) { 0237 TimeFinder timeFinder; 0238 timeFinder.find(pageText); 0239 std::vector<QDateTime> times; 0240 for (const auto &res : timeFinder.results()) { 0241 switch (res.dateTime.userType()) { 0242 case QMetaType::QTime: 0243 times.push_back(QDateTime(departureDay, res.dateTime.toTime())); 0244 break; 0245 case QMetaType::QDateTime: 0246 if (res.dateTime.toDateTime().date() == departureDay) { 0247 times.push_back(res.dateTime.toDateTime()); 0248 } 0249 break; 0250 case QMetaType::QDate: 0251 default: 0252 break; 0253 } 0254 } 0255 std::sort(times.begin(), times.end()); 0256 times.erase(std::unique(times.begin(), times.end()), times.end()); 0257 if (times.size() == 2) { 0258 // boarding/departure only, and on the same day 0259 if (isPlausibleBoardingTime(times[0], times[1]) && !isPlausibleFlightTime(times[0], times[1], from, to)) { 0260 applyFlightTimes(result, times[0], times[1], {}); 0261 } 0262 } else if (times.size() == 3) { 0263 // boarding/departure/arrival on the same day 0264 if (isPlausibleBoardingTime(times[0], times[1]) && isPlausibleFlightTime(times[1], times[2], from, to)) { 0265 applyFlightTimes(result, times[0], times[1], times[2]); 0266 // boarding/departure on the same day, arrival on the next day 0267 } else if (isPlausibleBoardingTime(times[1], times[2]) && isPlausibleFlightTime(times[2], times[0].addDays(1), from, to)) { 0268 applyFlightTimes(result, times[1], times[2], times[0].addDays(1)); 0269 } 0270 // TODO handle boarding before midnight 0271 // departure/arrival/duration 0272 else if (isPlausibleFlightTime(times[1], times[2], from, to) && flightDuration(times[1], times[2], from, to) == (times[0].time().hour() * 60 + times[0].time().minute())) { 0273 applyFlightTimes(result, {}, times[1], times[2]); 0274 } 0275 } 0276 } 0277 0278 fullResult += result; 0279 } 0280 0281 return fullResult; 0282 }