File indexing completed on 2024-11-24 04:45:06

0001 /*
0002     SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "trainstationdbgenerator.h"
0008 #include "codegen.h"
0009 #include "util.h"
0010 #include "wikidata.h"
0011 
0012 #include <QDebug>
0013 #include <QIODevice>
0014 #include <QJsonArray>
0015 #include <QJsonObject>
0016 
0017 using namespace KItinerary::Generator;
0018 
0019 namespace KItinerary {
0020 namespace Generator {
0021 
0022 static bool operator<(const TrainStationDbGenerator::Station &lhs, const TrainStationDbGenerator::Station &rhs)
0023 {
0024     return lhs.uri < rhs.uri;
0025 }
0026 static bool operator<(const TrainStationDbGenerator::Station &lhs, const QUrl &rhs)
0027 {
0028     return lhs.uri < rhs;
0029 }
0030 
0031 }
0032 }
0033 
0034 bool TrainStationDbGenerator::generate(QIODevice *out)
0035 {
0036     // retrieve content from Wikidata
0037     if (!fetch("P954", "ibnr", m_ibnrMap)
0038      || !fetch("P722", "uic", m_uicMap)
0039      || !fetch("P8181", "sncf", m_sncfIdMap)
0040      || !fetch("P8448", "benerail", m_benerailIdMap)
0041      || !fetch("P238", "iata", m_iataMap)
0042      || !fetch("P4803", "amtrak", m_amtrakMap)
0043      || !fetch("P10653", "viarail", m_viaRailMap)
0044      || !fetch("P4755", "uk", m_ukMap)
0045      || !fetchIndianRailwaysStationCode()
0046      || !fetchFinishStationCodes()
0047     ) {
0048         return false;
0049     }
0050     if (!fetchCountryInformation()) {
0051          return false;
0052      }
0053 
0054     // filtering out stations without useful information
0055     processStations();
0056 
0057     // code generation
0058     CodeGen::writeLicenseHeaderWikidata(out);
0059     out->write(R"(
0060 #include "knowledgedb.h"
0061 #include "trainstationdb.h"
0062 
0063 namespace KItinerary {
0064 namespace KnowledgeDb {
0065 )");
0066     writeStationData(out);
0067     writeIdMap(out, m_ibnrMap, "ibnr", "IBNR");
0068     writeIdMap(out, m_uicMap, "uic", "UICStation");
0069     writeIdMap(out, m_sncfIdMap, "sncfStationId", "SncfStationId");
0070     writeIdMap(out, m_benerailIdMap, "benerail", "BenerailStationId");
0071     writeIdMap(out, m_iataMap, "iata", "IataCode");
0072     writeIdMap(out, m_amtrakMap, "amtrak", "AmtrakStationCode");
0073     writeIdMap(out, m_viaRailMap, "viarail", "ViaRailStationCode");
0074     writeIdMap(out, m_ukMap, "uk", "UKRailwayStationCode");
0075     writeIndianRailwaysMap(out);
0076     writeVRMap(out);
0077     out->write(R"(
0078 }
0079 }
0080 )");
0081 
0082     printSummary();
0083     return true;
0084 }
0085 
0086 template<typename Id>
0087 bool TrainStationDbGenerator::fetch(const char *prop, const char *name, std::map<Id, QUrl> &idMap)
0088 {
0089   const auto stationArray =
0090       WikiData::query(QLatin1StringView(R"(
0091         SELECT DISTINCT ?station ?stationLabel ?id ?coord ?replacedBy ?dateOfOfficialClosure WHERE {
0092             ?station (wdt:P31/wdt:P279*) wd:Q55488.
0093             ?station wdt:)") +
0094                           QString::fromUtf8(prop) + QLatin1StringView(R"( ?id.
0095             OPTIONAL { ?station wdt:P625 ?coord. }
0096             OPTIONAL { ?station wdt:P1366 ?replacedBy. }
0097             OPTIONAL { ?station wdt:P3999 ?dateOfOfficialClosure. }
0098             SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
0099         } ORDER BY (?station))"),
0100                       QLatin1StringView("wikidata_trainstation_") +
0101                           QString::fromUtf8(name) + QLatin1String(".json"));
0102   if (stationArray.isEmpty()) {
0103     qWarning() << "Empty query result!";
0104     return false;
0105     }
0106 
0107     for (const auto &stationData : stationArray) {
0108         const auto stationObj = stationData.toObject();
0109         if (stationObj.contains(QLatin1StringView("replacedBy")) ||
0110             stationObj.contains(QLatin1String("dateOfOfficialClosure"))) {
0111           continue;
0112         }
0113 
0114         const auto uri = insertOrMerge(stationObj);
0115         const auto idStr = stationObj.value(QLatin1StringView("id"))
0116                                .toObject()
0117                                .value(QLatin1String("value"))
0118                                .toString()
0119                                .toUpper();
0120         const auto id = Id(idStr);
0121         if (!id.isValid()) {
0122             ++m_idFormatViolations;
0123             qWarning() << name << "format violation" << idStr << uri;
0124             continue;
0125         }
0126 
0127         const auto it = idMap.find(id);
0128         if (it != idMap.end() && (*it).second != uri) {
0129             ++m_idConflicts;
0130             qWarning() << "Conflict on" << name << idStr << uri << idMap[id];
0131         } else {
0132             idMap[id] = uri;
0133         }
0134     }
0135 
0136     return true;
0137 }
0138 
0139 bool TrainStationDbGenerator::fetchIndianRailwaysStationCode()
0140 {
0141     const auto stationArray = WikiData::query(R"(
0142         SELECT DISTINCT ?station ?stationLabel ?irId ?coord WHERE {
0143             ?station (wdt:P31/wdt:P279*) wd:Q55488.
0144             ?station wdt:P5696 ?irId.
0145             OPTIONAL { ?station wdt:P625 ?coord. }
0146             SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
0147         } ORDER BY (?station))", "wikidata_trainstation_indian_railways.json");
0148     if (stationArray.isEmpty()) {
0149         qWarning() << "Empty query result!";
0150         return false;
0151     }
0152 
0153     for (const auto &stationData : stationArray) {
0154         const auto stationObj = stationData.toObject();
0155         const auto uri = insertOrMerge(stationObj);
0156 
0157         const auto id = stationObj.value(QLatin1StringView("irId"))
0158                             .toObject()
0159                             .value(QLatin1String("value"))
0160                             .toString()
0161                             .toUpper();
0162         const auto it = m_indianRailwaysMap.find(id);
0163         if (it != m_indianRailwaysMap.end() && (*it).second != uri) {
0164             ++m_idConflicts;
0165             qWarning() << "Conflict on Indian Railwaiys station code" << id << uri << m_indianRailwaysMap[id];
0166         } else {
0167             m_indianRailwaysMap[id] = uri;
0168         }
0169     }
0170 
0171     return true;
0172 }
0173 
0174 bool TrainStationDbGenerator::fetchFinishStationCodes()
0175 {
0176     const auto stationArray = WikiData::query(R"(
0177         SELECT DISTINCT ?station ?stationLabel ?code ?coord ?ref WHERE {
0178             ?station (wdt:P31/wdt:P279*) wd:Q55488.
0179             ?station p:P296 ?codeStmt.
0180             ?codeStmt ps:P296 ?code.
0181             ?codeStmt prov:wasDerivedFrom ?refnode.
0182             ?refnode pr:P854 ?ref.
0183             OPTIONAL { ?station wdt:P625 ?coord. }
0184             SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
0185         } ORDER BY (?station))", "wikidata_trainstation_vrfi.json");
0186     if (stationArray.isEmpty()) {
0187         qWarning() << "Empty query result!";
0188         return false;
0189     }
0190 
0191     for (const auto &stationData : stationArray) {
0192         const auto stationObj = stationData.toObject();
0193         const auto ref = stationObj.value(QLatin1StringView("ref"))
0194                              .toObject()
0195                              .value(QLatin1String("value"))
0196                              .toString();
0197         if (!ref.contains(QLatin1StringView("rata.digitraffic.fi"),
0198                           Qt::CaseInsensitive)) {
0199           continue;
0200         }
0201         const auto uri = insertOrMerge(stationObj);
0202 
0203         // TODO this filters 'Ä' and 'Ö' too, which seem to occur in a few cases?
0204         const auto idStr = stationObj.value(QLatin1StringView("code"))
0205                                .toObject()
0206                                .value(QLatin1String("value"))
0207                                .toString()
0208                                .toUpper();
0209         const auto id = KnowledgeDb::VRStationCode(idStr);
0210         if (!id.isValid()) {
0211             ++m_idFormatViolations;
0212             qWarning() << "VR (Finland) station id format violation" << idStr << uri;
0213             continue;
0214         }
0215 
0216         const auto it = m_vrfiMap.find(id);
0217         if (it != m_vrfiMap.end() && (*it).second != uri) {
0218             ++m_idConflicts;
0219             qWarning() << "Conflict on VR (Finland) station code" << idStr << uri << m_vrfiMap[id];
0220         } else {
0221             m_vrfiMap[id] = uri;
0222         }
0223     }
0224 
0225     return true;
0226 }
0227 
0228 bool TrainStationDbGenerator::fetchCountryInformation()
0229 {
0230     const auto stationArray = WikiData::query(R"(
0231         SELECT DISTINCT ?station ?isoCode WHERE {
0232             ?station (wdt:P31/wdt:P279*) wd:Q55488.
0233             ?station wdt:P17 ?country.
0234             ?country p:P297 [ ps:P297 ?isoCode ].
0235         } ORDER BY (?station))", "wikidata_trainstation_country.json");
0236     if (stationArray.isEmpty()) {
0237         qWarning() << "Empty query result!";
0238         return false;
0239     }
0240 
0241     for (const auto &stationData : stationArray) {
0242         const auto uri = insertOrMerge(stationData.toObject(), true);
0243         Q_UNUSED(uri)
0244     }
0245 
0246     return true;
0247 }
0248 
0249 QUrl TrainStationDbGenerator::insertOrMerge(const QJsonObject &obj, bool mergeOnly)
0250 {
0251     if (obj.isEmpty()) {
0252         return {};
0253     }
0254 
0255     Station s;
0256     s.uri = QUrl(obj.value(QLatin1StringView("station"))
0257                      .toObject()
0258                      .value(QLatin1String("value"))
0259                      .toString());
0260     s.name = obj.value(QLatin1StringView("stationLabel"))
0261                  .toObject()
0262                  .value(QLatin1String("value"))
0263                  .toString();
0264     s.coord = WikiData::parseCoordinate(obj.value(QLatin1StringView("coord"))
0265                                             .toObject()
0266                                             .value(QLatin1String("value"))
0267                                             .toString());
0268     s.isoCode = obj.value(QLatin1StringView("isoCode"))
0269                     .toObject()
0270                     .value(QLatin1String("value"))
0271                     .toString();
0272 
0273     const auto it = std::lower_bound(m_stations.begin(), m_stations.end(), s);
0274     if (it != m_stations.end() && (*it).uri == s.uri) {
0275         if ((*it).name.isEmpty()) {
0276             (*it).name = s.name;
0277         }
0278         // check for coordinate conflicts
0279         if (s.coord.isValid() && (*it).coord.isValid()) {
0280             if (std::abs(s.coord.latitude - (*it).coord.latitude) > 0.2f || std::abs(s.coord.longitude - (*it).coord.longitude) > 0.2f) {
0281                 ++m_coordinateConflicts;
0282                 qWarning() << s.uri << "has multiple conflicting coordinates";
0283             }
0284             // pick always the same independent of the input order, so stabilize generated output
0285             (*it).coord.latitude = std::min((*it).coord.latitude, s.coord.latitude);
0286             (*it).coord.longitude = std::min((*it).coord.longitude, s.coord.longitude);
0287         }
0288         if ((*it).isoCode != s.isoCode && !s.isoCode.isEmpty()) {
0289             if (!(*it).isoCode.isEmpty()) {
0290                 ++m_countryConflicts;
0291                 qWarning() << s.uri << (*it).name << "has multiple country codes";
0292             } else {
0293                 (*it).isoCode = s.isoCode;
0294             }
0295         }
0296 
0297         return s.uri;
0298     }
0299 
0300     if (!mergeOnly) {
0301         m_stations.insert(it, s);
0302     }
0303     return s.uri;
0304 }
0305 
0306 void TrainStationDbGenerator::processStations()
0307 {
0308     for (auto it = m_stations.begin(); it != m_stations.end();) {
0309         if (!(*it).coord.isValid()) {
0310             qDebug() << "Station has no geo coordinates:" << (*it).name << (*it).uri;
0311         }
0312 
0313         if (!(*it).coord.isValid() && (*it).isoCode.isEmpty()) { // no useful information
0314             it = m_stations.erase(it);
0315         } else {
0316             ++it;
0317         }
0318     }
0319 }
0320 
0321 void TrainStationDbGenerator::writeStationData(QIODevice *out)
0322 {
0323     out->write("static const TrainStation trainstation_table[] = {\n");
0324     for (const auto &station : m_stations) {
0325         out->write("    {");
0326         CodeGen::writeCoordinate(out, station.coord);
0327         out->write(", ");
0328         CodeGen::writeCountryIsoCode(out, station.isoCode);
0329         out->write("}, // ");
0330         out->write(station.name.toUtf8());
0331         out->write("\n");
0332     }
0333     out->write("};\n\n");
0334 }
0335 
0336 template<typename Id>
0337 void TrainStationDbGenerator::writeIdMap(QIODevice *out, const std::map<Id, QUrl> &idMap, const char *tabName, const char *typeName) const
0338 {
0339     out->write("static constexpr const TrainStationIdIndex<");
0340     out->write(typeName);
0341     out->write("> ");
0342     out->write(tabName);
0343     out->write("_table[] = {\n");
0344     for (const auto &it : idMap) {
0345         const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
0346         if (station == m_stations.end() || (*station).uri != it.second) {
0347             continue;
0348         }
0349         out->write("    { ");
0350         out->write(typeName);
0351         out->write("{");
0352         out->write(encodeId(it.first));
0353         out->write("}, TrainStationIndex{");
0354         out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
0355         out->write("} }, // ");
0356         out->write((*station).name.toUtf8());
0357         out->write("\n");
0358     }
0359     out->write("};\n\n");
0360 }
0361 
0362 void TrainStationDbGenerator::writeIndianRailwaysMap(QIODevice *out)
0363 {
0364     // variable length identifiers, so we need a string table
0365     std::vector<uint16_t> offsets;
0366     offsets.reserve(m_indianRailwaysMap.size());
0367     uint16_t offset = 0;
0368 
0369     out->write("static constexpr const char indianRailwaysSationCode_stringtable[] =\n");
0370     for (const auto &it : m_indianRailwaysMap) {
0371         const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
0372         if (station == m_stations.end() || (*station).uri != it.second) {
0373             continue;
0374         }
0375 
0376         offsets.push_back(offset);
0377         out->write("    \"");
0378         out->write(it.first.toUtf8());
0379         out->write("\\0\" // ");
0380         out->write((*station).name.toUtf8());
0381         out->write("\n");
0382 
0383         offset += it.first.toUtf8().size() + 1; // +1 for the terminating null byte
0384     }
0385     out->write(";\n\n");
0386 
0387     out->write(
0388 R"(static constexpr const struct {
0389     uint16_t offset;
0390     TrainStationIndex stationIndex;
0391 } indianRailwaysSationCode_index[] = {
0392 )");
0393     int offsetIdx = 0;
0394     for (const auto &it : m_indianRailwaysMap) {
0395         const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
0396         if (station == m_stations.end() || (*station).uri != it.second) {
0397             continue;
0398         }
0399 
0400         out->write("    { ");
0401         out->write(QByteArray::number(offsets[offsetIdx++]));
0402         out->write(", TrainStationIndex{");
0403         out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
0404         out->write("} }, // ");
0405         out->write(it.first.toUtf8());
0406         out->write("\n");
0407     }
0408     out->write("};\n\n");
0409 }
0410 
0411 void TrainStationDbGenerator::writeVRMap(QIODevice *out)
0412 {
0413     out->write("static constexpr const TrainStationIdIndex<VRStationCode> vrfiConnexionsId_table[] = {\n");
0414     for (const auto &it : m_vrfiMap) {
0415         const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
0416         if (station == m_stations.end() || (*station).uri != it.second) {
0417             continue;
0418         }
0419         out->write("    { VRStationCode{\"");
0420         out->write(it.first.toString().toUtf8());
0421         for (int i = 0; i < 4 - it.first.toString().toUtf8().size(); ++i) {
0422             out->write("\\0");
0423         }
0424         out->write("\"}, TrainStationIndex{");
0425         out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
0426         out->write("} }, // ");
0427         out->write((*station).name.toUtf8());
0428         out->write("\n");
0429     }
0430     out->write("};\n\n");
0431 }
0432 
0433 void TrainStationDbGenerator::printSummary()
0434 {
0435     qDebug() << "Generated database containing" << m_stations.size() << "train stations";
0436     qDebug() << "IBNR index:" << m_ibnrMap.size() << "elements";
0437     qDebug() << "UIC index:" << m_uicMap.size() << "elements";
0438     qDebug() << "SNCF station code index:" << m_sncfIdMap.size() << "elements";
0439     qDebug() << "Benerail station code index:" << m_benerailIdMap.size() << "elements";
0440     qDebug() << "Indian Railwaiys station code index:" << m_indianRailwaysMap.size() << "elements";
0441     qDebug() << "VR (Finland) station code index:" << m_vrfiMap.size() << "elements";
0442     qDebug() << "IATA location code index:" << m_iataMap.size() << "elements";
0443     qDebug() << "Amtrak station code index:" << m_amtrakMap.size() << "elements";
0444     qDebug() << "Via Rail station code index:" << m_viaRailMap.size() << "elements";
0445     qDebug() << "UK railway station code index:" << m_ukMap.size() << "elements";
0446     qDebug() << "Identifier collisions:" << m_idConflicts;
0447     qDebug() << "Identifier format violations:" << m_idFormatViolations;
0448     qDebug() << "Coordinate conflicts:" << m_coordinateConflicts;
0449     qDebug() << "Country ISO code conflicts: " << m_countryConflicts;
0450 }