File indexing completed on 2024-11-24 04:45:06
0001 /* 0002 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "trainstationdbgenerator.h" 0008 #include "codegen.h" 0009 #include "util.h" 0010 #include "wikidata.h" 0011 0012 #include <QDebug> 0013 #include <QIODevice> 0014 #include <QJsonArray> 0015 #include <QJsonObject> 0016 0017 using namespace KItinerary::Generator; 0018 0019 namespace KItinerary { 0020 namespace Generator { 0021 0022 static bool operator<(const TrainStationDbGenerator::Station &lhs, const TrainStationDbGenerator::Station &rhs) 0023 { 0024 return lhs.uri < rhs.uri; 0025 } 0026 static bool operator<(const TrainStationDbGenerator::Station &lhs, const QUrl &rhs) 0027 { 0028 return lhs.uri < rhs; 0029 } 0030 0031 } 0032 } 0033 0034 bool TrainStationDbGenerator::generate(QIODevice *out) 0035 { 0036 // retrieve content from Wikidata 0037 if (!fetch("P954", "ibnr", m_ibnrMap) 0038 || !fetch("P722", "uic", m_uicMap) 0039 || !fetch("P8181", "sncf", m_sncfIdMap) 0040 || !fetch("P8448", "benerail", m_benerailIdMap) 0041 || !fetch("P238", "iata", m_iataMap) 0042 || !fetch("P4803", "amtrak", m_amtrakMap) 0043 || !fetch("P10653", "viarail", m_viaRailMap) 0044 || !fetch("P4755", "uk", m_ukMap) 0045 || !fetchIndianRailwaysStationCode() 0046 || !fetchFinishStationCodes() 0047 ) { 0048 return false; 0049 } 0050 if (!fetchCountryInformation()) { 0051 return false; 0052 } 0053 0054 // filtering out stations without useful information 0055 processStations(); 0056 0057 // code generation 0058 CodeGen::writeLicenseHeaderWikidata(out); 0059 out->write(R"( 0060 #include "knowledgedb.h" 0061 #include "trainstationdb.h" 0062 0063 namespace KItinerary { 0064 namespace KnowledgeDb { 0065 )"); 0066 writeStationData(out); 0067 writeIdMap(out, m_ibnrMap, "ibnr", "IBNR"); 0068 writeIdMap(out, m_uicMap, "uic", "UICStation"); 0069 writeIdMap(out, m_sncfIdMap, "sncfStationId", "SncfStationId"); 0070 writeIdMap(out, m_benerailIdMap, "benerail", "BenerailStationId"); 0071 writeIdMap(out, m_iataMap, "iata", "IataCode"); 0072 writeIdMap(out, m_amtrakMap, "amtrak", "AmtrakStationCode"); 0073 writeIdMap(out, m_viaRailMap, "viarail", "ViaRailStationCode"); 0074 writeIdMap(out, m_ukMap, "uk", "UKRailwayStationCode"); 0075 writeIndianRailwaysMap(out); 0076 writeVRMap(out); 0077 out->write(R"( 0078 } 0079 } 0080 )"); 0081 0082 printSummary(); 0083 return true; 0084 } 0085 0086 template<typename Id> 0087 bool TrainStationDbGenerator::fetch(const char *prop, const char *name, std::map<Id, QUrl> &idMap) 0088 { 0089 const auto stationArray = 0090 WikiData::query(QLatin1StringView(R"( 0091 SELECT DISTINCT ?station ?stationLabel ?id ?coord ?replacedBy ?dateOfOfficialClosure WHERE { 0092 ?station (wdt:P31/wdt:P279*) wd:Q55488. 0093 ?station wdt:)") + 0094 QString::fromUtf8(prop) + QLatin1StringView(R"( ?id. 0095 OPTIONAL { ?station wdt:P625 ?coord. } 0096 OPTIONAL { ?station wdt:P1366 ?replacedBy. } 0097 OPTIONAL { ?station wdt:P3999 ?dateOfOfficialClosure. } 0098 SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 0099 } ORDER BY (?station))"), 0100 QLatin1StringView("wikidata_trainstation_") + 0101 QString::fromUtf8(name) + QLatin1String(".json")); 0102 if (stationArray.isEmpty()) { 0103 qWarning() << "Empty query result!"; 0104 return false; 0105 } 0106 0107 for (const auto &stationData : stationArray) { 0108 const auto stationObj = stationData.toObject(); 0109 if (stationObj.contains(QLatin1StringView("replacedBy")) || 0110 stationObj.contains(QLatin1String("dateOfOfficialClosure"))) { 0111 continue; 0112 } 0113 0114 const auto uri = insertOrMerge(stationObj); 0115 const auto idStr = stationObj.value(QLatin1StringView("id")) 0116 .toObject() 0117 .value(QLatin1String("value")) 0118 .toString() 0119 .toUpper(); 0120 const auto id = Id(idStr); 0121 if (!id.isValid()) { 0122 ++m_idFormatViolations; 0123 qWarning() << name << "format violation" << idStr << uri; 0124 continue; 0125 } 0126 0127 const auto it = idMap.find(id); 0128 if (it != idMap.end() && (*it).second != uri) { 0129 ++m_idConflicts; 0130 qWarning() << "Conflict on" << name << idStr << uri << idMap[id]; 0131 } else { 0132 idMap[id] = uri; 0133 } 0134 } 0135 0136 return true; 0137 } 0138 0139 bool TrainStationDbGenerator::fetchIndianRailwaysStationCode() 0140 { 0141 const auto stationArray = WikiData::query(R"( 0142 SELECT DISTINCT ?station ?stationLabel ?irId ?coord WHERE { 0143 ?station (wdt:P31/wdt:P279*) wd:Q55488. 0144 ?station wdt:P5696 ?irId. 0145 OPTIONAL { ?station wdt:P625 ?coord. } 0146 SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 0147 } ORDER BY (?station))", "wikidata_trainstation_indian_railways.json"); 0148 if (stationArray.isEmpty()) { 0149 qWarning() << "Empty query result!"; 0150 return false; 0151 } 0152 0153 for (const auto &stationData : stationArray) { 0154 const auto stationObj = stationData.toObject(); 0155 const auto uri = insertOrMerge(stationObj); 0156 0157 const auto id = stationObj.value(QLatin1StringView("irId")) 0158 .toObject() 0159 .value(QLatin1String("value")) 0160 .toString() 0161 .toUpper(); 0162 const auto it = m_indianRailwaysMap.find(id); 0163 if (it != m_indianRailwaysMap.end() && (*it).second != uri) { 0164 ++m_idConflicts; 0165 qWarning() << "Conflict on Indian Railwaiys station code" << id << uri << m_indianRailwaysMap[id]; 0166 } else { 0167 m_indianRailwaysMap[id] = uri; 0168 } 0169 } 0170 0171 return true; 0172 } 0173 0174 bool TrainStationDbGenerator::fetchFinishStationCodes() 0175 { 0176 const auto stationArray = WikiData::query(R"( 0177 SELECT DISTINCT ?station ?stationLabel ?code ?coord ?ref WHERE { 0178 ?station (wdt:P31/wdt:P279*) wd:Q55488. 0179 ?station p:P296 ?codeStmt. 0180 ?codeStmt ps:P296 ?code. 0181 ?codeStmt prov:wasDerivedFrom ?refnode. 0182 ?refnode pr:P854 ?ref. 0183 OPTIONAL { ?station wdt:P625 ?coord. } 0184 SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 0185 } ORDER BY (?station))", "wikidata_trainstation_vrfi.json"); 0186 if (stationArray.isEmpty()) { 0187 qWarning() << "Empty query result!"; 0188 return false; 0189 } 0190 0191 for (const auto &stationData : stationArray) { 0192 const auto stationObj = stationData.toObject(); 0193 const auto ref = stationObj.value(QLatin1StringView("ref")) 0194 .toObject() 0195 .value(QLatin1String("value")) 0196 .toString(); 0197 if (!ref.contains(QLatin1StringView("rata.digitraffic.fi"), 0198 Qt::CaseInsensitive)) { 0199 continue; 0200 } 0201 const auto uri = insertOrMerge(stationObj); 0202 0203 // TODO this filters 'Ä' and 'Ö' too, which seem to occur in a few cases? 0204 const auto idStr = stationObj.value(QLatin1StringView("code")) 0205 .toObject() 0206 .value(QLatin1String("value")) 0207 .toString() 0208 .toUpper(); 0209 const auto id = KnowledgeDb::VRStationCode(idStr); 0210 if (!id.isValid()) { 0211 ++m_idFormatViolations; 0212 qWarning() << "VR (Finland) station id format violation" << idStr << uri; 0213 continue; 0214 } 0215 0216 const auto it = m_vrfiMap.find(id); 0217 if (it != m_vrfiMap.end() && (*it).second != uri) { 0218 ++m_idConflicts; 0219 qWarning() << "Conflict on VR (Finland) station code" << idStr << uri << m_vrfiMap[id]; 0220 } else { 0221 m_vrfiMap[id] = uri; 0222 } 0223 } 0224 0225 return true; 0226 } 0227 0228 bool TrainStationDbGenerator::fetchCountryInformation() 0229 { 0230 const auto stationArray = WikiData::query(R"( 0231 SELECT DISTINCT ?station ?isoCode WHERE { 0232 ?station (wdt:P31/wdt:P279*) wd:Q55488. 0233 ?station wdt:P17 ?country. 0234 ?country p:P297 [ ps:P297 ?isoCode ]. 0235 } ORDER BY (?station))", "wikidata_trainstation_country.json"); 0236 if (stationArray.isEmpty()) { 0237 qWarning() << "Empty query result!"; 0238 return false; 0239 } 0240 0241 for (const auto &stationData : stationArray) { 0242 const auto uri = insertOrMerge(stationData.toObject(), true); 0243 Q_UNUSED(uri) 0244 } 0245 0246 return true; 0247 } 0248 0249 QUrl TrainStationDbGenerator::insertOrMerge(const QJsonObject &obj, bool mergeOnly) 0250 { 0251 if (obj.isEmpty()) { 0252 return {}; 0253 } 0254 0255 Station s; 0256 s.uri = QUrl(obj.value(QLatin1StringView("station")) 0257 .toObject() 0258 .value(QLatin1String("value")) 0259 .toString()); 0260 s.name = obj.value(QLatin1StringView("stationLabel")) 0261 .toObject() 0262 .value(QLatin1String("value")) 0263 .toString(); 0264 s.coord = WikiData::parseCoordinate(obj.value(QLatin1StringView("coord")) 0265 .toObject() 0266 .value(QLatin1String("value")) 0267 .toString()); 0268 s.isoCode = obj.value(QLatin1StringView("isoCode")) 0269 .toObject() 0270 .value(QLatin1String("value")) 0271 .toString(); 0272 0273 const auto it = std::lower_bound(m_stations.begin(), m_stations.end(), s); 0274 if (it != m_stations.end() && (*it).uri == s.uri) { 0275 if ((*it).name.isEmpty()) { 0276 (*it).name = s.name; 0277 } 0278 // check for coordinate conflicts 0279 if (s.coord.isValid() && (*it).coord.isValid()) { 0280 if (std::abs(s.coord.latitude - (*it).coord.latitude) > 0.2f || std::abs(s.coord.longitude - (*it).coord.longitude) > 0.2f) { 0281 ++m_coordinateConflicts; 0282 qWarning() << s.uri << "has multiple conflicting coordinates"; 0283 } 0284 // pick always the same independent of the input order, so stabilize generated output 0285 (*it).coord.latitude = std::min((*it).coord.latitude, s.coord.latitude); 0286 (*it).coord.longitude = std::min((*it).coord.longitude, s.coord.longitude); 0287 } 0288 if ((*it).isoCode != s.isoCode && !s.isoCode.isEmpty()) { 0289 if (!(*it).isoCode.isEmpty()) { 0290 ++m_countryConflicts; 0291 qWarning() << s.uri << (*it).name << "has multiple country codes"; 0292 } else { 0293 (*it).isoCode = s.isoCode; 0294 } 0295 } 0296 0297 return s.uri; 0298 } 0299 0300 if (!mergeOnly) { 0301 m_stations.insert(it, s); 0302 } 0303 return s.uri; 0304 } 0305 0306 void TrainStationDbGenerator::processStations() 0307 { 0308 for (auto it = m_stations.begin(); it != m_stations.end();) { 0309 if (!(*it).coord.isValid()) { 0310 qDebug() << "Station has no geo coordinates:" << (*it).name << (*it).uri; 0311 } 0312 0313 if (!(*it).coord.isValid() && (*it).isoCode.isEmpty()) { // no useful information 0314 it = m_stations.erase(it); 0315 } else { 0316 ++it; 0317 } 0318 } 0319 } 0320 0321 void TrainStationDbGenerator::writeStationData(QIODevice *out) 0322 { 0323 out->write("static const TrainStation trainstation_table[] = {\n"); 0324 for (const auto &station : m_stations) { 0325 out->write(" {"); 0326 CodeGen::writeCoordinate(out, station.coord); 0327 out->write(", "); 0328 CodeGen::writeCountryIsoCode(out, station.isoCode); 0329 out->write("}, // "); 0330 out->write(station.name.toUtf8()); 0331 out->write("\n"); 0332 } 0333 out->write("};\n\n"); 0334 } 0335 0336 template<typename Id> 0337 void TrainStationDbGenerator::writeIdMap(QIODevice *out, const std::map<Id, QUrl> &idMap, const char *tabName, const char *typeName) const 0338 { 0339 out->write("static constexpr const TrainStationIdIndex<"); 0340 out->write(typeName); 0341 out->write("> "); 0342 out->write(tabName); 0343 out->write("_table[] = {\n"); 0344 for (const auto &it : idMap) { 0345 const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second); 0346 if (station == m_stations.end() || (*station).uri != it.second) { 0347 continue; 0348 } 0349 out->write(" { "); 0350 out->write(typeName); 0351 out->write("{"); 0352 out->write(encodeId(it.first)); 0353 out->write("}, TrainStationIndex{"); 0354 out->write(QByteArray::number((int)std::distance(m_stations.begin(), station))); 0355 out->write("} }, // "); 0356 out->write((*station).name.toUtf8()); 0357 out->write("\n"); 0358 } 0359 out->write("};\n\n"); 0360 } 0361 0362 void TrainStationDbGenerator::writeIndianRailwaysMap(QIODevice *out) 0363 { 0364 // variable length identifiers, so we need a string table 0365 std::vector<uint16_t> offsets; 0366 offsets.reserve(m_indianRailwaysMap.size()); 0367 uint16_t offset = 0; 0368 0369 out->write("static constexpr const char indianRailwaysSationCode_stringtable[] =\n"); 0370 for (const auto &it : m_indianRailwaysMap) { 0371 const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second); 0372 if (station == m_stations.end() || (*station).uri != it.second) { 0373 continue; 0374 } 0375 0376 offsets.push_back(offset); 0377 out->write(" \""); 0378 out->write(it.first.toUtf8()); 0379 out->write("\\0\" // "); 0380 out->write((*station).name.toUtf8()); 0381 out->write("\n"); 0382 0383 offset += it.first.toUtf8().size() + 1; // +1 for the terminating null byte 0384 } 0385 out->write(";\n\n"); 0386 0387 out->write( 0388 R"(static constexpr const struct { 0389 uint16_t offset; 0390 TrainStationIndex stationIndex; 0391 } indianRailwaysSationCode_index[] = { 0392 )"); 0393 int offsetIdx = 0; 0394 for (const auto &it : m_indianRailwaysMap) { 0395 const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second); 0396 if (station == m_stations.end() || (*station).uri != it.second) { 0397 continue; 0398 } 0399 0400 out->write(" { "); 0401 out->write(QByteArray::number(offsets[offsetIdx++])); 0402 out->write(", TrainStationIndex{"); 0403 out->write(QByteArray::number((int)std::distance(m_stations.begin(), station))); 0404 out->write("} }, // "); 0405 out->write(it.first.toUtf8()); 0406 out->write("\n"); 0407 } 0408 out->write("};\n\n"); 0409 } 0410 0411 void TrainStationDbGenerator::writeVRMap(QIODevice *out) 0412 { 0413 out->write("static constexpr const TrainStationIdIndex<VRStationCode> vrfiConnexionsId_table[] = {\n"); 0414 for (const auto &it : m_vrfiMap) { 0415 const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second); 0416 if (station == m_stations.end() || (*station).uri != it.second) { 0417 continue; 0418 } 0419 out->write(" { VRStationCode{\""); 0420 out->write(it.first.toString().toUtf8()); 0421 for (int i = 0; i < 4 - it.first.toString().toUtf8().size(); ++i) { 0422 out->write("\\0"); 0423 } 0424 out->write("\"}, TrainStationIndex{"); 0425 out->write(QByteArray::number((int)std::distance(m_stations.begin(), station))); 0426 out->write("} }, // "); 0427 out->write((*station).name.toUtf8()); 0428 out->write("\n"); 0429 } 0430 out->write("};\n\n"); 0431 } 0432 0433 void TrainStationDbGenerator::printSummary() 0434 { 0435 qDebug() << "Generated database containing" << m_stations.size() << "train stations"; 0436 qDebug() << "IBNR index:" << m_ibnrMap.size() << "elements"; 0437 qDebug() << "UIC index:" << m_uicMap.size() << "elements"; 0438 qDebug() << "SNCF station code index:" << m_sncfIdMap.size() << "elements"; 0439 qDebug() << "Benerail station code index:" << m_benerailIdMap.size() << "elements"; 0440 qDebug() << "Indian Railwaiys station code index:" << m_indianRailwaysMap.size() << "elements"; 0441 qDebug() << "VR (Finland) station code index:" << m_vrfiMap.size() << "elements"; 0442 qDebug() << "IATA location code index:" << m_iataMap.size() << "elements"; 0443 qDebug() << "Amtrak station code index:" << m_amtrakMap.size() << "elements"; 0444 qDebug() << "Via Rail station code index:" << m_viaRailMap.size() << "elements"; 0445 qDebug() << "UK railway station code index:" << m_ukMap.size() << "elements"; 0446 qDebug() << "Identifier collisions:" << m_idConflicts; 0447 qDebug() << "Identifier format violations:" << m_idFormatViolations; 0448 qDebug() << "Coordinate conflicts:" << m_coordinateConflicts; 0449 qDebug() << "Country ISO code conflicts: " << m_countryConflicts; 0450 }