File indexing completed on 2025-01-26 06:47:13
0001 // SPDX-License-Identifier: LGPL-2.1-or-later 0002 // 0003 // SPDX-FileCopyrightText: 2016 Dennis Nienhüser <nienhueser@kde.org> 0004 // 0005 0006 #include "SpellChecker.h" 0007 #include "MarbleMath.h" 0008 #include "OsmPlacemarkData.h" 0009 #include "MarbleDirs.h" 0010 #include "MarbleModel.h" 0011 #include "ParsingRunnerManager.h" 0012 #include "GeoSceneMercatorTileProjection.h" 0013 #include "TileId.h" 0014 0015 #include <QSet> 0016 #include <QDebug> 0017 #include <QFile> 0018 0019 namespace Marble { 0020 0021 SpellChecker::SpellChecker(const QString &citiesFile) : 0022 m_tileLevel(10), 0023 m_tileHash(parseCities(citiesFile)), 0024 m_verbose(false) 0025 { 0026 // nothing to do 0027 } 0028 0029 void SpellChecker::correctPlaceLabels(const QVector<GeoDataPlacemark*> &placemarks) 0030 { 0031 auto places = cityPlaces(placemarks); 0032 double const maxDistance = 5000.0 / EARTH_RADIUS; 0033 int hits = 0; 0034 int validated = 0; 0035 int misses = 0; 0036 for (auto place: places) { 0037 auto const places = candidatesFor(place); 0038 bool hasMatch = false; 0039 bool isValid = false; 0040 QString const placeName = place->name(); 0041 if (!places.isEmpty()) { 0042 auto match = places.first(); 0043 if (match->name() == place->name()) { 0044 ++validated; 0045 isValid = true; 0046 } else { 0047 if (match->coordinate().sphericalDistanceTo(place->coordinate()) < maxDistance) { 0048 if (levenshteinDistance(places.first()->name(), placeName) < 6) { 0049 if (m_verbose) { 0050 qDebug() << "Correcting" << placeName << "to" << match->name(); 0051 } 0052 place->setName(match->name()); 0053 place->osmData().removeTag("name"); 0054 place->osmData().addTag("name", match->name()); 0055 hasMatch = true; 0056 } 0057 } 0058 0059 if (m_verbose && !hasMatch) { 0060 qDebug() << "No match for " << placeName << ", candidates: "; 0061 for (auto candidate: places) { 0062 qDebug() << candidate->coordinate().sphericalDistanceTo(place->coordinate()) * EARTH_RADIUS << " m, " 0063 << "levenshtein distance " << levenshteinDistance(placeName, candidate->name()) << ":" << candidate->name(); 0064 } 0065 } 0066 } 0067 } else if (m_verbose) { 0068 qDebug() << "No match for " << placeName << " at " << place->coordinate().toString(GeoDataCoordinates::Decimal) << " and no candidates for replacement"; 0069 } 0070 hits += hasMatch ? 1 : 0; 0071 misses += (hasMatch || isValid) ? 0 : 1; 0072 } 0073 if (m_verbose) { 0074 qDebug() << "In total there are " << hits << " corrections, " << validated << " validations and " << misses << " misses"; 0075 } 0076 } 0077 0078 void SpellChecker::setVerbose(bool verbose) 0079 { 0080 m_verbose = verbose; 0081 } 0082 0083 QVector<GeoDataPlacemark *> SpellChecker::cityPlaces(const QVector<GeoDataPlacemark *> &placemarks) const 0084 { 0085 QSet<GeoDataPlacemark::GeoDataVisualCategory> categories; 0086 categories << GeoDataPlacemark::PlaceCity; 0087 categories << GeoDataPlacemark::PlaceCityCapital; 0088 categories << GeoDataPlacemark::PlaceCityNationalCapital; 0089 categories << GeoDataPlacemark::PlaceSuburb; 0090 categories << GeoDataPlacemark::PlaceHamlet; 0091 categories << GeoDataPlacemark::PlaceLocality; 0092 categories << GeoDataPlacemark::PlaceTown; 0093 categories << GeoDataPlacemark::PlaceTownCapital; 0094 categories << GeoDataPlacemark::PlaceTownNationalCapital; 0095 categories << GeoDataPlacemark::PlaceVillage; 0096 categories << GeoDataPlacemark::PlaceVillageCapital; 0097 categories << GeoDataPlacemark::PlaceVillageNationalCapital; 0098 0099 QVector<GeoDataPlacemark*> places; 0100 std::copy_if(placemarks.begin(), placemarks.end(), std::back_inserter(places), 0101 [categories] (GeoDataPlacemark* placemark) { 0102 return categories.contains(placemark->visualCategory()); }); 0103 return places; 0104 } 0105 0106 QHash<TileId, QVector<GeoDataPlacemark *> > SpellChecker::parseCities(const QString &filename) const 0107 { 0108 QHash<TileId, QVector<GeoDataPlacemark*> > placeLabels; 0109 QFile file(filename); 0110 if (!file.open(QIODevice::ReadOnly)) { 0111 qDebug() << "Cannot open " << filename << ":" << file.errorString(); 0112 return placeLabels; 0113 } 0114 0115 while (!file.atEnd()) { 0116 QByteArray line = file.readLine(); 0117 auto const values = line.split('\t'); 0118 if (values.size() > 15) { 0119 0120 GeoDataPlacemark* city = new GeoDataPlacemark; 0121 city->setName(values[1]); 0122 bool ok; 0123 double const lon = values[5].toDouble(&ok); 0124 if (!ok) { 0125 qDebug() << values[5] << " is no longitude"; 0126 continue; 0127 } 0128 double const lat = values[4].toDouble(&ok); 0129 if (!ok) { 0130 qDebug() << values[4] << " is no latitude"; 0131 continue; 0132 } 0133 double const ele = values[15].toDouble(); 0134 auto const coordinate = GeoDataCoordinates(lon, lat, ele, GeoDataCoordinates::Degree); 0135 city->setCoordinate(coordinate); 0136 0137 auto const tile = TileId::fromCoordinates(coordinate, m_tileLevel); 0138 placeLabels[tile] << city; 0139 } 0140 } 0141 return placeLabels; 0142 } 0143 0144 int SpellChecker::levenshteinDistance(const QString &a, const QString &b) 0145 { 0146 // From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance 0147 unsigned int const len1 = a.size(), len2 = b.size(); 0148 std::vector<std::vector<unsigned int>> distance(len1 + 1, std::vector<unsigned int>(len2 + 1)); 0149 0150 distance[0][0] = 0; 0151 for(unsigned int i = 1; i <= len1; ++i) { 0152 distance[i][0] = i; 0153 } 0154 for(unsigned int i = 1; i <= len2; ++i) { 0155 distance[0][i] = i; 0156 } 0157 0158 for(unsigned int i = 1; i <= len1; ++i) { 0159 for(unsigned int j = 1; j <= len2; ++j) { 0160 distance[i][j] = std::min({ distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + (a[i - 1] == b[j - 1] ? 0 : 1) }); 0161 } 0162 } 0163 return distance[len1][len2]; 0164 } 0165 0166 QVector<GeoDataPlacemark *> SpellChecker::candidatesFor(GeoDataPlacemark *placemark) const 0167 { 0168 int const N = pow(2, m_tileLevel); 0169 auto const tile = TileId::fromCoordinates(placemark->coordinate(), m_tileLevel); 0170 QVector<GeoDataPlacemark *> places; 0171 for (int x=qMax(0, tile.x()-1); x<qMin(N-1, tile.x()+1); ++x) { 0172 for (int y=qMax(0, tile.y()-1); y<qMin(N-1, tile.y()+1); ++y) { 0173 places << m_tileHash[TileId(0, m_tileLevel, x, y)]; 0174 } 0175 } 0176 QString const placeName = placemark->name(); 0177 std::sort(places.begin(), places.end(), 0178 [placeName] (GeoDataPlacemark* a, GeoDataPlacemark* b) { 0179 return levenshteinDistance(a->name(), placeName) < levenshteinDistance(b->name(), placeName); 0180 }); 0181 return places; 0182 } 0183 0184 }