File indexing completed on 2024-04-14 03:48:58

0001 // SPDX-License-Identifier: LGPL-2.1-or-later
0002 //
0003 // SPDX-FileCopyrightText: 2016 Dennis Nienhüser <nienhueser@kde.org>
0004 //
0005 
0006 #include "SpellChecker.h"
0007 #include "MarbleMath.h"
0008 #include "OsmPlacemarkData.h"
0009 #include "MarbleDirs.h"
0010 #include "MarbleModel.h"
0011 #include "ParsingRunnerManager.h"
0012 #include "GeoSceneMercatorTileProjection.h"
0013 #include "TileId.h"
0014 
0015 #include <QSet>
0016 #include <QDebug>
0017 #include <QFile>
0018 
0019 namespace Marble {
0020 
0021 SpellChecker::SpellChecker(const QString &citiesFile) :
0022     m_tileLevel(10),
0023     m_tileHash(parseCities(citiesFile)),
0024     m_verbose(false)
0025 {
0026     // nothing to do
0027 }
0028 
0029 void SpellChecker::correctPlaceLabels(const QVector<GeoDataPlacemark*> &placemarks)
0030 {
0031     auto places = cityPlaces(placemarks);
0032     double const maxDistance = 5000.0 / EARTH_RADIUS;
0033     int hits = 0;
0034     int validated = 0;
0035     int misses = 0;
0036     for (auto place: places) {
0037         auto const places = candidatesFor(place);
0038         bool hasMatch = false;
0039         bool isValid = false;
0040         QString const placeName = place->name();
0041         if (!places.isEmpty()) {
0042             auto match = places.first();
0043             if (match->name() == place->name()) {
0044                 ++validated;
0045                 isValid = true;
0046             } else {
0047                 if (match->coordinate().sphericalDistanceTo(place->coordinate()) < maxDistance) {
0048                     if (levenshteinDistance(places.first()->name(), placeName) < 6) {
0049                         if (m_verbose) {
0050                             qDebug() << "Correcting" << placeName << "to" << match->name();
0051                         }
0052                         place->setName(match->name());
0053                         place->osmData().removeTag("name");
0054                         place->osmData().addTag("name", match->name());
0055                         hasMatch = true;
0056                     }
0057                 }
0058 
0059                 if (m_verbose && !hasMatch) {
0060                     qDebug() << "No match for " << placeName << ", candidates: ";
0061                     for (auto candidate: places) {
0062                         qDebug() << candidate->coordinate().sphericalDistanceTo(place->coordinate()) * EARTH_RADIUS << " m, "
0063                                  << "levenshtein distance " << levenshteinDistance(placeName, candidate->name()) << ":" << candidate->name();
0064                     }
0065                 }
0066             }
0067         } else if (m_verbose) {
0068             qDebug() << "No match for " << placeName << " at " << place->coordinate().toString(GeoDataCoordinates::Decimal) << " and no candidates for replacement";
0069         }
0070         hits += hasMatch ? 1 : 0;
0071         misses += (hasMatch || isValid) ? 0 : 1;
0072     }
0073     if (m_verbose) {
0074         qDebug() << "In total there are " << hits << " corrections, " << validated << " validations and " << misses << " misses";
0075     }
0076 }
0077 
0078 void SpellChecker::setVerbose(bool verbose)
0079 {
0080     m_verbose = verbose;
0081 }
0082 
0083 QVector<GeoDataPlacemark *> SpellChecker::cityPlaces(const QVector<GeoDataPlacemark *> &placemarks) const
0084 {
0085     QSet<GeoDataPlacemark::GeoDataVisualCategory> categories;
0086     categories << GeoDataPlacemark::PlaceCity;
0087     categories << GeoDataPlacemark::PlaceCityCapital;
0088     categories << GeoDataPlacemark::PlaceCityNationalCapital;
0089     categories << GeoDataPlacemark::PlaceSuburb;
0090     categories << GeoDataPlacemark::PlaceHamlet;
0091     categories << GeoDataPlacemark::PlaceLocality;
0092     categories << GeoDataPlacemark::PlaceTown;
0093     categories << GeoDataPlacemark::PlaceTownCapital;
0094     categories << GeoDataPlacemark::PlaceTownNationalCapital;
0095     categories << GeoDataPlacemark::PlaceVillage;
0096     categories << GeoDataPlacemark::PlaceVillageCapital;
0097     categories << GeoDataPlacemark::PlaceVillageNationalCapital;
0098 
0099     QVector<GeoDataPlacemark*> places;
0100     std::copy_if(placemarks.begin(), placemarks.end(), std::back_inserter(places),
0101                  [categories] (GeoDataPlacemark* placemark) {
0102         return categories.contains(placemark->visualCategory()); });
0103     return places;
0104 }
0105 
0106 QHash<TileId, QVector<GeoDataPlacemark *> > SpellChecker::parseCities(const QString &filename) const
0107 {
0108     QHash<TileId, QVector<GeoDataPlacemark*> > placeLabels;
0109     QFile file(filename);
0110     if (!file.open(QIODevice::ReadOnly)) {
0111         qDebug() << "Cannot open " << filename << ":" << file.errorString();
0112         return placeLabels;
0113     }
0114 
0115     while (!file.atEnd()) {
0116         QByteArray line = file.readLine();
0117         auto const values = line.split('\t');
0118         if (values.size() > 15) {
0119 
0120             GeoDataPlacemark* city = new GeoDataPlacemark;
0121             city->setName(values[1]);
0122             bool ok;
0123             double const lon = values[5].toDouble(&ok);
0124             if (!ok) {
0125                 qDebug() << values[5] << " is no longitude";
0126                 continue;
0127             }
0128             double const lat = values[4].toDouble(&ok);
0129             if (!ok) {
0130                 qDebug() << values[4] << " is no latitude";
0131                 continue;
0132             }
0133             double const ele = values[15].toDouble();
0134             auto const coordinate = GeoDataCoordinates(lon, lat, ele, GeoDataCoordinates::Degree);
0135             city->setCoordinate(coordinate);
0136 
0137             auto const tile = TileId::fromCoordinates(coordinate, m_tileLevel);
0138             placeLabels[tile] << city;
0139         }
0140     }
0141     return placeLabels;
0142 }
0143 
0144 int SpellChecker::levenshteinDistance(const QString &a, const QString &b)
0145 {
0146     // From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
0147     unsigned int const len1 = a.size(), len2 = b.size();
0148     std::vector<std::vector<unsigned int>> distance(len1 + 1, std::vector<unsigned int>(len2 + 1));
0149 
0150     distance[0][0] = 0;
0151     for(unsigned int i = 1; i <= len1; ++i) {
0152         distance[i][0] = i;
0153     }
0154     for(unsigned int i = 1; i <= len2; ++i) {
0155         distance[0][i] = i;
0156     }
0157 
0158     for(unsigned int i = 1; i <= len1; ++i) {
0159         for(unsigned int j = 1; j <= len2; ++j) {
0160             distance[i][j] = std::min({ distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + (a[i - 1] == b[j - 1] ? 0 : 1) });
0161         }
0162     }
0163     return distance[len1][len2];
0164 }
0165 
0166 QVector<GeoDataPlacemark *> SpellChecker::candidatesFor(GeoDataPlacemark *placemark) const
0167 {
0168     int const N = pow(2, m_tileLevel);
0169     auto const tile = TileId::fromCoordinates(placemark->coordinate(), m_tileLevel);
0170     QVector<GeoDataPlacemark *> places;
0171     for (int x=qMax(0, tile.x()-1); x<qMin(N-1, tile.x()+1); ++x) {
0172         for (int y=qMax(0, tile.y()-1); y<qMin(N-1, tile.y()+1); ++y) {
0173             places << m_tileHash[TileId(0, m_tileLevel, x, y)];
0174         }
0175     }
0176     QString const placeName = placemark->name();
0177     std::sort(places.begin(), places.end(),
0178               [placeName] (GeoDataPlacemark* a, GeoDataPlacemark* b) {
0179         return levenshteinDistance(a->name(), placeName) < levenshteinDistance(b->name(), placeName);
0180     });
0181     return places;
0182 }
0183 
0184 }