File indexing completed on 2024-12-29 04:49:58

0001 /*
0002    SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
0003 
0004    SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "extractorfilter.h"
0008 #include "extractordocumentnode.h"
0009 #include "extractordocumentprocessor.h"
0010 #include "extractorresult.h"
0011 #include "logging.h"
0012 
0013 #include <QJsonObject>
0014 #include <QJSValue>
0015 #include <QMetaEnum>
0016 #include <QRegularExpression>
0017 
0018 using namespace KItinerary;
0019 
0020 namespace KItinerary {
0021 class ExtractorFilterPrivate : public QSharedData
0022 {
0023 public:
0024     QString m_mimeType;
0025     QString m_fieldName;
0026     QRegularExpression m_exp;
0027     ExtractorFilter::Scope m_scope = ExtractorFilter::Current;
0028 };
0029 }
0030 
0031 ExtractorFilter::ExtractorFilter()
0032     : d(new ExtractorFilterPrivate)
0033 {
0034 }
0035 
0036 ExtractorFilter::ExtractorFilter(const ExtractorFilter&) = default;
0037 ExtractorFilter::ExtractorFilter(ExtractorFilter&&) noexcept = default;
0038 ExtractorFilter::~ExtractorFilter() = default;
0039 ExtractorFilter& ExtractorFilter::operator=(const ExtractorFilter&) = default;
0040 ExtractorFilter& ExtractorFilter::operator=(ExtractorFilter&&) = default;
0041 
0042 QString ExtractorFilter::mimeType() const
0043 {
0044     return d->m_mimeType;
0045 }
0046 
0047 void ExtractorFilter::setMimeType(const QString &mimeType)
0048 {
0049     d.detach();
0050     d->m_mimeType = mimeType;
0051 }
0052 
0053 QString ExtractorFilter::fieldName() const
0054 {
0055     return d->m_fieldName;
0056 }
0057 
0058 void ExtractorFilter::setFieldName(const QString &fieldName)
0059 {
0060     d.detach();
0061     d->m_fieldName = fieldName;
0062 }
0063 
0064 bool ExtractorFilter::matches(const QString &data) const
0065 {
0066     if (!d->m_exp.isValid()) {
0067         qCDebug(Log) << d->m_exp.errorString() << d->m_exp.pattern();
0068     }
0069     return d->m_exp.match(data).hasMatch();
0070 }
0071 
0072 static bool needsFieldName(const QString &mimeType)
0073 {
0074   return mimeType != QLatin1StringView("text/plain") &&
0075          mimeType != QLatin1String("application/octet-stream");
0076 }
0077 
0078 template <typename T>
0079 static T readEnum(const QJsonValue &v, T defaultValue = {})
0080 {
0081     if (!v.isString()) {
0082         return defaultValue;
0083     }
0084 
0085     const auto me = QMetaEnum::fromType<T>();
0086     bool success = false;
0087     const auto result = static_cast<T>(me.keyToValue(v.toString().toUtf8().constData(), &success));
0088     return success ? result : defaultValue;
0089 }
0090 
0091 bool ExtractorFilter::load(const QJsonObject &obj)
0092 {
0093     d.detach();
0094     d->m_mimeType = obj.value(QLatin1StringView("mimeType")).toString();
0095     if (d->m_mimeType.isEmpty()) {
0096         qCDebug(Log) << "unspecified filter MIME type";
0097     }
0098     d->m_fieldName = obj.value(QLatin1StringView("field")).toString();
0099     d->m_exp.setPattern(obj.value(QLatin1StringView("match")).toString());
0100     d->m_scope = readEnum<ExtractorFilter::Scope>(
0101         obj.value(QLatin1StringView("scope")), ExtractorFilter::Current);
0102     return !d->m_mimeType.isEmpty() && (!d->m_fieldName.isEmpty() || !needsFieldName(d->m_mimeType)) && d->m_exp.isValid();
0103 }
0104 
0105 QJsonObject ExtractorFilter::toJson() const
0106 {
0107     QJsonObject obj;
0108     obj.insert(QLatin1StringView("mimeType"), d->m_mimeType);
0109     if (needsFieldName(d->m_mimeType)) {
0110       obj.insert(QLatin1StringView("field"), d->m_fieldName);
0111     }
0112     obj.insert(QLatin1StringView("match"), pattern());
0113     obj.insert(
0114         QLatin1StringView("scope"),
0115         QLatin1String(QMetaEnum::fromType<ExtractorFilter::Scope>().valueToKey(
0116             d->m_scope)));
0117     return obj;
0118 }
0119 
0120 QString ExtractorFilter::pattern() const
0121 {
0122     return d->m_exp.pattern();
0123 }
0124 
0125 void ExtractorFilter::setPattern(const QString &pattern)
0126 {
0127     d.detach();
0128     d->m_exp.setPattern(pattern);
0129 }
0130 
0131 ExtractorFilter::Scope ExtractorFilter::scope() const
0132 {
0133     return d->m_scope;
0134 }
0135 
0136 void ExtractorFilter::setScope(Scope scope)
0137 {
0138     d.detach();
0139     d->m_scope = scope;
0140 }
0141 
0142 static QString valueForJsonPath(const QJsonObject &obj, const QString &path)
0143 {
0144     const auto pathSections = QStringView(path).split(QLatin1Char('.'));
0145     QJsonValue v(obj);
0146     for (const auto &pathSection : pathSections) {
0147         if (!v.isObject()) {
0148             return {};
0149         }
0150         v = v.toObject().value(pathSection.toString());
0151     }
0152     return v.toString();
0153 }
0154 
0155 enum MatchMode { Any, All };
0156 
0157 static bool filterMachesNode(const ExtractorFilter &filter, ExtractorFilter::Scope scope, const ExtractorDocumentNode &node,
0158                              std::vector<ExtractorDocumentNode> &matches, MatchMode matchMode)
0159 {
0160     if (node.isNull()) {
0161         return false;
0162     }
0163 
0164     // filter without field/pattern always match, if the mimetype does
0165     if (filter.mimeType() == node.mimeType() && ((filter.fieldName().isEmpty() && filter.pattern().isEmpty()) || node.processor()->matches(filter, node))) {
0166         if (matchMode == All) {
0167             matches.push_back(node);
0168         }
0169         return true;
0170     }
0171 
0172     if (scope != ExtractorFilter::Ancestors &&
0173         filter.mimeType() == QLatin1StringView("application/ld+json") &&
0174         !node.result().isEmpty()) {
0175       // when collecting all matches for results, we only want the "leaf-most"
0176       // ones, not those along the path
0177       if (matchMode == All && scope == ExtractorFilter::Descendants) {
0178         bool descendantsMatched = false;
0179         for (const auto &child : node.childNodes()) {
0180           descendantsMatched |= filterMachesNode(
0181               filter, ExtractorFilter::Descendants, child, matches, matchMode);
0182         }
0183         if (descendantsMatched) {
0184           return true;
0185         }
0186       }
0187 
0188       const auto res = node.result().jsonLdResult();
0189       for (const auto &elem : res) {
0190         const auto property =
0191             valueForJsonPath(elem.toObject(), filter.fieldName());
0192         if (filter.matches(property)) {
0193           if (matchMode == All) {
0194             matches.push_back(node);
0195           } else {
0196             return true;
0197           }
0198         }
0199       }
0200     }
0201 
0202     if (scope == ExtractorFilter::Ancestors) {
0203         return filterMachesNode(filter, scope, node.parent(), matches, matchMode);
0204     }
0205     if (scope == ExtractorFilter::Descendants) {
0206         for (const auto &child : node.childNodes()) {
0207             const auto m = filterMachesNode(filter, ExtractorFilter::Descendants, child, matches, matchMode);
0208             if (m && matchMode == Any) {
0209                 return true;
0210             }
0211         }
0212     }
0213 
0214     return !matches.empty();
0215 }
0216 
0217 bool ExtractorFilter::matches(const ExtractorDocumentNode &node) const
0218 {
0219     std::vector<ExtractorDocumentNode> matches;
0220     switch (d->m_scope) {
0221         case ExtractorFilter::Current:
0222             return filterMachesNode(*this, ExtractorFilter::Current, node, matches, Any);
0223         case ExtractorFilter::Parent:
0224             return filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, Any);
0225         case ExtractorFilter::Ancestors:
0226             return filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, Any);
0227         case ExtractorFilter::Children:
0228         case ExtractorFilter::Descendants:
0229             for (const auto &child : node.childNodes()) {
0230                 if (filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, Any)) {
0231                     return true;
0232                 }
0233             }
0234     }
0235     return false;
0236 }
0237 
0238 void ExtractorFilter::allMatches(const ExtractorDocumentNode &node, std::vector<ExtractorDocumentNode>& matches) const
0239 {
0240     switch (d->m_scope) {
0241         case ExtractorFilter::Current:
0242             filterMachesNode(*this, ExtractorFilter::Current, node, matches, All);
0243             return;
0244         case ExtractorFilter::Parent:
0245             filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, All);
0246             return;
0247         case ExtractorFilter::Ancestors:
0248             filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, All);
0249             return;
0250         case ExtractorFilter::Children:
0251         case ExtractorFilter::Descendants:
0252             for (const auto &child : node.childNodes()) {
0253                 filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, All);
0254             }
0255             return;
0256     }
0257 }
0258 
0259 ExtractorFilter ExtractorFilter::fromJSValue(const QJSValue &js)
0260 {
0261     ExtractorFilter f;
0262     f.setMimeType(js.property(QLatin1StringView("mimeType")).toString());
0263     const auto fieldName = js.property(QLatin1StringView("field"));
0264     if (fieldName.isString()) {
0265         f.setFieldName(fieldName.toString());
0266     }
0267     const auto match = js.property(QLatin1StringView("match"));
0268     if (match.isString()) {
0269         f.setPattern(match.toString());
0270     }
0271     f.setScope(readEnum<ExtractorFilter::Scope>(
0272         js.property(QLatin1StringView("scope")).toString(),
0273         ExtractorFilter::Current));
0274     return f;
0275 }
0276 
0277 #include "moc_extractorfilter.cpp"