File indexing completed on 2024-12-29 04:49:58
0001 /* 0002 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "extractorfilter.h" 0008 #include "extractordocumentnode.h" 0009 #include "extractordocumentprocessor.h" 0010 #include "extractorresult.h" 0011 #include "logging.h" 0012 0013 #include <QJsonObject> 0014 #include <QJSValue> 0015 #include <QMetaEnum> 0016 #include <QRegularExpression> 0017 0018 using namespace KItinerary; 0019 0020 namespace KItinerary { 0021 class ExtractorFilterPrivate : public QSharedData 0022 { 0023 public: 0024 QString m_mimeType; 0025 QString m_fieldName; 0026 QRegularExpression m_exp; 0027 ExtractorFilter::Scope m_scope = ExtractorFilter::Current; 0028 }; 0029 } 0030 0031 ExtractorFilter::ExtractorFilter() 0032 : d(new ExtractorFilterPrivate) 0033 { 0034 } 0035 0036 ExtractorFilter::ExtractorFilter(const ExtractorFilter&) = default; 0037 ExtractorFilter::ExtractorFilter(ExtractorFilter&&) noexcept = default; 0038 ExtractorFilter::~ExtractorFilter() = default; 0039 ExtractorFilter& ExtractorFilter::operator=(const ExtractorFilter&) = default; 0040 ExtractorFilter& ExtractorFilter::operator=(ExtractorFilter&&) = default; 0041 0042 QString ExtractorFilter::mimeType() const 0043 { 0044 return d->m_mimeType; 0045 } 0046 0047 void ExtractorFilter::setMimeType(const QString &mimeType) 0048 { 0049 d.detach(); 0050 d->m_mimeType = mimeType; 0051 } 0052 0053 QString ExtractorFilter::fieldName() const 0054 { 0055 return d->m_fieldName; 0056 } 0057 0058 void ExtractorFilter::setFieldName(const QString &fieldName) 0059 { 0060 d.detach(); 0061 d->m_fieldName = fieldName; 0062 } 0063 0064 bool ExtractorFilter::matches(const QString &data) const 0065 { 0066 if (!d->m_exp.isValid()) { 0067 qCDebug(Log) << d->m_exp.errorString() << d->m_exp.pattern(); 0068 } 0069 return d->m_exp.match(data).hasMatch(); 0070 } 0071 0072 static bool needsFieldName(const QString &mimeType) 0073 { 0074 return mimeType != QLatin1StringView("text/plain") && 0075 mimeType != QLatin1String("application/octet-stream"); 0076 } 0077 0078 template <typename T> 0079 static T readEnum(const QJsonValue &v, T defaultValue = {}) 0080 { 0081 if (!v.isString()) { 0082 return defaultValue; 0083 } 0084 0085 const auto me = QMetaEnum::fromType<T>(); 0086 bool success = false; 0087 const auto result = static_cast<T>(me.keyToValue(v.toString().toUtf8().constData(), &success)); 0088 return success ? result : defaultValue; 0089 } 0090 0091 bool ExtractorFilter::load(const QJsonObject &obj) 0092 { 0093 d.detach(); 0094 d->m_mimeType = obj.value(QLatin1StringView("mimeType")).toString(); 0095 if (d->m_mimeType.isEmpty()) { 0096 qCDebug(Log) << "unspecified filter MIME type"; 0097 } 0098 d->m_fieldName = obj.value(QLatin1StringView("field")).toString(); 0099 d->m_exp.setPattern(obj.value(QLatin1StringView("match")).toString()); 0100 d->m_scope = readEnum<ExtractorFilter::Scope>( 0101 obj.value(QLatin1StringView("scope")), ExtractorFilter::Current); 0102 return !d->m_mimeType.isEmpty() && (!d->m_fieldName.isEmpty() || !needsFieldName(d->m_mimeType)) && d->m_exp.isValid(); 0103 } 0104 0105 QJsonObject ExtractorFilter::toJson() const 0106 { 0107 QJsonObject obj; 0108 obj.insert(QLatin1StringView("mimeType"), d->m_mimeType); 0109 if (needsFieldName(d->m_mimeType)) { 0110 obj.insert(QLatin1StringView("field"), d->m_fieldName); 0111 } 0112 obj.insert(QLatin1StringView("match"), pattern()); 0113 obj.insert( 0114 QLatin1StringView("scope"), 0115 QLatin1String(QMetaEnum::fromType<ExtractorFilter::Scope>().valueToKey( 0116 d->m_scope))); 0117 return obj; 0118 } 0119 0120 QString ExtractorFilter::pattern() const 0121 { 0122 return d->m_exp.pattern(); 0123 } 0124 0125 void ExtractorFilter::setPattern(const QString &pattern) 0126 { 0127 d.detach(); 0128 d->m_exp.setPattern(pattern); 0129 } 0130 0131 ExtractorFilter::Scope ExtractorFilter::scope() const 0132 { 0133 return d->m_scope; 0134 } 0135 0136 void ExtractorFilter::setScope(Scope scope) 0137 { 0138 d.detach(); 0139 d->m_scope = scope; 0140 } 0141 0142 static QString valueForJsonPath(const QJsonObject &obj, const QString &path) 0143 { 0144 const auto pathSections = QStringView(path).split(QLatin1Char('.')); 0145 QJsonValue v(obj); 0146 for (const auto &pathSection : pathSections) { 0147 if (!v.isObject()) { 0148 return {}; 0149 } 0150 v = v.toObject().value(pathSection.toString()); 0151 } 0152 return v.toString(); 0153 } 0154 0155 enum MatchMode { Any, All }; 0156 0157 static bool filterMachesNode(const ExtractorFilter &filter, ExtractorFilter::Scope scope, const ExtractorDocumentNode &node, 0158 std::vector<ExtractorDocumentNode> &matches, MatchMode matchMode) 0159 { 0160 if (node.isNull()) { 0161 return false; 0162 } 0163 0164 // filter without field/pattern always match, if the mimetype does 0165 if (filter.mimeType() == node.mimeType() && ((filter.fieldName().isEmpty() && filter.pattern().isEmpty()) || node.processor()->matches(filter, node))) { 0166 if (matchMode == All) { 0167 matches.push_back(node); 0168 } 0169 return true; 0170 } 0171 0172 if (scope != ExtractorFilter::Ancestors && 0173 filter.mimeType() == QLatin1StringView("application/ld+json") && 0174 !node.result().isEmpty()) { 0175 // when collecting all matches for results, we only want the "leaf-most" 0176 // ones, not those along the path 0177 if (matchMode == All && scope == ExtractorFilter::Descendants) { 0178 bool descendantsMatched = false; 0179 for (const auto &child : node.childNodes()) { 0180 descendantsMatched |= filterMachesNode( 0181 filter, ExtractorFilter::Descendants, child, matches, matchMode); 0182 } 0183 if (descendantsMatched) { 0184 return true; 0185 } 0186 } 0187 0188 const auto res = node.result().jsonLdResult(); 0189 for (const auto &elem : res) { 0190 const auto property = 0191 valueForJsonPath(elem.toObject(), filter.fieldName()); 0192 if (filter.matches(property)) { 0193 if (matchMode == All) { 0194 matches.push_back(node); 0195 } else { 0196 return true; 0197 } 0198 } 0199 } 0200 } 0201 0202 if (scope == ExtractorFilter::Ancestors) { 0203 return filterMachesNode(filter, scope, node.parent(), matches, matchMode); 0204 } 0205 if (scope == ExtractorFilter::Descendants) { 0206 for (const auto &child : node.childNodes()) { 0207 const auto m = filterMachesNode(filter, ExtractorFilter::Descendants, child, matches, matchMode); 0208 if (m && matchMode == Any) { 0209 return true; 0210 } 0211 } 0212 } 0213 0214 return !matches.empty(); 0215 } 0216 0217 bool ExtractorFilter::matches(const ExtractorDocumentNode &node) const 0218 { 0219 std::vector<ExtractorDocumentNode> matches; 0220 switch (d->m_scope) { 0221 case ExtractorFilter::Current: 0222 return filterMachesNode(*this, ExtractorFilter::Current, node, matches, Any); 0223 case ExtractorFilter::Parent: 0224 return filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, Any); 0225 case ExtractorFilter::Ancestors: 0226 return filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, Any); 0227 case ExtractorFilter::Children: 0228 case ExtractorFilter::Descendants: 0229 for (const auto &child : node.childNodes()) { 0230 if (filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, Any)) { 0231 return true; 0232 } 0233 } 0234 } 0235 return false; 0236 } 0237 0238 void ExtractorFilter::allMatches(const ExtractorDocumentNode &node, std::vector<ExtractorDocumentNode>& matches) const 0239 { 0240 switch (d->m_scope) { 0241 case ExtractorFilter::Current: 0242 filterMachesNode(*this, ExtractorFilter::Current, node, matches, All); 0243 return; 0244 case ExtractorFilter::Parent: 0245 filterMachesNode(*this, ExtractorFilter::Current, node.parent(), matches, All); 0246 return; 0247 case ExtractorFilter::Ancestors: 0248 filterMachesNode(*this, ExtractorFilter::Ancestors, node.parent(), matches, All); 0249 return; 0250 case ExtractorFilter::Children: 0251 case ExtractorFilter::Descendants: 0252 for (const auto &child : node.childNodes()) { 0253 filterMachesNode(*this, d->m_scope == ExtractorFilter::Descendants ? d->m_scope : ExtractorFilter::Current, child, matches, All); 0254 } 0255 return; 0256 } 0257 } 0258 0259 ExtractorFilter ExtractorFilter::fromJSValue(const QJSValue &js) 0260 { 0261 ExtractorFilter f; 0262 f.setMimeType(js.property(QLatin1StringView("mimeType")).toString()); 0263 const auto fieldName = js.property(QLatin1StringView("field")); 0264 if (fieldName.isString()) { 0265 f.setFieldName(fieldName.toString()); 0266 } 0267 const auto match = js.property(QLatin1StringView("match")); 0268 if (match.isString()) { 0269 f.setPattern(match.toString()); 0270 } 0271 f.setScope(readEnum<ExtractorFilter::Scope>( 0272 js.property(QLatin1StringView("scope")).toString(), 0273 ExtractorFilter::Current)); 0274 return f; 0275 } 0276 0277 #include "moc_extractorfilter.cpp"