File indexing completed on 2024-12-29 04:49:58
0001 /* 0002 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #pragma once 0008 0009 #include "abstractextractor.h" 0010 0011 #include <memory> 0012 #include <vector> 0013 0014 class QJsonObject; 0015 class QString; 0016 0017 namespace KItinerary { 0018 class ExtractorFilter; 0019 class ScriptExtractorPrivate; 0020 0021 /** A single unstructured data extraction rule set. 0022 * 0023 * These rules are loaded from JSON meta-data files in a compiled-in qrc file, 0024 * or from $XDG_DATA_DIRS/kitinerary/extractors. 0025 * 0026 * @section extractor_metadata Meta Data Format 0027 * 0028 * The meta-data files either contain a single JSON object or an array of JSON objects 0029 * with the following content: 0030 * - \c mimeType: The MIME type of the extractor, \c text if not specified. 0031 * - \c filter: An array of filters that are used to select this extractor for a given input file. 0032 * - \c script: A JavaScript file to execute. 0033 * - \c function: The entry point in the above mentioned script, @c main if not specified. 0034 * 0035 * The following extractor types are supported: 0036 * - \c text/plain: plain text, the argument to the script function is a single string. 0037 * - \c text/html: HTML documents, the argument to the script function is a KItinerary::HtmlDocument instance. 0038 * - \c application/pdf: PDF documents, the argument to the script function is a KItinerary::PdfDocument instance. 0039 * - \c application/vnd.apple.pkpass: Apple Wallet passes, the argument to the script function is a KPkPass::Pass instance. 0040 * - \c internal/event: iCalendar events, the argument to the script function is a KCalendarCore::Event instance. 0041 * 0042 * Filter definitions have the following field: 0043 * - \c mimeType: The MIME type of the document part this filter can match against. 0044 * - \c field: The name of the field to match against. This can be a field id in a Apple Wallet pass, 0045 * A MIME message header name, a property on a Json-LD object or an iCal calendar or event. 0046 * For plain text or binary content, this is ignored. 0047 * - \c match: A regular expression that is matched against the specified value (see QRegularExpression). 0048 * - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted. 0049 * One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default). 0050 * 0051 * Example: 0052 * @code 0053 * [ 0054 * { 0055 * "mimeType": "application/pdf", 0056 * "filter": [ { "field": "From", "match": "@swiss.com", "mimeType": "message/rfc822", "scope": "Ancestors" } ], 0057 * "script": "swiss.js", 0058 * "function": "parsePdf" 0059 * }, 0060 * { 0061 * "mimeType": "application/vnd.apple.pkpass", 0062 * "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "mimeType": "application/vnd.apple.pkpass", "scope": "Current" } ], 0063 * "script": "swiss.js", 0064 * "function": "parsePkPass" 0065 * } 0066 * ] 0067 * @endcode 0068 * 0069 * @section extractor_development Development 0070 * 0071 * For development it's convenient to symlink the extractors source folder to 0072 * $XDG_DATA_DIRS/kitinerary/extractors, so you can re-run a changed extractor 0073 * script without recompiling or restarting the application. 0074 * 0075 */ 0076 class KITINERARY_EXPORT ScriptExtractor : public AbstractExtractor 0077 { 0078 public: 0079 explicit ScriptExtractor(); 0080 ~ScriptExtractor(); 0081 0082 QString name() const override; 0083 bool canHandle(const ExtractorDocumentNode &node) const override; 0084 ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override; 0085 0086 /** The JS script containing the code of the extractor. */ 0087 QString scriptFileName() const; 0088 /** The JS function entry point for this extractor, @c main if empty. */ 0089 QString scriptFunction() const; 0090 /** Mime type this script extractor supports. */ 0091 QString mimeType() const; 0092 /** Returns the filters deciding whether this extractor should be applied. */ 0093 const std::vector<ExtractorFilter> &filters() const; 0094 0095 ///@cond internal 0096 /** Load meta data from the given JSON object. */ 0097 bool load(const QJsonObject &obj, const QString &fileName, int index = -1); 0098 /** Save extractor meta data to a JSON object. */ 0099 QJsonObject toJson() const; 0100 0101 /** Source file name. */ 0102 QString fileName() const; 0103 0104 void setMimeType(const QString &mimeType); 0105 void setScriptFileName(const QString &script); 0106 void setScriptFunction(const QString &func); 0107 void setFilters(std::vector<ExtractorFilter> &&filters); 0108 void setFilters(const std::vector<ExtractorFilter> &filters); 0109 ///@endcond 0110 0111 private: 0112 std::unique_ptr<ScriptExtractorPrivate> d; 0113 }; 0114 0115 } 0116