File indexing completed on 2024-12-29 04:49:58

0001 /*
0002    SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
0003 
0004    SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #pragma once
0008 
0009 #include "abstractextractor.h"
0010 
0011 #include <memory>
0012 #include <vector>
0013 
0014 class QJsonObject;
0015 class QString;
0016 
0017 namespace KItinerary {
0018 class ExtractorFilter;
0019 class ScriptExtractorPrivate;
0020 
0021 /** A single unstructured data extraction rule set.
0022  *
0023  * These rules are loaded from JSON meta-data files in a compiled-in qrc file,
0024  * or from $XDG_DATA_DIRS/kitinerary/extractors.
0025  *
0026  * @section extractor_metadata Meta Data Format
0027  *
0028  * The meta-data files either contain a single JSON object or an array of JSON objects
0029  * with the following content:
0030  * - \c mimeType: The MIME type of the extractor, \c text if not specified.
0031  * - \c filter: An array of filters that are used to select this extractor for a given input file.
0032  * - \c script: A JavaScript file to execute.
0033  * - \c function: The entry point in the above mentioned script, @c main if not specified.
0034  *
0035  * The following extractor types are supported:
0036  * - \c text/plain: plain text, the argument to the script function is a single string.
0037  * - \c text/html: HTML documents, the argument to the script function is a KItinerary::HtmlDocument instance.
0038  * - \c application/pdf: PDF documents, the argument to the script function is a KItinerary::PdfDocument instance.
0039  * - \c application/vnd.apple.pkpass: Apple Wallet passes, the argument to the script function is a KPkPass::Pass instance.
0040  * - \c internal/event: iCalendar events, the argument to the script function is a KCalendarCore::Event instance.
0041  *
0042  * Filter definitions have the following field:
0043  * - \c mimeType: The MIME type of the document part this filter can match against.
0044  * - \c field: The name of the field to match against. This can be a field id in a Apple Wallet pass,
0045  *      A MIME message header name, a property on a Json-LD object or an iCal calendar or event.
0046  *      For plain text or binary content, this is ignored.
0047  * - \c match: A regular expression that is matched against the specified value (see QRegularExpression).
0048  * - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted.
0049  *      One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default).
0050  *
0051  * Example:
0052  * @code
0053  * [
0054  *   {
0055  *     "mimeType": "application/pdf",
0056  *     "filter": [ { "field": "From", "match": "@swiss.com", "mimeType": "message/rfc822", "scope": "Ancestors" } ],
0057  *     "script": "swiss.js",
0058  *     "function": "parsePdf"
0059  *   },
0060  *   {
0061  *     "mimeType": "application/vnd.apple.pkpass",
0062  *     "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "mimeType": "application/vnd.apple.pkpass", "scope": "Current" } ],
0063  *     "script": "swiss.js",
0064  *     "function": "parsePkPass"
0065  *   }
0066  * ]
0067  * @endcode
0068  *
0069  * @section extractor_development Development
0070  *
0071  * For development it's convenient to symlink the extractors source folder to
0072  * $XDG_DATA_DIRS/kitinerary/extractors, so you can re-run a changed extractor
0073  * script without recompiling or restarting the application.
0074  *
0075  */
0076 class KITINERARY_EXPORT ScriptExtractor : public AbstractExtractor
0077 {
0078 public:
0079     explicit ScriptExtractor();
0080     ~ScriptExtractor();
0081 
0082     QString name() const override;
0083     bool canHandle(const ExtractorDocumentNode &node) const override;
0084     ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
0085 
0086     /** The JS script containing the code of the extractor. */
0087     QString scriptFileName() const;
0088     /** The JS function entry point for this extractor, @c main if empty. */
0089     QString scriptFunction() const;
0090     /** Mime type this script extractor supports. */
0091     QString mimeType() const;
0092     /** Returns the filters deciding whether this extractor should be applied. */
0093     const std::vector<ExtractorFilter> &filters() const;
0094 
0095     ///@cond internal
0096     /** Load meta data from the given JSON object. */
0097     bool load(const QJsonObject &obj, const QString &fileName, int index = -1);
0098     /** Save extractor meta data to a JSON object. */
0099     QJsonObject toJson() const;
0100 
0101     /** Source file name. */
0102     QString fileName() const;
0103 
0104     void setMimeType(const QString &mimeType);
0105     void setScriptFileName(const QString &script);
0106     void setScriptFunction(const QString &func);
0107     void setFilters(std::vector<ExtractorFilter> &&filters);
0108     void setFilters(const std::vector<ExtractorFilter> &filters);
0109     ///@endcond
0110 
0111 private:
0112     std::unique_ptr<ScriptExtractorPrivate> d;
0113 };
0114 
0115 }
0116