File indexing completed on 2024-11-24 04:44:39

0001 /*
0002  * SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
0003  * SPDX-License-Identifier: LGPL-2.0-or-later
0004  */
0005 
0006 #include "rdf_p.h"
0007 
0008 #include <QBuffer>
0009 #include <QCryptographicHash>
0010 #include <QHash>
0011 #include <QMap>
0012 #include <QIODevice>
0013 
0014 bool Rdf::Term::operator<(const Rdf::Term& other) const
0015 {
0016     if (type == other.type) {
0017         if (value == other.value) {
0018             return literalType < other.literalType;
0019         }
0020         return value < other.value;
0021     }
0022     return type < other.type;
0023 }
0024 
0025 bool Rdf::Term::operator==(const Rdf::Term& other) const
0026 {
0027     return type == other.type && value == other.value && literalType == other.literalType;
0028 }
0029 
0030 bool Rdf::Quad::operator<(const Rdf::Quad &other) const
0031 {
0032     if (subject == other.subject) {
0033         if (predicate == other.predicate) {
0034             return object < other.object;
0035         }
0036         return predicate < other.predicate;
0037     }
0038     return subject < other.subject;
0039 }
0040 
0041 // see https://json-ld.github.io/rdf-dataset-canonicalization/spec/#hash-first-degree-quads
0042 static QByteArray hashFirstDegreeQuads(const std::vector<Rdf::Quad> &quads, const QString &refBlankNode)
0043 {
0044     const auto renameBlankNode = [&refBlankNode](Rdf::Term &term) {
0045         if (term.type == Rdf::Term::BlankNode) {
0046             if (term.value == refBlankNode) {
0047                 term.value = QStringLiteral("a");
0048             } else {
0049                 term.value = QStringLiteral("z");
0050             }
0051         }
0052     };
0053 
0054     std::vector<Rdf::Quad> toHash;
0055     for (auto quad : quads) {
0056         renameBlankNode(quad.subject);
0057         renameBlankNode(quad.predicate);
0058         renameBlankNode(quad.object);
0059         toHash.push_back(std::move(quad));
0060     }
0061 
0062     std::sort(toHash.begin(), toHash.end());
0063     return QCryptographicHash::hash(serialize(toHash), QCryptographicHash::Sha256).toHex();
0064 }
0065 
0066 QByteArray Rdf::serialize(const std::vector<Rdf::Quad>& quads)
0067 {
0068     QByteArray out;
0069     QBuffer buffer(&out);
0070     buffer.open(QIODevice::WriteOnly);
0071     Rdf::serialize(&buffer, quads);
0072     buffer.close();
0073     return out;
0074 }
0075 
0076 void Rdf::normalize(std::vector<Rdf::Quad>& quads)
0077 {
0078     // see https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm
0079     QHash<QString, std::vector<Rdf::Quad>> blankNodeToQuadMap;
0080     for (const auto &quad : quads) {
0081         // ignores predicates and the same blank nodes used multiple times in a quad, as that doesn't happen for us
0082         if (quad.subject.type == Rdf::Term::BlankNode) {
0083             blankNodeToQuadMap[quad.subject.value].push_back(quad);
0084         }
0085         if (quad.object.type == Rdf::Term::BlankNode) {
0086             blankNodeToQuadMap[quad.object.value].push_back(quad);
0087         }
0088     }
0089 
0090     QMap<QByteArray, QString> hashToBlankNodeMap;
0091     for (auto it = blankNodeToQuadMap.begin(); it != blankNodeToQuadMap.end(); ++it) {
0092         hashToBlankNodeMap.insert(hashFirstDegreeQuads(it.value(), it.key()), it.key());
0093     }
0094 
0095     int c14nIdCounter = 0;
0096     QHash<QString, QString> blankNodeC14nMap;
0097     for (auto it = hashToBlankNodeMap.begin(); it != hashToBlankNodeMap.end(); ++it) {
0098         blankNodeC14nMap.insert(it.value(), QLatin1String("c14n") + QString::number(c14nIdCounter++));
0099     }
0100 
0101     const auto translateBlankNode = [&blankNodeC14nMap](Rdf::Term &term) {
0102         if (term.type == Rdf::Term::BlankNode) {
0103             const auto it = blankNodeC14nMap.constFind(term.value);
0104             if (it != blankNodeC14nMap.constEnd()) {
0105                 term.value = it.value();
0106             }
0107         }
0108     };
0109     for (auto &quad : quads) {
0110         translateBlankNode(quad.subject);
0111         translateBlankNode(quad.predicate);
0112         translateBlankNode(quad.object);
0113     }
0114 
0115     std::sort(quads.begin(), quads.end());
0116     quads.erase(std::unique(quads.begin(), quads.end(), [](const auto &lhs, const auto &rhs) {
0117         return lhs.subject == rhs.subject && lhs.predicate == rhs.predicate && lhs.object == rhs.object;
0118     }), quads.end());
0119 }
0120 
0121 void Rdf::serialize(QIODevice *out, const std::vector<Rdf::Quad> &quads)
0122 {
0123     for (const auto &quad : quads) {
0124         serialize(out, quad);
0125     }
0126 }
0127 
0128 void Rdf::serialize(QIODevice *out, const Rdf::Quad &quad)
0129 {
0130     serialize(out, quad.subject);
0131     out->write(" ");
0132     serialize(out, quad.predicate);
0133     out->write(" ");
0134     serialize(out, quad.object);
0135     out->write(" .\n");
0136 }
0137 
0138 void Rdf::serialize(QIODevice* out, const Rdf::Term &term)
0139 {
0140     switch (term.type) {
0141         case Term::IRI:
0142             out->write("<");
0143             out->write(term.value.toUtf8());
0144             out->write(">");
0145             break;
0146         case Term::BlankNode:
0147             out->write("_:");
0148             out->write(term.value.toUtf8());
0149             break;
0150         case Term::Literal:
0151             out->write("\"");
0152             out->write(term.value.toUtf8());
0153             out->write("\"");
0154             if (!term.literalType.isEmpty()) {
0155                 out->write("^^<");
0156                 out->write(term.literalType.toUtf8());
0157                 out->write(">");
0158             }
0159             break;
0160         case Term::Undefined:
0161             out->write(term.value.toUtf8());
0162             break;
0163     }
0164 }