lib/DictKanjidic/entrykanjidic.cpp

0001 /*
0002     This file is part of Kiten, a KDE Japanese Reference Tool
0003     SPDX-FileCopyrightText: 2001 Jason Katz-Brown <jason@katzbrown.com>
0004     SPDX-FileCopyrightText: 2006 Joseph Kerian <jkerian@gmail.com>
0005     SPDX-FileCopyrightText: 2006 Eric Kjeldergaard <kjelderg@gmail.com>
0006     SPDX-FileCopyrightText: 2011 Daniel E. Moctezuma <democtezuma@gmail.com>
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010
0011 #include "entrykanjidic.h"
0012
0013 #include "dictfilekanjidic.h"
0014 #include "kitenmacros.h"
0015
0016 #include <KLocalizedString>
0017
0018 #define QSTRINGLISTCHECK(x) (x == NULL ? QStringList() : *x)
0019
0020 using namespace Qt::StringLiterals;
0021
0022 EntryKanjidic::EntryKanjidic(const EntryKanjidic &dict)
0023     : Entry(dict)
0024 {
0025 }
0026
0027 EntryKanjidic::EntryKanjidic(const QString &dict)
0028     : Entry(dict)
0029 {
0030 }
0031
0032 EntryKanjidic::EntryKanjidic(const QString &dict, const QString &entry)
0033     : Entry(dict)
0034 {
0035     loadEntry(entry);
0036 }
0037
0038 QString EntryKanjidic::addReadings(const QStringList &list) const
0039 {
0040     QString readings;
0041     for (const QString &reading : list) {
0042         readings.append(makeReadingLink(reading) + outputListDelimiter);
0043     }
0044
0045     return readings;
0046 }
0047
0048 Entry *EntryKanjidic::clone() const
0049 {
0050     return new EntryKanjidic(*this);
0051 }
0052
0053 /**
0054  * This reproduces a kanjidic-formatted line from the Entry.
0055  *  Look at the above parser to see how the format works.
0056  */
0057 QString EntryKanjidic::dumpEntry() const
0058 {
0059     /* Loop over the ExtendedInfo to add it to the line we produce */
0060     QString dumpExtendedInfo;
0061     QHash<QString, QString>::const_iterator it;
0062     for (it = ExtendedInfo.constBegin(); it != ExtendedInfo.constEnd(); ++it) {
0063         dumpExtendedInfo += ' '_L1 + it.key() + it.value();
0064     }
0065
0066     return QStringLiteral("%1 %2%3").arg(Word).arg(Readings.join(QLatin1Char(' '))).arg(dumpExtendedInfo);
0067 }
0068
0069 bool EntryKanjidic::extendedItemCheck(const QString &key, const QString &value) const
0070 {
0071     if (key == QLatin1String("common")) {
0072         return !getExtendedInfoItem(QStringLiteral("G")).isEmpty();
0073     }
0074
0075     return Entry::extendedItemCheck(key, value);
0076 }
0077
0078 QString EntryKanjidic::getAsRadicalReadings() const
0079 {
0080     return AsRadicalReadings.join(outputListDelimiter);
0081 }
0082
0083 QStringList EntryKanjidic::getAsRadicalReadingsList() const
0084 {
0085     return AsRadicalReadings;
0086 }
0087
0088 QString EntryKanjidic::getDictionaryType() const
0089 {
0090     return KANJIDIC;
0091 }
0092
0093 QString EntryKanjidic::getInNamesReadings() const
0094 {
0095     return InNamesReadings.join(outputListDelimiter);
0096 }
0097
0098 QStringList EntryKanjidic::getInNamesReadingsList() const
0099 {
0100     return InNamesReadings;
0101 }
0102
0103 QString EntryKanjidic::getKanjiGrade() const
0104 {
0105     return getExtendedInfoItem(QStringLiteral("G"));
0106 }
0107
0108 QString EntryKanjidic::getKunyomiReadings() const
0109 {
0110     return KunyomiReadings.join(outputListDelimiter);
0111 }
0112
0113 QStringList EntryKanjidic::getKunyomiReadingsList() const
0114 {
0115     return KunyomiReadings;
0116 }
0117
0118 QString EntryKanjidic::getOnyomiReadings() const
0119 {
0120     return OnyomiReadings.join(outputListDelimiter);
0121 }
0122
0123 QStringList EntryKanjidic::getOnyomiReadingsList() const
0124 {
0125     return OnyomiReadings;
0126 }
0127
0128 QString EntryKanjidic::getStrokesCount() const
0129 {
0130     return getExtendedInfoItem(QStringLiteral("S"));
0131 }
0132
0133 QString EntryKanjidic::HTMLExtendedInfo(const QString &field) const
0134 {
0135     // qDebug() << field;
0136     return QStringLiteral("<span class=\"ExtendedInfo\">%1: %2</span>").arg(field).arg(ExtendedInfo[field]);
0137 }
0138
0139 /**
0140  * Prepares Readings for output as HTML
0141  */
0142 QString EntryKanjidic::HTMLReadings() const
0143 {
0144     QString htmlReadings;
0145     htmlReadings += addReadings(originalReadings);
0146
0147     if (InNamesReadings.count() > 0) {
0148         htmlReadings += i18n("In names: ");
0149         htmlReadings += addReadings(InNamesReadings);
0150     }
0151
0152     if (AsRadicalReadings.count() > 0) {
0153         htmlReadings += i18n("As radical: ");
0154         htmlReadings += addReadings(AsRadicalReadings);
0155     }
0156
0157     // get rid of last ,
0158     htmlReadings.truncate(htmlReadings.length() - outputListDelimiter.length());
0159     return QStringLiteral("<span class=\"Readings\">%1</span>").arg(htmlReadings);
0160 }
0161
0162 QString EntryKanjidic::HTMLWord() const
0163 {
0164     return QStringLiteral("<span class=\"Word\">%1</span>").arg(makeLink(Word));
0165 }
0166
0167 /**
0168  * Fill the fields of our Entry object appropriate to the given
0169  * entry line from Kanjidic.
0170  */
0171 /* TODO: Error checking */
0172 bool EntryKanjidic::loadEntry(const QString &entryLine)
0173 {
0174     unsigned int length = entryLine.length();
0175
0176     /* The loop would be a bit faster if we first grabbed the kanji (2 bytes) and then the
0177        space that follows, etc. for the fixed-space portion of the entries let's try that.
0178        First the first 2 bytes are guaranteed to be our kanji.  The 3rd byte is a space.
0179        The 4th through 7th are an ascii representation of the JIS code.  One more space
0180        Currently, kana are not detected so readings are anything that is not otherwise
0181        in the 8th position. */
0182     Word = entryLine.left(1);
0183     //  QString strjis = raw.mid( 2, 4 );
0184
0185     /* variables for the loop */
0186     QChar ichar;
0187     QString curString;
0188
0189 /* we would need to do these exact things ... many times so here now. */
0190 #define INCI                                                                                                                                                   \
0191     if (i < length) {                                                                                                                                          \
0192         i++;                                                                                                                                                   \
0193         ichar = entryLine.at(i);                                                                                                                               \
0194     }
0195 #define LOADSTRING(stringToLoad)                                                                                                                               \
0196     while (entryLine.at(i) != ' '_L1) {                                                                                                                        \
0197         stringToLoad += entryLine.at(i);                                                                                                                       \
0198         if (i < length)                                                                                                                                        \
0199             i++;                                                                                                                                               \
0200         else                                                                                                                                                   \
0201             break;                                                                                                                                             \
0202     }
0203
0204     //  qDebug() << "LOADSTRING: '" << stringToLoad << "'";
0205
0206     /* We can start looping at 8 because we have guarantees about the initial
0207        data.  This loop is used because the kanjidic format allows the data
0208        to be in any order until the end of the line.  The format was designed
0209        such that the data can be identified by the first byte. */
0210     for (unsigned int i = 7; i < length - 1; i++) {
0211         ichar = entryLine.at(i);
0212
0213         curString = QLatin1String("");
0214         switch (ichar.unicode()) {
0215         case ' ':
0216             /* as far as I can tell, there is no real rule forcing only 1 space so
0217                     there's not really any significance to them.  This block is not
0218                     reached in kanjidic itself. */
0219             break;
0220         case 'B':
0221             /* the radical, or busyu, number */
0222         case 'C':
0223             /* the classical radical number, usually doesn't differ from busyu number */
0224         case 'E':
0225             /* Henshell's "A Guide To Remembering Japanese Characters" index number */
0226         case 'F':
0227             /* frequency ranking */
0228         case 'G':
0229             /* grade level Jouyou 1 - 6 or 8 for common use or 9 for Jinmeiyou */
0230         case 'H':
0231             /* number from Halpern's New Japanese-English Character Dictionary */
0232         case 'K':
0233             /* Gakken Kanji Dictionary index */
0234         case 'L':
0235             /* Heisig's "Remembering The Kanji" index */
0236         case 'N':
0237             /* number from Nelson's Modern Reader's Japanese-English Character Dictionary */
0238         case 'O':
0239             /* O'Neill's "Japanese Names" index number */
0240         case 'P':
0241             /* SKIP code ... #-#-# format */
0242         case 'Q':
0243             /* Four Corner codes, it seems, can be multiple though I'm tempted just to take the last one. */
0244         case 'U':
0245             /* unicode which we are ignoring as it is found in another way */
0246         case 'V':
0247             /* number from Haig's New Nelson Japanese-English Character Dictionary */
0248         case 'W':
0249             /* korean reading */
0250         case 'X':
0251             /* I don't entirely understand this field. */
0252         case 'Y':
0253             /* Pinyin reading */
0254         case 'Z':
0255             /* SKIP misclassifications */
0256
0257             /* All of the above are of the format <Char><Data> where <Char> is
0258                     exactly 1 character. */
0259             i++;
0260             LOADSTRING(curString)
0261             ExtendedInfo.insert(QString(ichar), curString);
0262             break;
0263         case 'I':
0264             /* index codes for Spahn & Hadamitzky reference books we need the next
0265                     char to know what to do with it. */
0266             INCI if (ichar == 'N'_L1)
0267             {
0268                 /* a Kanji & Kana book number */
0269                 LOADSTRING(curString)
0270             }
0271             else {/* The Kanji Dictionary number, we need the current ichar. */
0272                   LOADSTRING(curString)} ExtendedInfo.insert('I'_L1 + QString(ichar), curString);
0273             break;
0274         case 'M':
0275             /* index and page numbers for Morohashi's Daikanwajiten 2 fields possible */
0276             INCI if (ichar == 'N'_L1)
0277             {
0278                 LOADSTRING(curString)
0279                 /* index number */
0280             }
0281             else if (ichar == 'P'_L1){
0282                 LOADSTRING(curString)
0283                 /* page number in volume.page format */
0284             } ExtendedInfo.insert('M'_L1 + QString(ichar), curString);
0285             break;
0286         case 'S':
0287             /* stroke count: may be multiple.  In that case, first is actual, others common
0288                     miscounts */
0289             i++;
0290             if (!ExtendedInfo.contains(QStringLiteral("S"))) {
0291                 LOADSTRING(curString)
0292                 ExtendedInfo.insert(QString(ichar), curString);
0293             } else {
0294                 LOADSTRING(curString)
0295                 ExtendedInfo.insert('_'_L1 + QString(ichar), curString);
0296             }
0297             break;
0298         case 'D':
0299             /* dictionary codes */
0300             INCI LOADSTRING(curString) ExtendedInfo.insert('D'_L1 + QString(ichar), curString);
0301             break;
0302         case '{':
0303             /* This should be starting with the first '{' character of a meaning section.
0304                     Let us get take it to the last. */
0305             INCI while (ichar != '}'_L1)
0306             {
0307                 curString += ichar;
0308                 /* sanity */
0309                 if (i < length) {
0310                     i++;
0311                 } else {
0312                     break;
0313                 }
0314                 ichar = entryLine.at(i);
0315             }
0316             INCI
0317                 //           qDebug() << "Meaning's curString: '" << curString << "'";
0318                 Meanings.append(curString);
0319             break;
0320         case 'T': /* a reading that is used in names for T1, radical names for T2 */
0321         {
0322             i++;
0323             LOADSTRING(curString)
0324             // Get the type number (1 for T1, 2 for T2).
0325             int type = curString.toInt();
0326             bool finished = false;
0327             while (!finished) {
0328                 // Skip all whitespaces.
0329                 INCI while (ichar == ' '_L1)
0330                 {
0331                     INCI
0332                 }
0333                 // Check if the current character is Kana.
0334                 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF) {
0335                     // Reset our variable and load it with
0336                     // all available kana until we find a whitespace.
0337                     curString = QLatin1String("");
0338                     LOADSTRING(curString)
0339                     switch (type) {
0340                     case 1: // Special reading used in names.
0341                         InNamesReadings.append(curString);
0342                         break;
0343                     case 2: // Reading as radical.
0344                         AsRadicalReadings.append(curString);
0345                         break;
0346                     }
0347                 } else {
0348                     // There are not more kana characters,
0349                     // so we finish this loop for now.
0350                     finished = true;
0351                 }
0352             }
0353             // Now 'i' points to a '{' character. We decrease its value
0354             // so in the next loop we can reach the "case '{'" section.
0355             i--;
0356         } break;
0357         case '-':
0358             /* a reading that is only in postposition */
0359             /* any of those 2 signals a reading is to ensue. */
0360             LOADSTRING(curString)
0361             originalReadings.append(curString);
0362
0363             // If it is Hiragana (Kunyomi)
0364             if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F) {
0365                 KunyomiReadings.append(curString);
0366             }
0367             // If it is Katakana (Onyomi)
0368             else if (0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF) {
0369                 OnyomiReadings.append(curString);
0370             }
0371
0372             curString = curString.remove('-'_L1).remove('.'_L1);
0373             Readings.append(curString);
0374             break;
0375         default:
0376             /* either a character we don't address or a problem...we should ignore it */
0377             //    qDebug() << "hit default in kanji parser.  Unicode: '" << ichar.unicode() << "'";
0378
0379             /* This should detect unicode kana */
0380             // Hiragana 0x3040 - 0x309F, Katakana: 0x30A0 - 0x30FF
0381             if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF) {
0382                 LOADSTRING(curString)
0383                 originalReadings.append(curString);
0384
0385                 // If it is Hiragana (Kunyomi)
0386                 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F) {
0387                     KunyomiReadings.append(curString);
0388                 }
0389                 // If it is Katakana (Onyomi)
0390                 else if (0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF) {
0391                     OnyomiReadings.append(curString);
0392                 }
0393
0394                 curString = curString.remove('-'_L1).remove('.'_L1);
0395                 Readings.append(curString);
0396                 break;
0397             }
0398             /* if it's not a kana reading ... it is something unhandled ...
0399                possibly a new field in kanjidic.  Let's treat it as the
0400                oh-so-common <char><data> type of entry.  It could be hotly
0401                debated what we should actually do about these. */
0402             i++;
0403             LOADSTRING(curString)
0404             ExtendedInfo.insert(QString(ichar), curString);
0405
0406             break;
0407         }
0408     }
0409     //   qDebug() << "Parsed: '"<<Word<<"' ("<<Readings.join("^")<<") \""<<
0410     //   Meanings.join("|")<<"\ and " <<ExtendedInfo.keys() << " from :"<<entryLine<<endl;
0411
0412     return true;
0413 }
0414
0415 QString EntryKanjidic::makeReadingLink(const QString &inReading) const
0416 {
0417     QString reading = inReading;
0418     return QStringLiteral("<a href=\"%1\">%2</a>").arg(reading.remove('.'_L1).remove('-'_L1)).arg(inReading);
0419 }
0420
0421 /**
0422  * Returns a HTML version of an Entry
0423  */
0424 QString EntryKanjidic::toHTML() const
0425 {
0426     QString result = QStringLiteral("<div class=\"KanjidicBrief\">");
0427
0428     for (const QString &field : QSTRINGLISTCHECK(DictFileKanjidic::displayFields)) {
0429         // qDebug() << "Display: "<<field;
0430         if (field == QLatin1String("--NewLine--"))
0431             result += QLatin1String("<br>");
0432         else if (field == QLatin1String("Word/Kanji"))
0433             result += HTMLWord() + ' '_L1;
0434         else if (field == QLatin1String("Meaning"))
0435             result += HTMLMeanings() + ' '_L1;
0436         else if (field == QLatin1String("Reading"))
0437             result += HTMLReadings() + ' '_L1;
0438         else if (ExtendedInfo.contains(field))
0439             result += HTMLExtendedInfo(field) + ' '_L1;
0440     }
0441
0442     result += QLatin1String("</div>");
0443     return result;
0444 }