File indexing completed on 2024-04-28 07:29:21
0001 /* 0002 This file is part of Kiten, a KDE Japanese Reference Tool 0003 SPDX-FileCopyrightText: 2001 Jason Katz-Brown <jason@katzbrown.com> 0004 SPDX-FileCopyrightText: 2006 Joseph Kerian <jkerian@gmail.com> 0005 SPDX-FileCopyrightText: 2006 Eric Kjeldergaard <kjelderg@gmail.com> 0006 SPDX-FileCopyrightText: 2011 Daniel E. Moctezuma <democtezuma@gmail.com> 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 0011 #include "entrykanjidic.h" 0012 0013 #include "dictfilekanjidic.h" 0014 #include "kitenmacros.h" 0015 0016 #include <KLocalizedString> 0017 0018 #define QSTRINGLISTCHECK(x) (x == NULL ? QStringList() : *x) 0019 0020 using namespace Qt::StringLiterals; 0021 0022 EntryKanjidic::EntryKanjidic(const EntryKanjidic &dict) 0023 : Entry(dict) 0024 { 0025 } 0026 0027 EntryKanjidic::EntryKanjidic(const QString &dict) 0028 : Entry(dict) 0029 { 0030 } 0031 0032 EntryKanjidic::EntryKanjidic(const QString &dict, const QString &entry) 0033 : Entry(dict) 0034 { 0035 loadEntry(entry); 0036 } 0037 0038 QString EntryKanjidic::addReadings(const QStringList &list) const 0039 { 0040 QString readings; 0041 for (const QString &reading : list) { 0042 readings.append(makeReadingLink(reading) + outputListDelimiter); 0043 } 0044 0045 return readings; 0046 } 0047 0048 Entry *EntryKanjidic::clone() const 0049 { 0050 return new EntryKanjidic(*this); 0051 } 0052 0053 /** 0054 * This reproduces a kanjidic-formatted line from the Entry. 0055 * Look at the above parser to see how the format works. 0056 */ 0057 QString EntryKanjidic::dumpEntry() const 0058 { 0059 /* Loop over the ExtendedInfo to add it to the line we produce */ 0060 QString dumpExtendedInfo; 0061 QHash<QString, QString>::const_iterator it; 0062 for (it = ExtendedInfo.constBegin(); it != ExtendedInfo.constEnd(); ++it) { 0063 dumpExtendedInfo += ' '_L1 + it.key() + it.value(); 0064 } 0065 0066 return QStringLiteral("%1 %2%3").arg(Word).arg(Readings.join(QLatin1Char(' '))).arg(dumpExtendedInfo); 0067 } 0068 0069 bool EntryKanjidic::extendedItemCheck(const QString &key, const QString &value) const 0070 { 0071 if (key == QLatin1String("common")) { 0072 return !getExtendedInfoItem(QStringLiteral("G")).isEmpty(); 0073 } 0074 0075 return Entry::extendedItemCheck(key, value); 0076 } 0077 0078 QString EntryKanjidic::getAsRadicalReadings() const 0079 { 0080 return AsRadicalReadings.join(outputListDelimiter); 0081 } 0082 0083 QStringList EntryKanjidic::getAsRadicalReadingsList() const 0084 { 0085 return AsRadicalReadings; 0086 } 0087 0088 QString EntryKanjidic::getDictionaryType() const 0089 { 0090 return KANJIDIC; 0091 } 0092 0093 QString EntryKanjidic::getInNamesReadings() const 0094 { 0095 return InNamesReadings.join(outputListDelimiter); 0096 } 0097 0098 QStringList EntryKanjidic::getInNamesReadingsList() const 0099 { 0100 return InNamesReadings; 0101 } 0102 0103 QString EntryKanjidic::getKanjiGrade() const 0104 { 0105 return getExtendedInfoItem(QStringLiteral("G")); 0106 } 0107 0108 QString EntryKanjidic::getKunyomiReadings() const 0109 { 0110 return KunyomiReadings.join(outputListDelimiter); 0111 } 0112 0113 QStringList EntryKanjidic::getKunyomiReadingsList() const 0114 { 0115 return KunyomiReadings; 0116 } 0117 0118 QString EntryKanjidic::getOnyomiReadings() const 0119 { 0120 return OnyomiReadings.join(outputListDelimiter); 0121 } 0122 0123 QStringList EntryKanjidic::getOnyomiReadingsList() const 0124 { 0125 return OnyomiReadings; 0126 } 0127 0128 QString EntryKanjidic::getStrokesCount() const 0129 { 0130 return getExtendedInfoItem(QStringLiteral("S")); 0131 } 0132 0133 QString EntryKanjidic::HTMLExtendedInfo(const QString &field) const 0134 { 0135 // qDebug() << field; 0136 return QStringLiteral("<span class=\"ExtendedInfo\">%1: %2</span>").arg(field).arg(ExtendedInfo[field]); 0137 } 0138 0139 /** 0140 * Prepares Readings for output as HTML 0141 */ 0142 QString EntryKanjidic::HTMLReadings() const 0143 { 0144 QString htmlReadings; 0145 htmlReadings += addReadings(originalReadings); 0146 0147 if (InNamesReadings.count() > 0) { 0148 htmlReadings += i18n("In names: "); 0149 htmlReadings += addReadings(InNamesReadings); 0150 } 0151 0152 if (AsRadicalReadings.count() > 0) { 0153 htmlReadings += i18n("As radical: "); 0154 htmlReadings += addReadings(AsRadicalReadings); 0155 } 0156 0157 // get rid of last , 0158 htmlReadings.truncate(htmlReadings.length() - outputListDelimiter.length()); 0159 return QStringLiteral("<span class=\"Readings\">%1</span>").arg(htmlReadings); 0160 } 0161 0162 QString EntryKanjidic::HTMLWord() const 0163 { 0164 return QStringLiteral("<span class=\"Word\">%1</span>").arg(makeLink(Word)); 0165 } 0166 0167 /** 0168 * Fill the fields of our Entry object appropriate to the given 0169 * entry line from Kanjidic. 0170 */ 0171 /* TODO: Error checking */ 0172 bool EntryKanjidic::loadEntry(const QString &entryLine) 0173 { 0174 unsigned int length = entryLine.length(); 0175 0176 /* The loop would be a bit faster if we first grabbed the kanji (2 bytes) and then the 0177 space that follows, etc. for the fixed-space portion of the entries let's try that. 0178 First the first 2 bytes are guaranteed to be our kanji. The 3rd byte is a space. 0179 The 4th through 7th are an ascii representation of the JIS code. One more space 0180 Currently, kana are not detected so readings are anything that is not otherwise 0181 in the 8th position. */ 0182 Word = entryLine.left(1); 0183 // QString strjis = raw.mid( 2, 4 ); 0184 0185 /* variables for the loop */ 0186 QChar ichar; 0187 QString curString; 0188 0189 /* we would need to do these exact things ... many times so here now. */ 0190 #define INCI \ 0191 if (i < length) { \ 0192 i++; \ 0193 ichar = entryLine.at(i); \ 0194 } 0195 #define LOADSTRING(stringToLoad) \ 0196 while (entryLine.at(i) != ' '_L1) { \ 0197 stringToLoad += entryLine.at(i); \ 0198 if (i < length) \ 0199 i++; \ 0200 else \ 0201 break; \ 0202 } 0203 0204 // qDebug() << "LOADSTRING: '" << stringToLoad << "'"; 0205 0206 /* We can start looping at 8 because we have guarantees about the initial 0207 data. This loop is used because the kanjidic format allows the data 0208 to be in any order until the end of the line. The format was designed 0209 such that the data can be identified by the first byte. */ 0210 for (unsigned int i = 7; i < length - 1; i++) { 0211 ichar = entryLine.at(i); 0212 0213 curString = QLatin1String(""); 0214 switch (ichar.unicode()) { 0215 case ' ': 0216 /* as far as I can tell, there is no real rule forcing only 1 space so 0217 there's not really any significance to them. This block is not 0218 reached in kanjidic itself. */ 0219 break; 0220 case 'B': 0221 /* the radical, or busyu, number */ 0222 case 'C': 0223 /* the classical radical number, usually doesn't differ from busyu number */ 0224 case 'E': 0225 /* Henshell's "A Guide To Remembering Japanese Characters" index number */ 0226 case 'F': 0227 /* frequency ranking */ 0228 case 'G': 0229 /* grade level Jouyou 1 - 6 or 8 for common use or 9 for Jinmeiyou */ 0230 case 'H': 0231 /* number from Halpern's New Japanese-English Character Dictionary */ 0232 case 'K': 0233 /* Gakken Kanji Dictionary index */ 0234 case 'L': 0235 /* Heisig's "Remembering The Kanji" index */ 0236 case 'N': 0237 /* number from Nelson's Modern Reader's Japanese-English Character Dictionary */ 0238 case 'O': 0239 /* O'Neill's "Japanese Names" index number */ 0240 case 'P': 0241 /* SKIP code ... #-#-# format */ 0242 case 'Q': 0243 /* Four Corner codes, it seems, can be multiple though I'm tempted just to take the last one. */ 0244 case 'U': 0245 /* unicode which we are ignoring as it is found in another way */ 0246 case 'V': 0247 /* number from Haig's New Nelson Japanese-English Character Dictionary */ 0248 case 'W': 0249 /* korean reading */ 0250 case 'X': 0251 /* I don't entirely understand this field. */ 0252 case 'Y': 0253 /* Pinyin reading */ 0254 case 'Z': 0255 /* SKIP misclassifications */ 0256 0257 /* All of the above are of the format <Char><Data> where <Char> is 0258 exactly 1 character. */ 0259 i++; 0260 LOADSTRING(curString) 0261 ExtendedInfo.insert(QString(ichar), curString); 0262 break; 0263 case 'I': 0264 /* index codes for Spahn & Hadamitzky reference books we need the next 0265 char to know what to do with it. */ 0266 INCI if (ichar == 'N'_L1) 0267 { 0268 /* a Kanji & Kana book number */ 0269 LOADSTRING(curString) 0270 } 0271 else {/* The Kanji Dictionary number, we need the current ichar. */ 0272 LOADSTRING(curString)} ExtendedInfo.insert('I'_L1 + QString(ichar), curString); 0273 break; 0274 case 'M': 0275 /* index and page numbers for Morohashi's Daikanwajiten 2 fields possible */ 0276 INCI if (ichar == 'N'_L1) 0277 { 0278 LOADSTRING(curString) 0279 /* index number */ 0280 } 0281 else if (ichar == 'P'_L1){ 0282 LOADSTRING(curString) 0283 /* page number in volume.page format */ 0284 } ExtendedInfo.insert('M'_L1 + QString(ichar), curString); 0285 break; 0286 case 'S': 0287 /* stroke count: may be multiple. In that case, first is actual, others common 0288 miscounts */ 0289 i++; 0290 if (!ExtendedInfo.contains(QStringLiteral("S"))) { 0291 LOADSTRING(curString) 0292 ExtendedInfo.insert(QString(ichar), curString); 0293 } else { 0294 LOADSTRING(curString) 0295 ExtendedInfo.insert('_'_L1 + QString(ichar), curString); 0296 } 0297 break; 0298 case 'D': 0299 /* dictionary codes */ 0300 INCI LOADSTRING(curString) ExtendedInfo.insert('D'_L1 + QString(ichar), curString); 0301 break; 0302 case '{': 0303 /* This should be starting with the first '{' character of a meaning section. 0304 Let us get take it to the last. */ 0305 INCI while (ichar != '}'_L1) 0306 { 0307 curString += ichar; 0308 /* sanity */ 0309 if (i < length) { 0310 i++; 0311 } else { 0312 break; 0313 } 0314 ichar = entryLine.at(i); 0315 } 0316 INCI 0317 // qDebug() << "Meaning's curString: '" << curString << "'"; 0318 Meanings.append(curString); 0319 break; 0320 case 'T': /* a reading that is used in names for T1, radical names for T2 */ 0321 { 0322 i++; 0323 LOADSTRING(curString) 0324 // Get the type number (1 for T1, 2 for T2). 0325 int type = curString.toInt(); 0326 bool finished = false; 0327 while (!finished) { 0328 // Skip all whitespaces. 0329 INCI while (ichar == ' '_L1) 0330 { 0331 INCI 0332 } 0333 // Check if the current character is Kana. 0334 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF) { 0335 // Reset our variable and load it with 0336 // all available kana until we find a whitespace. 0337 curString = QLatin1String(""); 0338 LOADSTRING(curString) 0339 switch (type) { 0340 case 1: // Special reading used in names. 0341 InNamesReadings.append(curString); 0342 break; 0343 case 2: // Reading as radical. 0344 AsRadicalReadings.append(curString); 0345 break; 0346 } 0347 } else { 0348 // There are not more kana characters, 0349 // so we finish this loop for now. 0350 finished = true; 0351 } 0352 } 0353 // Now 'i' points to a '{' character. We decrease its value 0354 // so in the next loop we can reach the "case '{'" section. 0355 i--; 0356 } break; 0357 case '-': 0358 /* a reading that is only in postposition */ 0359 /* any of those 2 signals a reading is to ensue. */ 0360 LOADSTRING(curString) 0361 originalReadings.append(curString); 0362 0363 // If it is Hiragana (Kunyomi) 0364 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F) { 0365 KunyomiReadings.append(curString); 0366 } 0367 // If it is Katakana (Onyomi) 0368 else if (0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF) { 0369 OnyomiReadings.append(curString); 0370 } 0371 0372 curString = curString.remove('-'_L1).remove('.'_L1); 0373 Readings.append(curString); 0374 break; 0375 default: 0376 /* either a character we don't address or a problem...we should ignore it */ 0377 // qDebug() << "hit default in kanji parser. Unicode: '" << ichar.unicode() << "'"; 0378 0379 /* This should detect unicode kana */ 0380 // Hiragana 0x3040 - 0x309F, Katakana: 0x30A0 - 0x30FF 0381 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x30FF) { 0382 LOADSTRING(curString) 0383 originalReadings.append(curString); 0384 0385 // If it is Hiragana (Kunyomi) 0386 if (0x3040 <= ichar.unicode() && ichar.unicode() <= 0x309F) { 0387 KunyomiReadings.append(curString); 0388 } 0389 // If it is Katakana (Onyomi) 0390 else if (0x30A0 <= ichar.unicode() && ichar.unicode() <= 0x30FF) { 0391 OnyomiReadings.append(curString); 0392 } 0393 0394 curString = curString.remove('-'_L1).remove('.'_L1); 0395 Readings.append(curString); 0396 break; 0397 } 0398 /* if it's not a kana reading ... it is something unhandled ... 0399 possibly a new field in kanjidic. Let's treat it as the 0400 oh-so-common <char><data> type of entry. It could be hotly 0401 debated what we should actually do about these. */ 0402 i++; 0403 LOADSTRING(curString) 0404 ExtendedInfo.insert(QString(ichar), curString); 0405 0406 break; 0407 } 0408 } 0409 // qDebug() << "Parsed: '"<<Word<<"' ("<<Readings.join("^")<<") \""<< 0410 // Meanings.join("|")<<"\ and " <<ExtendedInfo.keys() << " from :"<<entryLine<<endl; 0411 0412 return true; 0413 } 0414 0415 QString EntryKanjidic::makeReadingLink(const QString &inReading) const 0416 { 0417 QString reading = inReading; 0418 return QStringLiteral("<a href=\"%1\">%2</a>").arg(reading.remove('.'_L1).remove('-'_L1)).arg(inReading); 0419 } 0420 0421 /** 0422 * Returns a HTML version of an Entry 0423 */ 0424 QString EntryKanjidic::toHTML() const 0425 { 0426 QString result = QStringLiteral("<div class=\"KanjidicBrief\">"); 0427 0428 for (const QString &field : QSTRINGLISTCHECK(DictFileKanjidic::displayFields)) { 0429 // qDebug() << "Display: "<<field; 0430 if (field == QLatin1String("--NewLine--")) 0431 result += QLatin1String("<br>"); 0432 else if (field == QLatin1String("Word/Kanji")) 0433 result += HTMLWord() + ' '_L1; 0434 else if (field == QLatin1String("Meaning")) 0435 result += HTMLMeanings() + ' '_L1; 0436 else if (field == QLatin1String("Reading")) 0437 result += HTMLReadings() + ' '_L1; 0438 else if (ExtendedInfo.contains(field)) 0439 result += HTMLExtendedInfo(field) + ' '_L1; 0440 } 0441 0442 result += QLatin1String("</div>"); 0443 return result; 0444 }