File indexing completed on 2024-05-19 05:05:32
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2022 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "encoderlatex.h" 0021 0022 #include <QString> 0023 #include <QStack> 0024 0025 #include "logging_io.h" 0026 0027 inline bool isAsciiLetter(const QChar c) { 0028 static const ushort upperCaseLetterA = QLatin1Char('A').unicode(); 0029 static const ushort upperCaseLetterZ = QLatin1Char('Z').unicode(); 0030 static const ushort lowerCaseLetterA = QLatin1Char('a').unicode(); 0031 static const ushort lowerCaseLetterZ = QLatin1Char('z').unicode(); 0032 const ushort unicode = c.unicode(); 0033 return (unicode >= upperCaseLetterA && unicode <= upperCaseLetterZ) || (unicode >= lowerCaseLetterA && unicode <= lowerCaseLetterZ); 0034 } 0035 0036 inline int asciiLetterOrDigitToPos(const QChar c) { 0037 static const ushort upperCaseLetterA = QLatin1Char('A').unicode(); 0038 static const ushort upperCaseLetterZ = QLatin1Char('Z').unicode(); 0039 static const ushort lowerCaseLetterA = QLatin1Char('a').unicode(); 0040 static const ushort lowerCaseLetterZ = QLatin1Char('z').unicode(); 0041 static const ushort digit0 = QLatin1Char('0').unicode(); 0042 static const ushort digit9 = QLatin1Char('9').unicode(); 0043 const ushort unicode = c.unicode(); 0044 if (unicode >= upperCaseLetterA && unicode <= upperCaseLetterZ) return unicode - upperCaseLetterA; 0045 else if (unicode >= lowerCaseLetterA && unicode <= lowerCaseLetterZ) return unicode + 26 - lowerCaseLetterA; 0046 else if (unicode >= digit0 && unicode <= digit9) return unicode + 52 - digit0; 0047 else return -1; 0048 } 0049 0050 inline bool isIJ(const QChar c) { 0051 static const QChar upperCaseLetterI = QLatin1Char('I'); 0052 static const QChar upperCaseLetterJ = QLatin1Char('J'); 0053 static const QChar lowerCaseLetterI = QLatin1Char('i'); 0054 static const QChar lowerCaseLetterJ = QLatin1Char('j'); 0055 return c == upperCaseLetterI || c == upperCaseLetterJ || c == lowerCaseLetterI || c == lowerCaseLetterJ; 0056 } 0057 0058 enum EncoderLaTeXCommandDirection { 0059 DirectionCommandToUnicode = 1, //< A mapping between command and unicode value may be used in the direction from command to unicode value 0060 DirectionUnicodeToCommand = 2, //< A mapping between command and unicode value may be used in the direction from unicode value to command 0061 DirectionBoth = DirectionCommandToUnicode | DirectionUnicodeToCommand 0062 }; 0063 0064 /** 0065 * General documentation on this topic: 0066 * https://www.latex-project.org/help/documentation/encguide.pdf 0067 * https://mirror.hmc.edu/ctan/macros/xetex/latex/xecjk/xunicode-symbols.pdf 0068 * ftp://ftp.dante.de/tex-archive/biblio/biber/documentation/utf8-macro-map.html 0069 */ 0070 0071 /** 0072 * This structure contains information how escaped characters 0073 * such as \"a are translated to an Unicode character and back. 0074 * The structure is a table with three columns: (1) the modifier 0075 * (in the example before the quotation mark) (2) the ASCII 0076 * character ((in the example before the 'a') (3) the Unicode 0077 * character described by a hexcode. 0078 * This data structure is used both directly and indirectly via 0079 * the LookupTable structure which is initialized when the 0080 * EncoderLaTeX object is created. 0081 */ 0082 static const struct EncoderLaTeXEscapedCharacter { 0083 const QChar modifier; 0084 const QChar letter; 0085 const ushort unicode; 0086 const EncoderLaTeXCommandDirection direction; 0087 } 0088 encoderLaTeXEscapedCharacters[] = { 0089 {QLatin1Char('`'), QLatin1Char('A'), 0x00C0, DirectionBoth}, 0090 {QLatin1Char('\''), QLatin1Char('A'), 0x00C1, DirectionBoth}, 0091 {QLatin1Char('^'), QLatin1Char('A'), 0x00C2, DirectionBoth}, 0092 {QLatin1Char('~'), QLatin1Char('A'), 0x00C3, DirectionBoth}, 0093 {QLatin1Char('"'), QLatin1Char('A'), 0x00C4, DirectionBoth}, 0094 {QLatin1Char('r'), QLatin1Char('A'), 0x00C5, DirectionBoth}, 0095 /** 0x00C6: see EncoderLaTeXCharacterCommand */ 0096 {QLatin1Char('c'), QLatin1Char('C'), 0x00C7, DirectionBoth}, 0097 {QLatin1Char('`'), QLatin1Char('E'), 0x00C8, DirectionBoth}, 0098 {QLatin1Char('\''), QLatin1Char('E'), 0x00C9, DirectionBoth}, 0099 {QLatin1Char('^'), QLatin1Char('E'), 0x00CA, DirectionBoth}, 0100 {QLatin1Char('"'), QLatin1Char('E'), 0x00CB, DirectionBoth}, 0101 {QLatin1Char('`'), QLatin1Char('I'), 0x00CC, DirectionBoth}, 0102 {QLatin1Char('\''), QLatin1Char('I'), 0x00CD, DirectionBoth}, 0103 {QLatin1Char('^'), QLatin1Char('I'), 0x00CE, DirectionBoth}, 0104 {QLatin1Char('"'), QLatin1Char('I'), 0x00CF, DirectionBoth}, 0105 /** 0x00D0: see EncoderLaTeXCharacterCommand */ 0106 {QLatin1Char('~'), QLatin1Char('N'), 0x00D1, DirectionBoth}, 0107 {QLatin1Char('`'), QLatin1Char('O'), 0x00D2, DirectionBoth}, 0108 {QLatin1Char('\''), QLatin1Char('O'), 0x00D3, DirectionBoth}, 0109 {QLatin1Char('^'), QLatin1Char('O'), 0x00D4, DirectionBoth}, 0110 {QLatin1Char('~'), QLatin1Char('O'), 0x00D5, DirectionBoth}, 0111 {QLatin1Char('"'), QLatin1Char('O'), 0x00D6, DirectionBoth}, 0112 /** 0x00D7: see EncoderLaTeXCharacterCommand */ 0113 /** 0x00D8: see EncoderLaTeXCharacterCommand */ 0114 {QLatin1Char('`'), QLatin1Char('U'), 0x00D9, DirectionBoth}, 0115 {QLatin1Char('\''), QLatin1Char('U'), 0x00DA, DirectionBoth}, 0116 {QLatin1Char('^'), QLatin1Char('U'), 0x00DB, DirectionBoth}, 0117 {QLatin1Char('"'), QLatin1Char('U'), 0x00DC, DirectionBoth}, 0118 {QLatin1Char('\''), QLatin1Char('Y'), 0x00DD, DirectionBoth}, 0119 /** 0x00DE: see EncoderLaTeXCharacterCommand */ 0120 {QLatin1Char('"'), QLatin1Char('s'), 0x00DF, DirectionBoth}, 0121 {QLatin1Char('`'), QLatin1Char('a'), 0x00E0, DirectionBoth}, 0122 {QLatin1Char('\''), QLatin1Char('a'), 0x00E1, DirectionBoth}, 0123 {QLatin1Char('^'), QLatin1Char('a'), 0x00E2, DirectionBoth}, 0124 {QLatin1Char('~'), QLatin1Char('a'), 0x00E3, DirectionBoth}, 0125 {QLatin1Char('"'), QLatin1Char('a'), 0x00E4, DirectionBoth}, 0126 {QLatin1Char('r'), QLatin1Char('a'), 0x00E5, DirectionBoth}, 0127 /** 0x00E6: see EncoderLaTeXCharacterCommand */ 0128 {QLatin1Char('c'), QLatin1Char('c'), 0x00E7, DirectionBoth}, 0129 {QLatin1Char('`'), QLatin1Char('e'), 0x00E8, DirectionBoth}, 0130 {QLatin1Char('\''), QLatin1Char('e'), 0x00E9, DirectionBoth}, 0131 {QLatin1Char('^'), QLatin1Char('e'), 0x00EA, DirectionBoth}, 0132 {QLatin1Char('"'), QLatin1Char('e'), 0x00EB, DirectionBoth}, 0133 {QLatin1Char('`'), QLatin1Char('i'), 0x00EC, DirectionBoth}, 0134 {QLatin1Char('\''), QLatin1Char('i'), 0x00ED, DirectionBoth}, 0135 {QLatin1Char('^'), QLatin1Char('i'), 0x00EE, DirectionBoth}, 0136 {QLatin1Char('"'), QLatin1Char('i'), 0x00EF, DirectionBoth}, 0137 /** 0x00F0: see EncoderLaTeXCharacterCommand */ 0138 {QLatin1Char('~'), QLatin1Char('n'), 0x00F1, DirectionBoth}, 0139 {QLatin1Char('`'), QLatin1Char('o'), 0x00F2, DirectionBoth}, 0140 {QLatin1Char('\''), QLatin1Char('o'), 0x00F3, DirectionBoth}, 0141 {QLatin1Char('^'), QLatin1Char('o'), 0x00F4, DirectionBoth}, 0142 {QLatin1Char('~'), QLatin1Char('o'), 0x00F5, DirectionBoth}, 0143 {QLatin1Char('"'), QLatin1Char('o'), 0x00F6, DirectionBoth}, 0144 /** 0x00F7: see EncoderLaTeXCharacterCommand */ 0145 /** 0x00F8: see EncoderLaTeXCharacterCommand */ 0146 {QLatin1Char('`'), QLatin1Char('u'), 0x00F9, DirectionBoth}, 0147 {QLatin1Char('\''), QLatin1Char('u'), 0x00FA, DirectionBoth}, 0148 {QLatin1Char('^'), QLatin1Char('u'), 0x00FB, DirectionBoth}, 0149 {QLatin1Char('"'), QLatin1Char('u'), 0x00FC, DirectionBoth}, 0150 {QLatin1Char('\''), QLatin1Char('y'), 0x00FD, DirectionBoth}, 0151 /** 0x00FE: see EncoderLaTeXCharacterCommand */ 0152 {QLatin1Char('"'), QLatin1Char('y'), 0x00FF, DirectionBoth}, 0153 {QLatin1Char('='), QLatin1Char('A'), 0x0100, DirectionBoth}, 0154 {QLatin1Char('='), QLatin1Char('a'), 0x0101, DirectionBoth}, 0155 {QLatin1Char('u'), QLatin1Char('A'), 0x0102, DirectionBoth}, 0156 {QLatin1Char('u'), QLatin1Char('a'), 0x0103, DirectionBoth}, 0157 {QLatin1Char('k'), QLatin1Char('A'), 0x0104, DirectionBoth}, 0158 {QLatin1Char('k'), QLatin1Char('a'), 0x0105, DirectionBoth}, 0159 {QLatin1Char('\''), QLatin1Char('C'), 0x0106, DirectionBoth}, 0160 {QLatin1Char('\''), QLatin1Char('c'), 0x0107, DirectionBoth}, 0161 {QLatin1Char('^'), QLatin1Char('C'), 0x0108, DirectionBoth}, 0162 {QLatin1Char('^'), QLatin1Char('c'), 0x0109, DirectionBoth}, 0163 {QLatin1Char('.'), QLatin1Char('C'), 0x010A, DirectionBoth}, 0164 {QLatin1Char('.'), QLatin1Char('c'), 0x010B, DirectionBoth}, 0165 {QLatin1Char('v'), QLatin1Char('C'), 0x010C, DirectionBoth}, 0166 {QLatin1Char('v'), QLatin1Char('c'), 0x010D, DirectionBoth}, 0167 {QLatin1Char('v'), QLatin1Char('D'), 0x010E, DirectionBoth}, 0168 {QLatin1Char('v'), QLatin1Char('d'), 0x010F, DirectionBoth}, 0169 {QLatin1Char('B'), QLatin1Char('D'), 0x0110, DirectionCommandToUnicode}, //< 'African D', command provided by package 'fc' (command seems to be the same as \M{D}) 0170 {QLatin1Char('B'), QLatin1Char('d'), 0x0111, DirectionCommandToUnicode}, //< 'African d' (?), command provided by package 'fc' 0171 {QLatin1Char('='), QLatin1Char('E'), 0x0112, DirectionBoth}, 0172 {QLatin1Char('='), QLatin1Char('e'), 0x0113, DirectionBoth}, 0173 {QLatin1Char('u'), QLatin1Char('E'), 0x0114, DirectionBoth}, 0174 {QLatin1Char('u'), QLatin1Char('e'), 0x0115, DirectionBoth}, 0175 {QLatin1Char('.'), QLatin1Char('E'), 0x0116, DirectionBoth}, 0176 {QLatin1Char('.'), QLatin1Char('e'), 0x0117, DirectionBoth}, 0177 {QLatin1Char('k'), QLatin1Char('E'), 0x0118, DirectionBoth}, 0178 {QLatin1Char('k'), QLatin1Char('e'), 0x0119, DirectionBoth}, 0179 {QLatin1Char('v'), QLatin1Char('E'), 0x011A, DirectionBoth}, 0180 {QLatin1Char('v'), QLatin1Char('e'), 0x011B, DirectionBoth}, 0181 {QLatin1Char('^'), QLatin1Char('G'), 0x011C, DirectionBoth}, 0182 {QLatin1Char('^'), QLatin1Char('g'), 0x011D, DirectionBoth}, 0183 {QLatin1Char('u'), QLatin1Char('G'), 0x011E, DirectionBoth}, 0184 {QLatin1Char('u'), QLatin1Char('g'), 0x011F, DirectionBoth}, 0185 {QLatin1Char('.'), QLatin1Char('G'), 0x0120, DirectionBoth}, 0186 {QLatin1Char('.'), QLatin1Char('g'), 0x0121, DirectionBoth}, 0187 {QLatin1Char('c'), QLatin1Char('G'), 0x0122, DirectionBoth}, 0188 {QLatin1Char('c'), QLatin1Char('g'), 0x0123, DirectionBoth}, 0189 {QLatin1Char('^'), QLatin1Char('H'), 0x0124, DirectionBoth}, 0190 {QLatin1Char('^'), QLatin1Char('h'), 0x0125, DirectionBoth}, 0191 {QLatin1Char('B'), QLatin1Char('H'), 0x0126, DirectionCommandToUnicode}, 0192 {QLatin1Char('B'), QLatin1Char('h'), 0x0127, DirectionCommandToUnicode}, 0193 {QLatin1Char('~'), QLatin1Char('I'), 0x0128, DirectionBoth}, 0194 {QLatin1Char('~'), QLatin1Char('i'), 0x0129, DirectionBoth}, 0195 {QLatin1Char('='), QLatin1Char('I'), 0x012A, DirectionBoth}, 0196 {QLatin1Char('='), QLatin1Char('i'), 0x012B, DirectionBoth}, 0197 {QLatin1Char('u'), QLatin1Char('I'), 0x012C, DirectionBoth}, 0198 {QLatin1Char('u'), QLatin1Char('i'), 0x012D, DirectionBoth}, 0199 {QLatin1Char('k'), QLatin1Char('I'), 0x012E, DirectionBoth}, 0200 {QLatin1Char('k'), QLatin1Char('i'), 0x012F, DirectionBoth}, 0201 {QLatin1Char('.'), QLatin1Char('I'), 0x0130, DirectionBoth}, 0202 /** 0x0131: see EncoderLaTeXCharacterCommand */ 0203 /** 0x0132: see EncoderLaTeXCharacterCommand */ 0204 /** 0x0133: see EncoderLaTeXCharacterCommand */ 0205 {QLatin1Char('^'), QLatin1Char('J'), 0x012E, DirectionBoth}, 0206 {QLatin1Char('^'), QLatin1Char('j'), 0x012F, DirectionBoth}, 0207 {QLatin1Char('c'), QLatin1Char('K'), 0x0136, DirectionBoth}, 0208 {QLatin1Char('c'), QLatin1Char('k'), 0x0137, DirectionBoth}, 0209 /** 0x0138: see EncoderLaTeXCharacterCommand */ 0210 {QLatin1Char('\''), QLatin1Char('L'), 0x0139, DirectionBoth}, 0211 {QLatin1Char('\''), QLatin1Char('l'), 0x013A, DirectionBoth}, 0212 {QLatin1Char('c'), QLatin1Char('L'), 0x013B, DirectionBoth}, 0213 {QLatin1Char('c'), QLatin1Char('l'), 0x013C, DirectionBoth}, 0214 {QLatin1Char('v'), QLatin1Char('L'), 0x013D, DirectionBoth}, 0215 {QLatin1Char('v'), QLatin1Char('l'), 0x013E, DirectionBoth}, 0216 {QLatin1Char('.'), QLatin1Char('L'), 0x013F, DirectionBoth}, 0217 {QLatin1Char('.'), QLatin1Char('l'), 0x0140, DirectionBoth}, 0218 {QLatin1Char('B'), QLatin1Char('L'), 0x0141, DirectionCommandToUnicode}, 0219 {QLatin1Char('B'), QLatin1Char('l'), 0x0142, DirectionCommandToUnicode}, 0220 {QLatin1Char('\''), QLatin1Char('N'), 0x0143, DirectionBoth}, 0221 {QLatin1Char('\''), QLatin1Char('n'), 0x0144, DirectionBoth}, 0222 {QLatin1Char('c'), QLatin1Char('n'), 0x0145, DirectionBoth}, 0223 {QLatin1Char('c'), QLatin1Char('n'), 0x0146, DirectionBoth}, 0224 {QLatin1Char('v'), QLatin1Char('N'), 0x0147, DirectionBoth}, 0225 {QLatin1Char('v'), QLatin1Char('n'), 0x0148, DirectionBoth}, 0226 /** 0x0149: TODO n preceded by apostrophe */ 0227 {QLatin1Char('m'), QLatin1Char('N'), 0x014A, DirectionCommandToUnicode}, 0228 {QLatin1Char('m'), QLatin1Char('n'), 0x014B, DirectionCommandToUnicode}, 0229 {QLatin1Char('='), QLatin1Char('O'), 0x014C, DirectionBoth}, 0230 {QLatin1Char('='), QLatin1Char('o'), 0x014D, DirectionBoth}, 0231 {QLatin1Char('u'), QLatin1Char('O'), 0x014E, DirectionBoth}, 0232 {QLatin1Char('u'), QLatin1Char('o'), 0x014F, DirectionBoth}, 0233 {QLatin1Char('H'), QLatin1Char('O'), 0x0150, DirectionBoth}, 0234 {QLatin1Char('H'), QLatin1Char('o'), 0x0151, DirectionBoth}, 0235 /** 0x0152: see EncoderLaTeXCharacterCommand */ 0236 /** 0x0153: see EncoderLaTeXCharacterCommand */ 0237 {QLatin1Char('\''), QLatin1Char('R'), 0x0154, DirectionBoth}, 0238 {QLatin1Char('\''), QLatin1Char('r'), 0x0155, DirectionBoth}, 0239 {QLatin1Char('c'), QLatin1Char('R'), 0x0156, DirectionBoth}, 0240 {QLatin1Char('c'), QLatin1Char('r'), 0x0157, DirectionBoth}, 0241 {QLatin1Char('v'), QLatin1Char('R'), 0x0158, DirectionBoth}, 0242 {QLatin1Char('v'), QLatin1Char('r'), 0x0159, DirectionBoth}, 0243 {QLatin1Char('\''), QLatin1Char('S'), 0x015A, DirectionBoth}, 0244 {QLatin1Char('\''), QLatin1Char('s'), 0x015B, DirectionBoth}, 0245 {QLatin1Char('^'), QLatin1Char('S'), 0x015C, DirectionBoth}, 0246 {QLatin1Char('^'), QLatin1Char('s'), 0x015D, DirectionBoth}, 0247 {QLatin1Char('c'), QLatin1Char('S'), 0x015E, DirectionBoth}, 0248 {QLatin1Char('c'), QLatin1Char('s'), 0x015F, DirectionBoth}, 0249 {QLatin1Char('v'), QLatin1Char('S'), 0x0160, DirectionBoth}, 0250 {QLatin1Char('v'), QLatin1Char('s'), 0x0161, DirectionBoth}, 0251 {QLatin1Char('c'), QLatin1Char('T'), 0x0162, DirectionBoth}, 0252 {QLatin1Char('c'), QLatin1Char('t'), 0x0163, DirectionBoth}, 0253 {QLatin1Char('v'), QLatin1Char('T'), 0x0164, DirectionBoth}, 0254 {QLatin1Char('v'), QLatin1Char('t'), 0x0165, DirectionBoth}, 0255 {QLatin1Char('B'), QLatin1Char('T'), 0x0166, DirectionCommandToUnicode}, 0256 {QLatin1Char('B'), QLatin1Char('t'), 0x0167, DirectionCommandToUnicode}, 0257 {QLatin1Char('~'), QLatin1Char('U'), 0x0168, DirectionBoth}, 0258 {QLatin1Char('~'), QLatin1Char('u'), 0x0169, DirectionBoth}, 0259 {QLatin1Char('='), QLatin1Char('U'), 0x016A, DirectionBoth}, 0260 {QLatin1Char('='), QLatin1Char('u'), 0x016B, DirectionBoth}, 0261 {QLatin1Char('u'), QLatin1Char('U'), 0x016C, DirectionBoth}, 0262 {QLatin1Char('u'), QLatin1Char('u'), 0x016D, DirectionBoth}, 0263 {QLatin1Char('r'), QLatin1Char('U'), 0x016E, DirectionBoth}, 0264 {QLatin1Char('r'), QLatin1Char('u'), 0x016F, DirectionBoth}, 0265 {QLatin1Char('H'), QLatin1Char('U'), 0x0170, DirectionBoth}, 0266 {QLatin1Char('H'), QLatin1Char('u'), 0x0171, DirectionBoth}, 0267 {QLatin1Char('k'), QLatin1Char('U'), 0x0172, DirectionBoth}, 0268 {QLatin1Char('k'), QLatin1Char('u'), 0x0173, DirectionBoth}, 0269 {QLatin1Char('^'), QLatin1Char('W'), 0x0174, DirectionBoth}, 0270 {QLatin1Char('^'), QLatin1Char('w'), 0x0175, DirectionBoth}, 0271 {QLatin1Char('^'), QLatin1Char('Y'), 0x0176, DirectionBoth}, 0272 {QLatin1Char('^'), QLatin1Char('y'), 0x0177, DirectionBoth}, 0273 {QLatin1Char('"'), QLatin1Char('Y'), 0x0178, DirectionBoth}, 0274 {QLatin1Char('\''), QLatin1Char('Z'), 0x0179, DirectionBoth}, 0275 {QLatin1Char('\''), QLatin1Char('z'), 0x017A, DirectionBoth}, 0276 {QLatin1Char('.'), QLatin1Char('Z'), 0x017B, DirectionBoth}, 0277 {QLatin1Char('.'), QLatin1Char('z'), 0x017C, DirectionBoth}, 0278 {QLatin1Char('v'), QLatin1Char('Z'), 0x017D, DirectionBoth}, 0279 {QLatin1Char('v'), QLatin1Char('z'), 0x017E, DirectionBoth}, 0280 /** 0x017F: TODO long s */ 0281 {QLatin1Char('B'), QLatin1Char('b'), 0x0180, DirectionCommandToUnicode}, 0282 {QLatin1Char('m'), QLatin1Char('B'), 0x0181, DirectionCommandToUnicode}, 0283 /** 0x0182 */ 0284 /** 0x0183 */ 0285 /** 0x0184 */ 0286 /** 0x0185 */ 0287 {QLatin1Char('m'), QLatin1Char('O'), 0x0186, DirectionCommandToUnicode}, 0288 {QLatin1Char('m'), QLatin1Char('C'), 0x0187, DirectionCommandToUnicode}, 0289 {QLatin1Char('m'), QLatin1Char('c'), 0x0188, DirectionCommandToUnicode}, 0290 {QLatin1Char('M'), QLatin1Char('D'), 0x0189, DirectionBoth}, //< 'African D', command provided by package 'fc' (command seems to be the same as \B{D}) 0291 {QLatin1Char('m'), QLatin1Char('D'), 0x018A, DirectionCommandToUnicode}, 0292 /** 0x018B */ 0293 /** 0x018C */ 0294 /** 0x018D */ 0295 {QLatin1Char('M'), QLatin1Char('E'), 0x018E, DirectionCommandToUnicode}, 0296 /** 0x018F */ 0297 {QLatin1Char('m'), QLatin1Char('E'), 0x0190, DirectionCommandToUnicode}, 0298 {QLatin1Char('m'), QLatin1Char('F'), 0x0191, DirectionCommandToUnicode}, 0299 {QLatin1Char('m'), QLatin1Char('f'), 0x0192, DirectionCommandToUnicode}, 0300 /** 0x0193 */ 0301 {QLatin1Char('m'), QLatin1Char('G'), 0x0194, DirectionCommandToUnicode}, 0302 /** 0x0195: see EncoderLaTeXCharacterCommand */ 0303 {QLatin1Char('m'), QLatin1Char('I'), 0x0196, DirectionCommandToUnicode}, 0304 {QLatin1Char('B'), QLatin1Char('I'), 0x0197, DirectionCommandToUnicode}, 0305 {QLatin1Char('m'), QLatin1Char('K'), 0x0198, DirectionCommandToUnicode}, 0306 {QLatin1Char('m'), QLatin1Char('k'), 0x0199, DirectionCommandToUnicode}, 0307 {QLatin1Char('B'), QLatin1Char('l'), 0x019A, DirectionCommandToUnicode}, 0308 /** 0x019B */ 0309 /** 0x019C */ 0310 {QLatin1Char('m'), QLatin1Char('J'), 0x019D, DirectionCommandToUnicode}, 0311 /** 0x019E */ 0312 /** 0x019F */ 0313 /** 0x01A0 */ 0314 /** 0x01A1 */ 0315 /** 0x01A2 */ 0316 /** 0x01A3 */ 0317 {QLatin1Char('m'), QLatin1Char('P'), 0x01A4, DirectionCommandToUnicode}, 0318 {QLatin1Char('m'), QLatin1Char('p'), 0x01A5, DirectionCommandToUnicode}, 0319 /** 0x01A6 */ 0320 /** 0x01A7 */ 0321 /** 0x01A8 */ 0322 /** 0x01A9: see EncoderLaTeXCharacterCommand */ 0323 /** 0x01AA */ 0324 /** 0x01AB */ 0325 {QLatin1Char('m'), QLatin1Char('T'), 0x01AC, DirectionCommandToUnicode}, 0326 {QLatin1Char('m'), QLatin1Char('t'), 0x01AD, DirectionCommandToUnicode}, 0327 {QLatin1Char('M'), QLatin1Char('T'), 0x01AE, DirectionCommandToUnicode}, 0328 /** 0x01AF */ 0329 /** 0x01B0 */ 0330 {QLatin1Char('m'), QLatin1Char('U'), 0x01B1, DirectionCommandToUnicode}, 0331 {QLatin1Char('m'), QLatin1Char('V'), 0x01B2, DirectionCommandToUnicode}, 0332 {QLatin1Char('m'), QLatin1Char('Y'), 0x01B3, DirectionCommandToUnicode}, 0333 {QLatin1Char('m'), QLatin1Char('y'), 0x01B4, DirectionCommandToUnicode}, 0334 {QLatin1Char('B'), QLatin1Char('Z'), 0x01B5, DirectionCommandToUnicode}, 0335 {QLatin1Char('B'), QLatin1Char('z'), 0x01B6, DirectionCommandToUnicode}, 0336 {QLatin1Char('m'), QLatin1Char('Z'), 0x01B7, DirectionCommandToUnicode}, 0337 /** 0x01B8 */ 0338 /** 0x01B9 */ 0339 /** 0x01BA */ 0340 {QLatin1Char('B'), QLatin1Char('2'), 0x01BB, DirectionCommandToUnicode}, 0341 /** 0x01BC */ 0342 /** 0x01BD */ 0343 /** 0x01BE */ 0344 /** 0x01BF */ 0345 /** 0x01C0 */ 0346 /** 0x01C1 */ 0347 /** 0x01C2 */ 0348 /** 0x01C3 */ 0349 /** 0x01C4 */ 0350 /** 0x01C5 */ 0351 /** 0x01C6 */ 0352 /** 0x01C7 */ 0353 /** 0x01C8 */ 0354 /** 0x01C9 */ 0355 /** 0x01CA */ 0356 /** 0x01CB */ 0357 /** 0x01CC */ 0358 {QLatin1Char('v'), QLatin1Char('A'), 0x01CD, DirectionBoth}, 0359 {QLatin1Char('v'), QLatin1Char('a'), 0x01CE, DirectionBoth}, 0360 {QLatin1Char('v'), QLatin1Char('G'), 0x01E6, DirectionBoth}, 0361 {QLatin1Char('v'), QLatin1Char('g'), 0x01E7, DirectionBoth}, 0362 {QLatin1Char('k'), QLatin1Char('O'), 0x01EA, DirectionBoth}, 0363 {QLatin1Char('k'), QLatin1Char('o'), 0x01EB, DirectionBoth}, 0364 {QLatin1Char('\''), QLatin1Char('F'), 0x01F4, DirectionBoth}, 0365 {QLatin1Char('\''), QLatin1Char('f'), 0x01F5, DirectionBoth}, 0366 {QLatin1Char('.'), QLatin1Char('A'), 0x0226, DirectionBoth}, 0367 {QLatin1Char('.'), QLatin1Char('a'), 0x0227, DirectionBoth}, 0368 {QLatin1Char('c'), QLatin1Char('E'), 0x0228, DirectionBoth}, 0369 {QLatin1Char('c'), QLatin1Char('e'), 0x0229, DirectionBoth}, 0370 {QLatin1Char('='), QLatin1Char('Y'), 0x0232, DirectionBoth}, 0371 {QLatin1Char('='), QLatin1Char('y'), 0x0233, DirectionBoth}, 0372 {QLatin1Char('.'), QLatin1Char('O'), 0x022E, DirectionBoth}, 0373 {QLatin1Char('.'), QLatin1Char('o'), 0x022F, DirectionBoth}, 0374 {QLatin1Char('M'), QLatin1Char('d'), 0x0256, DirectionBoth}, //< 'African d', command provided by package 'fc' (may be same as \B{d} ?) 0375 {QLatin1Char('.'), QLatin1Char('B'), 0x1E02, DirectionBoth}, 0376 {QLatin1Char('.'), QLatin1Char('b'), 0x1E03, DirectionBoth}, 0377 {QLatin1Char('d'), QLatin1Char('B'), 0x1E04, DirectionBoth}, 0378 {QLatin1Char('d'), QLatin1Char('b'), 0x1E05, DirectionBoth}, 0379 {QLatin1Char('.'), QLatin1Char('D'), 0x1E0A, DirectionBoth}, 0380 {QLatin1Char('.'), QLatin1Char('d'), 0x1E0B, DirectionBoth}, 0381 {QLatin1Char('d'), QLatin1Char('D'), 0x1E0C, DirectionBoth}, 0382 {QLatin1Char('d'), QLatin1Char('d'), 0x1E0D, DirectionBoth}, 0383 {QLatin1Char('c'), QLatin1Char('D'), 0x1E10, DirectionBoth}, 0384 {QLatin1Char('c'), QLatin1Char('d'), 0x1E11, DirectionBoth}, 0385 {QLatin1Char('.'), QLatin1Char('E'), 0x1E1E, DirectionBoth}, 0386 {QLatin1Char('.'), QLatin1Char('e'), 0x1E1F, DirectionBoth}, 0387 {QLatin1Char('.'), QLatin1Char('H'), 0x1E22, DirectionBoth}, 0388 {QLatin1Char('.'), QLatin1Char('h'), 0x1E23, DirectionBoth}, 0389 {QLatin1Char('d'), QLatin1Char('H'), 0x1E24, DirectionBoth}, 0390 {QLatin1Char('d'), QLatin1Char('h'), 0x1E25, DirectionBoth}, 0391 {QLatin1Char('"'), QLatin1Char('H'), 0x1E26, DirectionBoth}, 0392 {QLatin1Char('"'), QLatin1Char('h'), 0x1E27, DirectionBoth}, 0393 {QLatin1Char('c'), QLatin1Char('H'), 0x1E28, DirectionBoth}, 0394 {QLatin1Char('c'), QLatin1Char('h'), 0x1E29, DirectionBoth}, 0395 {QLatin1Char('d'), QLatin1Char('K'), 0x1E32, DirectionBoth}, 0396 {QLatin1Char('d'), QLatin1Char('k'), 0x1E33, DirectionBoth}, 0397 {QLatin1Char('d'), QLatin1Char('L'), 0x1E36, DirectionBoth}, 0398 {QLatin1Char('d'), QLatin1Char('l'), 0x1E37, DirectionBoth}, 0399 {QLatin1Char('.'), QLatin1Char('M'), 0x1E40, DirectionBoth}, 0400 {QLatin1Char('.'), QLatin1Char('m'), 0x1E41, DirectionBoth}, 0401 {QLatin1Char('d'), QLatin1Char('M'), 0x1E42, DirectionBoth}, 0402 {QLatin1Char('d'), QLatin1Char('m'), 0x1E43, DirectionBoth}, 0403 {QLatin1Char('.'), QLatin1Char('N'), 0x1E44, DirectionBoth}, 0404 {QLatin1Char('.'), QLatin1Char('n'), 0x1E45, DirectionBoth}, 0405 {QLatin1Char('.'), QLatin1Char('N'), 0x1E46, DirectionBoth}, 0406 {QLatin1Char('.'), QLatin1Char('n'), 0x1E47, DirectionBoth}, 0407 {QLatin1Char('.'), QLatin1Char('P'), 0x1E56, DirectionBoth}, 0408 {QLatin1Char('.'), QLatin1Char('p'), 0x1E57, DirectionBoth}, 0409 {QLatin1Char('.'), QLatin1Char('R'), 0x1E58, DirectionBoth}, 0410 {QLatin1Char('.'), QLatin1Char('r'), 0x1E59, DirectionBoth}, 0411 {QLatin1Char('d'), QLatin1Char('R'), 0x1E5A, DirectionBoth}, 0412 {QLatin1Char('d'), QLatin1Char('r'), 0x1E5B, DirectionBoth}, 0413 {QLatin1Char('.'), QLatin1Char('S'), 0x1E60, DirectionBoth}, 0414 {QLatin1Char('.'), QLatin1Char('s'), 0x1E61, DirectionBoth}, 0415 {QLatin1Char('d'), QLatin1Char('S'), 0x1E62, DirectionBoth}, 0416 {QLatin1Char('d'), QLatin1Char('s'), 0x1E63, DirectionBoth}, 0417 {QLatin1Char('.'), QLatin1Char('T'), 0x1E6A, DirectionBoth}, 0418 {QLatin1Char('.'), QLatin1Char('t'), 0x1E6B, DirectionBoth}, 0419 {QLatin1Char('d'), QLatin1Char('T'), 0x1E6C, DirectionBoth}, 0420 {QLatin1Char('d'), QLatin1Char('t'), 0x1E6D, DirectionBoth}, 0421 {QLatin1Char('d'), QLatin1Char('V'), 0x1E7E, DirectionBoth}, 0422 {QLatin1Char('d'), QLatin1Char('v'), 0x1E7F, DirectionBoth}, 0423 {QLatin1Char('`'), QLatin1Char('W'), 0x1E80, DirectionBoth}, 0424 {QLatin1Char('`'), QLatin1Char('w'), 0x1E81, DirectionBoth}, 0425 {QLatin1Char('\''), QLatin1Char('W'), 0x1E82, DirectionBoth}, 0426 {QLatin1Char('\''), QLatin1Char('w'), 0x1E83, DirectionBoth}, 0427 {QLatin1Char('"'), QLatin1Char('W'), 0x1E84, DirectionBoth}, 0428 {QLatin1Char('"'), QLatin1Char('w'), 0x1E85, DirectionBoth}, 0429 {QLatin1Char('.'), QLatin1Char('W'), 0x1E86, DirectionBoth}, 0430 {QLatin1Char('.'), QLatin1Char('w'), 0x1E87, DirectionBoth}, 0431 {QLatin1Char('d'), QLatin1Char('W'), 0x1E88, DirectionBoth}, 0432 {QLatin1Char('d'), QLatin1Char('w'), 0x1E88, DirectionBoth}, 0433 {QLatin1Char('.'), QLatin1Char('X'), 0x1E8A, DirectionBoth}, 0434 {QLatin1Char('.'), QLatin1Char('x'), 0x1E8B, DirectionBoth}, 0435 {QLatin1Char('"'), QLatin1Char('X'), 0x1E8C, DirectionBoth}, 0436 {QLatin1Char('"'), QLatin1Char('x'), 0x1E8D, DirectionBoth}, 0437 {QLatin1Char('.'), QLatin1Char('Y'), 0x1E8E, DirectionBoth}, 0438 {QLatin1Char('.'), QLatin1Char('y'), 0x1E8F, DirectionBoth}, 0439 {QLatin1Char('d'), QLatin1Char('Z'), 0x1E92, DirectionBoth}, 0440 {QLatin1Char('d'), QLatin1Char('z'), 0x1E93, DirectionBoth}, 0441 {QLatin1Char('"'), QLatin1Char('t'), 0x1E97, DirectionBoth}, 0442 {QLatin1Char('r'), QLatin1Char('w'), 0x1E98, DirectionBoth}, 0443 {QLatin1Char('r'), QLatin1Char('y'), 0x1E99, DirectionBoth}, 0444 {QLatin1Char('d'), QLatin1Char('A'), 0x1EA0, DirectionBoth}, 0445 {QLatin1Char('d'), QLatin1Char('a'), 0x1EA1, DirectionBoth}, 0446 {QLatin1Char('d'), QLatin1Char('E'), 0x1EB8, DirectionBoth}, 0447 {QLatin1Char('d'), QLatin1Char('e'), 0x1EB9, DirectionBoth}, 0448 {QLatin1Char('d'), QLatin1Char('I'), 0x1ECA, DirectionBoth}, 0449 {QLatin1Char('d'), QLatin1Char('i'), 0x1ECB, DirectionBoth}, 0450 {QLatin1Char('d'), QLatin1Char('O'), 0x1ECC, DirectionBoth}, 0451 {QLatin1Char('d'), QLatin1Char('o'), 0x1ECD, DirectionBoth}, 0452 {QLatin1Char('d'), QLatin1Char('U'), 0x1EE4, DirectionBoth}, 0453 {QLatin1Char('d'), QLatin1Char('u'), 0x1EE5, DirectionBoth}, 0454 {QLatin1Char('`'), QLatin1Char('Y'), 0x1EF2, DirectionBoth}, 0455 {QLatin1Char('`'), QLatin1Char('y'), 0x1EF3, DirectionBoth}, 0456 {QLatin1Char('d'), QLatin1Char('Y'), 0x1EF4, DirectionBoth}, 0457 {QLatin1Char('d'), QLatin1Char('y'), 0x1EF5, DirectionBoth}, 0458 {QLatin1Char('r'), QLatin1Char('q'), 0x2019, DirectionCommandToUnicode} ///< tricky: this is \rq 0459 }; 0460 0461 0462 /** 0463 * This structure contains information on the usage of dotless i 0464 * and dotless j in combination with accent-like modifiers. 0465 * Combinations such as \"{\i} are translated to an Unicode character 0466 * and back. The structure is a table with three columns: (1) the 0467 * modified (in the example before the quotation mark) (2) the ASCII 0468 * character (in the example before the 'i') (3) the Unicode 0469 * character described by a hexcode. 0470 */ 0471 // TODO other cases of \i and \j? 0472 static const struct DotlessIJCharacter { 0473 const QChar modifier; 0474 const QChar letter; 0475 const ushort unicode; 0476 const EncoderLaTeXCommandDirection direction; 0477 } 0478 dotlessIJCharacters[] = { 0479 {QLatin1Char('`'), QLatin1Char('i'), 0x00EC, DirectionBoth}, 0480 {QLatin1Char('\''), QLatin1Char('i'), 0x00ED, DirectionBoth}, 0481 {QLatin1Char('^'), QLatin1Char('i'), 0x00EE, DirectionBoth}, 0482 {QLatin1Char('"'), QLatin1Char('i'), 0x00EF, DirectionBoth}, 0483 {QLatin1Char('~'), QLatin1Char('i'), 0x0129, DirectionBoth}, 0484 {QLatin1Char('='), QLatin1Char('i'), 0x012B, DirectionBoth}, 0485 {QLatin1Char('u'), QLatin1Char('i'), 0x012D, DirectionBoth}, 0486 {QLatin1Char('k'), QLatin1Char('i'), 0x012F, DirectionBoth}, 0487 {QLatin1Char('^'), QLatin1Char('j'), 0x0135, DirectionBoth}, 0488 {QLatin1Char('v'), QLatin1Char('i'), 0x01D0, DirectionBoth}, 0489 {QLatin1Char('v'), QLatin1Char('j'), 0x01F0, DirectionBoth}, 0490 {QLatin1Char('G'), QLatin1Char('i'), 0x0209, DirectionCommandToUnicode} 0491 }; 0492 0493 0494 /** 0495 * This lookup allows to quickly find hits in the 0496 * EncoderLaTeXEscapedCharacter table. This data structure here 0497 * consists of a number of rows. Each row consists of a 0498 * modifier (like '"' or 'v') and an array of Unicode chars. 0499 * Letters 'A'..'Z','a'..'z','0'..'9' are used as index to this 0500 * array by invocing asciiLetterOrDigitToPos(). 0501 * This data structure is built in the constructor. 0502 */ 0503 static const int lookupTableNumModifiers = 32; 0504 static const int lookupTableNumCharacters = 26 * 2 + 10; 0505 static struct EncoderLaTeXEscapedCharacterLookupTableRow { 0506 QChar modifier; 0507 QChar unicode[lookupTableNumCharacters]; 0508 } *lookupTable[lookupTableNumModifiers]; 0509 0510 0511 /** 0512 * This data structure keeps track of math commands, which 0513 * have to be treated differently in text and math mode. 0514 * The math command like "subset of" could be used directly 0515 * in math mode, but must be enclosed in \ensuremath{...} 0516 * in text mode. 0517 */ 0518 static const struct MathCommand { 0519 const QString command; 0520 const ushort unicode; 0521 const EncoderLaTeXCommandDirection direction; 0522 } 0523 mathCommands[] = { 0524 {QStringLiteral("pm"), 0x00B1, DirectionBoth}, 0525 {QStringLiteral("mu"), 0x00B5, DirectionUnicodeToCommand}, //< Unicode's micro symbol becomes Greek letter 0526 {QStringLiteral("times"), 0x00D7, DirectionBoth}, 0527 {QStringLiteral("div"), 0x00F7, DirectionBoth}, 0528 {QStringLiteral("phi"), 0x0278, DirectionBoth}, ///< see also 0x03C6 (GREEK SMALL LETTER PHI) 0529 {QStringLiteral("Alpha"), 0x0391, DirectionBoth}, 0530 {QStringLiteral("Beta"), 0x0392, DirectionBoth}, 0531 {QStringLiteral("Gamma"), 0x0393, DirectionBoth}, 0532 {QStringLiteral("Delta"), 0x0394, DirectionBoth}, 0533 {QStringLiteral("Epsilon"), 0x0395, DirectionBoth}, 0534 {QStringLiteral("Zeta"), 0x0396, DirectionBoth}, 0535 {QStringLiteral("Eta"), 0x0397, DirectionBoth}, 0536 {QStringLiteral("Theta"), 0x0398, DirectionBoth}, 0537 {QStringLiteral("Iota"), 0x0399, DirectionBoth}, 0538 {QStringLiteral("Kappa"), 0x039A, DirectionBoth}, 0539 {QStringLiteral("Lamda"), 0x039B, DirectionCommandToUnicode}, ///< \Lamda does not exist, this is mostly for spelling errors 0540 {QStringLiteral("Lambda"), 0x039B, DirectionBoth}, 0541 {QStringLiteral("Mu"), 0x039C, DirectionBoth}, 0542 {QStringLiteral("Nu"), 0x039D, DirectionBoth}, 0543 {QStringLiteral("Xi"), 0x039E, DirectionBoth}, 0544 {QStringLiteral("Omicron"), 0x039F, DirectionBoth}, 0545 {QStringLiteral("Pi"), 0x03A0, DirectionBoth}, 0546 {QStringLiteral("Rho"), 0x03A1, DirectionBoth}, 0547 {QStringLiteral("Sigma"), 0x03A3, DirectionBoth}, 0548 {QStringLiteral("Tau"), 0x03A4, DirectionBoth}, 0549 {QStringLiteral("Upsilon"), 0x03A5, DirectionBoth}, 0550 {QStringLiteral("Phi"), 0x03A6, DirectionBoth}, 0551 {QStringLiteral("Chi"), 0x03A7, DirectionBoth}, 0552 {QStringLiteral("Psi"), 0x03A8, DirectionBoth}, 0553 {QStringLiteral("Omega"), 0x03A9, DirectionBoth}, 0554 {QStringLiteral("alpha"), 0x03B1, DirectionBoth}, 0555 {QStringLiteral("beta"), 0x03B2, DirectionBoth}, 0556 {QStringLiteral("gamma"), 0x03B3, DirectionBoth}, 0557 {QStringLiteral("delta"), 0x03B4, DirectionBoth}, 0558 {QStringLiteral("varepsilon"), 0x03B5, DirectionBoth}, 0559 {QStringLiteral("zeta"), 0x03B6, DirectionBoth}, 0560 {QStringLiteral("eta"), 0x03B7, DirectionBoth}, 0561 {QStringLiteral("theta"), 0x03B8, DirectionBoth}, 0562 {QStringLiteral("iota"), 0x03B9, DirectionBoth}, 0563 {QStringLiteral("kappa"), 0x03BA, DirectionBoth}, 0564 {QStringLiteral("lamda"), 0x03BB, DirectionCommandToUnicode}, ///< \lamda does not exist, this is mostly for spelling errors 0565 {QStringLiteral("lambda"), 0x03BB, DirectionBoth}, 0566 {QStringLiteral("mu"), 0x03BC, DirectionBoth}, 0567 {QStringLiteral("nu"), 0x03BD, DirectionBoth}, 0568 {QStringLiteral("xi"), 0x03BE, DirectionBoth}, 0569 {QStringLiteral("omicron"), 0x03BF, DirectionBoth}, 0570 {QStringLiteral("pi"), 0x03C0, DirectionBoth}, 0571 {QStringLiteral("rho"), 0x03C1, DirectionBoth}, 0572 {QStringLiteral("varsigma"), 0x03C2, DirectionBoth}, 0573 {QStringLiteral("sigma"), 0x03C3, DirectionBoth}, 0574 {QStringLiteral("tau"), 0x03C4, DirectionBoth}, 0575 {QStringLiteral("upsilon"), 0x03C5, DirectionBoth}, 0576 {QStringLiteral("varphi"), 0x03C6, DirectionBoth}, ///< see also 0x0278 (LATIN SMALL LETTER PHI) 0577 {QStringLiteral("chi"), 0x03C7, DirectionBoth}, 0578 {QStringLiteral("psi"), 0x03C8, DirectionBoth}, 0579 {QStringLiteral("omega"), 0x03C9, DirectionBoth}, 0580 {QStringLiteral("vartheta"), 0x03D1, DirectionBoth}, 0581 {QStringLiteral("varpi"), 0x03D6, DirectionBoth}, 0582 {QStringLiteral("digamma"), 0x03DC, DirectionBoth}, 0583 {QStringLiteral("varkappa"), 0x03F0, DirectionBoth}, 0584 {QStringLiteral("varrho"), 0x03F1, DirectionBoth}, 0585 {QStringLiteral("epsilon"), 0x03F5, DirectionBoth}, 0586 {QStringLiteral("backepsilon"), 0x03F6, DirectionBoth}, 0587 {QStringLiteral("aleph"), 0x05D0, DirectionBoth}, 0588 {QStringLiteral("dagger"), 0x2020, DirectionBoth}, 0589 {QStringLiteral("ddagger"), 0x2021, DirectionBoth}, 0590 {QStringLiteral("mathbb{C}"), 0x2102, DirectionBoth}, 0591 {QStringLiteral("ell"), 0x2113, DirectionBoth}, 0592 {QStringLiteral("mho"), 0x2127, DirectionBoth}, 0593 {QStringLiteral("beth"), 0x2136, DirectionBoth}, 0594 {QStringLiteral("gimel"), 0x2137, DirectionBoth}, 0595 {QStringLiteral("daleth"), 0x2138, DirectionBoth}, 0596 {QStringLiteral("rightarrow"), 0x2192, DirectionBoth}, 0597 {QStringLiteral("forall"), 0x2200, DirectionBoth}, 0598 {QStringLiteral("complement"), 0x2201, DirectionBoth}, 0599 {QStringLiteral("partial"), 0x2202, DirectionBoth}, 0600 {QStringLiteral("exists"), 0x2203, DirectionBoth}, 0601 {QStringLiteral("nexists"), 0x2204, DirectionBoth}, 0602 {QStringLiteral("varnothing"), 0x2205, DirectionBoth}, 0603 {QStringLiteral("nabla"), 0x2207, DirectionBoth}, 0604 {QStringLiteral("in"), 0x2208, DirectionBoth}, 0605 {QStringLiteral("notin"), 0x2209, DirectionBoth}, 0606 {QStringLiteral("ni"), 0x220B, DirectionBoth}, 0607 {QStringLiteral("not\\ni"), 0x220C, DirectionBoth}, 0608 {QStringLiteral("asterisk"), 0x2217, DirectionCommandToUnicode}, 0609 {QStringLiteral("infty"), 0x221E, DirectionBoth}, 0610 {QStringLiteral("leq"), 0x2264, DirectionBoth}, 0611 {QStringLiteral("geq"), 0x2265, DirectionBoth}, 0612 {QStringLiteral("lneq"), 0x2268, DirectionBoth}, 0613 {QStringLiteral("gneq"), 0x2269, DirectionBoth}, 0614 {QStringLiteral("ll"), 0x226A, DirectionBoth}, 0615 {QStringLiteral("gg"), 0x226B, DirectionBoth}, 0616 {QStringLiteral("nless"), 0x226E, DirectionBoth}, 0617 {QStringLiteral("ngtr"), 0x226F, DirectionBoth}, 0618 {QStringLiteral("nleq"), 0x2270, DirectionBoth}, 0619 {QStringLiteral("ngeq"), 0x2271, DirectionBoth}, 0620 {QStringLiteral("subset"), 0x2282, DirectionBoth}, 0621 {QStringLiteral("supset"), 0x2283, DirectionBoth}, 0622 {QStringLiteral("subseteq"), 0x2286, DirectionBoth}, 0623 {QStringLiteral("supseteq"), 0x2287, DirectionBoth}, 0624 {QStringLiteral("nsubseteq"), 0x2288, DirectionBoth}, 0625 {QStringLiteral("nsupseteq"), 0x2289, DirectionBoth}, 0626 {QStringLiteral("subsetneq"), 0x228A, DirectionBoth}, 0627 {QStringLiteral("supsetneq"), 0x228A, DirectionBoth}, 0628 {QStringLiteral("Subset"), 0x22D0, DirectionBoth}, 0629 {QStringLiteral("Supset"), 0x22D1, DirectionBoth}, 0630 {QStringLiteral("lll"), 0x22D8, DirectionBoth}, 0631 {QStringLiteral("ggg"), 0x22D9, DirectionBoth}, 0632 {QStringLiteral("top"), 0x22A4, DirectionBoth}, 0633 {QStringLiteral("bot"), 0x22A5, DirectionBoth}, 0634 }; 0635 0636 0637 /** 0638 * This data structure holds commands representing a single 0639 * character. For example, it maps \AA to A with a ring (Nordic 0640 * letter) and back. The structure is a table with two columns: 0641 * (1) the command's name without a backslash (in the example 0642 * before the 'AA') (2) the Unicode character described by a 0643 * hexcode. 0644 */ 0645 static const struct EncoderLaTeXCharacterCommand { 0646 const QString command; 0647 const ushort unicode; 0648 const EncoderLaTeXCommandDirection direction; 0649 } 0650 encoderLaTeXCharacterCommands[] = { 0651 {QStringLiteral("textexclamdown"), 0x00A1, DirectionCommandToUnicode}, 0652 {QStringLiteral("textcent"), 0x00A2, DirectionBoth}, 0653 {QStringLiteral("pounds"), 0x00A3, DirectionBoth}, 0654 {QStringLiteral("textsterling"), 0x00A3, DirectionBoth}, 0655 /** 0x00A4 */ 0656 {QStringLiteral("textyen"), 0x00A5, DirectionBoth}, 0657 {QStringLiteral("textbrokenbar"), 0x00A6, DirectionBoth}, 0658 {QStringLiteral("S"), 0x00A7, DirectionBoth}, 0659 {QStringLiteral("textsection"), 0x00A7, DirectionBoth}, 0660 /** 0x00A8 */ 0661 {QStringLiteral("copyright"), 0x00A9, DirectionBoth}, 0662 {QStringLiteral("textcopyright"), 0x00A9, DirectionBoth}, 0663 {QStringLiteral("textordfeminine"), 0x00AA, DirectionBoth}, 0664 {QStringLiteral("guillemotleft"), 0x00AB, DirectionCommandToUnicode}, 0665 {QStringLiteral("textflqq"), 0x00AB, DirectionCommandToUnicode}, 0666 {QStringLiteral("flqq"), 0x00AB, DirectionBoth}, 0667 /** 0x00AC */ 0668 /** 0x00AD */ 0669 {QStringLiteral("textregistered"), 0x00AE, DirectionBoth}, 0670 /** 0x00AF */ 0671 {QStringLiteral("textdegree"), 0x00B0, DirectionBoth}, 0672 {QStringLiteral("textpm"), 0x00B1, DirectionBoth}, 0673 {QStringLiteral("textplusminus"), 0x00B1, DirectionCommandToUnicode}, 0674 /** 0x00B2 */ 0675 /** 0x00B3 */ 0676 /** 0x00B4 */ 0677 // Notes about Unicode U+00B5 ('micro sign'): 0678 // - Derived from the Greek 'mu' but used as a SI prefix meaning 'one millionth' 0679 // - Unicode differs between this symbol and a 'real' Greek 'mu' which has position U+03BC 0680 // - There are more lower case 'mu' in Unicode for mathematics (bold, italics, sans-serif, ...) 0681 // at position U+1D6CD and later; those are not supported at all by KBibTeX 0682 {QStringLiteral("textmu"), 0x00B5, DirectionUnicodeToCommand}, 0683 {QStringLiteral("textparagraph"), 0x00B6, DirectionBoth}, 0684 {QStringLiteral("textpilcrow"), 0x00B6, DirectionBoth}, 0685 {QStringLiteral("textperiodcentered"), 0x00B7, DirectionCommandToUnicode}, 0686 {QStringLiteral("textcdot"), 0x00B7, DirectionBoth}, 0687 {QStringLiteral("textcentereddot"), 0x00B7, DirectionCommandToUnicode}, 0688 /** 0x00B8 */ 0689 /** 0x00B9 */ 0690 {QStringLiteral("textordmasculine"), 0x00BA, DirectionBoth}, 0691 {QStringLiteral("guillemotright"), 0x00BB, DirectionCommandToUnicode}, 0692 {QStringLiteral("textfrqq"), 0x00BB, DirectionCommandToUnicode}, 0693 {QStringLiteral("frqq"), 0x00BB, DirectionBoth}, 0694 {QStringLiteral("textonequarter"), 0x00BC, DirectionBoth}, 0695 {QStringLiteral("textonehalf"), 0x00BD, DirectionBoth}, 0696 {QStringLiteral("textthreequarters"), 0x00BE, DirectionBoth}, 0697 {QStringLiteral("textquestiondown"), 0x00BF, DirectionCommandToUnicode}, // TODO /// recommended to write ?` instead of \textquestiondown 0698 {QStringLiteral("AA"), 0x00C5, DirectionBoth}, 0699 {QStringLiteral("AE"), 0x00C6, DirectionBoth}, 0700 {QStringLiteral("DH"), 0x00D0, DirectionBoth}, 0701 {QStringLiteral("texttimes"), 0x00D7, DirectionBoth}, 0702 {QStringLiteral("textmultiply"), 0x00D7, DirectionCommandToUnicode}, 0703 {QStringLiteral("O"), 0x00D8, DirectionBoth}, 0704 {QStringLiteral("TH"), 0x00DE, DirectionBoth}, 0705 {QStringLiteral("Thorn"), 0x00DE, DirectionCommandToUnicode}, 0706 {QStringLiteral("textThorn"), 0x00DE, DirectionCommandToUnicode}, 0707 {QStringLiteral("ss"), 0x00DF, DirectionBoth}, 0708 {QStringLiteral("aa"), 0x00E5, DirectionBoth}, 0709 {QStringLiteral("ae"), 0x00E6, DirectionBoth}, 0710 {QStringLiteral("dh"), 0x00F0, DirectionBoth}, 0711 {QStringLiteral("textdiv"), 0x00F7, DirectionBoth}, 0712 {QStringLiteral("textdivide"), 0x00F7, DirectionCommandToUnicode}, 0713 {QStringLiteral("o"), 0x00F8, DirectionBoth}, 0714 {QStringLiteral("th"), 0x00FE, DirectionBoth}, 0715 {QStringLiteral("textthorn"), 0x00FE, DirectionCommandToUnicode}, 0716 {QStringLiteral("textthornvari"), 0x00FE, DirectionCommandToUnicode}, 0717 {QStringLiteral("textthornvarii"), 0x00FE, DirectionCommandToUnicode}, 0718 {QStringLiteral("textthornvariii"), 0x00FE, DirectionCommandToUnicode}, 0719 {QStringLiteral("textthornvariv"), 0x00FE, DirectionCommandToUnicode}, 0720 {QStringLiteral("Aogonek"), 0x0104, DirectionCommandToUnicode}, 0721 {QStringLiteral("aogonek"), 0x0105, DirectionCommandToUnicode}, 0722 {QStringLiteral("DJ"), 0x0110, DirectionBoth}, 0723 {QStringLiteral("dj"), 0x0111, DirectionBoth}, 0724 {QStringLiteral("textcrd"), 0x0111, DirectionCommandToUnicode}, 0725 {QStringLiteral("textHslash"), 0x0126, DirectionCommandToUnicode}, 0726 {QStringLiteral("textHbar"), 0x0126, DirectionCommandToUnicode}, 0727 {QStringLiteral("textcrh"), 0x0127, DirectionCommandToUnicode}, 0728 {QStringLiteral("texthbar"), 0x0127, DirectionCommandToUnicode}, 0729 {QStringLiteral("i"), 0x0131, DirectionBoth}, 0730 {QStringLiteral("IJ"), 0x0132, DirectionBoth}, 0731 {QStringLiteral("ij"), 0x0133, DirectionBoth}, 0732 {QStringLiteral("textkra"), 0x0138, DirectionCommandToUnicode}, 0733 {QStringLiteral("Lcaron"), 0x013D, DirectionCommandToUnicode}, 0734 {QStringLiteral("lcaron"), 0x013E, DirectionCommandToUnicode}, 0735 {QStringLiteral("L"), 0x0141, DirectionBoth}, 0736 {QStringLiteral("Lstroke"), 0x0141, DirectionCommandToUnicode}, 0737 {QStringLiteral("l"), 0x0142, DirectionBoth}, 0738 {QStringLiteral("lstroke"), 0x0142, DirectionCommandToUnicode}, 0739 {QStringLiteral("textbarl"), 0x0142, DirectionCommandToUnicode}, 0740 {QStringLiteral("NG"), 0x014A, DirectionBoth}, 0741 {QStringLiteral("ng"), 0x014B, DirectionBoth}, 0742 {QStringLiteral("OE"), 0x0152, DirectionBoth}, 0743 {QStringLiteral("oe"), 0x0153, DirectionBoth}, 0744 {QStringLiteral("Racute"), 0x0154, DirectionCommandToUnicode}, 0745 {QStringLiteral("racute"), 0x0155, DirectionCommandToUnicode}, 0746 {QStringLiteral("Sacute"), 0x015A, DirectionCommandToUnicode}, 0747 {QStringLiteral("sacute"), 0x015B, DirectionCommandToUnicode}, 0748 {QStringLiteral("Scedilla"), 0x015E, DirectionCommandToUnicode}, 0749 {QStringLiteral("scedilla"), 0x015F, DirectionCommandToUnicode}, 0750 {QStringLiteral("Scaron"), 0x0160, DirectionCommandToUnicode}, 0751 {QStringLiteral("scaron"), 0x0161, DirectionCommandToUnicode}, 0752 {QStringLiteral("Tcaron"), 0x0164, DirectionCommandToUnicode}, 0753 {QStringLiteral("tcaron"), 0x0165, DirectionCommandToUnicode}, 0754 {QStringLiteral("textTstroke"), 0x0166, DirectionCommandToUnicode}, 0755 {QStringLiteral("textTbar"), 0x0166, DirectionCommandToUnicode}, 0756 {QStringLiteral("textTslash"), 0x0166, DirectionCommandToUnicode}, 0757 {QStringLiteral("texttstroke"), 0x0167, DirectionCommandToUnicode}, 0758 {QStringLiteral("texttbar"), 0x0167, DirectionCommandToUnicode}, 0759 {QStringLiteral("texttslash"), 0x0167, DirectionCommandToUnicode}, 0760 {QStringLiteral("Zdotaccent"), 0x017B, DirectionCommandToUnicode}, 0761 {QStringLiteral("zdotaccent"), 0x017C, DirectionCommandToUnicode}, 0762 {QStringLiteral("Zcaron"), 0x017D, DirectionCommandToUnicode}, 0763 {QStringLiteral("zcaron"), 0x017E, DirectionCommandToUnicode}, 0764 {QStringLiteral("textlongs"), 0x017F, DirectionCommandToUnicode}, 0765 {QStringLiteral("textcrb"), 0x0180, DirectionCommandToUnicode}, 0766 {QStringLiteral("textBhook"), 0x0181, DirectionCommandToUnicode}, 0767 {QStringLiteral("texthausaB"), 0x0181, DirectionCommandToUnicode}, 0768 {QStringLiteral("textOopen"), 0x0186, DirectionCommandToUnicode}, 0769 {QStringLiteral("textChook"), 0x0187, DirectionCommandToUnicode}, 0770 {QStringLiteral("textchook"), 0x0188, DirectionCommandToUnicode}, 0771 {QStringLiteral("texthtc"), 0x0188, DirectionCommandToUnicode}, 0772 {QStringLiteral("textDafrican"), 0x0189, DirectionCommandToUnicode}, 0773 {QStringLiteral("textDhook"), 0x018A, DirectionCommandToUnicode}, 0774 {QStringLiteral("texthausaD"), 0x018A, DirectionCommandToUnicode}, 0775 {QStringLiteral("textEreversed"), 0x018E, DirectionCommandToUnicode}, 0776 {QStringLiteral("textrevE"), 0x018E, DirectionCommandToUnicode}, 0777 {QStringLiteral("textEopen"), 0x0190, DirectionCommandToUnicode}, 0778 {QStringLiteral("textFhook"), 0x0191, DirectionCommandToUnicode}, 0779 {QStringLiteral("textflorin"), 0x0192, DirectionBoth}, 0780 {QStringLiteral("textgamma"), 0x0194, DirectionCommandToUnicode}, 0781 {QStringLiteral("textGammaafrican"), 0x0194, DirectionCommandToUnicode}, 0782 {QStringLiteral("hv"), 0x0195, DirectionCommandToUnicode}, 0783 {QStringLiteral("texthvlig"), 0x0195, DirectionCommandToUnicode}, 0784 {QStringLiteral("textIotaafrican"), 0x0196, DirectionCommandToUnicode}, 0785 {QStringLiteral("textKhook"), 0x0198, DirectionCommandToUnicode}, 0786 {QStringLiteral("texthausaK"), 0x0198, DirectionCommandToUnicode}, 0787 {QStringLiteral("texthtk"), 0x0199, DirectionCommandToUnicode}, 0788 {QStringLiteral("textkhook"), 0x0199, DirectionCommandToUnicode}, 0789 {QStringLiteral("textbarl"), 0x019A, DirectionCommandToUnicode}, 0790 {QStringLiteral("textcrlambda"), 0x019B, DirectionCommandToUnicode}, 0791 {QStringLiteral("textNhookleft"), 0x019D, DirectionCommandToUnicode}, 0792 {QStringLiteral("textnrleg"), 0x019E, DirectionCommandToUnicode}, 0793 {QStringLiteral("textPUnrleg"), 0x019E, DirectionCommandToUnicode}, 0794 {QStringLiteral("Ohorn"), 0x01A0, DirectionCommandToUnicode}, 0795 {QStringLiteral("ohorn"), 0x01A1, DirectionCommandToUnicode}, 0796 {QStringLiteral("textPhook"), 0x01A4, DirectionCommandToUnicode}, 0797 {QStringLiteral("texthtp"), 0x01A5, DirectionCommandToUnicode}, 0798 {QStringLiteral("textphook"), 0x01A5, DirectionCommandToUnicode}, 0799 {QStringLiteral("ESH"), 0x01A9, DirectionCommandToUnicode}, 0800 {QStringLiteral("textEsh"), 0x01A9, DirectionCommandToUnicode}, 0801 {QStringLiteral("textlooptoprevsh"), 0x01AA, DirectionCommandToUnicode}, 0802 {QStringLiteral("textlhtlongi"), 0x01AA, DirectionCommandToUnicode}, 0803 {QStringLiteral("textlhookt"), 0x01AB, DirectionCommandToUnicode}, 0804 {QStringLiteral("textThook"), 0x01AC, DirectionCommandToUnicode}, 0805 {QStringLiteral("textthook"), 0x01AD, DirectionCommandToUnicode}, 0806 {QStringLiteral("texthtt"), 0x01AD, DirectionCommandToUnicode}, 0807 {QStringLiteral("textTretroflexhook"), 0x01AE, DirectionCommandToUnicode}, 0808 {QStringLiteral("Uhorn"), 0x01AF, DirectionCommandToUnicode}, 0809 {QStringLiteral("uhorn"), 0x01B0, DirectionCommandToUnicode}, 0810 {QStringLiteral("textupsilon"), 0x01B1, DirectionCommandToUnicode}, 0811 {QStringLiteral("textVhook"), 0x01B2, DirectionCommandToUnicode}, 0812 {QStringLiteral("textYhook"), 0x01B3, DirectionCommandToUnicode}, 0813 {QStringLiteral("textvhook"), 0x01B4, DirectionCommandToUnicode}, 0814 {QStringLiteral("Zbar"), 0x01B5, DirectionCommandToUnicode}, 0815 {QStringLiteral("zbar"), 0x01B6, DirectionCommandToUnicode}, 0816 {QStringLiteral("EZH"), 0x01B7, DirectionCommandToUnicode}, 0817 {QStringLiteral("textEzh"), 0x01B7, DirectionCommandToUnicode}, 0818 {QStringLiteral("LJ"), 0x01C7, DirectionCommandToUnicode}, 0819 {QStringLiteral("Lj"), 0x01C8, DirectionCommandToUnicode}, 0820 {QStringLiteral("lj"), 0x01C9, DirectionCommandToUnicode}, 0821 {QStringLiteral("NJ"), 0x01CA, DirectionCommandToUnicode}, 0822 {QStringLiteral("Nj"), 0x01CB, DirectionCommandToUnicode}, 0823 {QStringLiteral("nj"), 0x01CC, DirectionCommandToUnicode}, 0824 {QStringLiteral("DZ"), 0x01F1, DirectionCommandToUnicode}, 0825 {QStringLiteral("Dz"), 0x01F2, DirectionCommandToUnicode}, 0826 {QStringLiteral("dz"), 0x01F3, DirectionCommandToUnicode}, 0827 {QStringLiteral("HV"), 0x01F6, DirectionCommandToUnicode}, 0828 {QStringLiteral("j"), 0x0237, DirectionBoth}, 0829 // Notes about Unicode U+03BC ('Greek small letter mu'): 0830 // - Unicode differs between this symbol and a 'micro' (SI-prefix) which has position U+00B5 0831 // - There are more lower case 'mu' in Unicode for mathematics (bold, italics, sans-serif, ...) 0832 // at position U+1D6CD and later; those are not supported at all by KBibTeX 0833 // - LaTeX package 'textcomp' provides command '\textmu' but no other Greek letters 0834 // - LaTeX package 'textgreek' provides commands for all Greek letters (e.g. '\textpi') but 0835 // to avoid conflicts with 'textcomp', the command for 'mu' is '\textmugreek' 0836 {QStringLiteral("textmugreek"), 0x03BC, DirectionCommandToUnicode}, 0837 {QStringLiteral("textmu"), 0x03BC, DirectionBoth}, 0838 {QStringLiteral("ldots"), 0x2026, DirectionBoth}, 0839 {QStringLiteral("grqq"), 0x201C, DirectionCommandToUnicode}, 0840 {QStringLiteral("textquotedblleft"), 0x201C, DirectionCommandToUnicode}, 0841 {QStringLiteral("rqq"), 0x201D, DirectionCommandToUnicode}, 0842 {QStringLiteral("textquotedblright"), 0x201D, DirectionCommandToUnicode}, 0843 {QStringLiteral("glqq"), 0x201E, DirectionCommandToUnicode}, 0844 {QStringLiteral("SS"), 0x1E9E, DirectionBoth}, 0845 {QStringLiteral("textendash"), 0x2013, DirectionCommandToUnicode}, 0846 {QStringLiteral("textemdash"), 0x2014, DirectionCommandToUnicode}, 0847 {QStringLiteral("textquoteleft"), 0x2018, DirectionCommandToUnicode}, 0848 {QStringLiteral("lq"), 0x2018, DirectionBoth}, 0849 {QStringLiteral("textquoteright"), 0x2019, DirectionCommandToUnicode}, 0850 {QStringLiteral("rq"), 0x2019, DirectionBoth}, ///< tricky one: 'r' is a valid modifier 0851 {QStringLiteral("quotesinglbase"), 0x201A, DirectionBoth}, 0852 {QStringLiteral("quotedblbase"), 0x201E, DirectionBoth}, 0853 {QStringLiteral("textbullet "), 0x2022, DirectionBoth}, 0854 {QStringLiteral("guilsinglleft "), 0x2039, DirectionBoth}, 0855 {QStringLiteral("guilsinglright "), 0x203A, DirectionBoth}, 0856 {QStringLiteral("textcelsius"), 0x2103, DirectionBoth}, 0857 {QStringLiteral("textleftarrow"), 0x2190, DirectionBoth}, 0858 {QStringLiteral("textuparrow"), 0x2191, DirectionBoth}, 0859 {QStringLiteral("textrightarrow"), 0x2192, DirectionBoth}, 0860 {QStringLiteral("textdownarrow"), 0x2193, DirectionBoth} 0861 }; 0862 0863 const QChar EncoderLaTeX::encoderLaTeXProtectedSymbols[] = {QLatin1Char('#'), QLatin1Char('&'), QLatin1Char('%')}; 0864 0865 const QChar EncoderLaTeX::encoderLaTeXProtectedTextOnlySymbols[] = {QLatin1Char('_')}; 0866 0867 0868 /** 0869 * This data structure holds LaTeX symbol sequences (without 0870 * any backslash) that represent a single Unicode character. 0871 * For example, it maps --- to an 'em dash' and back. 0872 * The structure is a table with two columns: (1) the symbol 0873 * sequence (in the example before the '---') (2) the Unicode 0874 * character described by a hexcode. 0875 */ 0876 static const struct EncoderLaTeXSymbolSequence { 0877 const QString latex; 0878 const ushort unicode; 0879 const EncoderLaTeXCommandDirection direction; 0880 } 0881 encoderLaTeXSymbolSequences[] = { 0882 {QStringLiteral("!`"), 0x00A1, DirectionBoth}, 0883 {QStringLiteral("\"<"), 0x00AB, DirectionBoth}, 0884 {QStringLiteral("\">"), 0x00BB, DirectionBoth}, 0885 {QStringLiteral("?`"), 0x00BF, DirectionBoth}, 0886 {QStringLiteral("---"), 0x2014, DirectionBoth}, ///< --- must come before -- 0887 {QStringLiteral("--"), 0x2013, DirectionBoth}, 0888 {QStringLiteral("``"), 0x201C, DirectionBoth}, 0889 {QStringLiteral("''"), 0x201D, DirectionBoth}, 0890 {QStringLiteral("ff"), 0xFB00, DirectionUnicodeToCommand}, 0891 {QStringLiteral("fi"), 0xFB01, DirectionUnicodeToCommand}, 0892 {QStringLiteral("fl"), 0xFB02, DirectionUnicodeToCommand}, 0893 {QStringLiteral("ffi"), 0xFB03, DirectionUnicodeToCommand}, 0894 {QStringLiteral("ffl"), 0xFB04, DirectionUnicodeToCommand}, 0895 {QStringLiteral("ft"), 0xFB05, DirectionUnicodeToCommand}, 0896 {QStringLiteral("st"), 0xFB06, DirectionUnicodeToCommand} 0897 }; 0898 0899 0900 EncoderLaTeX::EncoderLaTeX() 0901 : Encoder() 0902 { 0903 /// Initialize lookup table with NULL pointers 0904 for (int i = 0; i < lookupTableNumModifiers; ++i) lookupTable[i] = nullptr; 0905 0906 int lookupTableCount = 0; 0907 /// Go through all table rows of encoderLaTeXEscapedCharacters 0908 for (const EncoderLaTeXEscapedCharacter &encoderLaTeXEscapedCharacter : encoderLaTeXEscapedCharacters) { 0909 /// Check if this row's modifier is already known 0910 bool knownModifier = false; 0911 int j; 0912 for (j = lookupTableCount - 1; j >= 0; --j) { 0913 knownModifier |= lookupTable[j]->modifier == encoderLaTeXEscapedCharacter.modifier; 0914 if (knownModifier) break; 0915 } 0916 0917 if (!knownModifier) { 0918 /// Ok, this row's modifier appeared for the first time, 0919 /// therefore initialize memory structure, i.e. row in lookupTable 0920 lookupTable[lookupTableCount] = new EncoderLaTeXEscapedCharacterLookupTableRow; 0921 lookupTable[lookupTableCount]->modifier = encoderLaTeXEscapedCharacter.modifier; 0922 /// If no special character is known for a letter+modifier 0923 /// combination, fall back using the ASCII character only 0924 for (ushort k = 0; k < 26; ++k) { 0925 lookupTable[lookupTableCount]->unicode[k] = QChar(QLatin1Char('A').unicode() + k); 0926 lookupTable[lookupTableCount]->unicode[k + 26] = QChar(QLatin1Char('a').unicode() + k); 0927 } 0928 for (ushort k = 0; k < 10; ++k) 0929 lookupTable[lookupTableCount]->unicode[k + 52] = QChar(QLatin1Char('0').unicode() + k); 0930 j = lookupTableCount; 0931 ++lookupTableCount; 0932 } 0933 0934 /// Add the letter as of the current row in encoderLaTeXEscapedCharacters 0935 /// into Unicode char array in the current modifier's row in the lookup table. 0936 int pos = -1; 0937 if ((pos = asciiLetterOrDigitToPos(encoderLaTeXEscapedCharacter.letter)) >= 0) 0938 lookupTable[j]->unicode[pos] = QChar(encoderLaTeXEscapedCharacter.unicode); 0939 else 0940 qCWarning(LOG_KBIBTEX_IO) << "Cannot handle letter " << encoderLaTeXEscapedCharacter.letter; 0941 } 0942 } 0943 0944 EncoderLaTeX::~EncoderLaTeX() 0945 { 0946 /// Clean-up memory 0947 for (int i = lookupTableNumModifiers - 1; i >= 0; --i) 0948 if (lookupTable[i] != nullptr) 0949 delete lookupTable[i]; 0950 } 0951 0952 QString EncoderLaTeX::decode(const QString &input) const 0953 { 0954 const int len = input.length(); 0955 QString output; 0956 output.reserve(((len >> 10) + 2) << 10); // reserving multiples of 1024 Bytes 0957 enum MathMode { 0958 MathModeNone = 0, MathModeDollar, MathModeEnsureMath 0959 }; 0960 QStack<MathMode> currentMathMode; 0961 #define currentMathModeTop() (currentMathMode.empty()?MathModeNone:currentMathMode.top()) 0962 int openCurlyBracketCounterEnsureMath = 0; 0963 QStack<int> popEnsureMathAtOpenCurlyBacketCounter; 0964 int cachedAsciiLetterOrDigitToPos = -1; 0965 0966 /// Go through input char by char 0967 for (int i = 0; i < len; ++i) { 0968 /** 0969 * Repeatedly check if input data contains a verbatim command 0970 * like \url{...}, copy it to output, and update i to point 0971 * to the next character after the verbatim command. 0972 */ 0973 while (testAndCopyVerbatimCommands(input, i, output)); 0974 if (i >= len) break; 0975 0976 /// Fetch current input char 0977 const QChar c = input[i]; 0978 0979 if (c == QLatin1Char('{')) { 0980 /// First case: An opening curly bracket, 0981 /// which is harmless (see else case), unless ... 0982 if (i < len - 3 && input[i + 1] == QLatin1Char('\\')) { 0983 /// ... it continues with a backslash 0984 0985 /// Next, check if there follows a modifier after the backslash 0986 /// For example an quotation mark as used in {\"a} 0987 const int lookupTablePos = modifierInLookupTable(input[i + 2]); 0988 0989 /// Check for spaces between modifier and character, for example 0990 /// like {\H o} 0991 int skipSpaces = 0; 0992 while (i + 3 + skipSpaces < len && input[i + 3 + skipSpaces] == QLatin1Char(' ') && skipSpaces < 16) ++skipSpaces; 0993 0994 bool found = false; 0995 if (lookupTablePos >= 0 && (skipSpaces > 0 || !input[i + 2].isLetter()) && i + skipSpaces < len - 4 && (cachedAsciiLetterOrDigitToPos = asciiLetterOrDigitToPos(input[i + 3 + skipSpaces])) >= 0 && input[i + 4 + skipSpaces] == QLatin1Char('}')) { 0996 /// If we found a modifier which is followed by 0997 /// a letter followed by a closing curly bracket, 0998 /// we are looking at something like {\"A} 0999 /// Use lookup table to see what Unicode char this 1000 /// represents 1001 const QChar unicodeLetter = lookupTable[lookupTablePos]->unicode[cachedAsciiLetterOrDigitToPos]; 1002 if (unicodeLetter.unicode() >= 127) { 1003 output.append(unicodeLetter); 1004 /// Step over those additional characters 1005 i += 4 + skipSpaces; 1006 found = true; 1007 } 1008 /// Don't print any warnings yet, as this if-case may got triggered by e.g. \mu 1009 /// ('m' is a potential modifier, yet \mu should be recognized as Greek letter later) 1010 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 5 && input[i + 3 + skipSpaces] == QLatin1Char('\\') && isIJ(input[i + 4 + skipSpaces]) && input[i + 5 + skipSpaces] == QLatin1Char('}')) { 1011 /// This is the case for {\'\i} or alike. 1012 for (const DotlessIJCharacter &dotlessIJCharacter : dotlessIJCharacters) 1013 if ((dotlessIJCharacter.direction & DirectionCommandToUnicode) && dotlessIJCharacter.letter == input[i + 4 + skipSpaces] && dotlessIJCharacter.modifier == input[i + 2]) { 1014 output.append(QChar(dotlessIJCharacter.unicode)); 1015 found = true; 1016 break; 1017 } 1018 if (!found) { 1019 /// This combination of modifier and letter is not known, 1020 /// so try to preserve it 1021 output.append(QStringView{input}.mid(i, 6 + skipSpaces)); 1022 qCWarning(LOG_KBIBTEX_IO) << "Cannot interpret BACKSLASH" << input[i + 2] << "BACKSLASH" << input[i + 4 + skipSpaces]; 1023 } 1024 /// Step over those additional characters 1025 i += 5 + skipSpaces; 1026 found = true; 1027 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 6 && input[i + 3 + skipSpaces] == QLatin1Char('{') && (cachedAsciiLetterOrDigitToPos = asciiLetterOrDigitToPos(input[i + 4 + skipSpaces])) >= 0 && input[i + 5 + skipSpaces] == QLatin1Char('}') && input[i + 6 + skipSpaces] == QLatin1Char('}')) { 1028 /// If we found a modifier which is followed by 1029 /// an opening curly bracket followed by a letter 1030 /// followed by two closing curly brackets, 1031 /// we are looking at something like {\"{A}} 1032 /// Use lookup table to see what Unicode char this 1033 /// represents 1034 const QChar unicodeLetter = lookupTable[lookupTablePos]->unicode[cachedAsciiLetterOrDigitToPos]; 1035 if (unicodeLetter.unicode() < 127) { 1036 /// This combination of modifier and letter is not known, 1037 /// so try to preserve it 1038 output.append(QStringView{input}.mid(i, 7 + skipSpaces)); 1039 qCDebug(LOG_KBIBTEX_IO) << input.mid(qMax(0, i - 5), 10); 1040 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to translate this into Unicode: " << input.mid(i, 7 + skipSpaces); 1041 } else 1042 output.append(unicodeLetter); 1043 /// Step over those additional characters 1044 i += 6 + skipSpaces; 1045 found = true; 1046 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 7 && input[i + 3 + skipSpaces] == QLatin1Char('{') && input[i + 4 + skipSpaces] == QLatin1Char('\\') && isIJ(input[i + 5 + skipSpaces]) && input[i + 6 + skipSpaces] == QLatin1Char('}') && input[i + 7 + skipSpaces] == QLatin1Char('}')) { 1047 /// This is the case for {\'{\i}} or alike. 1048 for (const DotlessIJCharacter &dotlessIJCharacter : dotlessIJCharacters) 1049 if ((dotlessIJCharacter.direction & DirectionCommandToUnicode) && dotlessIJCharacter.letter == input[i + 5 + skipSpaces] && dotlessIJCharacter.modifier == input[i + 2]) { 1050 output.append(QChar(dotlessIJCharacter.unicode)); 1051 found = true; 1052 break; 1053 } 1054 if (!found) { 1055 /// This combination of modifier and letter is not known, 1056 /// so try to preserve it 1057 output.append(QStringView{input}.mid(i, 8 + skipSpaces)); 1058 qCDebug(LOG_KBIBTEX_IO) << input.mid(qMax(0, i - 5), 10); 1059 qCWarning(LOG_KBIBTEX_IO) << "Cannot interpret BACKSLASH" << input[i + 2] << "BACKSLASH {" << input[i + 5 + skipSpaces] << "}"; 1060 } 1061 /// Step over those additional characters 1062 i += 7 + skipSpaces; 1063 found = true; 1064 } 1065 1066 if (!found) { 1067 /// Now, either some two-letter command like {\AA} or {\mu} is left 1068 /// to check for or there is completely unsuppored command sequence, 1069 /// but which then should be kept unmodified 1070 const QString alpha = readAlphaCharacters(input, i + 2); 1071 int nextPosAfterAlpha = i + 2 + alpha.size(); 1072 if (nextPosAfterAlpha < input.length() && input[nextPosAfterAlpha] == QLatin1Char('}')) { 1073 /// We may deal with a string like {\AA} or {\mu} 1074 /// Check which command it is, then insert corresponding Unicode character 1075 found = false; 1076 for (const EncoderLaTeXCharacterCommand &encoderLaTeXCharacterCommand : encoderLaTeXCharacterCommands) { 1077 if ((encoderLaTeXCharacterCommand.direction & DirectionCommandToUnicode) && encoderLaTeXCharacterCommand.command == alpha) { 1078 output.append(QChar(encoderLaTeXCharacterCommand.unicode)); 1079 found = true; 1080 break; 1081 } 1082 } 1083 1084 /// Check if a math command has been read, 1085 /// like \subset 1086 /// (automatically skipped if command was found above) 1087 if (!found) 1088 for (const MathCommand &mathCommand : mathCommands) { 1089 if ((mathCommand.direction & DirectionCommandToUnicode) && mathCommand.command == alpha) { 1090 output.append(QChar(mathCommand.unicode)); 1091 found = true; 1092 break; 1093 } 1094 } 1095 1096 if (!found) { 1097 /// Dealing with a string like {\noopsort} 1098 /// (see BibTeX documentation where this gets explained) 1099 output.append(QStringView{input}.mid(i, 3 + alpha.size())); 1100 } 1101 i = nextPosAfterAlpha; 1102 } else { 1103 /// Could be something like {\tt filename.txt} 1104 /// Keep it as it is 1105 output.append(c); 1106 } 1107 } 1108 } else { 1109 /// Nothing special, copy input char to output 1110 output.append(c); 1111 } 1112 } else if (c == QLatin1Char('\\') && i < len - 1) { 1113 /// Second case: A backslash as in \"o 1114 1115 /// Sometimes such command are closed with just {}, 1116 /// so remember if to check for that 1117 bool checkForExtraCurlyAtEnd = false; 1118 1119 /// Check if there follows a modifier after the backslash 1120 /// For example an quotation mark as used in \"a 1121 const int lookupTablePos = modifierInLookupTable(input[i + 1]); 1122 1123 /// Check for spaces between modifier and character, for example 1124 /// like \H o 1125 int skipSpaces = 0; 1126 while (i + 2 + skipSpaces < len && input[i + 2 + skipSpaces] == QLatin1Char(' ') && skipSpaces < 16) ++skipSpaces; 1127 1128 bool found = false; 1129 if (lookupTablePos >= 0 && (skipSpaces > 0 || !input[i + 1].isLetter()) && i + skipSpaces <= len - 3 && (cachedAsciiLetterOrDigitToPos = asciiLetterOrDigitToPos(input[i + 2 + skipSpaces])) >= 0) { 1130 /// We found a special modifier which is followed by 1131 /// a letter followed by normal text without any 1132 /// delimiter, so we are looking at something like 1133 /// \"u inside Kr\"uger 1134 /// Use lookup table to see what Unicode char this 1135 /// represents 1136 const QChar unicodeLetter = lookupTable[lookupTablePos]->unicode[cachedAsciiLetterOrDigitToPos]; 1137 if (unicodeLetter.unicode() > 127) { 1138 output.append(unicodeLetter); 1139 /// Step over those additional characters 1140 i += 2 + skipSpaces; 1141 found = true; 1142 } 1143 /// Don't print any warnings yet, as this if-case may got triggered by e.g. \mu 1144 /// ('m' is a potential modifier, yet \mu should be recognized as Greek letter later) 1145 } else if (lookupTablePos >= 0 && (skipSpaces > 0 || !input[i + 1].isLetter()) && i + skipSpaces <= len - 3 && (cachedAsciiLetterOrDigitToPos = asciiLetterOrDigitToPos(input[i + 2 + skipSpaces])) >= 0 && (i + skipSpaces == len - 3 || input[i + 3 + skipSpaces] == QLatin1Char('}') || input[i + 3 + skipSpaces] == QLatin1Char('{') || input[i + 3 + skipSpaces] == QLatin1Char(' ') || input[i + 3 + skipSpaces] == QLatin1Char('\t') || input[i + 3 + skipSpaces] == QLatin1Char('\\') || input[i + 3 + skipSpaces] == QLatin1Char('\r') || input[i + 3 + skipSpaces] == QLatin1Char('\n'))) { 1146 /// We found a modifier which is followed by 1147 /// a letter followed by a command delimiter such 1148 /// as a whitespace, so we are looking at something 1149 /// like \"u followed by a space or another delimiter 1150 /// Use lookup table to see what Unicode char this 1151 /// represents 1152 const QChar unicodeLetter = lookupTable[lookupTablePos]->unicode[cachedAsciiLetterOrDigitToPos]; 1153 if (unicodeLetter.unicode() >= 127) { 1154 output.append(unicodeLetter); 1155 /// Step over those additional characters 1156 i += 2 + skipSpaces; 1157 found = true; 1158 1159 if (input[i + 1] != QLatin1Char(' ') && input[i + 1] != QLatin1Char('\r') && input[i + 1] != QLatin1Char('\n')) { 1160 /// If no whitespace follows, still 1161 /// check for extra curly brackets 1162 checkForExtraCurlyAtEnd = true; 1163 } 1164 } 1165 /// Don't print any warnings yet, as this if-case may got triggered by e.g. \mu 1166 /// ('m' is a potential modifier, yet \mu should be recognized as Greek letter later) 1167 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 4 && input[i + 2 + skipSpaces] == QLatin1Char('{') && (cachedAsciiLetterOrDigitToPos = asciiLetterOrDigitToPos(input[i + 3 + skipSpaces])) >= 0 && input[i + 4 + skipSpaces] == QLatin1Char('}')) { 1168 /// We found a modifier which is followed by an opening 1169 /// curly bracket followed a letter followed by a closing 1170 /// curly bracket, so we are looking at something 1171 /// like \"{u} 1172 /// Use lookup table to see what Unicode char this 1173 /// represents 1174 const QChar unicodeLetter = lookupTable[lookupTablePos]->unicode[cachedAsciiLetterOrDigitToPos]; 1175 if (unicodeLetter.unicode() < 127) { 1176 /// This combination of modifier and letter is not known, 1177 /// so try to preserve it 1178 output.append(QStringView{input}.mid(i, 5 + skipSpaces)); 1179 qCDebug(LOG_KBIBTEX_IO) << input.mid(qMax(0, i - 5), 10); 1180 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to translate this into Unicode: " << input.mid(i, 5 + skipSpaces); 1181 } else 1182 output.append(unicodeLetter); 1183 /// Step over those additional characters 1184 i += 4 + skipSpaces; 1185 found = true; 1186 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 3 && input[i + 2 + skipSpaces] == QLatin1Char('\\') && isIJ(input[i + 3 + skipSpaces])) { 1187 /// This is the case for \'\i or alike. 1188 for (const DotlessIJCharacter &dotlessIJCharacter : dotlessIJCharacters) 1189 if ((dotlessIJCharacter.direction & DirectionCommandToUnicode) && dotlessIJCharacter.letter == input[i + 3 + skipSpaces] && dotlessIJCharacter.modifier == input[i + 1]) { 1190 output.append(QChar(dotlessIJCharacter.unicode)); 1191 found = true; 1192 break; 1193 } 1194 if (!found) { 1195 /// This combination of modifier and letter is not known, 1196 /// so try to preserve it 1197 output.append(QStringView{input}.mid(i, 4 + skipSpaces)); 1198 qCWarning(LOG_KBIBTEX_IO) << "Cannot interpret BACKSLASH" << input[i + 1] << "BACKSLASH" << input[i + 3 + skipSpaces]; 1199 } 1200 /// Step over those additional characters 1201 i += 3 + skipSpaces; 1202 found = true; 1203 } else if (lookupTablePos >= 0 && i + skipSpaces < len - 5 && input[i + 2 + skipSpaces] == QLatin1Char('{') && input[i + 3 + skipSpaces] == QLatin1Char('\\') && isIJ(input[i + 4 + skipSpaces]) && input[i + 5 + skipSpaces] == QLatin1Char('}')) { 1204 /// This is the case for \'{\i} or alike. 1205 for (const DotlessIJCharacter &dotlessIJCharacter : dotlessIJCharacters) 1206 if ((dotlessIJCharacter.direction & DirectionCommandToUnicode) && dotlessIJCharacter.letter == input[i + 4 + skipSpaces] && dotlessIJCharacter.modifier == input[i + 1]) { 1207 output.append(QChar(dotlessIJCharacter.unicode)); 1208 found = true; 1209 break; 1210 } 1211 if (!found) { 1212 /// This combination of modifier and letter is not known, 1213 /// so try to preserve it 1214 output.append(QStringView{input}.mid(i, 6 + skipSpaces)); 1215 qCWarning(LOG_KBIBTEX_IO) << "Cannot interpret BACKSLASH" << input[i + 1] << "BACKSLASH {" << input[i + 4 + skipSpaces] << "}"; 1216 } 1217 /// Step over those additional characters 1218 i += 5 + skipSpaces; 1219 found = true; 1220 } 1221 1222 if (!found && i < len - 1) { 1223 /// Now, the case of something like \AA is left 1224 /// to check for 1225 const QString alpha = readAlphaCharacters(input, i + 1); 1226 int nextPosAfterAlpha = i + alpha.size(); 1227 if (alpha.size() >= 1 && alpha.at(0).isLetter()) { 1228 /// We are dealing actually with a string like \AA or \o 1229 /// Check which command it is, 1230 /// insert corresponding Unicode character 1231 for (const EncoderLaTeXCharacterCommand &encoderLaTeXCharacterCommand : encoderLaTeXCharacterCommands) { 1232 if ((encoderLaTeXCharacterCommand.direction & DirectionCommandToUnicode) && encoderLaTeXCharacterCommand.command == alpha) { 1233 output.append(QChar(encoderLaTeXCharacterCommand.unicode)); 1234 found = true; 1235 break; 1236 } 1237 } 1238 1239 /// Check if a math command has been read, 1240 /// like \subset 1241 /// (automatically skipped if command was found above) 1242 if (!found) 1243 for (const MathCommand &mathCommand : mathCommands) { 1244 if ((mathCommand.direction & DirectionCommandToUnicode) && mathCommand.command == alpha) { 1245 if (currentMathModeTop() == MathModeNone) 1246 qCDebug(LOG_KBIBTEX_IO) << "Found math mode command" << QString(QStringLiteral("\\%1")).arg(alpha) << "outside of a math expression"; 1247 output.append(QChar(mathCommand.unicode)); 1248 found = true; 1249 break; 1250 } 1251 } 1252 1253 if (found) { 1254 /// Now, after a command, a whitespace may follow 1255 /// which has to get "eaten" as it acts as a command 1256 /// delimiter 1257 if (nextPosAfterAlpha + 1 < input.length() && (input[nextPosAfterAlpha + 1] == QLatin1Char(' ') || input[nextPosAfterAlpha + 1] == QLatin1Char('\r') || input[nextPosAfterAlpha + 1] == QLatin1Char('\n'))) 1258 ++nextPosAfterAlpha; 1259 else { 1260 /// If no whitespace follows, still 1261 /// check for extra curly brackets 1262 checkForExtraCurlyAtEnd = true; 1263 } 1264 } else { 1265 /// No command found? Just copy input char to output 1266 output.append(QStringView{input}.mid(i, 1 + alpha.size())); 1267 1268 if (alpha == QStringLiteral("ensuremath") && input[nextPosAfterAlpha + 1] == QLatin1Char('{')) { 1269 currentMathMode.push(MathModeEnsureMath); 1270 popEnsureMathAtOpenCurlyBacketCounter.push(openCurlyBracketCounterEnsureMath); 1271 ++openCurlyBracketCounterEnsureMath; 1272 output.append(QLatin1Char('{')); 1273 ++nextPosAfterAlpha; 1274 } 1275 } 1276 i = nextPosAfterAlpha; 1277 } else { 1278 /// Maybe we are dealing with a string like \& or \_ 1279 /// Check which command it is 1280 found = false; 1281 for (const QChar &encoderLaTeXProtectedSymbol : encoderLaTeXProtectedSymbols) 1282 if (encoderLaTeXProtectedSymbol == input[i + 1]) { 1283 output.append(encoderLaTeXProtectedSymbol); 1284 found = true; 1285 break; 1286 } 1287 1288 if (!found && currentMathModeTop() == MathModeNone) 1289 for (const QChar &encoderLaTeXProtectedTextOnlySymbol : encoderLaTeXProtectedTextOnlySymbols) 1290 if (encoderLaTeXProtectedTextOnlySymbol == input[i + 1]) { 1291 output.append(encoderLaTeXProtectedTextOnlySymbol); 1292 found = true; 1293 break; 1294 } 1295 1296 /// If command has been found, nothing has to be done 1297 /// except for hopping over this backslash 1298 if (found) 1299 ++i; 1300 else if (i < len - 1 && input[i + 1] == QChar(0x002c /* comma */)) { 1301 /// Found a thin space: \, 1302 /// Replacing Latex-like thin space with Unicode thin space 1303 output.append(QChar(0x2009)); 1304 // found = true; ///< only necessary if more tests will follow in the future 1305 ++i; 1306 found = true; 1307 } else { 1308 /// Nothing special, copy input char to output 1309 output.append(c); 1310 found = true; 1311 } 1312 } 1313 } else if (!found) { 1314 /// Nothing special, copy input char to output 1315 output.append(c); 1316 } 1317 1318 /// Finally, check if there may be extra curly brackets 1319 /// like {} and hop over them 1320 if (checkForExtraCurlyAtEnd && i < len - 2 && input[i + 1] == QLatin1Char('{') && input[i + 2] == QLatin1Char('}')) 1321 i += 2; 1322 } else { 1323 /// So far, no opening curly bracket and no backslash 1324 /// May still be a symbol sequence like --- 1325 bool isSymbolSequence = false; 1326 /// Go through all known symbol sequnces 1327 for (const EncoderLaTeXSymbolSequence &encoderLaTeXSymbolSequence : encoderLaTeXSymbolSequences) { 1328 /// First, check if read input character matches beginning of symbol sequence 1329 /// and input buffer as enough characters left to potentially contain 1330 /// symbol sequence 1331 const int latexLen = encoderLaTeXSymbolSequence.latex.length(); 1332 if ((encoderLaTeXSymbolSequence.direction & DirectionCommandToUnicode) && encoderLaTeXSymbolSequence.latex[0] == c && i <= len - latexLen) { 1333 /// Now actually check if symbol sequence is in input buffer 1334 isSymbolSequence = true; 1335 for (int p = 1; isSymbolSequence && p < latexLen; ++p) 1336 isSymbolSequence &= encoderLaTeXSymbolSequence.latex[p] == input[i + p]; 1337 if (isSymbolSequence) { 1338 /// Ok, found sequence: insert Unicode character in output 1339 /// and hop over sequence in input buffer 1340 output.append(QChar(encoderLaTeXSymbolSequence.unicode)); 1341 i += encoderLaTeXSymbolSequence.latex.length() - 1; 1342 break; 1343 } 1344 } 1345 } 1346 1347 if (!isSymbolSequence) { 1348 /// No symbol sequence found, so just copy input to output 1349 output.append(c); 1350 1351 /// Still, check if input character is a dollar sign 1352 /// without a preceding backslash, means toggling between 1353 /// text mode and math mode 1354 if (c == QLatin1Char('$') && (i == 0 || input[i - 1] != QLatin1Char('\\'))) { 1355 if (currentMathModeTop() == MathModeDollar) 1356 currentMathMode.pop(); //< the Dollar sign that got just read closes the math mode 1357 else 1358 currentMathMode.push(MathModeDollar); //< the Dollar sign that got just read starts a new math mode 1359 } 1360 if (currentMathModeTop() == MathModeEnsureMath) { 1361 if (c == QLatin1Char('{')) 1362 ++openCurlyBracketCounterEnsureMath; 1363 else if (c == QLatin1Char('}')) 1364 --openCurlyBracketCounterEnsureMath; 1365 if (!popEnsureMathAtOpenCurlyBacketCounter.empty() && openCurlyBracketCounterEnsureMath == popEnsureMathAtOpenCurlyBacketCounter.top()) { 1366 popEnsureMathAtOpenCurlyBacketCounter.pop(); 1367 currentMathMode.pop(); 1368 } 1369 } 1370 } 1371 } 1372 } 1373 1374 output.squeeze(); 1375 return output; 1376 } 1377 1378 bool EncoderLaTeX::testAndCopyVerbatimCommands(const QString &input, int &pos, QString &output) const 1379 { 1380 int copyBytesCount = 0; 1381 int openedClosedCurlyBrackets = 0; 1382 1383 /// check for \url 1384 if (pos < input.length() - 6 && QStringView{input}.mid(pos, 5) == QStringLiteral("\\url{")) { 1385 copyBytesCount = 5; 1386 openedClosedCurlyBrackets = 1; 1387 } 1388 1389 if (copyBytesCount > 0) { 1390 while (openedClosedCurlyBrackets > 0 && pos + copyBytesCount < input.length()) { 1391 ++copyBytesCount; 1392 if (input[pos + copyBytesCount] == QLatin1Char('{') && input[pos + copyBytesCount - 1] != QLatin1Char('\\')) ++openedClosedCurlyBrackets; 1393 else if (input[pos + copyBytesCount] == QLatin1Char('}') && input[pos + copyBytesCount - 1] != QLatin1Char('\\')) --openedClosedCurlyBrackets; 1394 } 1395 1396 output.append(QStringView{input}.mid(pos, copyBytesCount)); 1397 pos += copyBytesCount; 1398 } 1399 1400 return copyBytesCount > 0; 1401 } 1402 1403 QString EncoderLaTeX::encode(const QString &ninput, const TargetEncoding targetEncoding) const 1404 { 1405 /// Perform Canonical Decomposition followed by Canonical Composition 1406 const QString input = ninput.normalized(QString::NormalizationForm_C); 1407 1408 int len = input.length(); 1409 QString output; 1410 output.reserve(((len >> 10) + 2) << 10); // reserving multiples of 1024 Bytes 1411 enum MathMode { 1412 MathModeNone = 0, MathModeDollar, MathModeEnsureMath 1413 }; 1414 QStack<MathMode> currentMathMode; 1415 #define currentMathModeTop() (currentMathMode.empty()?MathModeNone:currentMathMode.top()) 1416 int openCurlyBracketCounterEnsureMath = 0; 1417 QStack<int> popEnsureMathAtOpenCurlyBacketCounter; 1418 1419 /// Go through input char by char 1420 for (int i = 0; i < len; ++i) { 1421 /** 1422 * Repeatedly check if input data contains a verbatim command 1423 * like \url{...}, append it to output, and update i to point 1424 * to the next character after the verbatim command. 1425 */ 1426 while (testAndCopyVerbatimCommands(input, i, output)); 1427 if (i >= len) break; 1428 1429 const QChar c = input[i]; 1430 1431 if (targetEncoding == TargetEncoding::ASCII && c.unicode() > 127) { 1432 /// If current char is outside ASCII boundaries ... 1433 bool found = false; 1434 1435 if (!found && !currentMathMode.empty()) { 1436 /// Ok, test for math commands if already in math mode 1437 for (const MathCommand &mathCommand : mathCommands) 1438 if ((mathCommand.direction & DirectionUnicodeToCommand) && mathCommand.unicode == c.unicode()) { 1439 output.append(QString(QStringLiteral("\\%1")).arg(mathCommand.command)); 1440 const QChar peekAhead = i < len - 1 ? input[i + 1] : QChar(); 1441 if (peekAhead != QLatin1Char('\\') && peekAhead != QLatin1Char('}') && peekAhead != QLatin1Char('$')) { 1442 // Between current command and following character a separator is necessary 1443 // FIXME This peek-ahead won't do its job properly, as it is not yet known 1444 // whether the next character will be kept as-is or rewritten to, for example, a LaTeX command 1445 // Example: if the complete input string is '$µµ$' and the current variable 'c' comes from 1446 // the first 'µ', it will assume that curly brackets are necessary, thus the final output 1447 // becomes '$\mu{}\mu$ despite that '$\mu\mu$' would have been a better output. 1448 output.append(QStringLiteral("{}")); 1449 } 1450 found = true; 1451 break; 1452 } 1453 } 1454 1455 /// Handle special cases of i without a dot (\i) 1456 for (const DotlessIJCharacter &dotlessIJCharacter : dotlessIJCharacters) 1457 if ((dotlessIJCharacter.direction & DirectionUnicodeToCommand) && c.unicode() == dotlessIJCharacter.unicode) { 1458 // FIXME Find a better solution, as the curly brackets are unnecessary in some situations 1459 // e.g. '{\'\i}{\'\i}' should better be '{\'\i\'\i}' 1460 output.append(QString(QStringLiteral("{\\%1\\%2}")).arg(dotlessIJCharacter.modifier, dotlessIJCharacter.letter)); 1461 found = true; 1462 break; 1463 } 1464 1465 if (!found) { 1466 /// ... test if there is a symbol sequence like --- 1467 /// to encode it 1468 for (const EncoderLaTeXSymbolSequence &encoderLaTeXSymbolSequence : encoderLaTeXSymbolSequences) 1469 if (encoderLaTeXSymbolSequence.unicode == c.unicode() && (encoderLaTeXSymbolSequence.direction & DirectionUnicodeToCommand)) { 1470 for (int l = 0; l < encoderLaTeXSymbolSequence.latex.length(); ++l) 1471 output.append(encoderLaTeXSymbolSequence.latex[l]); 1472 found = true; 1473 break; 1474 } 1475 } 1476 1477 if (!found) { 1478 /// Ok, no symbol sequence. Let's test character 1479 /// commands like \ss 1480 for (const EncoderLaTeXCharacterCommand &encoderLaTeXCharacterCommand : encoderLaTeXCharacterCommands) 1481 if (encoderLaTeXCharacterCommand.unicode == c.unicode() && (encoderLaTeXCharacterCommand.direction & DirectionUnicodeToCommand)) { 1482 // FIXME Find a better solution, as the curly brackets are unnecessary in some situations 1483 // e.g. '{\command}{\command}' should better be '{\command\command}' 1484 output.append(QString(QStringLiteral("{\\%1}")).arg(encoderLaTeXCharacterCommand.command)); 1485 found = true; 1486 break; 1487 } 1488 } 1489 1490 if (!found) { 1491 /// Ok, neither a character command. Let's test 1492 /// escaped characters with modifiers like \"a 1493 for (const EncoderLaTeXEscapedCharacter &encoderLaTeXEscapedCharacter : encoderLaTeXEscapedCharacters) 1494 if ((encoderLaTeXEscapedCharacter.direction & DirectionUnicodeToCommand) && encoderLaTeXEscapedCharacter.unicode == c.unicode()) { 1495 // FIXME Find a better solution, as the curly brackets are unnecessary in some situations 1496 // e.g. '{\"a}{\"a}' should better be '{\"a\"a}' 1497 const QString formatString = isAsciiLetter(encoderLaTeXEscapedCharacter.modifier) ? QStringLiteral("{\\%1 %2}") : QStringLiteral("{\\%1%2}"); 1498 output.append(formatString.arg(encoderLaTeXEscapedCharacter.modifier).arg(encoderLaTeXEscapedCharacter.letter)); 1499 found = true; 1500 break; 1501 } 1502 } 1503 1504 if (!found && currentMathMode.empty()) { 1505 /// Ok, test for math commands, even if outside of a math mode, then enter math mode for this character 1506 for (const MathCommand &mathCommand : mathCommands) 1507 if ((mathCommand.direction & DirectionUnicodeToCommand) && mathCommand.unicode == c.unicode()) { 1508 // FIXME Find a better solution, as the \ensuremath should span several characters 1509 // e.g. '\ensuremath{\alpha}\ensuremath{\alpha}' should better be '\ensuremath{\alpha\alpha}' 1510 output.append(QString(QStringLiteral("\\ensuremath{\\%1}")).arg(mathCommand.command)); 1511 found = true; 1512 break; 1513 } 1514 } 1515 1516 if (!found && c.unicode() == 0x2009) { 1517 /// Thin space 1518 output.append(QStringLiteral("\\,")); 1519 found = true; 1520 } 1521 1522 if (!found) { 1523 qCDebug(LOG_KBIBTEX_IO) << input.mid(qMax(0, i - 5), 10); 1524 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to encode Unicode char" << QString(QStringLiteral("0x%1")).arg(c.unicode(), 4, 16, QLatin1Char('0')); 1525 output.append(c); 1526 } 1527 } else if ((targetEncoding == TargetEncoding::ASCII && c.unicode() <= 127) 1528 || targetEncoding == TargetEncoding::UTF8 1529 /** but not targetEncoding == TargetEncoding::RAW */) { 1530 /// Current character is normal ASCII 1531 /// and targetEncoding was set to accept only ASCII characters 1532 /// -- or -- targetEncoding was set to accept UTF-8 characters 1533 1534 /// Still, some characters have special meaning 1535 /// in TeX and have to be preceded with a backslash 1536 bool found = false; 1537 for (const QChar &encoderLaTeXProtectedSymbol : encoderLaTeXProtectedSymbols) 1538 if (encoderLaTeXProtectedSymbol == c) { 1539 output.append(QLatin1Char('\\')).append(c); 1540 found = true; 1541 break; 1542 } 1543 1544 if (!found && !currentMathMode.empty()) { 1545 /// Ok, test for math commands if already in math mode 1546 for (const MathCommand &mathCommand : mathCommands) 1547 if ((mathCommand.direction & DirectionUnicodeToCommand) && mathCommand.unicode == c.unicode()) { 1548 output.append(QString(QStringLiteral("\\%1")).arg(mathCommand.command)); 1549 const QChar peekAhead = i < len - 1 ? input[i + 1] : QChar(); 1550 if (peekAhead != QLatin1Char('\\') && peekAhead != QLatin1Char('}') && peekAhead != QLatin1Char('$')) { 1551 // Between current command and following character a separator is necessary 1552 // FIXME This peek-ahead won't do its job properly, as it is not yet known 1553 // whether the next character will be kept as-is or rewritten to, for example, a LaTeX command 1554 // Example: if the complete input string is '$µµ$' and the current variable 'c' comes from 1555 // the first 'µ', it will assume that curly brackets are necessary, thus the final output 1556 // becomes '$\mu{}\mu$ despite that '$\mu\mu$' would have been a better output. 1557 output.append(QStringLiteral("{}")); 1558 } 1559 found = true; 1560 break; 1561 } 1562 } 1563 1564 if (!found && currentMathMode.empty()) 1565 for (const QChar &encoderLaTeXProtectedTextOnlySymbol : encoderLaTeXProtectedTextOnlySymbols) 1566 if (encoderLaTeXProtectedTextOnlySymbol == c) { 1567 output.append(QLatin1Char('\\')).append(c); 1568 found = true; 1569 break; 1570 } 1571 1572 if (!found) { 1573 /// Well, either this is not a special character or 1574 /// we do not know what to do with it, so just dump it into the output 1575 output.append(c); 1576 found = true; 1577 } 1578 1579 /// Finally, check if input character is a dollar sign 1580 /// without a preceding backslash, means toggling between 1581 /// text mode and math mode 1582 if (c == QLatin1Char('$') && (i == 0 || input[i - 1] != QLatin1Char('\\'))) { 1583 if (currentMathMode.empty()) 1584 currentMathMode.push(MathModeDollar); 1585 else if (currentMathModeTop() == MathModeDollar) 1586 currentMathMode.pop(); 1587 else if (currentMathModeTop() == MathModeEnsureMath) 1588 currentMathMode.push(MathModeDollar); 1589 } else if (output.right(12) == QStringLiteral("\\ensuremath{")) { 1590 currentMathMode.push(MathModeEnsureMath); 1591 popEnsureMathAtOpenCurlyBacketCounter.push(openCurlyBracketCounterEnsureMath); 1592 // ++openCurlyBracketCounterEnsureMath; //< not necessary as right below both 1593 /// 'currentMathModeTop() == MathModeEnsureMath' and 'c == QLatin1Char('{')' 1594 /// will be true 1595 } 1596 if (currentMathModeTop() == MathModeEnsureMath) { 1597 if (c == QLatin1Char('{')) 1598 ++openCurlyBracketCounterEnsureMath; 1599 else if (c == QLatin1Char('}')) 1600 --openCurlyBracketCounterEnsureMath; 1601 if (!popEnsureMathAtOpenCurlyBacketCounter.empty() && openCurlyBracketCounterEnsureMath == popEnsureMathAtOpenCurlyBacketCounter.top()) { 1602 popEnsureMathAtOpenCurlyBacketCounter.pop(); 1603 currentMathMode.pop(); 1604 } 1605 } 1606 } 1607 } 1608 1609 output.squeeze(); 1610 return output; 1611 } 1612 1613 int EncoderLaTeX::modifierInLookupTable(const QChar modifier) const 1614 { 1615 for (int m = 0; m < lookupTableNumModifiers && lookupTable[m] != nullptr; ++m) 1616 if (lookupTable[m]->modifier == modifier) return m; 1617 return -1; 1618 } 1619 1620 QString EncoderLaTeX::readAlphaCharacters(const QString &base, int startFrom) const 1621 { 1622 const int len = base.size(); 1623 for (int j = startFrom; j < len; ++j) { 1624 if (!isAsciiLetter(base[j])) 1625 return base.mid(startFrom, j - startFrom); 1626 } 1627 return base.mid(startFrom); 1628 } 1629 1630 const EncoderLaTeX &EncoderLaTeX::instance() 1631 { 1632 static const EncoderLaTeX self; 1633 return self; 1634 }