src/rendering/break_lines.cpp

0001 #include <break_lines.h>
0002 #include <QLibrary>
0003 #include <QTextCodec>
0004 #include <stdio.h>
0005 #include <stdlib.h>
0006
0007 /* If HAVE_LIBTHAI is defined, libkhtml will link against
0008  * libthai since compile time. Otherwise it will try to
0009  * dlopen at run-time
0010  *
0011  * Ott Pattara Nov 14, 2004
0012  */
0013
0014 #ifndef HAVE_LIBTHAI
0015 typedef int (*th_brk_def)(const unsigned char *, int[], int);
0016 static th_brk_def th_brk;
0017 #else
0018 #include <thai/thailib.h>
0019 #include <thai/thbrk.h>
0020 #endif
0021
0022 namespace khtml
0023 {
0024 struct ThaiCache {
0025     ThaiCache()
0026     {
0027         string = nullptr;
0028         allocated = 0x400;
0029         wbrpos = (int *) malloc(allocated * sizeof(int));
0030         numwbrpos = 0;
0031         numisbreakable = 0x400;
0032         isbreakable = (int *) malloc(numisbreakable * sizeof(int));
0033         library = nullptr;
0034     }
0035     ~ThaiCache()
0036     {
0037         free(wbrpos);
0038         free(isbreakable);
0039         if (library) {
0040             library->unload();
0041         }
0042         delete library;
0043     }
0044     const QChar *string;
0045     int *wbrpos;
0046     int *isbreakable;
0047     int allocated;
0048     int numwbrpos, numisbreakable;
0049     QLibrary *library;
0050 };
0051 static ThaiCache *cache = nullptr;
0052
0053 void cleanup_thaibreaks()
0054 {
0055     delete cache;
0056     cache = nullptr;
0057 #ifndef HAVE_LIBTHAI
0058     th_brk = nullptr;
0059 #endif
0060 }
0061
0062 bool isBreakableThai(const QChar *string, const int pos, const int len)
0063 {
0064     static QTextCodec *thaiCodec = QTextCodec::codecForMib(2259);
0065     //printf("Entering isBreakableThai with pos = %d\n", pos);
0066
0067 #ifndef HAVE_LIBTHAI
0068
0069     QLibrary *lib = new QLibrary(QLatin1String("libthai"));
0070
0071     /* load libthai dynamically */
0072     if ((!th_brk) && thaiCodec) {
0073         printf("Try to load libthai dynamically...\n");
0074         if (lib->load()) {
0075             th_brk = (th_brk_def) lib->resolve("th_brk");
0076         }
0077         if (!th_brk) {
0078             // indication that loading failed and we shouldn't try to load again
0079             printf("Error, can't load libthai...\n");
0080             thaiCodec = nullptr;
0081             if (lib->isLoaded()) {
0082                 lib->unload();
0083             }
0084         }
0085     }
0086
0087     if (!th_brk) {
0088         return true;
0089     }
0090 #endif
0091
0092     if (!cache) {
0093         cache = new ThaiCache;
0094 #ifndef HAVE_LIBTHAI
0095         cache->library = lib;
0096 #endif
0097     }
0098
0099     // build up string of thai chars
0100     if (string != cache->string) {
0101         //fprintf(stderr,"new string found (not in cache), calling libthai\n");
0102         QByteArray cstr = thaiCodec->fromUnicode(QString::fromRawData(string, len));
0103         //printf("About to call libthai::th_brk with str: %s",cstr.data());
0104
0105         cache->numwbrpos = th_brk((const unsigned char *) cstr.data(), cache->wbrpos, cache->allocated);
0106         //fprintf(stderr,"libthai returns with value %d\n",cache->numwbrpos);
0107         if (cache->numwbrpos > cache->allocated) {
0108             cache->allocated = cache->numwbrpos;
0109             cache->wbrpos = (int *)realloc(cache->wbrpos, cache->allocated * sizeof(int));
0110             cache->numwbrpos = th_brk((const unsigned char *) cstr.data(), cache->wbrpos, cache->allocated);
0111         }
0112         if (len > cache->numisbreakable) {
0113             cache->numisbreakable = len;
0114             cache->isbreakable = (int *)realloc(cache->isbreakable, cache->numisbreakable * sizeof(int));
0115         }
0116         for (int i = 0; i < len; ++i) {
0117             cache->isbreakable[i] = 0;
0118         }
0119         if (cache->numwbrpos > 0) {
0120             for (int i = cache->numwbrpos - 1; i >= 0; --i) {
0121                 cache->isbreakable[cache->wbrpos[i]] = 1;
0122             }
0123         }
0124         cache->string = string;
0125     }
0126     //printf("Returning %d\n", cache->isbreakable[pos]);
0127     return cache->isbreakable[pos];
0128 }
0129
0130 /*
0131   array of unicode codes where breaking shouldn't occur.
0132   (in sorted order because of using with binary search)
0133   these are currently for Japanese, though simply adding
0134   Korean, Chinese ones should work as well
0135 */
0136 /*
0137   dontbreakbefore[] contains characters not covered by QChar::Punctuation_Close that shouldn't be broken before.
0138   chars included in QChar::Punctuation_Close are listed below.(look at UAX #14)
0139      - 3001 ideographic comma
0140      - 3002 ideographic full stop
0141      - FE50 small comma
0142      - FF52 small full stop
0143      - FF0C fullwidth comma
0144      - FF0E fullwidth full stop
0145      - FF61 halfwidth ideographic full stop
0146      - FF64 halfwidth ideographic comma
0147   these character is commented out.
0148 */
0149 static const ushort dontbreakbefore[] = {
0150     //0x3001,   //ideographic comma
0151     //0x3002,   //ideographic full stop
0152     0x3005, //ideographic iteration mark
0153     0x3009, //right angle bracket
0154     0x300b, //right double angle bracket
0155     0x300d, //right corner bracket
0156     0x300f, //right white corner bracket
0157     0x3011, //right black lenticular bracket
0158     0x3015, //right tortoise shell bracket
0159     0x3041, //small a hiragana
0160     0x3043, //small i hiragana
0161     0x3045, //small u hiragana
0162     0x3047, //small e hiragana
0163     0x3049, //small o hiragana
0164     0x3063, //small tsu hiragana
0165     0x3083, //small ya hiragana
0166     0x3085, //small yu hiragana
0167     0x3087, //small yo hiragana
0168     0x308E, //small wa hiragana
0169     0x309B, //jap voiced sound mark
0170     0x309C, //jap semi-voiced sound mark
0171     0x309D, //jap iteration mark hiragana
0172     0x309E, //jap voiced iteration mark hiragana
0173     0x30A1, //small a katakana
0174     0x30A3, //small i katakana
0175     0x30A5, //small u katakana
0176     0x30A7, //small e katakana
0177     0x30A9, //small o katakana
0178     0x30C3, //small tsu katakana
0179     0x30E3, //small ya katakana
0180     0x30E5, //small yu katakana
0181     0x30E7, //small yo katakana
0182     0x30EE, //small wa katakana
0183     0x30F5, //small ka katakana
0184     0x30F6, //small ke katakana
0185     0x30FC, //jap prolonged sound mark
0186     0x30FD, //jap iteration mark katakana
0187     0x30FE, //jap voiced iteration mark katakana
0188     //0xFE50,   //small comma
0189     //0xFF52,   //small full stop
0190     0xFF01, //fullwidth exclamation mark
0191     0xFF09, //fullwidth right parenthesis
0192     //0xFF0C,   //fullwidth comma
0193     0xFF0D, //fullwidth hyphen-minus
0194     //0xFF0E,   //fullwidth full stop
0195     0xFF1F, //fullwidth question mark
0196     0xFF3D, //fullwidth right square bracket
0197     0xFF5D, //fullwidth right curly bracket
0198     //0xFF61,   //halfwidth ideographic full stop
0199     0xFF63, //halfwidth right corner bracket
0200     //0xFF64,   //halfwidth ideographic comma
0201     0xFF67, //halfwidth katakana letter small a
0202     0xFF68, //halfwidth katakana letter small i
0203     0xFF69, //halfwidth katakana letter small u
0204     0xFF6a, //halfwidth katakana letter small e
0205     0xFF6b, //halfwidth katakana letter small o
0206     0xFF6c, //halfwidth katakana letter small ya
0207     0xFF6d, //halfwidth katakana letter small yu
0208     0xFF6e, //halfwidth katakana letter small yo
0209     0xFF6f, //halfwidth katakana letter small tu
0210     0xFF70  //halfwidth katakana-hiragana prolonged sound mark
0211 };
0212
0213 // characters that aren't covered by QChar::Punctuation_Open
0214 static const ushort dontbreakafter[] = {
0215     0x3012, //postal mark
0216     0xFF03, //full width pound mark
0217     0xFF04, //full width dollar sign
0218     0xFF20, //full width @
0219     0xFFE1, //full width british pound sign
0220     0xFFE5  //full width yen sign
0221 };
0222
0223 static bool break_bsearch(const ushort *arr, const unsigned int count, const ushort val)
0224 {
0225     unsigned int left = 0;
0226     unsigned int right = count - 1;
0227
0228     while (left != right) {
0229         unsigned int i = (left + right) / 2;
0230         if (val == arr[i]) {
0231             return false;
0232         }
0233         if (val < arr[i]) {
0234             right = i;
0235         } else {
0236             left = i + 1;
0237         }
0238     }
0239
0240     return val != arr[left];
0241 }
0242
0243 bool isBreakable(const QChar *str, const int pos, int len)
0244 {
0245     const QChar *c = str + pos;
0246     unsigned short ch = c->unicode();
0247     if (ch > 0xff) {
0248         // not latin1, need to do more sophisticated checks for asian fonts
0249         unsigned char row = c->row();
0250         if (row == 0x0e) {
0251             // 0e00 - 0e7f == Thai
0252             if (c->cell() < 0x80) {
0253                 // consult libthai
0254                 return isBreakableThai(str, pos, len);
0255             } else {
0256                 return false;
0257             }
0258         }
0259         if ((row > 0x2d && row < 0xfb) || row == 0x11) {
0260             /* asian line breaking. */
0261             if (pos == 0) {
0262                 return false;    // never break before first character
0263             }
0264
0265             // check for simple punctuation cases
0266             QChar::Category cat = c->category();
0267             if (cat == QChar::Punctuation_Close ||
0268                     cat == QChar::Punctuation_Other ||
0269                     (str + (pos - 1))->category() == QChar::Punctuation_Open) {
0270                 return false;
0271             }
0272
0273             // do binary search in dontbreak[]
0274             return break_bsearch(dontbreakbefore, (sizeof(dontbreakbefore) / sizeof(*dontbreakbefore)), c->unicode()) &&
0275                    break_bsearch(dontbreakafter, (sizeof(dontbreakafter) / sizeof(*dontbreakafter)), (str + (pos - 1))->unicode());
0276         } else { // no asian font
0277             return c->isSpace();
0278         }
0279     } else {
0280         if (ch == ' ' || ch == '\n') {
0281             return true;
0282         }
0283     }
0284     return false;
0285 }
0286
0287 }