File indexing completed on 2024-06-09 05:04:43

0001 /*  This is RTF to HTML converter, implemented as a text filter, generally.
0002     Copyright (C) 2003 Valentin Lavrinenko, vlavrinenko@users.sourceforge.net
0003 
0004     available at http://rtf2html.sf.net
0005 
0006     Original available under the terms of the GNU LGPL2, and according
0007     to those terms, relicensed under the GNU GPL2 for inclusion in Tellico */
0008 
0009 /***************************************************************************
0010  *                                                                         *
0011  *   This program is free software; you can redistribute it and/or         *
0012  *   modify it under the terms of the GNU General Public License as        *
0013  *   published by the Free Software Foundation; either version 2 of        *
0014  *   the License or (at your option) version 3 or any later version        *
0015  *   accepted by the membership of KDE e.V. (or its successor approved     *
0016  *   by the membership of KDE e.V.), which shall act as a proxy            *
0017  *   defined in Section 14 of version 3 of the license.                    *
0018  *                                                                         *
0019  *   This program is distributed in the hope that it will be useful,       *
0020  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0021  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0022  *   GNU General Public License for more details.                          *
0023  *                                                                         *
0024  *   You should have received a copy of the GNU General Public License     *
0025  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0026  *                                                                         *
0027  ***************************************************************************/
0028 
0029 #include "rtf2html.h"
0030 #include "rtf_table.h"
0031 #include "rtf_tools.h"
0032 #include "rtf_keyword.h"
0033 #include "fmt_opts.h"
0034 
0035 #include <cstdlib>
0036 #include <stdexcept>
0037 #include <fstream>
0038 #include <iostream>
0039 #include <string>
0040 
0041 using Tellico::RTF2HTML;
0042 using namespace rtf;
0043 
0044 RTF2HTML::RTF2HTML(const QString& text) : m_text(text) {
0045 }
0046 
0047 QString RTF2HTML::toHTML() const {
0048    std::string str_in = m_text.toStdString();
0049 
0050    std::string::iterator buf_in=str_in.begin(), buf_in_end=str_in.end();
0051    colorvect colortbl;
0052    fontmap fonttbl;
0053    std::string title;
0054 
0055    bool bAsterisk=false;
0056    fo_stack foStack;
0057    formatting_options cur_options;
0058    std::string html;
0059    html_text par_html(cur_options);
0060 
0061    /* CellDefs in rtf are really queer. We'll keep a list of them in main()
0062       and will give an iterator into this list to a row */
0063    table_cell_defs_list CellDefsList;
0064    table_cell_defs_list::iterator CurCellDefs;
0065    table_cell_def* tcdCurCellDef=new table_cell_def;
0066    table_cell* tcCurCell=new table_cell;
0067    table_row* trCurRow=new table_row;
0068    table* tblCurTable=new table;
0069    int iLastRowLeft=0, iLastRowHeight=0;
0070    std::string t_str;
0071 
0072    bool bInTable=false;
0073 //   int iDocWidth=12240;
0074 //   int iMarginLeft=1800;
0075    while(buf_in!=buf_in_end)
0076    {
0077       switch (*buf_in)
0078       {
0079       case '\\':
0080       {
0081          rtf_keyword kw(++buf_in);
0082          if (kw.is_control_char())
0083             switch (kw.control_char())
0084             {
0085             case '\\': case '{': case '}':
0086                par_html.write(kw.control_char());
0087                break;
0088             case '\'':
0089             {
0090                std::string stmp(1,*buf_in++);
0091                stmp+=*buf_in++;
0092                long int code=std::strtol(stmp.c_str(), NULL, 16);
0093                switch (code)
0094                {
0095                   case 167:
0096                      par_html.write("&bull;");
0097                      break;
0098                   case 188:
0099                      par_html.write("&hellip;");
0100                      break;
0101                   default:
0102                      par_html.write((char)code);
0103                }
0104                break;
0105             }
0106             case '*':
0107                bAsterisk=true;
0108                break;
0109             case '~':
0110                par_html.write("&nbsp;");
0111                break;
0112             case '\n':
0113                par_html.write("<br><br>");
0114                break;
0115             }
0116          else //kw.is_control_char
0117             if (bAsterisk)
0118             {
0119                bAsterisk=false;
0120                skip_group(buf_in);
0121             }
0122             else
0123             {
0124                switch (kw.keyword())
0125                {
0126                case rtf_keyword::rkw_filetbl:
0127                case rtf_keyword::rkw_stylesheet:
0128                case rtf_keyword::rkw_header:
0129                case rtf_keyword::rkw_footer: case rtf_keyword::rkw_headerf:
0130                case rtf_keyword::rkw_footerf: case rtf_keyword::rkw_pict:
0131                case rtf_keyword::rkw_object:
0132                   // we'll skip such groups
0133                   skip_group(buf_in);
0134                   break;
0135                // document title
0136                case rtf_keyword::rkw_info:
0137                {
0138                   int depth=1;
0139                   bool in_title=false;
0140                   while (depth>0)
0141                   {
0142 //                     std::cout<<std::string(buf_in).substr(0,20)<<"\t"<<depth<<std::endl;
0143                      switch (*buf_in)
0144                      {
0145                      case '\\':
0146                      {
0147                         rtf_keyword kw(++buf_in);
0148                         if (kw.keyword()==rtf_keyword::rkw_title)
0149                            in_title=true;
0150                         break;
0151                      }
0152                      case '{': ++depth; ++buf_in; break;
0153                      case '}': --depth; ++buf_in; in_title=false; break;
0154                      default: if (in_title) title+=*buf_in; ++buf_in; break;
0155                      }
0156                   }
0157                   break;
0158                }
0159                // color table
0160                case rtf_keyword::rkw_colortbl:
0161                {
0162                   color clr;
0163                   while (*buf_in!='}')
0164                   {
0165                      switch (*buf_in)
0166                      {
0167                      case '\\':
0168                      {
0169                         rtf_keyword kw(++buf_in);
0170                         switch (kw.keyword())
0171                         {
0172                         case rtf_keyword::rkw_red:
0173                            clr.r=kw.parameter();
0174                            break;
0175                         case rtf_keyword::rkw_green:
0176                            clr.g=kw.parameter();
0177                            break;
0178                         case rtf_keyword::rkw_blue:
0179                            clr.b=kw.parameter();
0180                            break;
0181                         default: break;
0182                         }
0183                         break;
0184                      }
0185                      case ';':
0186                         colortbl.push_back(clr);
0187                         ++buf_in;
0188                         break;
0189                      default:
0190                         ++buf_in;
0191                         break;
0192                      }
0193                   }
0194                   ++buf_in;
0195                   break;
0196                }
0197                // font table
0198                case rtf_keyword::rkw_fonttbl:
0199                {
0200                   font fnt;
0201                   int font_num=0;
0202                   bool full_name=false;
0203                   bool in_font=false;
0204                   while (! (*buf_in=='}' && !in_font))
0205                   {
0206                      switch (*buf_in)
0207                      {
0208                      case '\\':
0209                      {
0210                         rtf_keyword kw(++buf_in);
0211                         if (kw.is_control_char() && kw.control_char()=='*')
0212                            skip_group(buf_in);
0213                         else
0214                            switch (kw.keyword())
0215                            {
0216                            case rtf_keyword::rkw_f:
0217                               font_num=kw.parameter();
0218                               break;
0219                            case rtf_keyword::rkw_fprq:
0220                               fnt.pitch=kw.parameter();
0221                               break;
0222                            case rtf_keyword::rkw_fcharset:
0223                               fnt.charset=kw.parameter();
0224                               break;
0225                            case rtf_keyword::rkw_fnil:
0226                               fnt.family=font::ff_none;
0227                               break;
0228                            case rtf_keyword::rkw_froman:
0229                               fnt.family=font::ff_serif;
0230                               break;
0231                            case rtf_keyword::rkw_fswiss:
0232                               fnt.family=font::ff_sans_serif;
0233                               break;
0234                            case rtf_keyword::rkw_fmodern:
0235                               fnt.family=font::ff_monospace;
0236                               break;
0237                            case rtf_keyword::rkw_fscript:
0238                               fnt.family=font::ff_cursive;
0239                               break;
0240                            case rtf_keyword::rkw_fdecor:
0241                               fnt.family=font::ff_fantasy;
0242                               break;
0243                            default: break;
0244                            }
0245                         break;
0246                      }
0247                      case '{':
0248                         in_font=true;
0249                         ++buf_in;
0250                         break;
0251                      case '}':
0252                         in_font=false;
0253                         fonttbl.insert(std::make_pair(font_num, fnt));
0254                         fnt=font();
0255                         full_name=false;
0256                         ++buf_in;
0257                         break;
0258                      case ';':
0259                         full_name=true;
0260                         ++buf_in;
0261                         break;
0262                      default:
0263                         if (!full_name && in_font)
0264                            fnt.name+=*buf_in;
0265                         ++buf_in;
0266                         break;
0267                      }
0268                   }
0269                   ++buf_in;
0270                   break;
0271                }
0272                // special characters
0273                case rtf_keyword::rkw_line: case rtf_keyword::rkw_softline:
0274                   par_html.write("<br>");
0275                   break;
0276                case rtf_keyword::rkw_tab:
0277                   par_html.write("&nbsp;&nbsp;");  // maybe, this can be done better
0278                   break;
0279                case rtf_keyword::rkw_enspace: case rtf_keyword::rkw_emspace:
0280                   par_html.write("&nbsp;");
0281                   break;
0282                case rtf_keyword::rkw_qmspace:
0283                   par_html.write("&thinsp;");
0284                   break;
0285                case rtf_keyword::rkw_endash:
0286                   par_html.write("&ndash;");
0287                   break;
0288                case rtf_keyword::rkw_emdash:
0289                   par_html.write("&mdash;");
0290                   break;
0291                case rtf_keyword::rkw_bullet:
0292                   par_html.write("&bull;");
0293                   break;
0294                case rtf_keyword::rkw_lquote:
0295                   par_html.write("&lsquo;");
0296                   break;
0297                case rtf_keyword::rkw_rquote:
0298                   par_html.write("&rsquo;");
0299                   break;
0300                case rtf_keyword::rkw_ldblquote:
0301                   par_html.write("&ldquo;");
0302                   break;
0303                case rtf_keyword::rkw_rdblquote:
0304                   par_html.write("&rdquo;");
0305                   break;
0306                // paragraph formatting
0307                case rtf_keyword::rkw_ql:
0308                   cur_options.papAlign=formatting_options::align_left;
0309                   break;
0310                case rtf_keyword::rkw_qr:
0311                   cur_options.papAlign=formatting_options::align_right;
0312                   break;
0313                case rtf_keyword::rkw_qc:
0314                   cur_options.papAlign=formatting_options::align_center;
0315                   break;
0316                case rtf_keyword::rkw_qj:
0317                   cur_options.papAlign=formatting_options::align_justify;
0318                   break;
0319                case rtf_keyword::rkw_fi:
0320                   cur_options.papFirst=(int)rint(kw.parameter()/20);
0321                   break;
0322                case rtf_keyword::rkw_li:
0323                   cur_options.papLeft=(int)rint(kw.parameter()/20);
0324                   break;
0325                case rtf_keyword::rkw_ri:
0326                   cur_options.papRight=(int)rint(kw.parameter()/20);
0327                   break;
0328                case rtf_keyword::rkw_sb:
0329                   cur_options.papBefore=(int)rint(kw.parameter()/20);
0330                   break;
0331                case rtf_keyword::rkw_sa:
0332                   cur_options.papAfter=(int)rint(kw.parameter()/20);
0333                   break;
0334                case rtf_keyword::rkw_pard:
0335                   cur_options.papBefore=cur_options.papAfter=0;
0336                   cur_options.papLeft=cur_options.papRight=0;
0337                   cur_options.papFirst=0;
0338                   cur_options.papAlign=formatting_options::align_left;
0339                   cur_options.papInTbl=false;
0340                   break;
0341                case rtf_keyword::rkw_par:
0342                case rtf_keyword::rkw_sect:
0343                   t_str=cur_options.get_par_str()+par_html.str()
0344                         +"&nbsp;"+par_html.close()+"</p>\n";
0345                   if (!bInTable)
0346                   {
0347                      html+=t_str;
0348                   }
0349                   else
0350                   {
0351                      if (cur_options.papInTbl)
0352                      {
0353                         tcCurCell->Text+=t_str;
0354                      }
0355                      else
0356                      {
0357                         html+=tblCurTable->make()+t_str;
0358                         bInTable=false;
0359                         tblCurTable=new table;
0360                      }
0361                   }
0362                   par_html.clear();
0363                   break;
0364                // character formatting
0365                case rtf_keyword::rkw_super:
0366                   cur_options.chpVAlign=
0367                      kw.parameter()==0?formatting_options::va_normal
0368                                       :formatting_options::va_sup;
0369                   break;
0370                case rtf_keyword::rkw_sub:
0371                   cur_options.chpVAlign=
0372                      kw.parameter()==0?formatting_options::va_normal
0373                                       :formatting_options::va_sub;
0374                   break;
0375                case rtf_keyword::rkw_b:
0376                   cur_options.chpBold=!(kw.parameter()==0);
0377                   break;
0378                case rtf_keyword::rkw_i:
0379                   cur_options.chpItalic=!(kw.parameter()==0);
0380                   break;
0381                case rtf_keyword::rkw_ul:
0382                   cur_options.chpUnderline=!(kw.parameter()==0);
0383                   break;
0384                case rtf_keyword::rkw_ulnone:
0385                   cur_options.chpUnderline=false;
0386                   break;
0387                case rtf_keyword::rkw_fs:
0388                   cur_options.chpFontSize=kw.parameter();
0389                   break;
0390                case rtf_keyword::rkw_cf:
0391                   cur_options.chpFColor=colortbl[kw.parameter()];
0392                   break;
0393                case rtf_keyword::rkw_cb:
0394                   cur_options.chpBColor=colortbl[kw.parameter()];
0395                   break;
0396                case rtf_keyword::rkw_highlight:
0397                   cur_options.chpHighlight=kw.parameter();
0398                   break;
0399                case rtf_keyword::rkw_f:
0400                   cur_options.chpFont=fonttbl[kw.parameter()];
0401                   break;
0402                case rtf_keyword::rkw_plain:
0403                   cur_options.chpBold=cur_options.chpItalic
0404                     =cur_options.chpUnderline=false;
0405                   cur_options.chpVAlign=formatting_options::va_normal;
0406                   cur_options.chpFontSize=cur_options.chpHighlight=0;
0407                   cur_options.chpFColor=cur_options.chpBColor=color();
0408                   cur_options.chpFont=font();
0409                   break;
0410                // table formatting
0411                case rtf_keyword::rkw_intbl:
0412                   cur_options.papInTbl=true;
0413                   break;
0414                case rtf_keyword::rkw_trowd:
0415                   CurCellDefs=CellDefsList.insert(CellDefsList.end(),
0416                                                   table_cell_defs());
0417                case rtf_keyword::rkw_row:
0418                   if (!trCurRow->Cells.empty())
0419                   {
0420                      trCurRow->CellDefs=CurCellDefs;
0421                      if (trCurRow->Left==-1000)
0422                         trCurRow->Left=iLastRowLeft;
0423                      if (trCurRow->Height==-1000)
0424                         trCurRow->Height=iLastRowHeight;
0425                      tblCurTable->push_back(trCurRow);
0426                      trCurRow=new table_row;
0427                   }
0428                   bInTable=true;
0429                   break;
0430                case rtf_keyword::rkw_cell:
0431                   t_str=cur_options.get_par_str()+par_html.str()
0432                         +"&nbsp;"+par_html.close()+"</p>\n";
0433                   tcCurCell->Text+=t_str;
0434                   par_html.clear();
0435                   trCurRow->Cells.push_back(tcCurCell);
0436                   tcCurCell=new table_cell;
0437                   break;
0438                case rtf_keyword::rkw_cellx:
0439                   tcdCurCellDef->Right=kw.parameter();
0440                   CurCellDefs->push_back(tcdCurCellDef);
0441                   tcdCurCellDef=new table_cell_def;
0442                   break;
0443                case rtf_keyword::rkw_trleft:
0444                   trCurRow->Left=kw.parameter();
0445                   iLastRowLeft=kw.parameter();
0446                   break;
0447                case rtf_keyword::rkw_trrh:
0448                   trCurRow->Height=kw.parameter();
0449                   iLastRowHeight=kw.parameter();
0450                   break;
0451                case rtf_keyword::rkw_clvmgf:
0452                   tcdCurCellDef->FirstMerged=true;
0453                   break;
0454                case rtf_keyword::rkw_clvmrg:
0455                   tcdCurCellDef->Merged=true;
0456                   break;
0457                case rtf_keyword::rkw_clbrdrb:
0458                   tcdCurCellDef->BorderBottom=true;
0459                   tcdCurCellDef->ActiveBorder=&(tcdCurCellDef->BorderBottom);
0460                   break;
0461                case rtf_keyword::rkw_clbrdrt:
0462                   tcdCurCellDef->BorderTop=true;
0463                   tcdCurCellDef->ActiveBorder=&(tcdCurCellDef->BorderTop);
0464                   break;
0465                case rtf_keyword::rkw_clbrdrl:
0466                   tcdCurCellDef->BorderLeft=true;
0467                   tcdCurCellDef->ActiveBorder=&(tcdCurCellDef->BorderLeft);
0468                   break;
0469                case rtf_keyword::rkw_clbrdrr:
0470                   tcdCurCellDef->BorderRight=true;
0471                   tcdCurCellDef->ActiveBorder=&(tcdCurCellDef->BorderRight);
0472                   break;
0473                case rtf_keyword::rkw_brdrnone:
0474                   if (tcdCurCellDef->ActiveBorder!=NULL)
0475                   {
0476                      *(tcdCurCellDef->ActiveBorder)=false;
0477                   }
0478                   break;
0479                case rtf_keyword::rkw_clvertalt:
0480                   tcdCurCellDef->VAlign=table_cell_def::valign_top;
0481                   break;
0482                case rtf_keyword::rkw_clvertalc:
0483                   tcdCurCellDef->VAlign=table_cell_def::valign_center;
0484                   break;
0485                case rtf_keyword::rkw_clvertalb:
0486                   tcdCurCellDef->VAlign=table_cell_def::valign_bottom;
0487                   break;
0488                // page formatting
0489                case rtf_keyword::rkw_paperw:
0490 //                  iDocWidth=kw.parameter();
0491                   break;
0492                case rtf_keyword::rkw_margl:
0493 //                  iMarginLeft=kw.parameter();
0494                   break;
0495                default: break;
0496                }
0497             }
0498          break;
0499       }
0500       case '{':
0501          // perform group opening actions here
0502          foStack.push(cur_options);
0503          ++buf_in;
0504          break;
0505       case '}':
0506          // perform group closing actions here
0507          cur_options=foStack.top();
0508          foStack.pop();
0509          ++buf_in;
0510          break;
0511       case 13:
0512       case 10:
0513          ++buf_in;
0514          break;
0515       case '<':
0516          par_html.write("&lt;");
0517          ++buf_in;
0518          break;
0519       case '>':
0520          par_html.write("&gt;");
0521          ++buf_in;
0522          break;
0523 /*      case ' ':
0524          par_html.write("&ensp;");
0525          ++buf_in;
0526          break;*/
0527       default:
0528          par_html.write(*buf_in++);
0529       }
0530    }
0531 
0532    t_str=cur_options.get_par_str()+par_html.str()
0533         +"&nbsp;"+par_html.close()+"</p>\n";
0534    html+=t_str;
0535 
0536    delete tcCurCell;
0537    delete trCurRow;
0538    delete tblCurTable;
0539    delete tcdCurCellDef;
0540 
0541    return QString::fromStdString(html);
0542 }