HTMLPurifier/Injector/AutoParagraph.php

0001 <?php
0002
0003 /**
0004  * Injector that auto paragraphs text in the root node based on
0005  * double-spacing.
0006  * @todo Ensure all states are unit tested, including variations as well.
0007  * @todo Make a graph of the flow control for this Injector.
0008  */
0009 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
0010 {
0011     /**
0012      * @type string
0013      */
0014     public $name = 'AutoParagraph';
0015
0016     /**
0017      * @type array
0018      */
0019     public $needed = array('p');
0020
0021     /**
0022      * @return HTMLPurifier_Token_Start
0023      */
0024     private function _pStart()
0025     {
0026         $par = new HTMLPurifier_Token_Start('p');
0027         $par->armor['MakeWellFormed_TagClosedError'] = true;
0028         return $par;
0029     }
0030
0031     /**
0032      * @param HTMLPurifier_Token_Text $token
0033      */
0034     public function handleText(&$token)
0035     {
0036         $text = $token->data;
0037         // Does the current parent allow <p> tags?
0038         if ($this->allowsElement('p')) {
0039             if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
0040                 // Note that we have differing behavior when dealing with text
0041                 // in the anonymous root node, or a node inside the document.
0042                 // If the text as a double-newline, the treatment is the same;
0043                 // if it doesn't, see the next if-block if you're in the document.
0044
0045                 $i = $nesting = null;
0046                 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
0047                     // State 1.1: ...    ^ (whitespace, then document end)
0048                     //               ----
0049                     // This is a degenerate case
0050                 } else {
0051                     if (!$token->is_whitespace || $this->_isInline($current)) {
0052                         // State 1.2: PAR1
0053                         //            ----
0054
0055                         // State 1.3: PAR1\n\nPAR2
0056                         //            ------------
0057
0058                         // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
0059                         //                 ------------
0060                         $token = array($this->_pStart());
0061                         $this->_splitText($text, $token);
0062                     } else {
0063                         // State 1.5: \n<hr />
0064                         //            --
0065                     }
0066                 }
0067             } else {
0068                 // State 2:   <div>PAR1... (similar to 1.4)
0069                 //                 ----
0070
0071                 // We're in an element that allows paragraph tags, but we're not
0072                 // sure if we're going to need them.
0073                 if ($this->_pLookAhead()) {
0074                     // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
0075                     //                 ----
0076                     // Note: This will always be the first child, since any
0077                     // previous inline element would have triggered this very
0078                     // same routine, and found the double newline. One possible
0079                     // exception would be a comment.
0080                     $token = array($this->_pStart(), $token);
0081                 } else {
0082                     // State 2.2.1: <div>PAR1<div>
0083                     //                   ----
0084
0085                     // State 2.2.2: <div>PAR1<b>PAR1</b></div>
0086                     //                   ----
0087                 }
0088             }
0089             // Is the current parent a <p> tag?
0090         } elseif (!empty($this->currentNesting) &&
0091             $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
0092             // State 3.1: ...<p>PAR1
0093             //                  ----
0094
0095             // State 3.2: ...<p>PAR1\n\nPAR2
0096             //                  ------------
0097             $token = array();
0098             $this->_splitText($text, $token);
0099             // Abort!
0100         } else {
0101             // State 4.1: ...<b>PAR1
0102             //                  ----
0103
0104             // State 4.2: ...<b>PAR1\n\nPAR2
0105             //                  ------------
0106         }
0107     }
0108
0109     /**
0110      * @param HTMLPurifier_Token $token
0111      */
0112     public function handleElement(&$token)
0113     {
0114         // We don't have to check if we're already in a <p> tag for block
0115         // tokens, because the tag would have been autoclosed by MakeWellFormed.
0116         if ($this->allowsElement('p')) {
0117             if (!empty($this->currentNesting)) {
0118                 if ($this->_isInline($token)) {
0119                     // State 1: <div>...<b>
0120                     //                  ---
0121                     // Check if this token is adjacent to the parent token
0122                     // (seek backwards until token isn't whitespace)
0123                     $i = null;
0124                     $this->backward($i, $prev);
0125
0126                     if (!$prev instanceof HTMLPurifier_Token_Start) {
0127                         // Token wasn't adjacent
0128                         if ($prev instanceof HTMLPurifier_Token_Text &&
0129                             substr($prev->data, -2) === "\n\n"
0130                         ) {
0131                             // State 1.1.4: <div><p>PAR1</p>\n\n<b>
0132                             //                                  ---
0133                             // Quite frankly, this should be handled by splitText
0134                             $token = array($this->_pStart(), $token);
0135                         } else {
0136                             // State 1.1.1: <div><p>PAR1</p><b>
0137                             //                              ---
0138                             // State 1.1.2: <div><br /><b>
0139                             //                         ---
0140                             // State 1.1.3: <div>PAR<b>
0141                             //                      ---
0142                         }
0143                     } else {
0144                         // State 1.2.1: <div><b>
0145                         //                   ---
0146                         // Lookahead to see if <p> is needed.
0147                         if ($this->_pLookAhead()) {
0148                             // State 1.3.1: <div><b>PAR1\n\nPAR2
0149                             //                   ---
0150                             $token = array($this->_pStart(), $token);
0151                         } else {
0152                             // State 1.3.2: <div><b>PAR1</b></div>
0153                             //                   ---
0154
0155                             // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
0156                             //                   ---
0157                         }
0158                     }
0159                 } else {
0160                     // State 2.3: ...<div>
0161                     //               -----
0162                 }
0163             } else {
0164                 if ($this->_isInline($token)) {
0165                     // State 3.1: <b>
0166                     //            ---
0167                     // This is where the {p} tag is inserted, not reflected in
0168                     // inputTokens yet, however.
0169                     $token = array($this->_pStart(), $token);
0170                 } else {
0171                     // State 3.2: <div>
0172                     //            -----
0173                 }
0174
0175                 $i = null;
0176                 if ($this->backward($i, $prev)) {
0177                     if (!$prev instanceof HTMLPurifier_Token_Text) {
0178                         // State 3.1.1: ...</p>{p}<b>
0179                         //                        ---
0180                         // State 3.2.1: ...</p><div>
0181                         //                     -----
0182                         if (!is_array($token)) {
0183                             $token = array($token);
0184                         }
0185                         array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
0186                     } else {
0187                         // State 3.1.2: ...</p>\n\n{p}<b>
0188                         //                            ---
0189                         // State 3.2.2: ...</p>\n\n<div>
0190                         //                         -----
0191                         // Note: PAR<ELEM> cannot occur because PAR would have been
0192                         // wrapped in <p> tags.
0193                     }
0194                 }
0195             }
0196         } else {
0197             // State 2.2: <ul><li>
0198             //                ----
0199             // State 2.4: <p><b>
0200             //               ---
0201         }
0202     }
0203
0204     /**
0205      * Splits up a text in paragraph tokens and appends them
0206      * to the result stream that will replace the original
0207      * @param string $data String text data that will be processed
0208      *    into paragraphs
0209      * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
0210      *    tags will be appended onto
0211      */
0212     private function _splitText($data, &$result)
0213     {
0214         $raw_paragraphs = explode("\n\n", $data);
0215         $paragraphs = array(); // without empty paragraphs
0216         $needs_start = false;
0217         $needs_end = false;
0218
0219         $c = count($raw_paragraphs);
0220         if ($c == 1) {
0221             // There were no double-newlines, abort quickly. In theory this
0222             // should never happen.
0223             $result[] = new HTMLPurifier_Token_Text($data);
0224             return;
0225         }
0226         for ($i = 0; $i < $c; $i++) {
0227             $par = $raw_paragraphs[$i];
0228             if (trim($par) !== '') {
0229                 $paragraphs[] = $par;
0230             } else {
0231                 if ($i == 0) {
0232                     // Double newline at the front
0233                     if (empty($result)) {
0234                         // The empty result indicates that the AutoParagraph
0235                         // injector did not add any start paragraph tokens.
0236                         // This means that we have been in a paragraph for
0237                         // a while, and the newline means we should start a new one.
0238                         $result[] = new HTMLPurifier_Token_End('p');
0239                         $result[] = new HTMLPurifier_Token_Text("\n\n");
0240                         // However, the start token should only be added if
0241                         // there is more processing to be done (i.e. there are
0242                         // real paragraphs in here). If there are none, the
0243                         // next start paragraph tag will be handled by the
0244                         // next call to the injector
0245                         $needs_start = true;
0246                     } else {
0247                         // We just started a new paragraph!
0248                         // Reinstate a double-newline for presentation's sake, since
0249                         // it was in the source code.
0250                         array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
0251                     }
0252                 } elseif ($i + 1 == $c) {
0253                     // Double newline at the end
0254                     // There should be a trailing </p> when we're finally done.
0255                     $needs_end = true;
0256                 }
0257             }
0258         }
0259
0260         // Check if this was just a giant blob of whitespace. Move this earlier,
0261         // perhaps?
0262         if (empty($paragraphs)) {
0263             return;
0264         }
0265
0266         // Add the start tag indicated by \n\n at the beginning of $data
0267         if ($needs_start) {
0268             $result[] = $this->_pStart();
0269         }
0270
0271         // Append the paragraphs onto the result
0272         foreach ($paragraphs as $par) {
0273             $result[] = new HTMLPurifier_Token_Text($par);
0274             $result[] = new HTMLPurifier_Token_End('p');
0275             $result[] = new HTMLPurifier_Token_Text("\n\n");
0276             $result[] = $this->_pStart();
0277         }
0278
0279         // Remove trailing start token; Injector will handle this later if
0280         // it was indeed needed. This prevents from needing to do a lookahead,
0281         // at the cost of a lookbehind later.
0282         array_pop($result);
0283
0284         // If there is no need for an end tag, remove all of it and let
0285         // MakeWellFormed close it later.
0286         if (!$needs_end) {
0287             array_pop($result); // removes \n\n
0288             array_pop($result); // removes </p>
0289         }
0290     }
0291
0292     /**
0293      * Returns true if passed token is inline (and, ergo, allowed in
0294      * paragraph tags)
0295      * @param HTMLPurifier_Token $token
0296      * @return bool
0297      */
0298     private function _isInline($token)
0299     {
0300         return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
0301     }
0302
0303     /**
0304      * Looks ahead in the token list and determines whether or not we need
0305      * to insert a <p> tag.
0306      * @return bool
0307      */
0308     private function _pLookAhead()
0309     {
0310         if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
0311             $nesting = 1;
0312         } else {
0313             $nesting = 0;
0314         }
0315         $ok = false;
0316         $i = null;
0317         while ($this->forwardUntilEndToken($i, $current, $nesting)) {
0318             $result = $this->_checkNeedsP($current);
0319             if ($result !== null) {
0320                 $ok = $result;
0321                 break;
0322             }
0323         }
0324         return $ok;
0325     }
0326
0327     /**
0328      * Determines if a particular token requires an earlier inline token
0329      * to get a paragraph. This should be used with _forwardUntilEndToken
0330      * @param HTMLPurifier_Token $current
0331      * @return bool
0332      */
0333     private function _checkNeedsP($current)
0334     {
0335         if ($current instanceof HTMLPurifier_Token_Start) {
0336             if (!$this->_isInline($current)) {
0337                 // <div>PAR1<div>
0338                 //      ----
0339                 // Terminate early, since we hit a block element
0340                 return false;
0341             }
0342         } elseif ($current instanceof HTMLPurifier_Token_Text) {
0343             if (strpos($current->data, "\n\n") !== false) {
0344                 // <div>PAR1<b>PAR1\n\nPAR2
0345                 //      ----
0346                 return true;
0347             } else {
0348                 // <div>PAR1<b>PAR1...
0349                 //      ----
0350             }
0351         }
0352         return null;
0353     }
0354 }
0355
0356 // vim: et sw=4 sts=4