File indexing completed on 2025-02-02 05:43:43
0001 <?php 0002 0003 /** 0004 * Injector that auto paragraphs text in the root node based on 0005 * double-spacing. 0006 * @todo Ensure all states are unit tested, including variations as well. 0007 * @todo Make a graph of the flow control for this Injector. 0008 */ 0009 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector 0010 { 0011 /** 0012 * @type string 0013 */ 0014 public $name = 'AutoParagraph'; 0015 0016 /** 0017 * @type array 0018 */ 0019 public $needed = array('p'); 0020 0021 /** 0022 * @return HTMLPurifier_Token_Start 0023 */ 0024 private function _pStart() 0025 { 0026 $par = new HTMLPurifier_Token_Start('p'); 0027 $par->armor['MakeWellFormed_TagClosedError'] = true; 0028 return $par; 0029 } 0030 0031 /** 0032 * @param HTMLPurifier_Token_Text $token 0033 */ 0034 public function handleText(&$token) 0035 { 0036 $text = $token->data; 0037 // Does the current parent allow <p> tags? 0038 if ($this->allowsElement('p')) { 0039 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { 0040 // Note that we have differing behavior when dealing with text 0041 // in the anonymous root node, or a node inside the document. 0042 // If the text as a double-newline, the treatment is the same; 0043 // if it doesn't, see the next if-block if you're in the document. 0044 0045 $i = $nesting = null; 0046 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { 0047 // State 1.1: ... ^ (whitespace, then document end) 0048 // ---- 0049 // This is a degenerate case 0050 } else { 0051 if (!$token->is_whitespace || $this->_isInline($current)) { 0052 // State 1.2: PAR1 0053 // ---- 0054 0055 // State 1.3: PAR1\n\nPAR2 0056 // ------------ 0057 0058 // State 1.4: <div>PAR1\n\nPAR2 (see State 2) 0059 // ------------ 0060 $token = array($this->_pStart()); 0061 $this->_splitText($text, $token); 0062 } else { 0063 // State 1.5: \n<hr /> 0064 // -- 0065 } 0066 } 0067 } else { 0068 // State 2: <div>PAR1... (similar to 1.4) 0069 // ---- 0070 0071 // We're in an element that allows paragraph tags, but we're not 0072 // sure if we're going to need them. 0073 if ($this->_pLookAhead()) { 0074 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 0075 // ---- 0076 // Note: This will always be the first child, since any 0077 // previous inline element would have triggered this very 0078 // same routine, and found the double newline. One possible 0079 // exception would be a comment. 0080 $token = array($this->_pStart(), $token); 0081 } else { 0082 // State 2.2.1: <div>PAR1<div> 0083 // ---- 0084 0085 // State 2.2.2: <div>PAR1<b>PAR1</b></div> 0086 // ---- 0087 } 0088 } 0089 // Is the current parent a <p> tag? 0090 } elseif (!empty($this->currentNesting) && 0091 $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') { 0092 // State 3.1: ...<p>PAR1 0093 // ---- 0094 0095 // State 3.2: ...<p>PAR1\n\nPAR2 0096 // ------------ 0097 $token = array(); 0098 $this->_splitText($text, $token); 0099 // Abort! 0100 } else { 0101 // State 4.1: ...<b>PAR1 0102 // ---- 0103 0104 // State 4.2: ...<b>PAR1\n\nPAR2 0105 // ------------ 0106 } 0107 } 0108 0109 /** 0110 * @param HTMLPurifier_Token $token 0111 */ 0112 public function handleElement(&$token) 0113 { 0114 // We don't have to check if we're already in a <p> tag for block 0115 // tokens, because the tag would have been autoclosed by MakeWellFormed. 0116 if ($this->allowsElement('p')) { 0117 if (!empty($this->currentNesting)) { 0118 if ($this->_isInline($token)) { 0119 // State 1: <div>...<b> 0120 // --- 0121 // Check if this token is adjacent to the parent token 0122 // (seek backwards until token isn't whitespace) 0123 $i = null; 0124 $this->backward($i, $prev); 0125 0126 if (!$prev instanceof HTMLPurifier_Token_Start) { 0127 // Token wasn't adjacent 0128 if ($prev instanceof HTMLPurifier_Token_Text && 0129 substr($prev->data, -2) === "\n\n" 0130 ) { 0131 // State 1.1.4: <div><p>PAR1</p>\n\n<b> 0132 // --- 0133 // Quite frankly, this should be handled by splitText 0134 $token = array($this->_pStart(), $token); 0135 } else { 0136 // State 1.1.1: <div><p>PAR1</p><b> 0137 // --- 0138 // State 1.1.2: <div><br /><b> 0139 // --- 0140 // State 1.1.3: <div>PAR<b> 0141 // --- 0142 } 0143 } else { 0144 // State 1.2.1: <div><b> 0145 // --- 0146 // Lookahead to see if <p> is needed. 0147 if ($this->_pLookAhead()) { 0148 // State 1.3.1: <div><b>PAR1\n\nPAR2 0149 // --- 0150 $token = array($this->_pStart(), $token); 0151 } else { 0152 // State 1.3.2: <div><b>PAR1</b></div> 0153 // --- 0154 0155 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> 0156 // --- 0157 } 0158 } 0159 } else { 0160 // State 2.3: ...<div> 0161 // ----- 0162 } 0163 } else { 0164 if ($this->_isInline($token)) { 0165 // State 3.1: <b> 0166 // --- 0167 // This is where the {p} tag is inserted, not reflected in 0168 // inputTokens yet, however. 0169 $token = array($this->_pStart(), $token); 0170 } else { 0171 // State 3.2: <div> 0172 // ----- 0173 } 0174 0175 $i = null; 0176 if ($this->backward($i, $prev)) { 0177 if (!$prev instanceof HTMLPurifier_Token_Text) { 0178 // State 3.1.1: ...</p>{p}<b> 0179 // --- 0180 // State 3.2.1: ...</p><div> 0181 // ----- 0182 if (!is_array($token)) { 0183 $token = array($token); 0184 } 0185 array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); 0186 } else { 0187 // State 3.1.2: ...</p>\n\n{p}<b> 0188 // --- 0189 // State 3.2.2: ...</p>\n\n<div> 0190 // ----- 0191 // Note: PAR<ELEM> cannot occur because PAR would have been 0192 // wrapped in <p> tags. 0193 } 0194 } 0195 } 0196 } else { 0197 // State 2.2: <ul><li> 0198 // ---- 0199 // State 2.4: <p><b> 0200 // --- 0201 } 0202 } 0203 0204 /** 0205 * Splits up a text in paragraph tokens and appends them 0206 * to the result stream that will replace the original 0207 * @param string $data String text data that will be processed 0208 * into paragraphs 0209 * @param HTMLPurifier_Token[] $result Reference to array of tokens that the 0210 * tags will be appended onto 0211 */ 0212 private function _splitText($data, &$result) 0213 { 0214 $raw_paragraphs = explode("\n\n", $data); 0215 $paragraphs = array(); // without empty paragraphs 0216 $needs_start = false; 0217 $needs_end = false; 0218 0219 $c = count($raw_paragraphs); 0220 if ($c == 1) { 0221 // There were no double-newlines, abort quickly. In theory this 0222 // should never happen. 0223 $result[] = new HTMLPurifier_Token_Text($data); 0224 return; 0225 } 0226 for ($i = 0; $i < $c; $i++) { 0227 $par = $raw_paragraphs[$i]; 0228 if (trim($par) !== '') { 0229 $paragraphs[] = $par; 0230 } else { 0231 if ($i == 0) { 0232 // Double newline at the front 0233 if (empty($result)) { 0234 // The empty result indicates that the AutoParagraph 0235 // injector did not add any start paragraph tokens. 0236 // This means that we have been in a paragraph for 0237 // a while, and the newline means we should start a new one. 0238 $result[] = new HTMLPurifier_Token_End('p'); 0239 $result[] = new HTMLPurifier_Token_Text("\n\n"); 0240 // However, the start token should only be added if 0241 // there is more processing to be done (i.e. there are 0242 // real paragraphs in here). If there are none, the 0243 // next start paragraph tag will be handled by the 0244 // next call to the injector 0245 $needs_start = true; 0246 } else { 0247 // We just started a new paragraph! 0248 // Reinstate a double-newline for presentation's sake, since 0249 // it was in the source code. 0250 array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); 0251 } 0252 } elseif ($i + 1 == $c) { 0253 // Double newline at the end 0254 // There should be a trailing </p> when we're finally done. 0255 $needs_end = true; 0256 } 0257 } 0258 } 0259 0260 // Check if this was just a giant blob of whitespace. Move this earlier, 0261 // perhaps? 0262 if (empty($paragraphs)) { 0263 return; 0264 } 0265 0266 // Add the start tag indicated by \n\n at the beginning of $data 0267 if ($needs_start) { 0268 $result[] = $this->_pStart(); 0269 } 0270 0271 // Append the paragraphs onto the result 0272 foreach ($paragraphs as $par) { 0273 $result[] = new HTMLPurifier_Token_Text($par); 0274 $result[] = new HTMLPurifier_Token_End('p'); 0275 $result[] = new HTMLPurifier_Token_Text("\n\n"); 0276 $result[] = $this->_pStart(); 0277 } 0278 0279 // Remove trailing start token; Injector will handle this later if 0280 // it was indeed needed. This prevents from needing to do a lookahead, 0281 // at the cost of a lookbehind later. 0282 array_pop($result); 0283 0284 // If there is no need for an end tag, remove all of it and let 0285 // MakeWellFormed close it later. 0286 if (!$needs_end) { 0287 array_pop($result); // removes \n\n 0288 array_pop($result); // removes </p> 0289 } 0290 } 0291 0292 /** 0293 * Returns true if passed token is inline (and, ergo, allowed in 0294 * paragraph tags) 0295 * @param HTMLPurifier_Token $token 0296 * @return bool 0297 */ 0298 private function _isInline($token) 0299 { 0300 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); 0301 } 0302 0303 /** 0304 * Looks ahead in the token list and determines whether or not we need 0305 * to insert a <p> tag. 0306 * @return bool 0307 */ 0308 private function _pLookAhead() 0309 { 0310 if ($this->currentToken instanceof HTMLPurifier_Token_Start) { 0311 $nesting = 1; 0312 } else { 0313 $nesting = 0; 0314 } 0315 $ok = false; 0316 $i = null; 0317 while ($this->forwardUntilEndToken($i, $current, $nesting)) { 0318 $result = $this->_checkNeedsP($current); 0319 if ($result !== null) { 0320 $ok = $result; 0321 break; 0322 } 0323 } 0324 return $ok; 0325 } 0326 0327 /** 0328 * Determines if a particular token requires an earlier inline token 0329 * to get a paragraph. This should be used with _forwardUntilEndToken 0330 * @param HTMLPurifier_Token $current 0331 * @return bool 0332 */ 0333 private function _checkNeedsP($current) 0334 { 0335 if ($current instanceof HTMLPurifier_Token_Start) { 0336 if (!$this->_isInline($current)) { 0337 // <div>PAR1<div> 0338 // ---- 0339 // Terminate early, since we hit a block element 0340 return false; 0341 } 0342 } elseif ($current instanceof HTMLPurifier_Token_Text) { 0343 if (strpos($current->data, "\n\n") !== false) { 0344 // <div>PAR1<b>PAR1\n\nPAR2 0345 // ---- 0346 return true; 0347 } else { 0348 // <div>PAR1<b>PAR1... 0349 // ---- 0350 } 0351 } 0352 return null; 0353 } 0354 } 0355 0356 // vim: et sw=4 sts=4