File indexing completed on 2024-12-22 05:36:20

0001 <?php
0002 
0003 /**
0004  * Takes a well formed list of tokens and fixes their nesting.
0005  *
0006  * HTML elements dictate which elements are allowed to be their children,
0007  * for example, you can't have a p tag in a span tag.  Other elements have
0008  * much more rigorous definitions: tables, for instance, require a specific
0009  * order for their elements.  There are also constraints not expressible by
0010  * document type definitions, such as the chameleon nature of ins/del
0011  * tags and global child exclusions.
0012  *
0013  * The first major objective of this strategy is to iterate through all
0014  * the nodes and determine whether or not their children conform to the
0015  * element's definition.  If they do not, the child definition may
0016  * optionally supply an amended list of elements that is valid or
0017  * require that the entire node be deleted (and the previous node
0018  * rescanned).
0019  *
0020  * The second objective is to ensure that explicitly excluded elements of
0021  * an element do not appear in its children.  Code that accomplishes this
0022  * task is pervasive through the strategy, though the two are distinct tasks
0023  * and could, theoretically, be seperated (although it's not recommended).
0024  *
0025  * @note Whether or not unrecognized children are silently dropped or
0026  *       translated into text depends on the child definitions.
0027  *
0028  * @todo Enable nodes to be bubbled out of the structure.  This is
0029  *       easier with our new algorithm.
0030  */
0031 
0032 class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
0033 {
0034 
0035     /**
0036      * @param HTMLPurifier_Token[] $tokens
0037      * @param HTMLPurifier_Config $config
0038      * @param HTMLPurifier_Context $context
0039      * @return array|HTMLPurifier_Token[]
0040      */
0041     public function execute($tokens, $config, $context)
0042     {
0043 
0044         //####################################################################//
0045         // Pre-processing
0046 
0047         // O(n) pass to convert to a tree, so that we can efficiently
0048         // refer to substrings
0049         $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
0050 
0051         // get a copy of the HTML definition
0052         $definition = $config->getHTMLDefinition();
0053 
0054         $excludes_enabled = !$config->get('Core.DisableExcludes');
0055 
0056         // setup the context variable 'IsInline', for chameleon processing
0057         // is 'false' when we are not inline, 'true' when it must always
0058         // be inline, and an integer when it is inline for a certain
0059         // branch of the document tree
0060         $is_inline = $definition->info_parent_def->descendants_are_inline;
0061         $context->register('IsInline', $is_inline);
0062 
0063         // setup error collector
0064         $e =& $context->get('ErrorCollector', true);
0065 
0066         //####################################################################//
0067         // Loop initialization
0068 
0069         // stack that contains all elements that are excluded
0070         // it is organized by parent elements, similar to $stack,
0071         // but it is only populated when an element with exclusions is
0072         // processed, i.e. there won't be empty exclusions.
0073         $exclude_stack = array($definition->info_parent_def->excludes);
0074 
0075         // variable that contains the start token while we are processing
0076         // nodes. This enables error reporting to do its job
0077         $node = $top_node;
0078         // dummy token
0079         list($token, $d) = $node->toTokenPair();
0080         $context->register('CurrentNode', $node);
0081         $context->register('CurrentToken', $token);
0082 
0083         //####################################################################//
0084         // Loop
0085 
0086         // We need to implement a post-order traversal iteratively, to
0087         // avoid running into stack space limits.  This is pretty tricky
0088         // to reason about, so we just manually stack-ify the recursive
0089         // variant:
0090         //
0091         //  function f($node) {
0092         //      foreach ($node->children as $child) {
0093         //          f($child);
0094         //      }
0095         //      validate($node);
0096         //  }
0097         //
0098         // Thus, we will represent a stack frame as array($node,
0099         // $is_inline, stack of children)
0100         // e.g. array_reverse($node->children) - already processed
0101         // children.
0102 
0103         $parent_def = $definition->info_parent_def;
0104         $stack = array(
0105             array($top_node,
0106                   $parent_def->descendants_are_inline,
0107                   $parent_def->excludes, // exclusions
0108                   0)
0109             );
0110 
0111         while (!empty($stack)) {
0112             list($node, $is_inline, $excludes, $ix) = array_pop($stack);
0113             // recursive call
0114             $go = false;
0115             $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
0116             while (isset($node->children[$ix])) {
0117                 $child = $node->children[$ix++];
0118                 if ($child instanceof HTMLPurifier_Node_Element) {
0119                     $go = true;
0120                     $stack[] = array($node, $is_inline, $excludes, $ix);
0121                     $stack[] = array($child,
0122                         // ToDo: I don't think it matters if it's def or
0123                         // child_def, but double check this...
0124                         $is_inline || $def->descendants_are_inline,
0125                         empty($def->excludes) ? $excludes
0126                                               : array_merge($excludes, $def->excludes),
0127                         0);
0128                     break;
0129                 }
0130             };
0131             if ($go) continue;
0132             list($token, $d) = $node->toTokenPair();
0133             // base case
0134             if ($excludes_enabled && isset($excludes[$node->name])) {
0135                 $node->dead = true;
0136                 if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
0137             } else {
0138                 // XXX I suppose it would be slightly more efficient to
0139                 // avoid the allocation here and have children
0140                 // strategies handle it
0141                 $children = array();
0142                 foreach ($node->children as $child) {
0143                     if (!$child->dead) $children[] = $child;
0144                 }
0145                 $result = $def->child->validateChildren($children, $config, $context);
0146                 if ($result === true) {
0147                     // nop
0148                     $node->children = $children;
0149                 } elseif ($result === false) {
0150                     $node->dead = true;
0151                     if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
0152                 } else {
0153                     $node->children = $result;
0154                     if ($e) {
0155                         // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
0156                         if (empty($result) && !empty($children)) {
0157                             $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
0158                         } else if ($result != $children) {
0159                             $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
0160                         }
0161                     }
0162                 }
0163             }
0164         }
0165 
0166         //####################################################################//
0167         // Post-processing
0168 
0169         // remove context variables
0170         $context->destroy('IsInline');
0171         $context->destroy('CurrentNode');
0172         $context->destroy('CurrentToken');
0173 
0174         //####################################################################//
0175         // Return
0176 
0177         return HTMLPurifier_Arborize::flatten($node, $config, $context);
0178     }
0179 }
0180 
0181 // vim: et sw=4 sts=4