]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Takes a well formed list of tokens and fixes their nesting.\r | |
5 | *\r | |
6 | * HTML elements dictate which elements are allowed to be their children,\r | |
7 | * for example, you can't have a p tag in a span tag. Other elements have\r | |
8 | * much more rigorous definitions: tables, for instance, require a specific\r | |
9 | * order for their elements. There are also constraints not expressible by\r | |
10 | * document type definitions, such as the chameleon nature of ins/del\r | |
11 | * tags and global child exclusions.\r | |
12 | *\r | |
13 | * The first major objective of this strategy is to iterate through all\r | |
14 | * the nodes and determine whether or not their children conform to the\r | |
15 | * element's definition. If they do not, the child definition may\r | |
16 | * optionally supply an amended list of elements that is valid or\r | |
17 | * require that the entire node be deleted (and the previous node\r | |
18 | * rescanned).\r | |
19 | *\r | |
20 | * The second objective is to ensure that explicitly excluded elements of\r | |
21 | * an element do not appear in its children. Code that accomplishes this\r | |
22 | * task is pervasive through the strategy, though the two are distinct tasks\r | |
23 | * and could, theoretically, be seperated (although it's not recommended).\r | |
24 | *\r | |
25 | * @note Whether or not unrecognized children are silently dropped or\r | |
26 | * translated into text depends on the child definitions.\r | |
27 | *\r | |
28 | * @todo Enable nodes to be bubbled out of the structure. This is\r | |
29 | * easier with our new algorithm.\r | |
30 | */\r | |
31 | \r | |
32 | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy\r | |
33 | {\r | |
34 | \r | |
35 | /**\r | |
36 | * @param HTMLPurifier_Token[] $tokens\r | |
37 | * @param HTMLPurifier_Config $config\r | |
38 | * @param HTMLPurifier_Context $context\r | |
39 | * @return array|HTMLPurifier_Token[]\r | |
40 | */\r | |
41 | public function execute($tokens, $config, $context)\r | |
42 | {\r | |
43 | \r | |
44 | //####################################################################//\r | |
45 | // Pre-processing\r | |
46 | \r | |
47 | // O(n) pass to convert to a tree, so that we can efficiently\r | |
48 | // refer to substrings\r | |
49 | $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);\r | |
50 | \r | |
51 | // get a copy of the HTML definition\r | |
52 | $definition = $config->getHTMLDefinition();\r | |
53 | \r | |
54 | $excludes_enabled = !$config->get('Core.DisableExcludes');\r | |
55 | \r | |
56 | // setup the context variable 'IsInline', for chameleon processing\r | |
57 | // is 'false' when we are not inline, 'true' when it must always\r | |
58 | // be inline, and an integer when it is inline for a certain\r | |
59 | // branch of the document tree\r | |
60 | $is_inline = $definition->info_parent_def->descendants_are_inline;\r | |
61 | $context->register('IsInline', $is_inline);\r | |
62 | \r | |
63 | // setup error collector\r | |
64 | $e =& $context->get('ErrorCollector', true);\r | |
65 | \r | |
66 | //####################################################################//\r | |
67 | // Loop initialization\r | |
68 | \r | |
69 | // stack that contains all elements that are excluded\r | |
70 | // it is organized by parent elements, similar to $stack,\r | |
71 | // but it is only populated when an element with exclusions is\r | |
72 | // processed, i.e. there won't be empty exclusions.\r | |
73 | $exclude_stack = array($definition->info_parent_def->excludes);\r | |
74 | \r | |
75 | // variable that contains the start token while we are processing\r | |
76 | // nodes. This enables error reporting to do its job\r | |
77 | $node = $top_node;\r | |
78 | // dummy token\r | |
79 | list($token, $d) = $node->toTokenPair();\r | |
80 | $context->register('CurrentNode', $node);\r | |
81 | $context->register('CurrentToken', $token);\r | |
82 | \r | |
83 | //####################################################################//\r | |
84 | // Loop\r | |
85 | \r | |
86 | // We need to implement a post-order traversal iteratively, to\r | |
87 | // avoid running into stack space limits. This is pretty tricky\r | |
88 | // to reason about, so we just manually stack-ify the recursive\r | |
89 | // variant:\r | |
90 | //\r | |
91 | // function f($node) {\r | |
92 | // foreach ($node->children as $child) {\r | |
93 | // f($child);\r | |
94 | // }\r | |
95 | // validate($node);\r | |
96 | // }\r | |
97 | //\r | |
98 | // Thus, we will represent a stack frame as array($node,\r | |
99 | // $is_inline, stack of children)\r | |
100 | // e.g. array_reverse($node->children) - already processed\r | |
101 | // children.\r | |
102 | \r | |
103 | $parent_def = $definition->info_parent_def;\r | |
104 | $stack = array(\r | |
105 | array($top_node,\r | |
106 | $parent_def->descendants_are_inline,\r | |
107 | $parent_def->excludes, // exclusions\r | |
108 | 0)\r | |
109 | );\r | |
110 | \r | |
111 | while (!empty($stack)) {\r | |
112 | list($node, $is_inline, $excludes, $ix) = array_pop($stack);\r | |
113 | // recursive call\r | |
114 | $go = false;\r | |
115 | $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];\r | |
116 | while (isset($node->children[$ix])) {\r | |
117 | $child = $node->children[$ix++];\r | |
118 | if ($child instanceof HTMLPurifier_Node_Element) {\r | |
119 | $go = true;\r | |
120 | $stack[] = array($node, $is_inline, $excludes, $ix);\r | |
121 | $stack[] = array($child,\r | |
122 | // ToDo: I don't think it matters if it's def or\r | |
123 | // child_def, but double check this...\r | |
124 | $is_inline || $def->descendants_are_inline,\r | |
125 | empty($def->excludes) ? $excludes\r | |
126 | : array_merge($excludes, $def->excludes),\r | |
127 | 0);\r | |
128 | break;\r | |
129 | }\r | |
130 | };\r | |
131 | if ($go) continue;\r | |
132 | list($token, $d) = $node->toTokenPair();\r | |
133 | // base case\r | |
134 | if ($excludes_enabled && isset($excludes[$node->name])) {\r | |
135 | $node->dead = true;\r | |
136 | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');\r | |
137 | } else {\r | |
138 | // XXX I suppose it would be slightly more efficient to\r | |
139 | // avoid the allocation here and have children\r | |
140 | // strategies handle it\r | |
141 | $children = array();\r | |
142 | foreach ($node->children as $child) {\r | |
143 | if (!$child->dead) $children[] = $child;\r | |
144 | }\r | |
145 | $result = $def->child->validateChildren($children, $config, $context);\r | |
146 | if ($result === true) {\r | |
147 | // nop\r | |
148 | $node->children = $children;\r | |
149 | } elseif ($result === false) {\r | |
150 | $node->dead = true;\r | |
151 | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');\r | |
152 | } else {\r | |
153 | $node->children = $result;\r | |
154 | if ($e) {\r | |
155 | // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators\r | |
156 | if (empty($result) && !empty($children)) {\r | |
157 | $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');\r | |
158 | } else if ($result != $children) {\r | |
159 | $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');\r | |
160 | }\r | |
161 | }\r | |
162 | }\r | |
163 | }\r | |
164 | }\r | |
165 | \r | |
166 | //####################################################################//\r | |
167 | // Post-processing\r | |
168 | \r | |
169 | // remove context variables\r | |
170 | $context->destroy('IsInline');\r | |
171 | $context->destroy('CurrentNode');\r | |
172 | $context->destroy('CurrentToken');\r | |
173 | \r | |
174 | //####################################################################//\r | |
175 | // Return\r | |
176 | \r | |
177 | return HTMLPurifier_Arborize::flatten($node, $config, $context);\r | |
178 | }\r | |
179 | }\r | |
180 | \r | |
181 | // vim: et sw=4 sts=4\r |