]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Injector that auto paragraphs text in the root node based on\r | |
5 | * double-spacing.\r | |
6 | * @todo Ensure all states are unit tested, including variations as well.\r | |
7 | * @todo Make a graph of the flow control for this Injector.\r | |
8 | */\r | |
9 | class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector\r | |
10 | {\r | |
11 | /**\r | |
12 | * @type string\r | |
13 | */\r | |
14 | public $name = 'AutoParagraph';\r | |
15 | \r | |
16 | /**\r | |
17 | * @type array\r | |
18 | */\r | |
19 | public $needed = array('p');\r | |
20 | \r | |
21 | /**\r | |
22 | * @return HTMLPurifier_Token_Start\r | |
23 | */\r | |
24 | private function _pStart()\r | |
25 | {\r | |
26 | $par = new HTMLPurifier_Token_Start('p');\r | |
27 | $par->armor['MakeWellFormed_TagClosedError'] = true;\r | |
28 | return $par;\r | |
29 | }\r | |
30 | \r | |
31 | /**\r | |
32 | * @param HTMLPurifier_Token_Text $token\r | |
33 | */\r | |
34 | public function handleText(&$token)\r | |
35 | {\r | |
36 | $text = $token->data;\r | |
37 | // Does the current parent allow <p> tags?\r | |
38 | if ($this->allowsElement('p')) {\r | |
39 | if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {\r | |
40 | // Note that we have differing behavior when dealing with text\r | |
41 | // in the anonymous root node, or a node inside the document.\r | |
42 | // If the text as a double-newline, the treatment is the same;\r | |
43 | // if it doesn't, see the next if-block if you're in the document.\r | |
44 | \r | |
45 | $i = $nesting = null;\r | |
46 | if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {\r | |
47 | // State 1.1: ... ^ (whitespace, then document end)\r | |
48 | // ----\r | |
49 | // This is a degenerate case\r | |
50 | } else {\r | |
51 | if (!$token->is_whitespace || $this->_isInline($current)) {\r | |
52 | // State 1.2: PAR1\r | |
53 | // ----\r | |
54 | \r | |
55 | // State 1.3: PAR1\n\nPAR2\r | |
56 | // ------------\r | |
57 | \r | |
58 | // State 1.4: <div>PAR1\n\nPAR2 (see State 2)\r | |
59 | // ------------\r | |
60 | $token = array($this->_pStart());\r | |
61 | $this->_splitText($text, $token);\r | |
62 | } else {\r | |
63 | // State 1.5: \n<hr />\r | |
64 | // --\r | |
65 | }\r | |
66 | }\r | |
67 | } else {\r | |
68 | // State 2: <div>PAR1... (similar to 1.4)\r | |
69 | // ----\r | |
70 | \r | |
71 | // We're in an element that allows paragraph tags, but we're not\r | |
72 | // sure if we're going to need them.\r | |
73 | if ($this->_pLookAhead()) {\r | |
74 | // State 2.1: <div>PAR1<b>PAR1\n\nPAR2\r | |
75 | // ----\r | |
76 | // Note: This will always be the first child, since any\r | |
77 | // previous inline element would have triggered this very\r | |
78 | // same routine, and found the double newline. One possible\r | |
79 | // exception would be a comment.\r | |
80 | $token = array($this->_pStart(), $token);\r | |
81 | } else {\r | |
82 | // State 2.2.1: <div>PAR1<div>\r | |
83 | // ----\r | |
84 | \r | |
85 | // State 2.2.2: <div>PAR1<b>PAR1</b></div>\r | |
86 | // ----\r | |
87 | }\r | |
88 | }\r | |
89 | // Is the current parent a <p> tag?\r | |
90 | } elseif (!empty($this->currentNesting) &&\r | |
91 | $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {\r | |
92 | // State 3.1: ...<p>PAR1\r | |
93 | // ----\r | |
94 | \r | |
95 | // State 3.2: ...<p>PAR1\n\nPAR2\r | |
96 | // ------------\r | |
97 | $token = array();\r | |
98 | $this->_splitText($text, $token);\r | |
99 | // Abort!\r | |
100 | } else {\r | |
101 | // State 4.1: ...<b>PAR1\r | |
102 | // ----\r | |
103 | \r | |
104 | // State 4.2: ...<b>PAR1\n\nPAR2\r | |
105 | // ------------\r | |
106 | }\r | |
107 | }\r | |
108 | \r | |
109 | /**\r | |
110 | * @param HTMLPurifier_Token $token\r | |
111 | */\r | |
112 | public function handleElement(&$token)\r | |
113 | {\r | |
114 | // We don't have to check if we're already in a <p> tag for block\r | |
115 | // tokens, because the tag would have been autoclosed by MakeWellFormed.\r | |
116 | if ($this->allowsElement('p')) {\r | |
117 | if (!empty($this->currentNesting)) {\r | |
118 | if ($this->_isInline($token)) {\r | |
119 | // State 1: <div>...<b>\r | |
120 | // ---\r | |
121 | // Check if this token is adjacent to the parent token\r | |
122 | // (seek backwards until token isn't whitespace)\r | |
123 | $i = null;\r | |
124 | $this->backward($i, $prev);\r | |
125 | \r | |
126 | if (!$prev instanceof HTMLPurifier_Token_Start) {\r | |
127 | // Token wasn't adjacent\r | |
128 | if ($prev instanceof HTMLPurifier_Token_Text &&\r | |
129 | substr($prev->data, -2) === "\n\n"\r | |
130 | ) {\r | |
131 | // State 1.1.4: <div><p>PAR1</p>\n\n<b>\r | |
132 | // ---\r | |
133 | // Quite frankly, this should be handled by splitText\r | |
134 | $token = array($this->_pStart(), $token);\r | |
135 | } else {\r | |
136 | // State 1.1.1: <div><p>PAR1</p><b>\r | |
137 | // ---\r | |
138 | // State 1.1.2: <div><br /><b>\r | |
139 | // ---\r | |
140 | // State 1.1.3: <div>PAR<b>\r | |
141 | // ---\r | |
142 | }\r | |
143 | } else {\r | |
144 | // State 1.2.1: <div><b>\r | |
145 | // ---\r | |
146 | // Lookahead to see if <p> is needed.\r | |
147 | if ($this->_pLookAhead()) {\r | |
148 | // State 1.3.1: <div><b>PAR1\n\nPAR2\r | |
149 | // ---\r | |
150 | $token = array($this->_pStart(), $token);\r | |
151 | } else {\r | |
152 | // State 1.3.2: <div><b>PAR1</b></div>\r | |
153 | // ---\r | |
154 | \r | |
155 | // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>\r | |
156 | // ---\r | |
157 | }\r | |
158 | }\r | |
159 | } else {\r | |
160 | // State 2.3: ...<div>\r | |
161 | // -----\r | |
162 | }\r | |
163 | } else {\r | |
164 | if ($this->_isInline($token)) {\r | |
165 | // State 3.1: <b>\r | |
166 | // ---\r | |
167 | // This is where the {p} tag is inserted, not reflected in\r | |
168 | // inputTokens yet, however.\r | |
169 | $token = array($this->_pStart(), $token);\r | |
170 | } else {\r | |
171 | // State 3.2: <div>\r | |
172 | // -----\r | |
173 | }\r | |
174 | \r | |
175 | $i = null;\r | |
176 | if ($this->backward($i, $prev)) {\r | |
177 | if (!$prev instanceof HTMLPurifier_Token_Text) {\r | |
178 | // State 3.1.1: ...</p>{p}<b>\r | |
179 | // ---\r | |
180 | // State 3.2.1: ...</p><div>\r | |
181 | // -----\r | |
182 | if (!is_array($token)) {\r | |
183 | $token = array($token);\r | |
184 | }\r | |
185 | array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));\r | |
186 | } else {\r | |
187 | // State 3.1.2: ...</p>\n\n{p}<b>\r | |
188 | // ---\r | |
189 | // State 3.2.2: ...</p>\n\n<div>\r | |
190 | // -----\r | |
191 | // Note: PAR<ELEM> cannot occur because PAR would have been\r | |
192 | // wrapped in <p> tags.\r | |
193 | }\r | |
194 | }\r | |
195 | }\r | |
196 | } else {\r | |
197 | // State 2.2: <ul><li>\r | |
198 | // ----\r | |
199 | // State 2.4: <p><b>\r | |
200 | // ---\r | |
201 | }\r | |
202 | }\r | |
203 | \r | |
204 | /**\r | |
205 | * Splits up a text in paragraph tokens and appends them\r | |
206 | * to the result stream that will replace the original\r | |
207 | * @param string $data String text data that will be processed\r | |
208 | * into paragraphs\r | |
209 | * @param HTMLPurifier_Token[] $result Reference to array of tokens that the\r | |
210 | * tags will be appended onto\r | |
211 | */\r | |
212 | private function _splitText($data, &$result)\r | |
213 | {\r | |
214 | $raw_paragraphs = explode("\n\n", $data);\r | |
215 | $paragraphs = array(); // without empty paragraphs\r | |
216 | $needs_start = false;\r | |
217 | $needs_end = false;\r | |
218 | \r | |
219 | $c = count($raw_paragraphs);\r | |
220 | if ($c == 1) {\r | |
221 | // There were no double-newlines, abort quickly. In theory this\r | |
222 | // should never happen.\r | |
223 | $result[] = new HTMLPurifier_Token_Text($data);\r | |
224 | return;\r | |
225 | }\r | |
226 | for ($i = 0; $i < $c; $i++) {\r | |
227 | $par = $raw_paragraphs[$i];\r | |
228 | if (trim($par) !== '') {\r | |
229 | $paragraphs[] = $par;\r | |
230 | } else {\r | |
231 | if ($i == 0) {\r | |
232 | // Double newline at the front\r | |
233 | if (empty($result)) {\r | |
234 | // The empty result indicates that the AutoParagraph\r | |
235 | // injector did not add any start paragraph tokens.\r | |
236 | // This means that we have been in a paragraph for\r | |
237 | // a while, and the newline means we should start a new one.\r | |
238 | $result[] = new HTMLPurifier_Token_End('p');\r | |
239 | $result[] = new HTMLPurifier_Token_Text("\n\n");\r | |
240 | // However, the start token should only be added if\r | |
241 | // there is more processing to be done (i.e. there are\r | |
242 | // real paragraphs in here). If there are none, the\r | |
243 | // next start paragraph tag will be handled by the\r | |
244 | // next call to the injector\r | |
245 | $needs_start = true;\r | |
246 | } else {\r | |
247 | // We just started a new paragraph!\r | |
248 | // Reinstate a double-newline for presentation's sake, since\r | |
249 | // it was in the source code.\r | |
250 | array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));\r | |
251 | }\r | |
252 | } elseif ($i + 1 == $c) {\r | |
253 | // Double newline at the end\r | |
254 | // There should be a trailing </p> when we're finally done.\r | |
255 | $needs_end = true;\r | |
256 | }\r | |
257 | }\r | |
258 | }\r | |
259 | \r | |
260 | // Check if this was just a giant blob of whitespace. Move this earlier,\r | |
261 | // perhaps?\r | |
262 | if (empty($paragraphs)) {\r | |
263 | return;\r | |
264 | }\r | |
265 | \r | |
266 | // Add the start tag indicated by \n\n at the beginning of $data\r | |
267 | if ($needs_start) {\r | |
268 | $result[] = $this->_pStart();\r | |
269 | }\r | |
270 | \r | |
271 | // Append the paragraphs onto the result\r | |
272 | foreach ($paragraphs as $par) {\r | |
273 | $result[] = new HTMLPurifier_Token_Text($par);\r | |
274 | $result[] = new HTMLPurifier_Token_End('p');\r | |
275 | $result[] = new HTMLPurifier_Token_Text("\n\n");\r | |
276 | $result[] = $this->_pStart();\r | |
277 | }\r | |
278 | \r | |
279 | // Remove trailing start token; Injector will handle this later if\r | |
280 | // it was indeed needed. This prevents from needing to do a lookahead,\r | |
281 | // at the cost of a lookbehind later.\r | |
282 | array_pop($result);\r | |
283 | \r | |
284 | // If there is no need for an end tag, remove all of it and let\r | |
285 | // MakeWellFormed close it later.\r | |
286 | if (!$needs_end) {\r | |
287 | array_pop($result); // removes \n\n\r | |
288 | array_pop($result); // removes </p>\r | |
289 | }\r | |
290 | }\r | |
291 | \r | |
292 | /**\r | |
293 | * Returns true if passed token is inline (and, ergo, allowed in\r | |
294 | * paragraph tags)\r | |
295 | * @param HTMLPurifier_Token $token\r | |
296 | * @return bool\r | |
297 | */\r | |
298 | private function _isInline($token)\r | |
299 | {\r | |
300 | return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);\r | |
301 | }\r | |
302 | \r | |
303 | /**\r | |
304 | * Looks ahead in the token list and determines whether or not we need\r | |
305 | * to insert a <p> tag.\r | |
306 | * @return bool\r | |
307 | */\r | |
308 | private function _pLookAhead()\r | |
309 | {\r | |
310 | if ($this->currentToken instanceof HTMLPurifier_Token_Start) {\r | |
311 | $nesting = 1;\r | |
312 | } else {\r | |
313 | $nesting = 0;\r | |
314 | }\r | |
315 | $ok = false;\r | |
316 | $i = null;\r | |
317 | while ($this->forwardUntilEndToken($i, $current, $nesting)) {\r | |
318 | $result = $this->_checkNeedsP($current);\r | |
319 | if ($result !== null) {\r | |
320 | $ok = $result;\r | |
321 | break;\r | |
322 | }\r | |
323 | }\r | |
324 | return $ok;\r | |
325 | }\r | |
326 | \r | |
327 | /**\r | |
328 | * Determines if a particular token requires an earlier inline token\r | |
329 | * to get a paragraph. This should be used with _forwardUntilEndToken\r | |
330 | * @param HTMLPurifier_Token $current\r | |
331 | * @return bool\r | |
332 | */\r | |
333 | private function _checkNeedsP($current)\r | |
334 | {\r | |
335 | if ($current instanceof HTMLPurifier_Token_Start) {\r | |
336 | if (!$this->_isInline($current)) {\r | |
337 | // <div>PAR1<div>\r | |
338 | // ----\r | |
339 | // Terminate early, since we hit a block element\r | |
340 | return false;\r | |
341 | }\r | |
342 | } elseif ($current instanceof HTMLPurifier_Token_Text) {\r | |
343 | if (strpos($current->data, "\n\n") !== false) {\r | |
344 | // <div>PAR1<b>PAR1\n\nPAR2\r | |
345 | // ----\r | |
346 | return true;\r | |
347 | } else {\r | |
348 | // <div>PAR1<b>PAR1...\r | |
349 | // ----\r | |
350 | }\r | |
351 | }\r | |
352 | return null;\r | |
353 | }\r | |
354 | }\r | |
355 | \r | |
356 | // vim: et sw=4 sts=4\r |