diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-02-21 15:57:10 +0100 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-02-21 15:57:10 +0100 |
commit | 99679d06884120c57f43b44e55e03595f1f87bed (patch) | |
tree | a3f2a1aa1afdaeca1386d0c6e8a75344fd2241fb /inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php | |
parent | 655214ab30ee84884dc408488b85586f36263fcb (diff) | |
parent | d3b47e94705e17b3ba3529cbb1dc6efe69c5d2b7 (diff) | |
download | wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.gz wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.zst wallabag-99679d06884120c57f43b44e55e03595f1f87bed.zip |
Merge pull request #481 from wallabag/dev1.5.2
1.5.2
Diffstat (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php')
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php new file mode 100644 index 00000000..d3ec44f1 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Injector/AutoParagraph.php | |||
@@ -0,0 +1,356 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Injector that auto paragraphs text in the root node based on | ||
5 | * double-spacing. | ||
6 | * @todo Ensure all states are unit tested, including variations as well. | ||
7 | * @todo Make a graph of the flow control for this Injector. | ||
8 | */ | ||
9 | class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector | ||
10 | { | ||
11 | /** | ||
12 | * @type string | ||
13 | */ | ||
14 | public $name = 'AutoParagraph'; | ||
15 | |||
16 | /** | ||
17 | * @type array | ||
18 | */ | ||
19 | public $needed = array('p'); | ||
20 | |||
21 | /** | ||
22 | * @return HTMLPurifier_Token_Start | ||
23 | */ | ||
24 | private function _pStart() | ||
25 | { | ||
26 | $par = new HTMLPurifier_Token_Start('p'); | ||
27 | $par->armor['MakeWellFormed_TagClosedError'] = true; | ||
28 | return $par; | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * @param HTMLPurifier_Token_Text $token | ||
33 | */ | ||
34 | public function handleText(&$token) | ||
35 | { | ||
36 | $text = $token->data; | ||
37 | // Does the current parent allow <p> tags? | ||
38 | if ($this->allowsElement('p')) { | ||
39 | if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { | ||
40 | // Note that we have differing behavior when dealing with text | ||
41 | // in the anonymous root node, or a node inside the document. | ||
42 | // If the text as a double-newline, the treatment is the same; | ||
43 | // if it doesn't, see the next if-block if you're in the document. | ||
44 | |||
45 | $i = $nesting = null; | ||
46 | if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { | ||
47 | // State 1.1: ... ^ (whitespace, then document end) | ||
48 | // ---- | ||
49 | // This is a degenerate case | ||
50 | } else { | ||
51 | if (!$token->is_whitespace || $this->_isInline($current)) { | ||
52 | // State 1.2: PAR1 | ||
53 | // ---- | ||
54 | |||
55 | // State 1.3: PAR1\n\nPAR2 | ||
56 | // ------------ | ||
57 | |||
58 | // State 1.4: <div>PAR1\n\nPAR2 (see State 2) | ||
59 | // ------------ | ||
60 | $token = array($this->_pStart()); | ||
61 | $this->_splitText($text, $token); | ||
62 | } else { | ||
63 | // State 1.5: \n<hr /> | ||
64 | // -- | ||
65 | } | ||
66 | } | ||
67 | } else { | ||
68 | // State 2: <div>PAR1... (similar to 1.4) | ||
69 | // ---- | ||
70 | |||
71 | // We're in an element that allows paragraph tags, but we're not | ||
72 | // sure if we're going to need them. | ||
73 | if ($this->_pLookAhead()) { | ||
74 | // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 | ||
75 | // ---- | ||
76 | // Note: This will always be the first child, since any | ||
77 | // previous inline element would have triggered this very | ||
78 | // same routine, and found the double newline. One possible | ||
79 | // exception would be a comment. | ||
80 | $token = array($this->_pStart(), $token); | ||
81 | } else { | ||
82 | // State 2.2.1: <div>PAR1<div> | ||
83 | // ---- | ||
84 | |||
85 | // State 2.2.2: <div>PAR1<b>PAR1</b></div> | ||
86 | // ---- | ||
87 | } | ||
88 | } | ||
89 | // Is the current parent a <p> tag? | ||
90 | } elseif (!empty($this->currentNesting) && | ||
91 | $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') { | ||
92 | // State 3.1: ...<p>PAR1 | ||
93 | // ---- | ||
94 | |||
95 | // State 3.2: ...<p>PAR1\n\nPAR2 | ||
96 | // ------------ | ||
97 | $token = array(); | ||
98 | $this->_splitText($text, $token); | ||
99 | // Abort! | ||
100 | } else { | ||
101 | // State 4.1: ...<b>PAR1 | ||
102 | // ---- | ||
103 | |||
104 | // State 4.2: ...<b>PAR1\n\nPAR2 | ||
105 | // ------------ | ||
106 | } | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * @param HTMLPurifier_Token $token | ||
111 | */ | ||
112 | public function handleElement(&$token) | ||
113 | { | ||
114 | // We don't have to check if we're already in a <p> tag for block | ||
115 | // tokens, because the tag would have been autoclosed by MakeWellFormed. | ||
116 | if ($this->allowsElement('p')) { | ||
117 | if (!empty($this->currentNesting)) { | ||
118 | if ($this->_isInline($token)) { | ||
119 | // State 1: <div>...<b> | ||
120 | // --- | ||
121 | // Check if this token is adjacent to the parent token | ||
122 | // (seek backwards until token isn't whitespace) | ||
123 | $i = null; | ||
124 | $this->backward($i, $prev); | ||
125 | |||
126 | if (!$prev instanceof HTMLPurifier_Token_Start) { | ||
127 | // Token wasn't adjacent | ||
128 | if ($prev instanceof HTMLPurifier_Token_Text && | ||
129 | substr($prev->data, -2) === "\n\n" | ||
130 | ) { | ||
131 | // State 1.1.4: <div><p>PAR1</p>\n\n<b> | ||
132 | // --- | ||
133 | // Quite frankly, this should be handled by splitText | ||
134 | $token = array($this->_pStart(), $token); | ||
135 | } else { | ||
136 | // State 1.1.1: <div><p>PAR1</p><b> | ||
137 | // --- | ||
138 | // State 1.1.2: <div><br /><b> | ||
139 | // --- | ||
140 | // State 1.1.3: <div>PAR<b> | ||
141 | // --- | ||
142 | } | ||
143 | } else { | ||
144 | // State 1.2.1: <div><b> | ||
145 | // --- | ||
146 | // Lookahead to see if <p> is needed. | ||
147 | if ($this->_pLookAhead()) { | ||
148 | // State 1.3.1: <div><b>PAR1\n\nPAR2 | ||
149 | // --- | ||
150 | $token = array($this->_pStart(), $token); | ||
151 | } else { | ||
152 | // State 1.3.2: <div><b>PAR1</b></div> | ||
153 | // --- | ||
154 | |||
155 | // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> | ||
156 | // --- | ||
157 | } | ||
158 | } | ||
159 | } else { | ||
160 | // State 2.3: ...<div> | ||
161 | // ----- | ||
162 | } | ||
163 | } else { | ||
164 | if ($this->_isInline($token)) { | ||
165 | // State 3.1: <b> | ||
166 | // --- | ||
167 | // This is where the {p} tag is inserted, not reflected in | ||
168 | // inputTokens yet, however. | ||
169 | $token = array($this->_pStart(), $token); | ||
170 | } else { | ||
171 | // State 3.2: <div> | ||
172 | // ----- | ||
173 | } | ||
174 | |||
175 | $i = null; | ||
176 | if ($this->backward($i, $prev)) { | ||
177 | if (!$prev instanceof HTMLPurifier_Token_Text) { | ||
178 | // State 3.1.1: ...</p>{p}<b> | ||
179 | // --- | ||
180 | // State 3.2.1: ...</p><div> | ||
181 | // ----- | ||
182 | if (!is_array($token)) { | ||
183 | $token = array($token); | ||
184 | } | ||
185 | array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); | ||
186 | } else { | ||
187 | // State 3.1.2: ...</p>\n\n{p}<b> | ||
188 | // --- | ||
189 | // State 3.2.2: ...</p>\n\n<div> | ||
190 | // ----- | ||
191 | // Note: PAR<ELEM> cannot occur because PAR would have been | ||
192 | // wrapped in <p> tags. | ||
193 | } | ||
194 | } | ||
195 | } | ||
196 | } else { | ||
197 | // State 2.2: <ul><li> | ||
198 | // ---- | ||
199 | // State 2.4: <p><b> | ||
200 | // --- | ||
201 | } | ||
202 | } | ||
203 | |||
204 | /** | ||
205 | * Splits up a text in paragraph tokens and appends them | ||
206 | * to the result stream that will replace the original | ||
207 | * @param string $data String text data that will be processed | ||
208 | * into paragraphs | ||
209 | * @param HTMLPurifier_Token[] $result Reference to array of tokens that the | ||
210 | * tags will be appended onto | ||
211 | */ | ||
212 | private function _splitText($data, &$result) | ||
213 | { | ||
214 | $raw_paragraphs = explode("\n\n", $data); | ||
215 | $paragraphs = array(); // without empty paragraphs | ||
216 | $needs_start = false; | ||
217 | $needs_end = false; | ||
218 | |||
219 | $c = count($raw_paragraphs); | ||
220 | if ($c == 1) { | ||
221 | // There were no double-newlines, abort quickly. In theory this | ||
222 | // should never happen. | ||
223 | $result[] = new HTMLPurifier_Token_Text($data); | ||
224 | return; | ||
225 | } | ||
226 | for ($i = 0; $i < $c; $i++) { | ||
227 | $par = $raw_paragraphs[$i]; | ||
228 | if (trim($par) !== '') { | ||
229 | $paragraphs[] = $par; | ||
230 | } else { | ||
231 | if ($i == 0) { | ||
232 | // Double newline at the front | ||
233 | if (empty($result)) { | ||
234 | // The empty result indicates that the AutoParagraph | ||
235 | // injector did not add any start paragraph tokens. | ||
236 | // This means that we have been in a paragraph for | ||
237 | // a while, and the newline means we should start a new one. | ||
238 | $result[] = new HTMLPurifier_Token_End('p'); | ||
239 | $result[] = new HTMLPurifier_Token_Text("\n\n"); | ||
240 | // However, the start token should only be added if | ||
241 | // there is more processing to be done (i.e. there are | ||
242 | // real paragraphs in here). If there are none, the | ||
243 | // next start paragraph tag will be handled by the | ||
244 | // next call to the injector | ||
245 | $needs_start = true; | ||
246 | } else { | ||
247 | // We just started a new paragraph! | ||
248 | // Reinstate a double-newline for presentation's sake, since | ||
249 | // it was in the source code. | ||
250 | array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); | ||
251 | } | ||
252 | } elseif ($i + 1 == $c) { | ||
253 | // Double newline at the end | ||
254 | // There should be a trailing </p> when we're finally done. | ||
255 | $needs_end = true; | ||
256 | } | ||
257 | } | ||
258 | } | ||
259 | |||
260 | // Check if this was just a giant blob of whitespace. Move this earlier, | ||
261 | // perhaps? | ||
262 | if (empty($paragraphs)) { | ||
263 | return; | ||
264 | } | ||
265 | |||
266 | // Add the start tag indicated by \n\n at the beginning of $data | ||
267 | if ($needs_start) { | ||
268 | $result[] = $this->_pStart(); | ||
269 | } | ||
270 | |||
271 | // Append the paragraphs onto the result | ||
272 | foreach ($paragraphs as $par) { | ||
273 | $result[] = new HTMLPurifier_Token_Text($par); | ||
274 | $result[] = new HTMLPurifier_Token_End('p'); | ||
275 | $result[] = new HTMLPurifier_Token_Text("\n\n"); | ||
276 | $result[] = $this->_pStart(); | ||
277 | } | ||
278 | |||
279 | // Remove trailing start token; Injector will handle this later if | ||
280 | // it was indeed needed. This prevents from needing to do a lookahead, | ||
281 | // at the cost of a lookbehind later. | ||
282 | array_pop($result); | ||
283 | |||
284 | // If there is no need for an end tag, remove all of it and let | ||
285 | // MakeWellFormed close it later. | ||
286 | if (!$needs_end) { | ||
287 | array_pop($result); // removes \n\n | ||
288 | array_pop($result); // removes </p> | ||
289 | } | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * Returns true if passed token is inline (and, ergo, allowed in | ||
294 | * paragraph tags) | ||
295 | * @param HTMLPurifier_Token $token | ||
296 | * @return bool | ||
297 | */ | ||
298 | private function _isInline($token) | ||
299 | { | ||
300 | return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); | ||
301 | } | ||
302 | |||
303 | /** | ||
304 | * Looks ahead in the token list and determines whether or not we need | ||
305 | * to insert a <p> tag. | ||
306 | * @return bool | ||
307 | */ | ||
308 | private function _pLookAhead() | ||
309 | { | ||
310 | if ($this->currentToken instanceof HTMLPurifier_Token_Start) { | ||
311 | $nesting = 1; | ||
312 | } else { | ||
313 | $nesting = 0; | ||
314 | } | ||
315 | $ok = false; | ||
316 | $i = null; | ||
317 | while ($this->forwardUntilEndToken($i, $current, $nesting)) { | ||
318 | $result = $this->_checkNeedsP($current); | ||
319 | if ($result !== null) { | ||
320 | $ok = $result; | ||
321 | break; | ||
322 | } | ||
323 | } | ||
324 | return $ok; | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * Determines if a particular token requires an earlier inline token | ||
329 | * to get a paragraph. This should be used with _forwardUntilEndToken | ||
330 | * @param HTMLPurifier_Token $current | ||
331 | * @return bool | ||
332 | */ | ||
333 | private function _checkNeedsP($current) | ||
334 | { | ||
335 | if ($current instanceof HTMLPurifier_Token_Start) { | ||
336 | if (!$this->_isInline($current)) { | ||
337 | // <div>PAR1<div> | ||
338 | // ---- | ||
339 | // Terminate early, since we hit a block element | ||
340 | return false; | ||
341 | } | ||
342 | } elseif ($current instanceof HTMLPurifier_Token_Text) { | ||
343 | if (strpos($current->data, "\n\n") !== false) { | ||
344 | // <div>PAR1<b>PAR1\n\nPAR2 | ||
345 | // ---- | ||
346 | return true; | ||
347 | } else { | ||
348 | // <div>PAR1<b>PAR1... | ||
349 | // ---- | ||
350 | } | ||
351 | } | ||
352 | return null; | ||
353 | } | ||
354 | } | ||
355 | |||
356 | // vim: et sw=4 sts=4 | ||