]> git.immae.eu Git - github/shaarli/Shaarli.git/blob - application/formatter/BookmarkMarkdownFormatter.php
Manually fix remaining PHPCS errors
[github/shaarli/Shaarli.git] / application / formatter / BookmarkMarkdownFormatter.php
1 <?php
2
3 namespace Shaarli\Formatter;
4
5 use Shaarli\Config\ConfigManager;
6
7 /**
8 * Class BookmarkMarkdownFormatter
9 *
10 * Format bookmark description into Markdown format.
11 *
12 * @package Shaarli\Formatter
13 */
14 class BookmarkMarkdownFormatter extends BookmarkDefaultFormatter
15 {
16 /**
17 * When this tag is present in a bookmark, its description should not be processed with Markdown
18 */
19 public const NO_MD_TAG = 'nomarkdown';
20
21 /** @var \Parsedown instance */
22 protected $parsedown;
23
24 /** @var bool used to escape HTML in Markdown or not.
25 * It MUST be set to true for shared instance as HTML content can
26 * introduce XSS vulnerabilities.
27 */
28 protected $escape;
29
30 /**
31 * @var array List of allowed protocols for links inside bookmark's description.
32 */
33 protected $allowedProtocols;
34
35 /**
36 * LinkMarkdownFormatter constructor.
37 *
38 * @param ConfigManager $conf instance
39 * @param bool $isLoggedIn
40 */
41 public function __construct(ConfigManager $conf, bool $isLoggedIn)
42 {
43 parent::__construct($conf, $isLoggedIn);
44
45 $this->parsedown = new \Parsedown();
46 $this->escape = $conf->get('security.markdown_escape', true);
47 $this->allowedProtocols = $conf->get('security.allowed_protocols', []);
48 }
49
50 /**
51 * @inheritdoc
52 */
53 public function formatDescription($bookmark)
54 {
55 if (in_array(self::NO_MD_TAG, $bookmark->getTags())) {
56 return parent::formatDescription($bookmark);
57 }
58
59 $processedDescription = $this->tokenizeSearchHighlightField(
60 $bookmark->getDescription() ?? '',
61 $bookmark->getAdditionalContentEntry('search_highlight')['description'] ?? []
62 );
63 $processedDescription = $this->filterProtocols($processedDescription);
64 $processedDescription = $this->formatHashTags($processedDescription);
65 $processedDescription = $this->reverseEscapedHtml($processedDescription);
66 $processedDescription = $this->parsedown
67 ->setMarkupEscaped($this->escape)
68 ->setBreaksEnabled(true)
69 ->text($processedDescription);
70 $processedDescription = $this->sanitizeHtml($processedDescription);
71 $processedDescription = $this->replaceTokens($processedDescription);
72
73 if (!empty($processedDescription)) {
74 $processedDescription = '<div class="markdown">' . $processedDescription . '</div>';
75 }
76
77 return $processedDescription;
78 }
79
80 /**
81 * Remove the NO markdown tag if it is present
82 *
83 * @inheritdoc
84 */
85 protected function formatTagList($bookmark)
86 {
87 $out = parent::formatTagList($bookmark);
88 if ($this->isLoggedIn === false && ($pos = array_search(self::NO_MD_TAG, $out)) !== false) {
89 unset($out[$pos]);
90 return array_values($out);
91 }
92 return $out;
93 }
94
95 /**
96 * Replace not whitelisted protocols with http:// in given description.
97 * Also adds `index_url` to relative links if it's specified
98 *
99 * @param string $description input description text.
100 *
101 * @return string $description without malicious link.
102 */
103 protected function filterProtocols($description)
104 {
105 $allowedProtocols = $this->allowedProtocols;
106 $indexUrl = ! empty($this->contextData['index_url']) ? $this->contextData['index_url'] : '';
107
108 return preg_replace_callback(
109 '#]\((.*?)\)#is',
110 function ($match) use ($allowedProtocols, $indexUrl) {
111 $link = startsWith($match[1], '?') || startsWith($match[1], '/') ? $indexUrl : '';
112 $link .= whitelist_protocols($match[1], $allowedProtocols);
113 return '](' . $link . ')';
114 },
115 $description
116 );
117 }
118
119 /**
120 * Replace hashtag in Markdown links format
121 * E.g. `#hashtag` becomes `[#hashtag](./add-tag/hashtag)`
122 * It includes the index URL if specified.
123 *
124 * @param string $description
125 *
126 * @return string
127 */
128 protected function formatHashTags($description)
129 {
130 $indexUrl = ! empty($this->contextData['index_url']) ? $this->contextData['index_url'] : '';
131
132 /*
133 * To support unicode: http://stackoverflow.com/a/35498078/1484919
134 * \p{Pc} - to match underscore
135 * \p{N} - numeric character in any script
136 * \p{L} - letter from any language
137 * \p{Mn} - any non marking space (accents, umlauts, etc)
138 */
139 $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}]+)/mui';
140 $replacement = '$1[#$2](' . $indexUrl . './add-tag/$2)';
141
142 $descriptionLines = explode(PHP_EOL, $description);
143 $descriptionOut = '';
144 $codeBlockOn = false;
145 $lineCount = 0;
146
147 foreach ($descriptionLines as $descriptionLine) {
148 // Detect line of code: starting with 4 spaces,
149 // except lists which can start with +/*/- or `2.` after spaces.
150 $codeLineOn = preg_match('/^ +(?=[^\+\*\-])(?=(?!\d\.).)/', $descriptionLine) > 0;
151 // Detect and toggle block of code
152 if (!$codeBlockOn) {
153 $codeBlockOn = preg_match('/^```/', $descriptionLine) > 0;
154 } elseif (preg_match('/^```/', $descriptionLine) > 0) {
155 $codeBlockOn = false;
156 }
157
158 if (!$codeBlockOn && !$codeLineOn) {
159 $descriptionLine = preg_replace($regex, $replacement, $descriptionLine);
160 }
161
162 $descriptionOut .= $descriptionLine;
163 if ($lineCount++ < count($descriptionLines) - 1) {
164 $descriptionOut .= PHP_EOL;
165 }
166 }
167
168 return $descriptionOut;
169 }
170
171 /**
172 * Remove dangerous HTML tags (tags, iframe, etc.).
173 * Doesn't affect <code> content (already escaped by Parsedown).
174 *
175 * @param string $description input description text.
176 *
177 * @return string given string escaped.
178 */
179 protected function sanitizeHtml($description)
180 {
181 $escapeTags = [
182 'script',
183 'style',
184 'link',
185 'iframe',
186 'frameset',
187 'frame',
188 ];
189 foreach ($escapeTags as $tag) {
190 $description = preg_replace_callback(
191 '#<\s*' . $tag . '[^>]*>(.*</\s*' . $tag . '[^>]*>)?#is',
192 function ($match) {
193 return escape($match[0]);
194 },
195 $description
196 );
197 }
198 $description = preg_replace(
199 '#(<[^>]+\s)on[a-z]*="?[^ "]*"?#is',
200 '$1',
201 $description
202 );
203 return $description;
204 }
205
206 protected function reverseEscapedHtml($description)
207 {
208 return unescape($description);
209 }
210 }