]> git.immae.eu Git - github/shaarli/Shaarli.git/blame - application/formatter/BookmarkMarkdownFormatter.php
Support search highlights when matching URL content
[github/shaarli/Shaarli.git] / application / formatter / BookmarkMarkdownFormatter.php
CommitLineData
336a28fa
A
1<?php
2
3namespace Shaarli\Formatter;
4
5use Shaarli\Config\ConfigManager;
9ef8555a 6use Shaarli\Formatter\Parsedown\ShaarliParsedown;
336a28fa
A
7
8/**
9 * Class BookmarkMarkdownFormatter
10 *
11 * Format bookmark description into Markdown format.
12 *
13 * @package Shaarli\Formatter
14 */
15class BookmarkMarkdownFormatter extends BookmarkDefaultFormatter
16{
17 /**
18 * When this tag is present in a bookmark, its description should not be processed with Markdown
19 */
b99e00f7 20 public const NO_MD_TAG = 'nomarkdown';
336a28fa
A
21
22 /** @var \Parsedown instance */
23 protected $parsedown;
24
25 /** @var bool used to escape HTML in Markdown or not.
26 * It MUST be set to true for shared instance as HTML content can
27 * introduce XSS vulnerabilities.
28 */
29 protected $escape;
30
31 /**
32 * @var array List of allowed protocols for links inside bookmark's description.
33 */
34 protected $allowedProtocols;
35
36 /**
37 * LinkMarkdownFormatter constructor.
38 *
39 * @param ConfigManager $conf instance
a39acb25 40 * @param bool $isLoggedIn
336a28fa 41 */
a39acb25 42 public function __construct(ConfigManager $conf, bool $isLoggedIn)
336a28fa 43 {
a39acb25
A
44 parent::__construct($conf, $isLoggedIn);
45
9ef8555a 46 $this->parsedown = new ShaarliParsedown();
336a28fa
A
47 $this->escape = $conf->get('security.markdown_escape', true);
48 $this->allowedProtocols = $conf->get('security.allowed_protocols', []);
49 }
50
51 /**
52 * @inheritdoc
53 */
54 public function formatDescription($bookmark)
55 {
56 if (in_array(self::NO_MD_TAG, $bookmark->getTags())) {
57 return parent::formatDescription($bookmark);
58 }
59
4e3875c0
A
60 $processedDescription = $this->tokenizeSearchHighlightField(
61 $bookmark->getDescription() ?? '',
62 $bookmark->getAdditionalContentEntry('search_highlight')['description'] ?? []
63 );
336a28fa
A
64 $processedDescription = $this->filterProtocols($processedDescription);
65 $processedDescription = $this->formatHashTags($processedDescription);
cf92b4dd 66 $processedDescription = $this->reverseEscapedHtml($processedDescription);
336a28fa
A
67 $processedDescription = $this->parsedown
68 ->setMarkupEscaped($this->escape)
69 ->setBreaksEnabled(true)
70 ->text($processedDescription);
71 $processedDescription = $this->sanitizeHtml($processedDescription);
4e3875c0 72 $processedDescription = $this->replaceTokens($processedDescription);
336a28fa
A
73
74 if (!empty($processedDescription)) {
53054b2b 75 $processedDescription = '<div class="markdown">' . $processedDescription . '</div>';
336a28fa
A
76 }
77
78 return $processedDescription;
79 }
80
81 /**
82 * Remove the NO markdown tag if it is present
83 *
84 * @inheritdoc
85 */
86 protected function formatTagList($bookmark)
87 {
88 $out = parent::formatTagList($bookmark);
a39acb25 89 if ($this->isLoggedIn === false && ($pos = array_search(self::NO_MD_TAG, $out)) !== false) {
336a28fa
A
90 unset($out[$pos]);
91 return array_values($out);
92 }
93 return $out;
94 }
95
96 /**
97 * Replace not whitelisted protocols with http:// in given description.
98 * Also adds `index_url` to relative links if it's specified
99 *
100 * @param string $description input description text.
101 *
102 * @return string $description without malicious link.
103 */
104 protected function filterProtocols($description)
105 {
106 $allowedProtocols = $this->allowedProtocols;
107 $indexUrl = ! empty($this->contextData['index_url']) ? $this->contextData['index_url'] : '';
108
109 return preg_replace_callback(
110 '#]\((.*?)\)#is',
111 function ($match) use ($allowedProtocols, $indexUrl) {
112 $link = startsWith($match[1], '?') || startsWith($match[1], '/') ? $indexUrl : '';
113 $link .= whitelist_protocols($match[1], $allowedProtocols);
53054b2b 114 return '](' . $link . ')';
336a28fa
A
115 },
116 $description
117 );
118 }
119
120 /**
121 * Replace hashtag in Markdown links format
03340c18 122 * E.g. `#hashtag` becomes `[#hashtag](./add-tag/hashtag)`
336a28fa
A
123 * It includes the index URL if specified.
124 *
125 * @param string $description
126 *
127 * @return string
128 */
129 protected function formatHashTags($description)
130 {
131 $indexUrl = ! empty($this->contextData['index_url']) ? $this->contextData['index_url'] : '';
9ef8555a
A
132 $tokens = '(?:' . BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_OPEN . ')' .
133 '(?:' . BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_CLOSE . ')'
134 ;
336a28fa
A
135
136 /*
137 * To support unicode: http://stackoverflow.com/a/35498078/1484919
138 * \p{Pc} - to match underscore
139 * \p{N} - numeric character in any script
140 * \p{L} - letter from any language
141 * \p{Mn} - any non marking space (accents, umlauts, etc)
142 */
9ef8555a
A
143 $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}' . $tokens . ']+)/mui';
144 $replacement = function (array $match) use ($indexUrl): string {
145 $cleanMatch = str_replace(
146 BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_OPEN,
147 '',
148 str_replace(BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_CLOSE, '', $match[2])
149 );
150 return $match[1] . '[#' . $match[2] . '](' . $indexUrl . './add-tag/' . $cleanMatch . ')';
151 };
336a28fa
A
152
153 $descriptionLines = explode(PHP_EOL, $description);
154 $descriptionOut = '';
155 $codeBlockOn = false;
156 $lineCount = 0;
157
158 foreach ($descriptionLines as $descriptionLine) {
159 // Detect line of code: starting with 4 spaces,
160 // except lists which can start with +/*/- or `2.` after spaces.
161 $codeLineOn = preg_match('/^ +(?=[^\+\*\-])(?=(?!\d\.).)/', $descriptionLine) > 0;
162 // Detect and toggle block of code
163 if (!$codeBlockOn) {
164 $codeBlockOn = preg_match('/^```/', $descriptionLine) > 0;
165 } elseif (preg_match('/^```/', $descriptionLine) > 0) {
166 $codeBlockOn = false;
167 }
168
169 if (!$codeBlockOn && !$codeLineOn) {
9ef8555a 170 $descriptionLine = preg_replace_callback($regex, $replacement, $descriptionLine);
336a28fa
A
171 }
172
173 $descriptionOut .= $descriptionLine;
174 if ($lineCount++ < count($descriptionLines) - 1) {
175 $descriptionOut .= PHP_EOL;
176 }
177 }
178
179 return $descriptionOut;
180 }
181
182 /**
183 * Remove dangerous HTML tags (tags, iframe, etc.).
184 * Doesn't affect <code> content (already escaped by Parsedown).
185 *
186 * @param string $description input description text.
187 *
188 * @return string given string escaped.
189 */
190 protected function sanitizeHtml($description)
191 {
53054b2b 192 $escapeTags = [
336a28fa
A
193 'script',
194 'style',
195 'link',
196 'iframe',
197 'frameset',
198 'frame',
53054b2b 199 ];
336a28fa
A
200 foreach ($escapeTags as $tag) {
201 $description = preg_replace_callback(
53054b2b 202 '#<\s*' . $tag . '[^>]*>(.*</\s*' . $tag . '[^>]*>)?#is',
336a28fa
A
203 function ($match) {
204 return escape($match[0]);
205 },
206 $description
207 );
208 }
209 $description = preg_replace(
210 '#(<[^>]+\s)on[a-z]*="?[^ "]*"?#is',
211 '$1',
212 $description
213 );
214 return $description;
215 }
cf92b4dd
A
216
217 protected function reverseEscapedHtml($description)
218 {
219 return unescape($description);
220 }
336a28fa 221}