diff options
Diffstat (limited to 'application/LinkUtils.php')
-rw-r--r-- | application/LinkUtils.php | 110 |
1 files changed, 68 insertions, 42 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php index cf58f808..3705f7e9 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php | |||
@@ -1,60 +1,81 @@ | |||
1 | <?php | 1 | <?php |
2 | 2 | ||
3 | /** | 3 | /** |
4 | * Extract title from an HTML document. | 4 | * Get cURL callback function for CURLOPT_WRITEFUNCTION |
5 | * | 5 | * |
6 | * @param string $html HTML content where to look for a title. | 6 | * @param string $charset to extract from the downloaded page (reference) |
7 | * @param string $title to extract from the downloaded page (reference) | ||
8 | * @param string $curlGetInfo Optionnaly overrides curl_getinfo function | ||
7 | * | 9 | * |
8 | * @return bool|string Extracted title if found, false otherwise. | 10 | * @return Closure |
9 | */ | 11 | */ |
10 | function html_extract_title($html) | 12 | function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') |
11 | { | 13 | { |
12 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { | 14 | /** |
13 | return trim(str_replace("\n", '', $matches[1])); | 15 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). |
14 | } | 16 | * |
15 | return false; | 17 | * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' |
18 | * Then we extract the title and the charset and stop the download when it's done. | ||
19 | * | ||
20 | * @param resource $ch cURL resource | ||
21 | * @param string $data chunk of data being downloaded | ||
22 | * | ||
23 | * @return int|bool length of $data or false if we need to stop the download | ||
24 | */ | ||
25 | return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) { | ||
26 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
27 | if (!empty($responseCode) && $responseCode != 200) { | ||
28 | return false; | ||
29 | } | ||
30 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
31 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
32 | return false; | ||
33 | } | ||
34 | if (empty($charset)) { | ||
35 | $charset = header_extract_charset($contentType); | ||
36 | } | ||
37 | if (empty($charset)) { | ||
38 | $charset = html_extract_charset($data); | ||
39 | } | ||
40 | if (empty($title)) { | ||
41 | $title = html_extract_title($data); | ||
42 | } | ||
43 | // We got everything we want, stop the download. | ||
44 | if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { | ||
45 | return false; | ||
46 | } | ||
47 | |||
48 | return strlen($data); | ||
49 | }; | ||
16 | } | 50 | } |
17 | 51 | ||
18 | /** | 52 | /** |
19 | * Determine charset from downloaded page. | 53 | * Extract title from an HTML document. |
20 | * Priority: | ||
21 | * 1. HTTP headers (Content type). | ||
22 | * 2. HTML content page (tag <meta charset>). | ||
23 | * 3. Use a default charset (default: UTF-8). | ||
24 | * | 54 | * |
25 | * @param array $headers HTTP headers array. | 55 | * @param string $html HTML content where to look for a title. |
26 | * @param string $htmlContent HTML content where to look for charset. | ||
27 | * @param string $defaultCharset Default charset to apply if other methods failed. | ||
28 | * | 56 | * |
29 | * @return string Determined charset. | 57 | * @return bool|string Extracted title if found, false otherwise. |
30 | */ | 58 | */ |
31 | function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') | 59 | function html_extract_title($html) |
32 | { | 60 | { |
33 | if ($charset = headers_extract_charset($headers)) { | 61 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { |
34 | return $charset; | 62 | return trim(str_replace("\n", '', $matches[1])); |
35 | } | ||
36 | |||
37 | if ($charset = html_extract_charset($htmlContent)) { | ||
38 | return $charset; | ||
39 | } | 63 | } |
40 | 64 | return false; | |
41 | return $defaultCharset; | ||
42 | } | 65 | } |
43 | 66 | ||
44 | /** | 67 | /** |
45 | * Extract charset from HTTP headers if it's defined. | 68 | * Extract charset from HTTP header if it's defined. |
46 | * | 69 | * |
47 | * @param array $headers HTTP headers array. | 70 | * @param string $header HTTP header Content-Type line. |
48 | * | 71 | * |
49 | * @return bool|string Charset string if found (lowercase), false otherwise. | 72 | * @return bool|string Charset string if found (lowercase), false otherwise. |
50 | */ | 73 | */ |
51 | function headers_extract_charset($headers) | 74 | function header_extract_charset($header) |
52 | { | 75 | { |
53 | if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { | 76 | preg_match('/charset="?([^; ]+)/i', $header, $match); |
54 | preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); | 77 | if (! empty($match[1])) { |
55 | if (! empty($match[1])) { | 78 | return strtolower(trim($match[1])); |
56 | return strtolower(trim($match[1])); | ||
57 | } | ||
58 | } | 79 | } |
59 | 80 | ||
60 | return false; | 81 | return false; |
@@ -89,7 +110,9 @@ function count_private($links) | |||
89 | { | 110 | { |
90 | $cpt = 0; | 111 | $cpt = 0; |
91 | foreach ($links as $link) { | 112 | foreach ($links as $link) { |
92 | $cpt = $link['private'] == true ? $cpt + 1 : $cpt; | 113 | if ($link['private']) { |
114 | $cpt += 1; | ||
115 | } | ||
93 | } | 116 | } |
94 | 117 | ||
95 | return $cpt; | 118 | return $cpt; |
@@ -100,14 +123,15 @@ function count_private($links) | |||
100 | * | 123 | * |
101 | * @param string $text input string. | 124 | * @param string $text input string. |
102 | * @param string $redirector if a redirector is set, use it to gerenate links. | 125 | * @param string $redirector if a redirector is set, use it to gerenate links. |
126 | * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. | ||
103 | * | 127 | * |
104 | * @return string returns $text with all links converted to HTML links. | 128 | * @return string returns $text with all links converted to HTML links. |
105 | * | 129 | * |
106 | * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722 | 130 | * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722 |
107 | */ | 131 | */ |
108 | function text2clickable($text, $redirector = '') | 132 | function text2clickable($text, $redirector = '', $urlEncode = true) |
109 | { | 133 | { |
110 | $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[[:alnum:]]/?)!si'; | 134 | $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9\(\)]/?)!si'; |
111 | 135 | ||
112 | if (empty($redirector)) { | 136 | if (empty($redirector)) { |
113 | return preg_replace($regex, '<a href="$1">$1</a>', $text); | 137 | return preg_replace($regex, '<a href="$1">$1</a>', $text); |
@@ -115,8 +139,9 @@ function text2clickable($text, $redirector = '') | |||
115 | // Redirector is set, urlencode the final URL. | 139 | // Redirector is set, urlencode the final URL. |
116 | return preg_replace_callback( | 140 | return preg_replace_callback( |
117 | $regex, | 141 | $regex, |
118 | function ($matches) use ($redirector) { | 142 | function ($matches) use ($redirector, $urlEncode) { |
119 | return '<a href="' . $redirector . urlencode($matches[1]) .'">'. $matches[1] .'</a>'; | 143 | $url = $urlEncode ? urlencode($matches[1]) : $matches[1]; |
144 | return '<a href="' . $redirector . $url .'">'. $matches[1] .'</a>'; | ||
120 | }, | 145 | }, |
121 | $text | 146 | $text |
122 | ); | 147 | ); |
@@ -162,12 +187,13 @@ function space2nbsp($text) | |||
162 | * | 187 | * |
163 | * @param string $description shaare's description. | 188 | * @param string $description shaare's description. |
164 | * @param string $redirector if a redirector is set, use it to gerenate links. | 189 | * @param string $redirector if a redirector is set, use it to gerenate links. |
190 | * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. | ||
165 | * @param string $indexUrl URL to Shaarli's index. | 191 | * @param string $indexUrl URL to Shaarli's index. |
166 | * | 192 | |
167 | * @return string formatted description. | 193 | * @return string formatted description. |
168 | */ | 194 | */ |
169 | function format_description($description, $redirector = '', $indexUrl = '') { | 195 | function format_description($description, $redirector = '', $urlEncode = true, $indexUrl = '') { |
170 | return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl))); | 196 | return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector, $urlEncode), $indexUrl))); |
171 | } | 197 | } |
172 | 198 | ||
173 | /** | 199 | /** |