aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/LinkUtils.php
diff options
context:
space:
mode:
Diffstat (limited to 'application/LinkUtils.php')
-rw-r--r--application/LinkUtils.php110
1 files changed, 68 insertions, 42 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index cf58f808..3705f7e9 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1<?php 1<?php
2 2
3/** 3/**
4 * Extract title from an HTML document. 4 * Get cURL callback function for CURLOPT_WRITEFUNCTION
5 * 5 *
6 * @param string $html HTML content where to look for a title. 6 * @param string $charset to extract from the downloaded page (reference)
7 * @param string $title to extract from the downloaded page (reference)
8 * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7 * 9 *
8 * @return bool|string Extracted title if found, false otherwise. 10 * @return Closure
9 */ 11 */
10function html_extract_title($html) 12function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11{ 13{
12 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { 14 /**
13 return trim(str_replace("\n", '', $matches[1])); 15 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14 } 16 *
15 return false; 17 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
18 * Then we extract the title and the charset and stop the download when it's done.
19 *
20 * @param resource $ch cURL resource
21 * @param string $data chunk of data being downloaded
22 *
23 * @return int|bool length of $data or false if we need to stop the download
24 */
25 return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
26 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
27 if (!empty($responseCode) && $responseCode != 200) {
28 return false;
29 }
30 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
31 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
32 return false;
33 }
34 if (empty($charset)) {
35 $charset = header_extract_charset($contentType);
36 }
37 if (empty($charset)) {
38 $charset = html_extract_charset($data);
39 }
40 if (empty($title)) {
41 $title = html_extract_title($data);
42 }
43 // We got everything we want, stop the download.
44 if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
45 return false;
46 }
47
48 return strlen($data);
49 };
16} 50}
17 51
18/** 52/**
19 * Determine charset from downloaded page. 53 * Extract title from an HTML document.
20 * Priority:
21 * 1. HTTP headers (Content type).
22 * 2. HTML content page (tag <meta charset>).
23 * 3. Use a default charset (default: UTF-8).
24 * 54 *
25 * @param array $headers HTTP headers array. 55 * @param string $html HTML content where to look for a title.
26 * @param string $htmlContent HTML content where to look for charset.
27 * @param string $defaultCharset Default charset to apply if other methods failed.
28 * 56 *
29 * @return string Determined charset. 57 * @return bool|string Extracted title if found, false otherwise.
30 */ 58 */
31function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') 59function html_extract_title($html)
32{ 60{
33 if ($charset = headers_extract_charset($headers)) { 61 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
34 return $charset; 62 return trim(str_replace("\n", '', $matches[1]));
35 }
36
37 if ($charset = html_extract_charset($htmlContent)) {
38 return $charset;
39 } 63 }
40 64 return false;
41 return $defaultCharset;
42} 65}
43 66
44/** 67/**
45 * Extract charset from HTTP headers if it's defined. 68 * Extract charset from HTTP header if it's defined.
46 * 69 *
47 * @param array $headers HTTP headers array. 70 * @param string $header HTTP header Content-Type line.
48 * 71 *
49 * @return bool|string Charset string if found (lowercase), false otherwise. 72 * @return bool|string Charset string if found (lowercase), false otherwise.
50 */ 73 */
51function headers_extract_charset($headers) 74function header_extract_charset($header)
52{ 75{
53 if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { 76 preg_match('/charset="?([^; ]+)/i', $header, $match);
54 preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); 77 if (! empty($match[1])) {
55 if (! empty($match[1])) { 78 return strtolower(trim($match[1]));
56 return strtolower(trim($match[1]));
57 }
58 } 79 }
59 80
60 return false; 81 return false;
@@ -89,7 +110,9 @@ function count_private($links)
89{ 110{
90 $cpt = 0; 111 $cpt = 0;
91 foreach ($links as $link) { 112 foreach ($links as $link) {
92 $cpt = $link['private'] == true ? $cpt + 1 : $cpt; 113 if ($link['private']) {
114 $cpt += 1;
115 }
93 } 116 }
94 117
95 return $cpt; 118 return $cpt;
@@ -100,14 +123,15 @@ function count_private($links)
100 * 123 *
101 * @param string $text input string. 124 * @param string $text input string.
102 * @param string $redirector if a redirector is set, use it to gerenate links. 125 * @param string $redirector if a redirector is set, use it to gerenate links.
126 * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not.
103 * 127 *
104 * @return string returns $text with all links converted to HTML links. 128 * @return string returns $text with all links converted to HTML links.
105 * 129 *
106 * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722 130 * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722
107 */ 131 */
108function text2clickable($text, $redirector = '') 132function text2clickable($text, $redirector = '', $urlEncode = true)
109{ 133{
110 $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[[:alnum:]]/?)!si'; 134 $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9\(\)]/?)!si';
111 135
112 if (empty($redirector)) { 136 if (empty($redirector)) {
113 return preg_replace($regex, '<a href="$1">$1</a>', $text); 137 return preg_replace($regex, '<a href="$1">$1</a>', $text);
@@ -115,8 +139,9 @@ function text2clickable($text, $redirector = '')
115 // Redirector is set, urlencode the final URL. 139 // Redirector is set, urlencode the final URL.
116 return preg_replace_callback( 140 return preg_replace_callback(
117 $regex, 141 $regex,
118 function ($matches) use ($redirector) { 142 function ($matches) use ($redirector, $urlEncode) {
119 return '<a href="' . $redirector . urlencode($matches[1]) .'">'. $matches[1] .'</a>'; 143 $url = $urlEncode ? urlencode($matches[1]) : $matches[1];
144 return '<a href="' . $redirector . $url .'">'. $matches[1] .'</a>';
120 }, 145 },
121 $text 146 $text
122 ); 147 );
@@ -162,12 +187,13 @@ function space2nbsp($text)
162 * 187 *
163 * @param string $description shaare's description. 188 * @param string $description shaare's description.
164 * @param string $redirector if a redirector is set, use it to gerenate links. 189 * @param string $redirector if a redirector is set, use it to gerenate links.
190 * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not.
165 * @param string $indexUrl URL to Shaarli's index. 191 * @param string $indexUrl URL to Shaarli's index.
166 * 192
167 * @return string formatted description. 193 * @return string formatted description.
168 */ 194 */
169function format_description($description, $redirector = '', $indexUrl = '') { 195function format_description($description, $redirector = '', $urlEncode = true, $indexUrl = '') {
170 return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl))); 196 return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector, $urlEncode), $indexUrl)));
171} 197}
172 198
173/** 199/**