X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=application%2FLinkUtils.php;h=d56e019f4246f40d94f7a2eb5b8f715d10c7f1a1;hb=1004742f09b55ff781c13745781b9a7e90986faa;hp=9d9ae3cb29f603f6a82a8125bf3cc4b8860c1183;hpb=fb6c8f770a07e38d5250ca4d6717445002e7bfb3;p=github%2Fshaarli%2FShaarli.git diff --git a/application/LinkUtils.php b/application/LinkUtils.php index 9d9ae3cb..d56e019f 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php @@ -1,60 +1,90 @@ (.*?)!is', $html, $matches)) { - return trim(str_replace("\n", '', $matches[1])); - } - return false; + $isRedirected = false; + /** + * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). + * + * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' + * Then we extract the title and the charset and stop the download when it's done. + * + * @param resource $ch cURL resource + * @param string $data chunk of data being downloaded + * + * @return int|bool length of $data or false if we need to stop the download + */ + return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { + $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + if (!empty($responseCode) && in_array($responseCode, [301, 302])) { + $isRedirected = true; + return strlen($data); + } + if (!empty($responseCode) && $responseCode !== 200) { + return false; + } + // After a redirection, the content type will keep the previous request value + // until it finds the next content-type header. + if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { + $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); + } + if (!empty($contentType) && strpos($contentType, 'text/html') === false) { + return false; + } + if (!empty($contentType) && empty($charset)) { + $charset = header_extract_charset($contentType); + } + if (empty($charset)) { + $charset = html_extract_charset($data); + } + if (empty($title)) { + $title = html_extract_title($data); + } + // We got everything we want, stop the download. + if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { + return false; + } + + return strlen($data); + }; } /** - * Determine charset from downloaded page. - * Priority: - * 1. HTTP headers (Content type). - * 2. HTML content page (tag ). - * 3. Use a default charset (default: UTF-8). + * Extract title from an HTML document. * - * @param array $headers HTTP headers array. - * @param string $htmlContent HTML content where to look for charset. - * @param string $defaultCharset Default charset to apply if other methods failed. + * @param string $html HTML content where to look for a title. * - * @return string Determined charset. + * @return bool|string Extracted title if found, false otherwise. */ -function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') +function html_extract_title($html) { - if ($charset = headers_extract_charset($headers)) { - return $charset; - } - - if ($charset = html_extract_charset($htmlContent)) { - return $charset; + if (preg_match('!(.*?)!is', $html, $matches)) { + return trim(str_replace("\n", '', $matches[1])); } - - return $defaultCharset; + return false; } /** - * Extract charset from HTTP headers if it's defined. + * Extract charset from HTTP header if it's defined. * - * @param array $headers HTTP headers array. + * @param string $header HTTP header Content-Type line. * * @return bool|string Charset string if found (lowercase), false otherwise. */ -function headers_extract_charset($headers) +function header_extract_charset($header) { - if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { - preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); - if (! empty($match[1])) { - return strtolower(trim($match[1])); - } + preg_match('/charset="?([^; ]+)/i', $header, $match); + if (! empty($match[1])) { + return strtolower(trim($match[1])); } return false; @@ -89,7 +119,9 @@ function count_private($links) { $cpt = 0; foreach ($links as $link) { - $cpt = $link['private'] == true ? $cpt + 1 : $cpt; + if ($link['private']) { + $cpt += 1; + } } return $cpt; @@ -100,14 +132,15 @@ function count_private($links) * * @param string $text input string. * @param string $redirector if a redirector is set, use it to gerenate links. + * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. * * @return string returns $text with all links converted to HTML links. * * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722 */ -function text2clickable($text, $redirector = '') +function text2clickable($text, $redirector = '', $urlEncode = true) { - $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[[:alnum:]]/?)!si'; + $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9\(\)]/?)!si'; if (empty($redirector)) { return preg_replace($regex, '$1', $text); @@ -115,8 +148,9 @@ function text2clickable($text, $redirector = '') // Redirector is set, urlencode the final URL. return preg_replace_callback( $regex, - function ($matches) use ($redirector) { - return ''. $matches[1] .''; + function ($matches) use ($redirector, $urlEncode) { + $url = $urlEncode ? urlencode($matches[1]) : $matches[1]; + return ''. $matches[1] .''; }, $text ); @@ -162,10 +196,25 @@ function space2nbsp($text) * * @param string $description shaare's description. * @param string $redirector if a redirector is set, use it to gerenate links. + * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. * @param string $indexUrl URL to Shaarli's index. - * + * @return string formatted description. */ -function format_description($description, $redirector = '', $indexUrl = '') { - return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl))); +function format_description($description, $redirector = '', $urlEncode = true, $indexUrl = '') +{ + return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector, $urlEncode), $indexUrl))); +} + +/** + * Generate a small hash for a link. + * + * @param DateTime $date Link creation date. + * @param int $id Link ID. + * + * @return string the small hash generated from link data. + */ +function link_small_hash($date, $id) +{ + return smallHash($date->format(LinkDB::LINK_DATE_FORMAT) . $id); }