From 1557cefbd76257ceb830f65806831b490faf0acc Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Mon, 4 Jan 2016 10:45:54 +0100 Subject: Fixes #410 - Retrieve title fails in multiple cases * `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary. --- index.php | 87 +++++++++++++++++++++++++++------------------------------------ 1 file changed, 37 insertions(+), 50 deletions(-) (limited to 'index.php') diff --git a/index.php b/index.php index cd83600b..600b2f55 100644 --- a/index.php +++ b/index.php @@ -152,6 +152,7 @@ require_once 'application/FileUtils.php'; require_once 'application/HttpUtils.php'; require_once 'application/LinkDB.php'; require_once 'application/LinkFilter.php'; +require_once 'application/LinkUtils.php'; require_once 'application/TimeZone.php'; require_once 'application/Url.php'; require_once 'application/Utils.php'; @@ -578,13 +579,6 @@ function linkdate2iso8601($linkdate) return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format. } -// Extract title from an HTML document. -// (Returns an empty string if not found.) -function html_extract_title($html) -{ - return preg_match('!(.*?)!is', $html, $matches) ? trim(str_replace("\n",' ', $matches[1])) : '' ; -} - // ------------------------------------------------------------------------------------------ // Token management for XSRF protection // Token should be used in any form which acts on data (create,update,delete,import...). @@ -1642,7 +1636,7 @@ function renderPage() // -------- User want to post a new link: Display link edit form. if (isset($_GET['post'])) { - $url = cleanup_url($_GET['post']); + $url = cleanup_url(escape($_GET['post'])); $link_is_new = false; // Check if URL is not already in database (in this case, we will edit the existing link) @@ -1660,35 +1654,24 @@ function renderPage() // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { // Short timeout to keep the application responsive - list($headers, $data) = get_http_url($url, 4); - // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) in html + list($headers, $content) = get_http_response($url, 4); if (strpos($headers[0], '200 OK') !== false) { - // Look for charset in html header. - preg_match('##Usi', $data, $meta); - - // If found, extract encoding. - if (!empty($meta[0])) { - // Get encoding specified in header. - preg_match('#charset="?(.*)"#si', $meta[0], $enc); - // If charset not found, use utf-8. - $html_charset = (!empty($enc[1])) ? strtolower($enc[1]) : 'utf-8'; - } - else { - $html_charset = 'utf-8'; - } - - // Extract title - $title = html_extract_title($data); - if (!empty($title)) { - // Re-encode title in utf-8 if necessary. - $title = ($html_charset == 'iso-8859-1') ? utf8_encode($title) : $title; + // Retrieve charset. + $charset = get_charset($headers, $content); + // Extract title. + $title = html_extract_title($content); + // Re-encode title in utf-8 if necessary. + if (! empty($title) && $charset != 'utf-8') { + $title = mb_convert_encoding($title, $charset, 'utf-8'); } } } + if ($url == '') { $url = '?' . smallHash($linkdate); $title = 'Note: '; } + $link = array( 'linkdate' => $linkdate, 'title' => $title, @@ -2314,11 +2297,11 @@ function genThumbnail() else // This is a flickr page (html) { // Get the flickr html page. - list($headers, $data) = get_http_url($url, 20); + list($headers, $content) = get_http_response($url, 20); if (strpos($headers[0], '200 OK') !== false) { // flickr now nicely provides the URL of the thumbnail in each flickr page. - preg_match('! if ($imageurl=='') { - preg_match('! tag on that page // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html // - list($headers, $data) = get_http_url($url, 5); + list($headers, $content) = get_http_response($url, 5); if (strpos($headers[0], '200 OK') !== false) { // Extract the link to the thumbnail - preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!',$data,$matches); + preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!', $content, $matches); if (!empty($matches[1])) { // Let's download the image. $imageurl=$matches[1]; // No control on image size, so wait long enough - list($headers, $data) = get_http_url($imageurl, 20); + list($headers, $content) = get_http_response($imageurl, 20); if (strpos($headers[0], '200 OK') !== false) { $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; - file_put_contents($filepath,$data); // Save image to cache. + file_put_contents($filepath, $content); // Save image to cache. if (resizeImage($filepath)) { header('Content-Type: image/jpeg'); @@ -2398,18 +2383,19 @@ function genThumbnail() // There is no thumbnail available for xkcd comics, so download the whole image and resize it. // http://xkcd.com/327/ // <BLABLA> - list($headers, $data) = get_http_url($url, 5); + list($headers, $content) = get_http_response($url, 5); if (strpos($headers[0], '200 OK') !== false) { // Extract the link to the thumbnail - preg_match('!