diff options
author | ArthurHoaro <arthur@hoa.ro> | 2016-01-04 10:45:54 +0100 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2016-01-11 21:19:31 +0100 |
commit | 1557cefbd76257ceb830f65806831b490faf0acc (patch) | |
tree | 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /index.php | |
parent | c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff) | |
download | Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip |
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`.
* Use the same HTTP context to retrieve response headers and content.
* Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections).
* Add `LinkUtils` to extract titles and charset.
* Try to retrieve charset from HTTP headers first (new), then HTML content.
* Use mb_string to re-encode title if necessary.
Diffstat (limited to 'index.php')
-rw-r--r-- | index.php | 87 |
1 files changed, 37 insertions, 50 deletions
@@ -152,6 +152,7 @@ require_once 'application/FileUtils.php'; | |||
152 | require_once 'application/HttpUtils.php'; | 152 | require_once 'application/HttpUtils.php'; |
153 | require_once 'application/LinkDB.php'; | 153 | require_once 'application/LinkDB.php'; |
154 | require_once 'application/LinkFilter.php'; | 154 | require_once 'application/LinkFilter.php'; |
155 | require_once 'application/LinkUtils.php'; | ||
155 | require_once 'application/TimeZone.php'; | 156 | require_once 'application/TimeZone.php'; |
156 | require_once 'application/Url.php'; | 157 | require_once 'application/Url.php'; |
157 | require_once 'application/Utils.php'; | 158 | require_once 'application/Utils.php'; |
@@ -578,13 +579,6 @@ function linkdate2iso8601($linkdate) | |||
578 | return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format. | 579 | return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format. |
579 | } | 580 | } |
580 | 581 | ||
581 | // Extract title from an HTML document. | ||
582 | // (Returns an empty string if not found.) | ||
583 | function html_extract_title($html) | ||
584 | { | ||
585 | return preg_match('!<title>(.*?)</title>!is', $html, $matches) ? trim(str_replace("\n",' ', $matches[1])) : '' ; | ||
586 | } | ||
587 | |||
588 | // ------------------------------------------------------------------------------------------ | 582 | // ------------------------------------------------------------------------------------------ |
589 | // Token management for XSRF protection | 583 | // Token management for XSRF protection |
590 | // Token should be used in any form which acts on data (create,update,delete,import...). | 584 | // Token should be used in any form which acts on data (create,update,delete,import...). |
@@ -1642,7 +1636,7 @@ function renderPage() | |||
1642 | 1636 | ||
1643 | // -------- User want to post a new link: Display link edit form. | 1637 | // -------- User want to post a new link: Display link edit form. |
1644 | if (isset($_GET['post'])) { | 1638 | if (isset($_GET['post'])) { |
1645 | $url = cleanup_url($_GET['post']); | 1639 | $url = cleanup_url(escape($_GET['post'])); |
1646 | 1640 | ||
1647 | $link_is_new = false; | 1641 | $link_is_new = false; |
1648 | // Check if URL is not already in database (in this case, we will edit the existing link) | 1642 | // Check if URL is not already in database (in this case, we will edit the existing link) |
@@ -1660,35 +1654,24 @@ function renderPage() | |||
1660 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) | 1654 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) |
1661 | if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { | 1655 | if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { |
1662 | // Short timeout to keep the application responsive | 1656 | // Short timeout to keep the application responsive |
1663 | list($headers, $data) = get_http_url($url, 4); | 1657 | list($headers, $content) = get_http_response($url, 4); |
1664 | // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html | ||
1665 | if (strpos($headers[0], '200 OK') !== false) { | 1658 | if (strpos($headers[0], '200 OK') !== false) { |
1666 | // Look for charset in html header. | 1659 | // Retrieve charset. |
1667 | preg_match('#<meta .*charset=.*>#Usi', $data, $meta); | 1660 | $charset = get_charset($headers, $content); |
1668 | 1661 | // Extract title. | |
1669 | // If found, extract encoding. | 1662 | $title = html_extract_title($content); |
1670 | if (!empty($meta[0])) { | 1663 | // Re-encode title in utf-8 if necessary. |
1671 | // Get encoding specified in header. | 1664 | if (! empty($title) && $charset != 'utf-8') { |
1672 | preg_match('#charset="?(.*)"#si', $meta[0], $enc); | 1665 | $title = mb_convert_encoding($title, $charset, 'utf-8'); |
1673 | // If charset not found, use utf-8. | ||
1674 | $html_charset = (!empty($enc[1])) ? strtolower($enc[1]) : 'utf-8'; | ||
1675 | } | ||
1676 | else { | ||
1677 | $html_charset = 'utf-8'; | ||
1678 | } | ||
1679 | |||
1680 | // Extract title | ||
1681 | $title = html_extract_title($data); | ||
1682 | if (!empty($title)) { | ||
1683 | // Re-encode title in utf-8 if necessary. | ||
1684 | $title = ($html_charset == 'iso-8859-1') ? utf8_encode($title) : $title; | ||
1685 | } | 1666 | } |
1686 | } | 1667 | } |
1687 | } | 1668 | } |
1669 | |||
1688 | if ($url == '') { | 1670 | if ($url == '') { |
1689 | $url = '?' . smallHash($linkdate); | 1671 | $url = '?' . smallHash($linkdate); |
1690 | $title = 'Note: '; | 1672 | $title = 'Note: '; |
1691 | } | 1673 | } |
1674 | |||
1692 | $link = array( | 1675 | $link = array( |
1693 | 'linkdate' => $linkdate, | 1676 | 'linkdate' => $linkdate, |
1694 | 'title' => $title, | 1677 | 'title' => $title, |
@@ -2314,11 +2297,11 @@ function genThumbnail() | |||
2314 | else // This is a flickr page (html) | 2297 | else // This is a flickr page (html) |
2315 | { | 2298 | { |
2316 | // Get the flickr html page. | 2299 | // Get the flickr html page. |
2317 | list($headers, $data) = get_http_url($url, 20); | 2300 | list($headers, $content) = get_http_response($url, 20); |
2318 | if (strpos($headers[0], '200 OK') !== false) | 2301 | if (strpos($headers[0], '200 OK') !== false) |
2319 | { | 2302 | { |
2320 | // flickr now nicely provides the URL of the thumbnail in each flickr page. | 2303 | // flickr now nicely provides the URL of the thumbnail in each flickr page. |
2321 | preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!',$data,$matches); | 2304 | preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!', $content, $matches); |
2322 | if (!empty($matches[1])) $imageurl=$matches[1]; | 2305 | if (!empty($matches[1])) $imageurl=$matches[1]; |
2323 | 2306 | ||
2324 | // In albums (and some other pages), the link rel="image_src" is not provided, | 2307 | // In albums (and some other pages), the link rel="image_src" is not provided, |
@@ -2326,7 +2309,7 @@ function genThumbnail() | |||
2326 | // <meta property="og:image" content="http://farm4.staticflickr.com/3398/3239339068_25d13535ff_z.jpg" /> | 2309 | // <meta property="og:image" content="http://farm4.staticflickr.com/3398/3239339068_25d13535ff_z.jpg" /> |
2327 | if ($imageurl=='') | 2310 | if ($imageurl=='') |
2328 | { | 2311 | { |
2329 | preg_match('!<meta property=\"og:image\" content=\"(.+?)\"!',$data,$matches); | 2312 | preg_match('!<meta property=\"og:image\" content=\"(.+?)\"!', $content, $matches); |
2330 | if (!empty($matches[1])) $imageurl=$matches[1]; | 2313 | if (!empty($matches[1])) $imageurl=$matches[1]; |
2331 | } | 2314 | } |
2332 | } | 2315 | } |
@@ -2335,11 +2318,12 @@ function genThumbnail() | |||
2335 | if ($imageurl!='') | 2318 | if ($imageurl!='') |
2336 | { // Let's download the image. | 2319 | { // Let's download the image. |
2337 | // Image is 240x120, so 10 seconds to download should be enough. | 2320 | // Image is 240x120, so 10 seconds to download should be enough. |
2338 | list($headers, $data) = get_http_url($imageurl, 10); | 2321 | list($headers, $content) = get_http_response($imageurl, 10); |
2339 | if (strpos($headers[0], '200 OK') !== false) { | 2322 | if (strpos($headers[0], '200 OK') !== false) { |
2340 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. | 2323 | // Save image to cache. |
2324 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/' . $thumbname, $content); | ||
2341 | header('Content-Type: image/jpeg'); | 2325 | header('Content-Type: image/jpeg'); |
2342 | echo $data; | 2326 | echo $content; |
2343 | return; | 2327 | return; |
2344 | } | 2328 | } |
2345 | } | 2329 | } |
@@ -2350,16 +2334,17 @@ function genThumbnail() | |||
2350 | // This is more complex: we have to perform a HTTP request, then parse the result. | 2334 | // This is more complex: we have to perform a HTTP request, then parse the result. |
2351 | // Maybe we should deport this to JavaScript ? Example: http://stackoverflow.com/questions/1361149/get-img-thumbnails-from-vimeo/4285098#4285098 | 2335 | // Maybe we should deport this to JavaScript ? Example: http://stackoverflow.com/questions/1361149/get-img-thumbnails-from-vimeo/4285098#4285098 |
2352 | $vid = substr(parse_url($url,PHP_URL_PATH),1); | 2336 | $vid = substr(parse_url($url,PHP_URL_PATH),1); |
2353 | list($headers, $data) = get_http_url('https://vimeo.com/api/v2/video/'.escape($vid).'.php', 5); | 2337 | list($headers, $content) = get_http_response('https://vimeo.com/api/v2/video/'.escape($vid).'.php', 5); |
2354 | if (strpos($headers[0], '200 OK') !== false) { | 2338 | if (strpos($headers[0], '200 OK') !== false) { |
2355 | $t = unserialize($data); | 2339 | $t = unserialize($content); |
2356 | $imageurl = $t[0]['thumbnail_medium']; | 2340 | $imageurl = $t[0]['thumbnail_medium']; |
2357 | // Then we download the image and serve it to our client. | 2341 | // Then we download the image and serve it to our client. |
2358 | list($headers, $data) = get_http_url($imageurl, 10); | 2342 | list($headers, $content) = get_http_response($imageurl, 10); |
2359 | if (strpos($headers[0], '200 OK') !== false) { | 2343 | if (strpos($headers[0], '200 OK') !== false) { |
2360 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. | 2344 | // Save image to cache. |
2345 | file_put_contents($GLOBALS['config']['CACHEDIR'] . '/' . $thumbname, $content); | ||
2361 | header('Content-Type: image/jpeg'); | 2346 | header('Content-Type: image/jpeg'); |
2362 | echo $data; | 2347 | echo $content; |
2363 | return; | 2348 | return; |
2364 | } | 2349 | } |
2365 | } | 2350 | } |
@@ -2370,18 +2355,18 @@ function genThumbnail() | |||
2370 | // The thumbnail for TED talks is located in the <link rel="image_src" [...]> tag on that page | 2355 | // The thumbnail for TED talks is located in the <link rel="image_src" [...]> tag on that page |
2371 | // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html | 2356 | // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html |
2372 | // <link rel="image_src" href="http://images.ted.com/images/ted/28bced335898ba54d4441809c5b1112ffaf36781_389x292.jpg" /> | 2357 | // <link rel="image_src" href="http://images.ted.com/images/ted/28bced335898ba54d4441809c5b1112ffaf36781_389x292.jpg" /> |
2373 | list($headers, $data) = get_http_url($url, 5); | 2358 | list($headers, $content) = get_http_response($url, 5); |
2374 | if (strpos($headers[0], '200 OK') !== false) { | 2359 | if (strpos($headers[0], '200 OK') !== false) { |
2375 | // Extract the link to the thumbnail | 2360 | // Extract the link to the thumbnail |
2376 | preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!',$data,$matches); | 2361 | preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!', $content, $matches); |
2377 | if (!empty($matches[1])) | 2362 | if (!empty($matches[1])) |
2378 | { // Let's download the image. | 2363 | { // Let's download the image. |
2379 | $imageurl=$matches[1]; | 2364 | $imageurl=$matches[1]; |
2380 | // No control on image size, so wait long enough | 2365 | // No control on image size, so wait long enough |
2381 | list($headers, $data) = get_http_url($imageurl, 20); | 2366 | list($headers, $content) = get_http_response($imageurl, 20); |
2382 | if (strpos($headers[0], '200 OK') !== false) { | 2367 | if (strpos($headers[0], '200 OK') !== false) { |
2383 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2368 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2384 | file_put_contents($filepath,$data); // Save image to cache. | 2369 | file_put_contents($filepath, $content); // Save image to cache. |
2385 | if (resizeImage($filepath)) | 2370 | if (resizeImage($filepath)) |
2386 | { | 2371 | { |
2387 | header('Content-Type: image/jpeg'); | 2372 | header('Content-Type: image/jpeg'); |
@@ -2398,18 +2383,19 @@ function genThumbnail() | |||
2398 | // There is no thumbnail available for xkcd comics, so download the whole image and resize it. | 2383 | // There is no thumbnail available for xkcd comics, so download the whole image and resize it. |
2399 | // http://xkcd.com/327/ | 2384 | // http://xkcd.com/327/ |
2400 | // <img src="http://imgs.xkcd.com/comics/exploits_of_a_mom.png" title="<BLABLA>" alt="<BLABLA>" /> | 2385 | // <img src="http://imgs.xkcd.com/comics/exploits_of_a_mom.png" title="<BLABLA>" alt="<BLABLA>" /> |
2401 | list($headers, $data) = get_http_url($url, 5); | 2386 | list($headers, $content) = get_http_response($url, 5); |
2402 | if (strpos($headers[0], '200 OK') !== false) { | 2387 | if (strpos($headers[0], '200 OK') !== false) { |
2403 | // Extract the link to the thumbnail | 2388 | // Extract the link to the thumbnail |
2404 | preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!',$data,$matches); | 2389 | preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!', $content, $matches); |
2405 | if (!empty($matches[1])) | 2390 | if (!empty($matches[1])) |
2406 | { // Let's download the image. | 2391 | { // Let's download the image. |
2407 | $imageurl=$matches[1]; | 2392 | $imageurl=$matches[1]; |
2408 | // No control on image size, so wait long enough | 2393 | // No control on image size, so wait long enough |
2409 | list($headers, $data) = get_http_url($imageurl, 20); | 2394 | list($headers, $content) = get_http_response($imageurl, 20); |
2410 | if (strpos($headers[0], '200 OK') !== false) { | 2395 | if (strpos($headers[0], '200 OK') !== false) { |
2411 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2396 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2412 | file_put_contents($filepath,$data); // Save image to cache. | 2397 | // Save image to cache. |
2398 | file_put_contents($filepath, $content); | ||
2413 | if (resizeImage($filepath)) | 2399 | if (resizeImage($filepath)) |
2414 | { | 2400 | { |
2415 | header('Content-Type: image/jpeg'); | 2401 | header('Content-Type: image/jpeg'); |
@@ -2425,10 +2411,11 @@ function genThumbnail() | |||
2425 | { | 2411 | { |
2426 | // For all other domains, we try to download the image and make a thumbnail. | 2412 | // For all other domains, we try to download the image and make a thumbnail. |
2427 | // We allow 30 seconds max to download (and downloads are limited to 4 Mb) | 2413 | // We allow 30 seconds max to download (and downloads are limited to 4 Mb) |
2428 | list($headers, $data) = get_http_url($url, 30); | 2414 | list($headers, $content) = get_http_response($url, 30); |
2429 | if (strpos($headers[0], '200 OK') !== false) { | 2415 | if (strpos($headers[0], '200 OK') !== false) { |
2430 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2416 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2431 | file_put_contents($filepath,$data); // Save image to cache. | 2417 | // Save image to cache. |
2418 | file_put_contents($filepath, $content); | ||
2432 | if (resizeImage($filepath)) | 2419 | if (resizeImage($filepath)) |
2433 | { | 2420 | { |
2434 | header('Content-Type: image/jpeg'); | 2421 | header('Content-Type: image/jpeg'); |