diff options
author | Sebastien SAUVAGE <sebsauvage@sebsauvage.net> | 2013-08-03 22:10:04 +0200 |
---|---|---|
committer | Sebastien SAUVAGE <sebsauvage@sebsauvage.net> | 2013-08-03 22:10:04 +0200 |
commit | 002ef0e5c8ed2bab38e205a5d54617780f25c3a9 (patch) | |
tree | 3d138b5f5a30db4fbe16d3bd6ae09182eb591994 | |
parent | f6a6ca0aec6cc09ee76d827cc07d3c0ed66c8eb0 (diff) | |
download | Shaarli-002ef0e5c8ed2bab38e205a5d54617780f25c3a9.tar.gz Shaarli-002ef0e5c8ed2bab38e205a5d54617780f25c3a9.tar.zst Shaarli-002ef0e5c8ed2bab38e205a5d54617780f25c3a9.zip |
Better encoding handling in title parsing
Thanks to a patch from Le Hollandais Volant.
-rw-r--r-- | index.php | 25 |
1 files changed, 23 insertions, 2 deletions
@@ -1545,8 +1545,29 @@ function renderPage() | |||
1545 | { | 1545 | { |
1546 | list($status,$headers,$data) = getHTTP($url,4); // Short timeout to keep the application responsive. | 1546 | list($status,$headers,$data) = getHTTP($url,4); // Short timeout to keep the application responsive. |
1547 | // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html | 1547 | // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html |
1548 | if (strpos($status,'200 OK')!==false) $title=html_entity_decode(html_extract_title($data),ENT_QUOTES,'UTF-8'); | 1548 | if (strpos($status,'200 OK')!==false) |
1549 | 1549 | { | |
1550 | // Look for charset in html header. | ||
1551 | preg_match('#<meta .*charset=.*>#Usi', $data, $meta); | ||
1552 | |||
1553 | // If found, extract encoding. | ||
1554 | if (!empty($meta[0])) | ||
1555 | { | ||
1556 | // Get encoding specified in header. | ||
1557 | preg_match('#charset="?(.*)"#si', $meta[0], $enc); | ||
1558 | // If charset not found, use utf-8. | ||
1559 | $html_charset = (!empty($enc[1])) ? strtolower($enc[1]) : 'utf-8'; | ||
1560 | } | ||
1561 | else { $html_charset = 'utf-8'; } | ||
1562 | |||
1563 | // Extract title | ||
1564 | $title = html_extract_title($data); | ||
1565 | if (!empty($title)) | ||
1566 | { | ||
1567 | // Re-encode title in utf-8 if necessary. | ||
1568 | $title = ($html_charset == 'iso-8859-1') ? utf8_encode($title) : $title; | ||
1569 | } | ||
1570 | } | ||
1550 | } | 1571 | } |
1551 | if ($url=='') $url='?'.smallHash($linkdate); // In case of empty URL, this is just a text (with a link that point to itself) | 1572 | if ($url=='') $url='?'.smallHash($linkdate); // In case of empty URL, this is just a text (with a link that point to itself) |
1552 | $link = array('linkdate'=>$linkdate,'title'=>$title,'url'=>$url,'description'=>$description,'tags'=>$tags,'private'=>0); | 1573 | $link = array('linkdate'=>$linkdate,'title'=>$title,'url'=>$url,'description'=>$description,'tags'=>$tags,'private'=>0); |