Refactoring Â· GitHub

From: ArthurHoaro Date: Tue, 23 Jan 2018 17:41:38 +0000 (+0100) Subject: Merge pull request #977 from ArthurHoaro/feature/dl-filter X-Git-Tag: v0.9.4~7 X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=d449f79a0d7ca808b891baf73b9e25ce7f7e48fe;hp=-c;p=github%2Fshaarli%2FShaarli.git Merge pull request #977 from ArthurHoaro/feature/dl-filter Extract the title/charset during page download, and check content type --- d449f79a0d7ca808b891baf73b9e25ce7f7e48fe diff --combined application/HttpUtils.php index c9371b55,2edf5ce2..83a4c5e2 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php @@@ -3,9 -3,11 +3,11 @@@ * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@@ -29,7 -31,7 +31,7 @@@ * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ - function get_http_response($url, $timeout = 30, $maxBytes = 4194304) + function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@@ -75,8 -77,12 +77,12 @@@ curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } + // Max download size management - curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024); + curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) @@@ -302,13 -308,6 +308,13 @@@ function server_url($server $port = $server['HTTP_X_FORWARDED_PORT']; } + // This is a workaround for proxies that don't forward the scheme properly. + // Connecting over port 443 has to be in HTTPS. + // See https://github.com/shaarli/Shaarli/issues/1022 + if ($port == '443') { + $scheme = 'https'; + } + if (($scheme == 'http' && $port != '80') || ($scheme == 'https' && $port != '443') ) { diff --combined application/LinkUtils.php index e3d95d08,c0dd32a6..3705f7e9 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php @@@ -1,60 -1,81 +1,81 @@@ (.*?)!is', $html, $matches)) { - return trim(str_replace("\n", '', $matches[1])); - } - return false; + /** + * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). + * + * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' + * Then we extract the title and the charset and stop the download when it's done. + * + * @param resource $ch cURL resource + * @param string $data chunk of data being downloaded + * + * @return int|bool length of $data or false if we need to stop the download + */ + return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) { + $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + if (!empty($responseCode) && $responseCode != 200) { + return false; + } + $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); + if (!empty($contentType) && strpos($contentType, 'text/html') === false) { + return false; + } + if (empty($charset)) { + $charset = header_extract_charset($contentType); + } + if (empty($charset)) { + $charset = html_extract_charset($data); + } + if (empty($title)) { + $title = html_extract_title($data); + } + // We got everything we want, stop the download. + if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { + return false; + } + + return strlen($data); + }; } /** - * Determine charset from downloaded page. - * Priority: - * 1. HTTP headers (Content type). - * 2. HTML content page (tag ). - * 3. Use a default charset (default: UTF-8). + * Extract title from an HTML document. * - * @param array $headers HTTP headers array. - * @param string $htmlContent HTML content where to look for charset. - * @param string $defaultCharset Default charset to apply if other methods failed. + * @param string $html HTML content where to look for a title. * - * @return string Determined charset. + * @return bool|string Extracted title if found, false otherwise. */ - function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') + function html_extract_title($html) { - if ($charset = headers_extract_charset($headers)) { - return $charset; - } - - if ($charset = html_extract_charset($htmlContent)) { - return $charset; + if (preg_match('!(.*?)!is', $html, $matches)) { + return trim(str_replace("\n", '', $matches[1])); } - - return $defaultCharset; + return false; } /** - * Extract charset from HTTP headers if it's defined. + * Extract charset from HTTP header if it's defined. * - * @param array $headers HTTP headers array. + * @param string $header HTTP header Content-Type line. * * @return bool|string Charset string if found (lowercase), false otherwise. */ - function headers_extract_charset($headers) + function header_extract_charset($header) { - if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { - preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); - if (! empty($match[1])) { - return strtolower(trim($match[1])); - } + preg_match('/charset="?([^; ]+)/i', $header, $match); + if (! empty($match[1])) { + return strtolower(trim($match[1])); } return false; @@@ -102,15 -123,14 +123,15 @@@ function count_private($links * * @param string $text input string. * @param string $redirector if a redirector is set, use it to gerenate links. + * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. * * @return string returns $text with all links converted to HTML links. * * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722 */ -function text2clickable($text, $redirector = '') +function text2clickable($text, $redirector = '', $urlEncode = true) { - $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[[:alnum:]]/?)!si'; + $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9]/?)!si'; if (empty($redirector)) { return preg_replace($regex, '$1', $text); @@@ -118,9 -138,8 +139,9 @@@ // Redirector is set, urlencode the final URL. return preg_replace_callback( $regex, - function ($matches) use ($redirector) { - return ''. $matches[1] .''; + function ($matches) use ($redirector, $urlEncode) { + $url = $urlEncode ? urlencode($matches[1]) : $matches[1]; + return ''. $matches[1] .''; }, $text ); @@@ -166,13 -185,12 +187,13 @@@ function space2nbsp($text * * @param string $description shaare's description. * @param string $redirector if a redirector is set, use it to gerenate links. + * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not. * @param string $indexUrl URL to Shaarli's index. - * + * @return string formatted description. */ -function format_description($description, $redirector = '', $indexUrl = '') { - return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector), $indexUrl))); +function format_description($description, $redirector = '', $urlEncode = true, $indexUrl = '') { + return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector, $urlEncode), $indexUrl))); } /** diff --combined index.php index 27335a36,ac51038d..d57789e6 --- a/index.php +++ b/index.php @@@ -64,6 -64,7 +64,6 @@@ require_once 'application/FeedBuilder.p require_once 'application/FileUtils.php'; require_once 'application/History.php'; require_once 'application/HttpUtils.php'; -require_once 'application/Languages.php'; require_once 'application/LinkDB.php'; require_once 'application/LinkFilter.php'; require_once 'application/LinkUtils.php'; @@@ -75,10 -76,8 +75,10 @@@ require_once 'application/Utils.php' require_once 'application/PluginManager.php'; require_once 'application/Router.php'; require_once 'application/Updater.php'; +use \Shaarli\Languages; use \Shaarli\ThemeUtils; use \Shaarli\Config\ConfigManager; +use \Shaarli\SessionManager; // Ensure the PHP version is supported try { @@@ -89,7 -88,7 +89,7 @@@ exit; } -define('shaarli_version', ApplicationUtils::getVersion(__DIR__ .'/'. ApplicationUtils::$VERSION_FILE)); +define('SHAARLI_VERSION', ApplicationUtils::getVersion(__DIR__ .'/'. ApplicationUtils::$VERSION_FILE)); // Force cookie path (but do not change lifetime) $cookie = session_get_cookie_params(); @@@ -116,23 -115,14 +116,23 @@@ if (session_id() == '') } // Regenerate session ID if invalid or not defined in cookie. -if (isset($_COOKIE['shaarli']) && !is_session_id_valid($_COOKIE['shaarli'])) { +if (isset($_COOKIE['shaarli']) && !SessionManager::checkId($_COOKIE['shaarli'])) { session_regenerate_id(true); $_COOKIE['shaarli'] = session_id(); } $conf = new ConfigManager(); +$sessionManager = new SessionManager($_SESSION, $conf); + +// Sniff browser language and set date format accordingly. +if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { + autoLocale($_SERVER['HTTP_ACCEPT_LANGUAGE']); +} + +new Languages(setlocale(LC_MESSAGES, 0), $conf); + $conf->setEmpty('general.timezone', date_default_timezone_get()); -$conf->setEmpty('general.title', 'Shared links on '. escape(index_url($_SERVER))); +$conf->setEmpty('general.title', t('Shared links on '). escape(index_url($_SERVER))); RainTPL::$tpl_dir = $conf->get('resource.raintpl_tpl').'/'.$conf->get('resource.theme').'/'; // template directory RainTPL::$cache_dir = $conf->get('resource.raintpl_tmp'); // cache directory @@@ -154,7 -144,7 +154,7 @@@ if (! is_file($conf->getConfigFileExt() $errors = ApplicationUtils::checkResourcePermissions($conf); if ($errors != array()) { - $message = '

Insufficient permissions:

'. t('Insufficient permissions:') .'

'.$error.'

get('redirector.url')); + $link['description'] = format_description( + $link['description'], + $conf->get('redirector.url'), + $conf->get('redirector.encode_url') + ); $classLi = ($i % 2) != 0 ? '' : 'publicLinkHightLight'; $link['class'] = $link['private'] == 0 ? $classLi : 'private'; $link['timestamp'] = $link['created']->getTimestamp(); @@@ -1954,10 -1944,10 +1948,10 @@@ function lazyThumbnail($conf, $url,$hre * Installation * This function should NEVER be called if the file data/config.php exists. * - * @param ConfigManager $conf Configuration Manager instance. + * @param ConfigManager $conf Configuration Manager instance. + * @param SessionManager $sessionManager SessionManager instance */ -function install($conf) -{ +function install($conf, $sessionManager) { // On free.fr host, make sure the /sessions directory exists, otherwise login will not work. if (endsWith($_SERVER['HTTP_HOST'],'.free.fr') && !is_dir($_SERVER['DOCUMENT_ROOT'].'/sessions')) mkdir($_SERVER['DOCUMENT_ROOT'].'/sessions',0705); @@@ -1966,20 -1956,12 +1960,20 @@@ // (Because on some hosts, session.save_path may not be set correctly, // or we may not have write access to it.) if (isset($_GET['test_session']) && ( !isset($_SESSION) || !isset($_SESSION['session_tested']) || $_SESSION['session_tested']!='Working')) - { // Step 2: Check if data in session is correct. - echo '

Sessions do not seem to work correctly on your server.
';
 -        echo 'Make sure the variable session.save_path is set correctly in your php config, and that you have write access to it.
';
 -        echo 'It currently points to '.session_save_path().'
';
 -        echo 'Check that the hostname used to access Shaarli contains a dot. On some browsers, accessing your server via a hostname like \'localhost\' or any custom hostname without a dot causes cookie storage to fail. We recommend accessing your server via it\'s IP address or Fully Qualified Domain Name.
';
 -        echo '
Click to try again.

'; + { + // Step 2: Check if data in session is correct. + $msg = t( + '

Sessions do not seem to work correctly on your server.
'.
 +            'Make sure the variable "session.save_path" is set correctly in your PHP config, '.
 +            'and that you have write access to it.
'.
 +            'It currently points to %s.
'.
 +            'On some browsers, accessing your server via a hostname like \'localhost\' '.
 +            'or any custom hostname without a dot causes cookie storage to fail. '.
 +            'We recommend accessing your server via it\'s IP address or Fully Qualified Domain Name.
'
 +        );
 +        $msg = sprintf($msg, session_save_path());
 +        echo $msg;
 +        echo '
'. t('Click to try again.') .'

'; die; } if (!isset($_SESSION['session_tested'])) @@@ -2012,7 -1994,6 +2006,7 @@@ } else { $conf->set('general.title', 'Shared links on '.escape(index_url($_SERVER))); } + $conf->set('translation.language', escape($_POST['language'])); $conf->set('updates.check_updates', !empty($_POST['updateCheck'])); $conf->set('api.enabled', !empty($_POST['enableApi'])); $conf->set( @@@ -2040,11 -2021,10 +2034,11 @@@ exit; } - $PAGE = new PageBuilder($conf); + $PAGE = new PageBuilder($conf, null, $sessionManager->generateToken()); list($continents, $cities) = generateTimeZoneData(timezone_identifiers_list(), date_default_timezone_get()); $PAGE->assign('continents', $continents); $PAGE->assign('cities', $cities); + $PAGE->assign('languages', Languages::getAvailableLanguages()); $PAGE->renderPage('install'); exit; } @@@ -2317,7 -2297,7 +2311,7 @@@ $response = $app->run(true) if ($response->getStatusCode() == 404 && strpos($_SERVER['REQUEST_URI'], '/api/v1') === false) { // We use UTF-8 for proper international characters handling. header('Content-Type: text/html; charset=utf-8'); - renderPage($conf, $pluginManager, $linkDb, $history); + renderPage($conf, $pluginManager, $linkDb, $history, $sessionManager); } else { $app->respond($response); } diff --combined tests/LinkUtilsTest.php index 99679320,ef650f44..7fbd59b0 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php @@@ -28,28 -28,14 +28,14 @@@ class LinkUtilsTest extends PHPUnit_Fra $this->assertFalse(html_extract_title($html)); } - /** - * Test get_charset() with all priorities. - */ - public function testGetCharset() - { - $headers = array('Content-Type' => 'text/html; charset=Headers'); - $html = 'stuff'; - $default = 'default'; - $this->assertEquals('headers', get_charset($headers, $html, $default)); - $this->assertEquals('html', get_charset(array(), $html, $default)); - $this->assertEquals($default, get_charset(array(), '', $default)); - $this->assertEquals('utf-8', get_charset(array(), '')); - } - /** * Test headers_extract_charset() when the charset is found. */ public function testHeadersExtractExistentCharset() { $charset = 'x-MacCroatian'; - $headers = array('Content-Type' => 'text/html; charset='. $charset); - $this->assertEquals(strtolower($charset), headers_extract_charset($headers)); + $headers = 'text/html; charset='. $charset; + $this->assertEquals(strtolower($charset), header_extract_charset($headers)); } /** @@@ -57,11 -43,11 +43,11 @@@ */ public function testHeadersExtractNonExistentCharset() { - $headers = array(); - $this->assertFalse(headers_extract_charset($headers)); + $headers = ''; + $this->assertFalse(header_extract_charset($headers)); - $headers = array('Content-Type' => 'text/html'); - $this->assertFalse(headers_extract_charset($headers)); + $headers = 'text/html'; + $this->assertFalse(header_extract_charset($headers)); } /** @@@ -85,6 -71,131 +71,131 @@@ $this->assertFalse(html_extract_charset($html)); } + /** + * Test the download callback with valid value + */ + public function testCurlDownloadCallbackOk() + { + $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok'); + $data = [ + 'HTTP/1.1 200 OK', + 'Server: GitHub.com', + 'Date: Sat, 28 Oct 2017 12:01:33 GMT', + 'Content-Type: text/html; charset=utf-8', + 'Status: 200 OK', + 'end' => 'th=device-width">Refactoring Â· GitHubRefactoring Â· GitHub', + 'end' => 'th=device-width">Refactoring Â· GitHubRefactoring Â· GitHubhttp://hello.there/is=someone#here otherstuff'; $processedText = text2clickable($text, ''); $this->assertEquals($expectedText, $processedText); + + $text = 'stuff http://hello.there/is=someone#here(please) otherstuff'; + $expectedText = 'stuff http://hello.there/is=someone#here(please) otherstuff'; + $processedText = text2clickable($text, ''); + $this->assertEquals($expectedText, $processedText); + + $text = 'stuff http://hello.there/is=someone#here(please)&no otherstuff'; + $expectedText = 'stuff http://hello.there/is=someone#here(please)&no otherstuff'; + $processedText = text2clickable($text, ''); + $this->assertEquals($expectedText, $processedText); } /** @@@ -130,21 -231,6 +241,21 @@@ $this->assertEquals($expectedText, $processedText); } + /** + * Test text2clickable a redirector set and without URL encode. + */ + public function testText2clickableWithRedirectorDontEncode() + { + $text = 'stuff http://hello.there/?is=someone&or=something#here otherstuff'; + $redirector = 'http://redirector.to'; + $expectedText = 'stuff http://hello.there/?is=someone&or=something#here otherstuff'; + $processedText = text2clickable($text, $redirector, false); + $this->assertEquals($expectedText, $processedText); + } + /** * Test testSpace2nbsp. */ @@@ -207,3 -293,96 +318,96 @@@ return str_replace('$1', $hashtag, $hashtagLink); } } + + // old style mock: PHPUnit doesn't allow function mock + + /** + * Returns code 200 or html content type. + * + * @param resource $ch cURL resource + * @param int $type cURL info type + * + * @return int|string 200 or 'text/html' + */ + function ut_curl_getinfo_ok($ch, $type) + { + switch ($type) { + case CURLINFO_RESPONSE_CODE: + return 200; + case CURLINFO_CONTENT_TYPE: + return 'text/html; charset=utf-8'; + } + } + + /** + * Returns code 200 or html content type without charset. + * + * @param resource $ch cURL resource + * @param int $type cURL info type + * + * @return int|string 200 or 'text/html' + */ + function ut_curl_getinfo_no_charset($ch, $type) + { + switch ($type) { + case CURLINFO_RESPONSE_CODE: + return 200; + case CURLINFO_CONTENT_TYPE: + return 'text/html'; + } + } + + /** + * Invalid response code. + * + * @param resource $ch cURL resource + * @param int $type cURL info type + * + * @return int|string 404 or 'text/html' + */ + function ut_curl_getinfo_rc_ko($ch, $type) + { + switch ($type) { + case CURLINFO_RESPONSE_CODE: + return 404; + case CURLINFO_CONTENT_TYPE: + return 'text/html; charset=utf-8'; + } + } + + /** + * Invalid content type. + * + * @param resource $ch cURL resource + * @param int $type cURL info type + * + * @return int|string 200 or 'text/plain' + */ + function ut_curl_getinfo_ct_ko($ch, $type) + { + switch ($type) { + case CURLINFO_RESPONSE_CODE: + return 200; + case CURLINFO_CONTENT_TYPE: + return 'text/plain'; + } + } + + /** + * Invalid response code and content type. + * + * @param resource $ch cURL resource + * @param int $type cURL info type + * + * @return int|string 404 or 'text/plain' + */ + function ut_curl_getinfo_rs_ct_ko($ch, $type) + { + switch ($type) { + case CURLINFO_RESPONSE_CODE: + return 404; + case CURLINFO_CONTENT_TYPE: + return 'text/plain'; + } + } +