X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=application%2Fhttp%2FHttpUtils.php;h=4bde1d5b8c4b33c97dc91a40f847b96422174f6c;hb=53054b2bf6a919fd4ff9b44b6ad1986f21f488b6;hp=4fc4e3dcff08f3457c7a2d541962c699979732c7;hpb=af41d5ab5d2bd3ba64d052c997bc6afa6966a63c;p=github%2Fshaarli%2FShaarli.git diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 4fc4e3dc..4bde1d5b 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -6,12 +6,14 @@ use Shaarli\Http\Url; * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) - * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). - * Can be used to add download conditions on the - * headers (response code, content type, etc.). + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlHeaderFunction Optional callback called during the download of headers + * (CURLOPT_HEADERFUNCTION) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the + * headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -35,13 +37,18 @@ use Shaarli\Http\Url; * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) -{ +function get_http_response( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null +) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { - return array(array(0 => 'Invalid HTTP UrlUtils'), false); + return [[0 => 'Invalid HTTP UrlUtils'], false]; } $userAgent = @@ -64,42 +71,39 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF $ch = curl_init($cleanUrl); if ($ch === false) { - return array(array(0 => 'curl_init() error'), false); + return [[0 => 'curl_init() error'], false]; } // General cURL settings curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_HEADER, true); + // Default header download if the $curlHeaderFunction is not defined + curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); curl_setopt( $ch, CURLOPT_HTTPHEADER, - array('Accept-Language: ' . $acceptLanguage) + ['Accept-Language: ' . $acceptLanguage] ); curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); + // Max download size management + curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16); + curl_setopt($ch, CURLOPT_NOPROGRESS, false); + if (is_callable($curlHeaderFunction)) { + curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); + } if (is_callable($curlWriteFunction)) { curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); } - - // Max download size management - curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); - curl_setopt($ch, CURLOPT_NOPROGRESS, false); curl_setopt( $ch, CURLOPT_PROGRESSFUNCTION, - function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { - if (version_compare(phpversion(), '5.5', '<')) { - // PHP version lower than 5.5 - // Callback has 4 arguments - $downloaded = $arg1; - } else { - // Callback has 5 arguments - $downloaded = $arg2; - } + function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { + $downloaded = $arg2; + // Non-zero return stops downloading return ($downloaded > $maxBytes) ? 1 : 0; } @@ -118,9 +122,9 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF * Removing this would require updating * GetHttpUrlTest::testGetInvalidRemoteUrl() */ - return array(false, false); + return [false, false]; } - return array(array(0 => 'curl_exec() error: ' . $errorStr), false); + return [[0 => 'curl_exec() error: ' . $errorStr], false]; } // Formatting output like the fallback method @@ -131,7 +135,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF $rawHeadersLastRedir = end($rawHeadersArrayRedirs); $content = substr($response, $headSize); - $headers = array(); + $headers = []; foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { if (empty($line) || ctype_space($line)) { continue; @@ -142,7 +146,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF $value = $splitLine[1]; if (array_key_exists($key, $headers)) { if (!is_array($headers[$key])) { - $headers[$key] = array(0 => $headers[$key]); + $headers[$key] = [0 => $headers[$key]]; } $headers[$key][] = $value; } else { @@ -153,7 +157,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF } } - return array($headers, $content); + return [$headers, $content]; } /** @@ -184,15 +188,15 @@ function get_http_response_fallback( $acceptLanguage, $maxRedr ) { - $options = array( - 'http' => array( + $options = [ + 'http' => [ 'method' => 'GET', 'timeout' => $timeout, 'user_agent' => $userAgent, 'header' => "Accept: */*\r\n" . 'Accept-Language: ' . $acceptLanguage - ) - ); + ] + ]; stream_context_set_default($options); list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); @@ -203,7 +207,7 @@ function get_http_response_fallback( } if (! $headers) { - return array($headers, false); + return [$headers, false]; } try { @@ -211,10 +215,10 @@ function get_http_response_fallback( $context = stream_context_create($options); $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); } catch (Exception $exc) { - return array(array(0 => 'HTTP Error'), $exc->getMessage()); + return [[0 => 'HTTP Error'], $exc->getMessage()]; } - return array($headers, $content); + return [$headers, $content]; } /** @@ -233,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3) } // Headers found, redirection found, and limit not reached. - if ($redirectionLimit-- > 0 + if ( + $redirectionLimit-- > 0 && !empty($headers) && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) - && !empty($headers['Location'])) { + && !empty($headers['Location']) + ) { $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; if ($redirection != $url) { $redirection = getAbsoluteUrl($url, $redirection); @@ -244,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3) } } - return array($headers, $url); + return [$headers, $url]; } /** @@ -266,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl) } $parts = parse_url($originalUrl); - $final = $parts['scheme'] .'://'. $parts['host']; + $final = $parts['scheme'] . '://' . $parts['host']; $final .= (!empty($parts['port'])) ? $parts['port'] : ''; $final .= '/'; if ($newUrl[0] != '/') { @@ -319,7 +325,8 @@ function server_url($server) $scheme = 'https'; } - if (($scheme == 'http' && $port != '80') + if ( + ($scheme == 'http' && $port != '80') || ($scheme == 'https' && $port != '443') ) { $port = ':' . $port; @@ -340,22 +347,26 @@ function server_url($server) $host = $server['SERVER_NAME']; } - return $scheme.'://'.$host.$port; + return $scheme . '://' . $host . $port; } // SSL detection - if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') - || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) { + if ( + (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') + || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443') + ) { $scheme = 'https'; } // Do not append standard port values - if (($scheme == 'http' && $server['SERVER_PORT'] != '80') - || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) { - $port = ':'.$server['SERVER_PORT']; + if ( + ($scheme == 'http' && $server['SERVER_PORT'] != '80') + || ($scheme == 'https' && $server['SERVER_PORT'] != '443') + ) { + $port = ':' . $server['SERVER_PORT']; } - return $scheme.'://'.$server['SERVER_NAME'].$port; + return $scheme . '://' . $server['SERVER_NAME'] . $port; } /** @@ -369,7 +380,11 @@ function server_url($server) */ function index_url($server) { - $scriptname = $server['SCRIPT_NAME'] ?? ''; + if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) { + return rtrim(SHAARLI_ROOT_URL, '/') . '/'; + } + + $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/'; if (endsWith($scriptname, 'index.php')) { $scriptname = substr($scriptname, 0, -9); } @@ -392,7 +407,7 @@ function page_url($server) $scriptname = substr($scriptname, 0, -9); } - $route = ltrim($server['REQUEST_URI'] ?? '', $scriptname); + $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? ''); if (! empty($server['QUERY_STRING'])) { return index_url($server) . $route . '?' . $server['QUERY_STRING']; } @@ -485,6 +500,46 @@ function is_https($server) return ! empty($server['HTTPS']); } +/** + * Get cURL callback function for CURLOPT_WRITEFUNCTION + * + * @param string $charset to extract from the downloaded page (reference) + * @param string $curlGetInfo Optionally overrides curl_getinfo function + * + * @return Closure + */ +function get_curl_header_callback( + &$charset, + $curlGetInfo = 'curl_getinfo' +) { + $isRedirected = false; + + return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { + $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + $chunkLength = strlen($data); + if (!empty($responseCode) && in_array($responseCode, [301, 302])) { + $isRedirected = true; + return $chunkLength; + } + if (!empty($responseCode) && $responseCode !== 200) { + return false; + } + // After a redirection, the content type will keep the previous request value + // until it finds the next content-type header. + if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { + $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); + } + if (!empty($contentType) && strpos($contentType, 'text/html') === false) { + return false; + } + if (!empty($contentType) && empty($charset)) { + $charset = header_extract_charset($contentType); + } + + return $chunkLength; + }; +} + /** * Get cURL callback function for CURLOPT_WRITEFUNCTION * @@ -503,9 +558,8 @@ function get_curl_download_callback( &$description, &$keywords, $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $tagsSeparator ) { - $isRedirected = false; $currentChunk = 0; $foundChunk = null; @@ -520,37 +574,22 @@ function get_curl_download_callback( * * @return int|bool length of $data or false if we need to stop the download */ - return function (&$ch, $data) use ( + return function ( + $ch, + $data + ) use ( $retrieveDescription, - $curlGetInfo, + $tagsSeparator, &$charset, &$title, &$description, &$keywords, - &$isRedirected, &$currentChunk, &$foundChunk ) { + $chunkLength = strlen($data); $currentChunk++; - $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); - if (!empty($responseCode) && in_array($responseCode, [301, 302])) { - $isRedirected = true; - return strlen($data); - } - if (!empty($responseCode) && $responseCode !== 200) { - return false; - } - // After a redirection, the content type will keep the previous request value - // until it finds the next content-type header. - if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { - $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); - } - if (!empty($contentType) && strpos($contentType, 'text/html') === false) { - return false; - } - if (!empty($contentType) && empty($charset)) { - $charset = header_extract_charset($contentType); - } + if (empty($charset)) { $charset = html_extract_charset($data); } @@ -558,6 +597,10 @@ function get_curl_download_callback( $title = html_extract_title($data); $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; } + if (empty($title)) { + $title = html_extract_tag('title', $data); + $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; + } if ($retrieveDescription && empty($description)) { $description = html_extract_tag('description', $data); $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; @@ -567,10 +610,10 @@ function get_curl_download_callback( if (! empty($keywords)) { $foundChunk = $currentChunk; // Keywords use the format tag1, tag2 multiple words, tag - // So we format them to match Shaarli's separator and glue multiple words with '-' - $keywords = implode(' ', array_map(function($keyword) { - return implode('-', preg_split('/\s+/', trim($keyword))); - }, explode(',', $keywords))); + // So we split the result with `,`, then if a tag contains the separator we replace it by `-`. + $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string { + return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-'); + }, tags_str2array($keywords, ',')), $tagsSeparator); } } @@ -578,7 +621,8 @@ function get_curl_download_callback( // If we already found either the title, description or keywords, // it's highly unlikely that we'll found the other metas further than // in the same chunk of data or the next one. So we also stop the download after that. - if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null + if ( + (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null && (! $retrieveDescription || $foundChunk < $currentChunk || (!empty($title) && !empty($description) && !empty($keywords)) @@ -587,6 +631,6 @@ function get_curl_download_callback( return false; } - return strlen($data); + return $chunkLength; }; }