* GET an HTTP URL to retrieve its content
* Uses the cURL library or a fallback method
*
- * @param string $url URL to get (http://...)
- * @param int $timeout network timeout (in seconds)
- * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
- * Can be used to add download conditions on the
- * headers (response code, content type, etc.).
+ * @param string $url URL to get (http://...)
+ * @param int $timeout network timeout (in seconds)
+ * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
+ * (CURLOPT_HEADERFUNCTION)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ * Can be used to add download conditions on the
+ * headers (response code, content type, etc.).
*
* @return array HTTP response headers, downloaded content
*
* @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720
*/
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
-{
+function get_http_response(
+ $url,
+ $timeout = 30,
+ $maxBytes = 4194304,
+ $curlHeaderFunction = null,
+ $curlWriteFunction = null
+) {
$urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii();
if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
- return array(array(0 => 'Invalid HTTP UrlUtils'), false);
+ return [[0 => 'Invalid HTTP UrlUtils'], false];
}
$userAgent =
$ch = curl_init($cleanUrl);
if ($ch === false) {
- return array(array(0 => 'curl_init() error'), false);
+ return [[0 => 'curl_init() error'], false];
}
// General cURL settings
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($ch, CURLOPT_HEADER, true);
+ // Default header download if the $curlHeaderFunction is not defined
+ curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
curl_setopt(
$ch,
CURLOPT_HTTPHEADER,
- array('Accept-Language: ' . $acceptLanguage)
+ ['Accept-Language: ' . $acceptLanguage]
);
curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
+ // Max download size management
+ curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
+ curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+ if (is_callable($curlHeaderFunction)) {
+ curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+ }
if (is_callable($curlWriteFunction)) {
curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
}
-
- // Max download size management
- curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
- curl_setopt($ch, CURLOPT_NOPROGRESS, false);
curl_setopt(
$ch,
CURLOPT_PROGRESSFUNCTION,
- function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
- if (version_compare(phpversion(), '5.5', '<')) {
- // PHP version lower than 5.5
- // Callback has 4 arguments
- $downloaded = $arg1;
- } else {
- // Callback has 5 arguments
- $downloaded = $arg2;
- }
+ function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
+ $downloaded = $arg2;
+
// Non-zero return stops downloading
return ($downloaded > $maxBytes) ? 1 : 0;
}
* Removing this would require updating
* GetHttpUrlTest::testGetInvalidRemoteUrl()
*/
- return array(false, false);
+ return [false, false];
}
- return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
+ return [[0 => 'curl_exec() error: ' . $errorStr], false];
}
// Formatting output like the fallback method
$rawHeadersLastRedir = end($rawHeadersArrayRedirs);
$content = substr($response, $headSize);
- $headers = array();
+ $headers = [];
foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
if (empty($line) || ctype_space($line)) {
continue;
$value = $splitLine[1];
if (array_key_exists($key, $headers)) {
if (!is_array($headers[$key])) {
- $headers[$key] = array(0 => $headers[$key]);
+ $headers[$key] = [0 => $headers[$key]];
}
$headers[$key][] = $value;
} else {
}
}
- return array($headers, $content);
+ return [$headers, $content];
}
/**
$acceptLanguage,
$maxRedr
) {
- $options = array(
- 'http' => array(
+ $options = [
+ 'http' => [
'method' => 'GET',
'timeout' => $timeout,
'user_agent' => $userAgent,
'header' => "Accept: */*\r\n"
. 'Accept-Language: ' . $acceptLanguage
- )
- );
+ ]
+ ];
stream_context_set_default($options);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
}
if (! $headers) {
- return array($headers, false);
+ return [$headers, false];
}
try {
$context = stream_context_create($options);
$content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
} catch (Exception $exc) {
- return array(array(0 => 'HTTP Error'), $exc->getMessage());
+ return [[0 => 'HTTP Error'], $exc->getMessage()];
}
- return array($headers, $content);
+ return [$headers, $content];
}
/**
}
// Headers found, redirection found, and limit not reached.
- if ($redirectionLimit-- > 0
+ if (
+ $redirectionLimit-- > 0
&& !empty($headers)
&& (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
- && !empty($headers['Location'])) {
+ && !empty($headers['Location'])
+ ) {
$redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
if ($redirection != $url) {
$redirection = getAbsoluteUrl($url, $redirection);
}
}
- return array($headers, $url);
+ return [$headers, $url];
}
/**
}
$parts = parse_url($originalUrl);
- $final = $parts['scheme'] .'://'. $parts['host'];
+ $final = $parts['scheme'] . '://' . $parts['host'];
$final .= (!empty($parts['port'])) ? $parts['port'] : '';
$final .= '/';
if ($newUrl[0] != '/') {
$scheme = 'https';
}
- if (($scheme == 'http' && $port != '80')
+ if (
+ ($scheme == 'http' && $port != '80')
|| ($scheme == 'https' && $port != '443')
) {
$port = ':' . $port;
$host = $server['SERVER_NAME'];
}
- return $scheme.'://'.$host.$port;
+ return $scheme . '://' . $host . $port;
}
// SSL detection
- if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
- || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
+ if (
+ (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
+ || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
+ ) {
$scheme = 'https';
}
// Do not append standard port values
- if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
- || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
- $port = ':'.$server['SERVER_PORT'];
+ if (
+ ($scheme == 'http' && $server['SERVER_PORT'] != '80')
+ || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
+ ) {
+ $port = ':' . $server['SERVER_PORT'];
}
- return $scheme.'://'.$server['SERVER_NAME'].$port;
+ return $scheme . '://' . $server['SERVER_NAME'] . $port;
}
/**
*/
function index_url($server)
{
- $scriptname = $server['SCRIPT_NAME'];
+ if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
+ return rtrim(SHAARLI_ROOT_URL, '/') . '/';
+ }
+
+ $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
if (endsWith($scriptname, 'index.php')) {
$scriptname = substr($scriptname, 0, -9);
}
}
/**
- * Returns the absolute URL of the current script, with the query
+ * Returns the absolute URL of the current script, with current route and query
*
* If the resource is "index.php", then it is removed (for better-looking URLs)
*
*/
function page_url($server)
{
+ $scriptname = $server['SCRIPT_NAME'] ?? '';
+ if (endsWith($scriptname, 'index.php')) {
+ $scriptname = substr($scriptname, 0, -9);
+ }
+
+ $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
if (! empty($server['QUERY_STRING'])) {
- return index_url($server).'?'.$server['QUERY_STRING'];
+ return index_url($server) . $route . '?' . $server['QUERY_STRING'];
}
- return index_url($server);
+
+ return index_url($server) . $route;
}
/**
return ! empty($server['HTTPS']);
}
+
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_header_callback(
+ &$charset,
+ $curlGetInfo = 'curl_getinfo'
+) {
+ $isRedirected = false;
+
+ return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
+ $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+ $chunkLength = strlen($data);
+ if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+ $isRedirected = true;
+ return $chunkLength;
+ }
+ if (!empty($responseCode) && $responseCode !== 200) {
+ return false;
+ }
+ // After a redirection, the content type will keep the previous request value
+ // until it finds the next content-type header.
+ if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+ $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+ }
+ if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+ return false;
+ }
+ if (!empty($contentType) && empty($charset)) {
+ $charset = header_extract_charset($contentType);
+ }
+
+ return $chunkLength;
+ };
+}
+
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset to extract from the downloaded page (reference)
+ * @param string $title to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords to extract from the downloaded page (reference)
+ * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_download_callback(
+ &$charset,
+ &$title,
+ &$description,
+ &$keywords,
+ $retrieveDescription,
+ $tagsSeparator
+) {
+ $currentChunk = 0;
+ $foundChunk = null;
+
+ /**
+ * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+ *
+ * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+ * Then we extract the title and the charset and stop the download when it's done.
+ *
+ * @param resource $ch cURL resource
+ * @param string $data chunk of data being downloaded
+ *
+ * @return int|bool length of $data or false if we need to stop the download
+ */
+ return function (
+ $ch,
+ $data
+ ) use (
+ $retrieveDescription,
+ $tagsSeparator,
+ &$charset,
+ &$title,
+ &$description,
+ &$keywords,
+ &$currentChunk,
+ &$foundChunk
+ ) {
+ $chunkLength = strlen($data);
+ $currentChunk++;
+
+ if (empty($charset)) {
+ $charset = html_extract_charset($data);
+ }
+ if (empty($title)) {
+ $title = html_extract_title($data);
+ $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+ }
+ if (empty($title)) {
+ $title = html_extract_tag('title', $data);
+ $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+ }
+ if ($retrieveDescription && empty($description)) {
+ $description = html_extract_tag('description', $data);
+ $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
+ }
+ if ($retrieveDescription && empty($keywords)) {
+ $keywords = html_extract_tag('keywords', $data);
+ if (! empty($keywords)) {
+ $foundChunk = $currentChunk;
+ // Keywords use the format tag1, tag2 multiple words, tag
+ // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
+ $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
+ return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
+ }, tags_str2array($keywords, ',')), $tagsSeparator);
+ }
+ }
+
+ // We got everything we want, stop the download.
+ // If we already found either the title, description or keywords,
+ // it's highly unlikely that we'll found the other metas further than
+ // in the same chunk of data or the next one. So we also stop the download after that.
+ if (
+ (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+ && (! $retrieveDescription
+ || $foundChunk < $currentChunk
+ || (!empty($title) && !empty($description) && !empty($keywords))
+ )
+ ) {
+ return false;
+ }
+
+ return $chunkLength;
+ };
+}