* GET an HTTP URL to retrieve its content
* Uses the cURL library or a fallback method
*
- * @param string $url URL to get (http://...)
- * @param int $timeout network timeout (in seconds)
- * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
- * Can be used to add download conditions on the
- * headers (response code, content type, etc.).
+ * @param string $url URL to get (http://...)
+ * @param int $timeout network timeout (in seconds)
+ * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
+ * (CURLOPT_HEADERFUNCTION)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ * Can be used to add download conditions on the
+ * headers (response code, content type, etc.).
*
* @return array HTTP response headers, downloaded content
*
* @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720
*/
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
-{
+function get_http_response(
+ $url,
+ $timeout = 30,
+ $maxBytes = 4194304,
+ $curlHeaderFunction = null,
+ $curlWriteFunction = null
+) {
$urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii();
// General cURL settings
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($ch, CURLOPT_HEADER, true);
+ // Default header download if the $curlHeaderFunction is not defined
+ curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
curl_setopt(
$ch,
CURLOPT_HTTPHEADER,
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
- if (is_callable($curlWriteFunction)) {
- curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
- }
-
// Max download size management
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+ if (is_callable($curlHeaderFunction)) {
+ curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+ }
+ if (is_callable($curlWriteFunction)) {
+ curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+ }
curl_setopt(
$ch,
CURLOPT_PROGRESSFUNCTION,
- function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
- if (version_compare(phpversion(), '5.5', '<')) {
- // PHP version lower than 5.5
- // Callback has 4 arguments
- $downloaded = $arg1;
- } else {
- // Callback has 5 arguments
- $downloaded = $arg2;
- }
+ function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
+ $downloaded = $arg2;
+
// Non-zero return stops downloading
return ($downloaded > $maxBytes) ? 1 : 0;
}
return ! empty($server['HTTPS']);
}
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_header_callback(
+ &$charset,
+ $curlGetInfo = 'curl_getinfo'
+) {
+ $isRedirected = false;
+
+ return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
+ $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+ $chunkLength = strlen($data);
+ if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+ $isRedirected = true;
+ return $chunkLength;
+ }
+ if (!empty($responseCode) && $responseCode !== 200) {
+ return false;
+ }
+ // After a redirection, the content type will keep the previous request value
+ // until it finds the next content-type header.
+ if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+ $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+ }
+ if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+ return false;
+ }
+ if (!empty($contentType) && empty($charset)) {
+ $charset = header_extract_charset($contentType);
+ }
+
+ return $chunkLength;
+ };
+}
+
/**
* Get cURL callback function for CURLOPT_WRITEFUNCTION
*
&$title,
&$description,
&$keywords,
- $retrieveDescription,
- $curlGetInfo = 'curl_getinfo'
+ $retrieveDescription
) {
- $isRedirected = false;
$currentChunk = 0;
$foundChunk = null;
*
* @return int|bool length of $data or false if we need to stop the download
*/
- return function (&$ch, $data) use (
+ return function ($ch, $data) use (
$retrieveDescription,
- $curlGetInfo,
&$charset,
&$title,
&$description,
&$keywords,
- &$isRedirected,
&$currentChunk,
&$foundChunk
) {
+ $chunkLength = strlen($data);
$currentChunk++;
- $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
- if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
- $isRedirected = true;
- return strlen($data);
- }
- if (!empty($responseCode) && $responseCode !== 200) {
- return false;
- }
- // After a redirection, the content type will keep the previous request value
- // until it finds the next content-type header.
- if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
- $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
- }
- if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
- return false;
- }
- if (!empty($contentType) && empty($charset)) {
- $charset = header_extract_charset($contentType);
- }
+
if (empty($charset)) {
$charset = html_extract_charset($data);
}
$title = html_extract_title($data);
$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
}
+ if (empty($title)) {
+ $title = html_extract_tag('title', $data);
+ $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+ }
if ($retrieveDescription && empty($description)) {
$description = html_extract_tag('description', $data);
$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
return false;
}
- return strlen($data);
+ return $chunkLength;
};
}
$this->assertFalse(html_extract_tag('description', $html));
}
+ /**
+ * Test the header callback with valid value
+ */
+ public function testCurlHeaderCallbackOk(): void
+ {
+ $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ 'Server: GitHub.com',
+ 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
+ 'Content-Type: text/html; charset=utf-8',
+ 'Status: 200 OK',
+ ];
+
+ foreach ($data as $chunk) {
+ static::assertIsInt($callback(null, $chunk));
+ }
+
+ static::assertSame('utf-8', $charset);
+ }
+
/**
* Test the download callback with valid value
*/
- public function testCurlDownloadCallbackOk()
+ public function testCurlDownloadCallbackOk(): void
{
+ $charset = 'utf-8';
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
- false,
- 'ut_curl_getinfo_ok'
+ false
);
+
$data = [
- 'HTTP/1.1 200 OK',
- 'Server: GitHub.com',
- 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
- 'Content-Type: text/html; charset=utf-8',
- 'Status: 200 OK',
- 'end' => 'th=device-width">'
+ 'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
'<title>ignored</title>'
. '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $expected = $key !== 'end' ? strlen($line) : false;
- $this->assertEquals($expected, $callback($ignore, $line));
- if ($expected === false) {
- break;
- }
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
- $this->assertEquals('utf-8', $charset);
- $this->assertEquals('Refactoring · GitHub', $title);
- $this->assertEmpty($desc);
- $this->assertEmpty($keywords);
+
+ static::assertSame('utf-8', $charset);
+ static::assertSame('Refactoring · GitHub', $title);
+ static::assertEmpty($desc);
+ static::assertEmpty($keywords);
+ }
+
+ /**
+ * Test the header callback with valid value
+ */
+ public function testCurlHeaderCallbackNoCharset(): void
+ {
+ $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ ];
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
+ }
+
+ static::assertFalse($charset);
}
/**
* Test the download callback with valid values and no charset
*/
- public function testCurlDownloadCallbackOkNoCharset()
+ public function testCurlDownloadCallbackOkNoCharset(): void
{
+ $charset = null;
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
- false,
- 'ut_curl_getinfo_no_charset'
+ false
);
+
$data = [
- 'HTTP/1.1 200 OK',
'end' => 'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
. '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $this->assertEquals(strlen($line), $callback($ignore, $line));
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
+
$this->assertEmpty($charset);
$this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc);
/**
* Test the download callback with valid values and no charset
*/
- public function testCurlDownloadCallbackOkHtmlCharset()
+ public function testCurlDownloadCallbackOkHtmlCharset(): void
{
+ $charset = null;
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
- false,
- 'ut_curl_getinfo_no_charset'
+ false
);
+
$data = [
- 'HTTP/1.1 200 OK',
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
'end' => 'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<meta name="description" content="desc" />'
. '<meta name="keywords" content="key1,key2" />',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $expected = $key !== 'end' ? strlen($line) : false;
- $this->assertEquals($expected, $callback($ignore, $line));
- if ($expected === false) {
- break;
- }
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
+
$this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc);
/**
* Test the download callback with valid values and no title
*/
- public function testCurlDownloadCallbackOkNoTitle()
+ public function testCurlDownloadCallbackOkNoTitle(): void
{
+ $charset = 'utf-8';
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
- false,
- 'ut_curl_getinfo_ok'
+ false
);
+
$data = [
- 'HTTP/1.1 200 OK',
'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
'ignored',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $this->assertEquals(strlen($line), $callback($ignore, $line));
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
+
$this->assertEquals('utf-8', $charset);
$this->assertEmpty($title);
$this->assertEmpty($desc);
}
/**
- * Test the download callback with an invalid content type.
+ * Test the header callback with an invalid content type.
*/
- public function testCurlDownloadCallbackInvalidContentType()
+ public function testCurlHeaderCallbackInvalidContentType(): void
{
- $callback = get_curl_download_callback(
- $charset,
- $title,
- $desc,
- $keywords,
- false,
- 'ut_curl_getinfo_ct_ko'
- );
- $ignore = null;
- $this->assertFalse($callback($ignore, ''));
- $this->assertEmpty($charset);
- $this->assertEmpty($title);
+ $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ct_ko');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ ];
+
+ static::assertFalse($callback(null, $data[0]));
+ static::assertNull($charset);
}
/**
- * Test the download callback with an invalid response code.
+ * Test the header callback with an invalid response code.
*/
- public function testCurlDownloadCallbackInvalidResponseCode()
+ public function testCurlHeaderCallbackInvalidResponseCode(): void
{
- $callback = $callback = get_curl_download_callback(
- $charset,
- $title,
- $desc,
- $keywords,
- false,
- 'ut_curl_getinfo_rc_ko'
- );
- $ignore = null;
- $this->assertFalse($callback($ignore, ''));
- $this->assertEmpty($charset);
- $this->assertEmpty($title);
+ $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rc_ko');
+
+ static::assertFalse($callback(null, ''));
+ static::assertNull($charset);
}
/**
- * Test the download callback with an invalid content type and response code.
+ * Test the header callback with an invalid content type and response code.
*/
- public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
+ public function testCurlHeaderCallbackInvalidContentTypeAndResponseCode(): void
{
- $callback = $callback = get_curl_download_callback(
- $charset,
- $title,
- $desc,
- $keywords,
- false,
- 'ut_curl_getinfo_rs_ct_ko'
- );
- $ignore = null;
- $this->assertFalse($callback($ignore, ''));
- $this->assertEmpty($charset);
- $this->assertEmpty($title);
+ $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_rs_ct_ko');
+
+ static::assertFalse($callback(null, ''));
+ static::assertNull($charset);
}
/**
* Test the download callback with valid value, and retrieve_description option enabled.
*/
- public function testCurlDownloadCallbackOkWithDesc()
+ public function testCurlDownloadCallbackOkWithDesc(): void
{
+ $charset = 'utf-8';
$callback = get_curl_download_callback(
$charset,
$title,
$desc,
$keywords,
- true,
- 'ut_curl_getinfo_ok'
+ true
);
$data = [
- 'HTTP/1.1 200 OK',
- 'Server: GitHub.com',
- 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
- 'Content-Type: text/html; charset=utf-8',
- 'Status: 200 OK',
'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
. '<meta name="description" content="link desc" />'
. '<meta name="keywords" content="key1,key2" />',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $expected = $key !== 'end' ? strlen($line) : false;
- $this->assertEquals($expected, $callback($ignore, $line));
- if ($expected === false) {
- break;
- }
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
+
$this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title);
$this->assertEquals('link desc', $desc);
* Test the download callback with valid value, and retrieve_description option enabled,
* but no desc or keyword defined in the page.
*/
- public function testCurlDownloadCallbackOkWithDescNotFound()
+ public function testCurlDownloadCallbackOkWithDescNotFound(): void
{
+ $charset = 'utf-8';
$callback = get_curl_download_callback(
$charset,
$title,
'ut_curl_getinfo_ok'
);
$data = [
- 'HTTP/1.1 200 OK',
- 'Server: GitHub.com',
- 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
- 'Content-Type: text/html; charset=utf-8',
- 'Status: 200 OK',
'th=device-width">'
. '<title>Refactoring · GitHub</title>'
. '<link rel="search" type="application/opensea',
'end' => '<title>ignored</title>',
];
- foreach ($data as $key => $line) {
- $ignore = null;
- $expected = $key !== 'end' ? strlen($line) : false;
- $this->assertEquals($expected, $callback($ignore, $line));
- if ($expected === false) {
- break;
- }
+
+ foreach ($data as $chunk) {
+ static::assertSame(strlen($chunk), $callback(null, $chunk));
}
+
$this->assertEquals('utf-8', $charset);
$this->assertEquals('Refactoring · GitHub', $title);
$this->assertEmpty($desc);
$remoteTitle = 'Remote Title ';
$remoteDesc = 'Sometimes the meta description is relevant.';
$remoteTags = 'abc def';
+ $remoteCharset = 'utf-8';
$expectedResult = [
'title' => $remoteTitle,
'tags' => $remoteTags,
];
+ $this->httpAccess
+ ->expects(static::once())
+ ->method('getCurlHeaderCallback')
+ ->willReturnCallback(
+ function (&$charset) use (
+ $remoteCharset
+ ): callable {
+ return function () use (
+ &$charset,
+ $remoteCharset
+ ): void {
+ $charset = $remoteCharset;
+ };
+ }
+ )
+ ;
$this->httpAccess
->expects(static::once())
->method('getCurlDownloadCallback')
->willReturnCallback(
function (&$charset, &$title, &$description, &$tags) use (
+ $remoteCharset,
$remoteTitle,
$remoteDesc,
$remoteTags
&$title,
&$description,
&$tags,
+ $remoteCharset,
$remoteTitle,
$remoteDesc,
$remoteTags
): void {
- $charset = 'ISO-8859-1';
+ static::assertSame($remoteCharset, $charset);
+
$title = $remoteTitle;
$description = $remoteDesc;
$tags = $remoteTags;
->expects(static::once())
->method('getHttpResponse')
->with($url, 30, 4194304)
- ->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void {
- $callback();
+ ->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
+ $headerCallback();
+ $dlCallback();
})
;
->expects(static::once())
->method('getCurlDownloadCallback')
->willReturnCallback(
- function (&$charset, &$title, &$description, &$tags): callable {
- return function () use (&$charset, &$title, &$description, &$tags): void {};
+ function (): callable {
+ return function (): void {};
+ }
+ )
+ ;
+ $this->httpAccess
+ ->expects(static::once())
+ ->method('getCurlHeaderCallback')
+ ->willReturnCallback(
+ function (): callable {
+ return function (): void {};
}
)
;
->expects(static::once())
->method('getHttpResponse')
->with($url, 30, 4194304)
- ->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void {
- $callback();
+ ->willReturnCallback(function($url, $timeout, $maxBytes, $headerCallback, $dlCallback): void {
+ $headerCallback();
+ $dlCallback();
})
;