* GET an HTTP URL to retrieve its content
* Uses the cURL library or a fallback method
*
- * @param string $url URL to get (http://...)
- * @param int $timeout network timeout (in seconds)
- * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param string $url URL to get (http://...)
+ * @param int $timeout network timeout (in seconds)
+ * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ * Can be used to add download conditions on the headers (response code, content type, etc.).
*
* @return array HTTP response headers, downloaded content
*
* @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720
*/
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
{
$urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii();
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
+ if (is_callable($curlWriteFunction)) {
+ curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+ }
+
// Max download size management
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
<?php
/**
- * Extract title from an HTML document.
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
*
- * @param string $html HTML content where to look for a title.
+ * @param string $charset to extract from the downloaded page (reference)
+ * @param string $title to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
*
- * @return bool|string Extracted title if found, false otherwise.
+ * @return Closure
*/
-function html_extract_title($html)
+function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
{
- if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
- return trim(str_replace("\n", '', $matches[1]));
- }
- return false;
+ /**
+ * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+ *
+ * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+ * Then we extract the title and the charset and stop the download when it's done.
+ *
+ * @param resource $ch cURL resource
+ * @param string $data chunk of data being downloaded
+ *
+ * @return int|bool length of $data or false if we need to stop the download
+ */
+ return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
+ $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+ if (!empty($responseCode) && $responseCode != 200) {
+ return false;
+ }
+ $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+ if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+ return false;
+ }
+ if (empty($charset)) {
+ $charset = header_extract_charset($contentType);
+ }
+ if (empty($charset)) {
+ $charset = html_extract_charset($data);
+ }
+ if (empty($title)) {
+ $title = html_extract_title($data);
+ }
+ // We got everything we want, stop the download.
+ if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+ return false;
+ }
+
+ return strlen($data);
+ };
}
/**
- * Determine charset from downloaded page.
- * Priority:
- * 1. HTTP headers (Content type).
- * 2. HTML content page (tag <meta charset>).
- * 3. Use a default charset (default: UTF-8).
+ * Extract title from an HTML document.
*
- * @param array $headers HTTP headers array.
- * @param string $htmlContent HTML content where to look for charset.
- * @param string $defaultCharset Default charset to apply if other methods failed.
+ * @param string $html HTML content where to look for a title.
*
- * @return string Determined charset.
+ * @return bool|string Extracted title if found, false otherwise.
*/
-function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+function html_extract_title($html)
{
- if ($charset = headers_extract_charset($headers)) {
- return $charset;
- }
-
- if ($charset = html_extract_charset($htmlContent)) {
- return $charset;
+ if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
+ return trim(str_replace("\n", '', $matches[1]));
}
-
- return $defaultCharset;
+ return false;
}
/**
- * Extract charset from HTTP headers if it's defined.
+ * Extract charset from HTTP header if it's defined.
*
- * @param array $headers HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
*
* @return bool|string Charset string if found (lowercase), false otherwise.
*/
-function headers_extract_charset($headers)
+function header_extract_charset($header)
{
- if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
- preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
- if (! empty($match[1])) {
- return strtolower(trim($match[1]));
- }
+ preg_match('/charset="?([^; ]+)/i', $header, $match);
+ if (! empty($match[1])) {
+ return strtolower(trim($match[1]));
}
return false;
$this->assertFalse(html_extract_title($html));
}
- /**
- * Test get_charset() with all priorities.
- */
- public function testGetCharset()
- {
- $headers = array('Content-Type' => 'text/html; charset=Headers');
- $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
- $default = 'default';
- $this->assertEquals('headers', get_charset($headers, $html, $default));
- $this->assertEquals('html', get_charset(array(), $html, $default));
- $this->assertEquals($default, get_charset(array(), '', $default));
- $this->assertEquals('utf-8', get_charset(array(), ''));
- }
-
/**
* Test headers_extract_charset() when the charset is found.
*/
public function testHeadersExtractExistentCharset()
{
$charset = 'x-MacCroatian';
- $headers = array('Content-Type' => 'text/html; charset='. $charset);
- $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+ $headers = 'text/html; charset='. $charset;
+ $this->assertEquals(strtolower($charset), header_extract_charset($headers));
}
/**
*/
public function testHeadersExtractNonExistentCharset()
{
- $headers = array();
- $this->assertFalse(headers_extract_charset($headers));
+ $headers = '';
+ $this->assertFalse(header_extract_charset($headers));
- $headers = array('Content-Type' => 'text/html');
- $this->assertFalse(headers_extract_charset($headers));
+ $headers = 'text/html';
+ $this->assertFalse(header_extract_charset($headers));
}
/**
$this->assertFalse(html_extract_charset($html));
}
+ /**
+ * Test the download callback with valid value
+ */
+ public function testCurlDownloadCallbackOk()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ 'Server: GitHub.com',
+ 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
+ 'Content-Type: text/html; charset=utf-8',
+ 'Status: 200 OK',
+ 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+ '<title>ignored</title>',
+ ];
+ foreach ($data as $key => $line) {
+ $ignore = null;
+ $expected = $key !== 'end' ? strlen($line) : false;
+ $this->assertEquals($expected, $callback($ignore, $line));
+ if ($expected === false) {
+ break;
+ }
+ }
+ $this->assertEquals('utf-8', $charset);
+ $this->assertEquals('Refactoring · GitHub', $title);
+ }
+
+ /**
+ * Test the download callback with valid values and no charset
+ */
+ public function testCurlDownloadCallbackOkNoCharset()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+ '<title>ignored</title>',
+ ];
+ foreach ($data as $key => $line) {
+ $ignore = null;
+ $this->assertEquals(strlen($line), $callback($ignore, $line));
+ }
+ $this->assertEmpty($charset);
+ $this->assertEquals('Refactoring · GitHub', $title);
+ }
+
+ /**
+ * Test the download callback with valid values and no charset
+ */
+ public function testCurlDownloadCallbackOkHtmlCharset()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
+ 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+ '<title>ignored</title>',
+ ];
+ foreach ($data as $key => $line) {
+ $ignore = null;
+ $expected = $key !== 'end' ? strlen($line) : false;
+ $this->assertEquals($expected, $callback($ignore, $line));
+ if ($expected === false) {
+ break;
+ }
+ }
+ $this->assertEquals('utf-8', $charset);
+ $this->assertEquals('Refactoring · GitHub', $title);
+ }
+
+ /**
+ * Test the download callback with valid values and no title
+ */
+ public function testCurlDownloadCallbackOkNoTitle()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+ $data = [
+ 'HTTP/1.1 200 OK',
+ 'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
+ 'ignored',
+ ];
+ foreach ($data as $key => $line) {
+ $ignore = null;
+ $this->assertEquals(strlen($line), $callback($ignore, $line));
+ }
+ $this->assertEquals('utf-8', $charset);
+ $this->assertEmpty($title);
+ }
+
+ /**
+ * Test the download callback with an invalid content type.
+ */
+ public function testCurlDownloadCallbackInvalidContentType()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
+ $ignore = null;
+ $this->assertFalse($callback($ignore, ''));
+ $this->assertEmpty($charset);
+ $this->assertEmpty($title);
+ }
+
+ /**
+ * Test the download callback with an invalid response code.
+ */
+ public function testCurlDownloadCallbackInvalidResponseCode()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
+ $ignore = null;
+ $this->assertFalse($callback($ignore, ''));
+ $this->assertEmpty($charset);
+ $this->assertEmpty($title);
+ }
+
+ /**
+ * Test the download callback with an invalid content type and response code.
+ */
+ public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
+ {
+ $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
+ $ignore = null;
+ $this->assertFalse($callback($ignore, ''));
+ $this->assertEmpty($charset);
+ $this->assertEmpty($title);
+ }
+
/**
* Test count_private.
*/
return str_replace('$1', $hashtag, $hashtagLink);
}
}
+
+// old style mock: PHPUnit doesn't allow function mock
+
+/**
+ * Returns code 200 or html content type.
+ *
+ * @param resource $ch cURL resource
+ * @param int $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_ok($ch, $type)
+{
+ switch ($type) {
+ case CURLINFO_RESPONSE_CODE:
+ return 200;
+ case CURLINFO_CONTENT_TYPE:
+ return 'text/html; charset=utf-8';
+ }
+}
+
+/**
+ * Returns code 200 or html content type without charset.
+ *
+ * @param resource $ch cURL resource
+ * @param int $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_no_charset($ch, $type)
+{
+ switch ($type) {
+ case CURLINFO_RESPONSE_CODE:
+ return 200;
+ case CURLINFO_CONTENT_TYPE:
+ return 'text/html';
+ }
+}
+
+/**
+ * Invalid response code.
+ *
+ * @param resource $ch cURL resource
+ * @param int $type cURL info type
+ *
+ * @return int|string 404 or 'text/html'
+ */
+function ut_curl_getinfo_rc_ko($ch, $type)
+{
+ switch ($type) {
+ case CURLINFO_RESPONSE_CODE:
+ return 404;
+ case CURLINFO_CONTENT_TYPE:
+ return 'text/html; charset=utf-8';
+ }
+}
+
+/**
+ * Invalid content type.
+ *
+ * @param resource $ch cURL resource
+ * @param int $type cURL info type
+ *
+ * @return int|string 200 or 'text/plain'
+ */
+function ut_curl_getinfo_ct_ko($ch, $type)
+{
+ switch ($type) {
+ case CURLINFO_RESPONSE_CODE:
+ return 200;
+ case CURLINFO_CONTENT_TYPE:
+ return 'text/plain';
+ }
+}
+
+/**
+ * Invalid response code and content type.
+ *
+ * @param resource $ch cURL resource
+ * @param int $type cURL info type
+ *
+ * @return int|string 404 or 'text/plain'
+ */
+function ut_curl_getinfo_rs_ct_ko($ch, $type)
+{
+ switch ($type) {
+ case CURLINFO_RESPONSE_CODE:
+ return 404;
+ case CURLINFO_CONTENT_TYPE:
+ return 'text/plain';
+ }
+}
+