Extract the title/charset during page download, and check content type

author ArthurHoaro <arthur@hoa.ro>

Sat, 30 Sep 2017 09:04:13 +0000 (11:04 +0200)

committer ArthurHoaro <arthur@hoa.ro>

Sat, 28 Oct 2017 12:35:49 +0000 (14:35 +0200)
author ArthurHoaro <arthur@hoa.ro>
Sat, 30 Sep 2017 09:04:13 +0000 (11:04 +0200)
committer ArthurHoaro <arthur@hoa.ro>
Sat, 28 Oct 2017 12:35:49 +0000 (14:35 +0200)
diff --git a/application/HttpUtils.php b/application/HttpUtils.php

index 0083596643f510d4ea131fad9df25de215ff77ac..2edf5ce2df74bcb39fd5a1ce15c401302ff666c1 100644 (file)
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
   * GET an HTTP URL to retrieve its content
   * Uses the cURL library or a fallback method 
   *
- * @param string $url      URL to get (http://...)
- * @param int    $timeout  network timeout (in seconds)
- * @param int    $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param string          $url               URL to get (http://...)
+ * @param int             $timeout           network timeout (in seconds)
+ * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                           Can be used to add download conditions on the headers (response code, content type, etc.).
   *
   * @return array HTTP response headers, downloaded content
   *
@@ -29,7 +31,7 @@
   * @see http://stackoverflow.com/q/9183178
   * @see http://stackoverflow.com/q/1462720
   */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
  {
      $urlObj = new Url($url);
      $cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
      curl_setopt($ch, CURLOPT_TIMEOUT,           $timeout);
      curl_setopt($ch, CURLOPT_USERAGENT,         $userAgent);
  
+    if (is_callable($curlWriteFunction)) {
+        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+    }
+
      // Max download size management
      curl_setopt($ch, CURLOPT_BUFFERSIZE,        1024);
      curl_setopt($ch, CURLOPT_NOPROGRESS,        false);
diff --git a/application/LinkUtils.php b/application/LinkUtils.php

index 976474de721ad14636b9b431f0cec06a8920e120..c0dd32a66cfa0160e4b37160fdf226d4abb14499 100644 (file)
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
  <?php
  
  /**
- * Extract title from an HTML document.
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
   *
- * @param string $html HTML content where to look for a title.
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
   *
- * @return bool|string Extracted title if found, false otherwise.
+ * @return Closure
   */
-function html_extract_title($html)
+function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
  {
-    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
-        return trim(str_replace("\n", '', $matches[1]));
-    }
-    return false;
+    /**
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+     *
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        if (!empty($responseCode) && $responseCode != 200) {
+            return false;
+        }
+        $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+        }
+        // We got everything we want, stop the download.
+        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+            return false;
+        }
+
+        return strlen($data);
+    };
  }
  
  /**
- * Determine charset from downloaded page.
- * Priority:
- *   1. HTTP headers (Content type).
- *   2. HTML content page (tag <meta charset>).
- *   3. Use a default charset (default: UTF-8).
+ * Extract title from an HTML document.
   *
- * @param array  $headers           HTTP headers array.
- * @param string $htmlContent       HTML content where to look for charset.
- * @param string $defaultCharset    Default charset to apply if other methods failed.
+ * @param string $html HTML content where to look for a title.
   *
- * @return string Determined charset.
+ * @return bool|string Extracted title if found, false otherwise.
   */
-function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+function html_extract_title($html)
  {
-    if ($charset = headers_extract_charset($headers)) {
-        return $charset;
-    }
-
-    if ($charset = html_extract_charset($htmlContent)) {
-        return $charset;
+    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
+        return trim(str_replace("\n", '', $matches[1]));
      }
-
-    return $defaultCharset;
+    return false;
  }
  
  /**
- * Extract charset from HTTP headers if it's defined.
+ * Extract charset from HTTP header if it's defined.
   *
- * @param array $headers HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
   *
   * @return bool|string Charset string if found (lowercase), false otherwise.
   */
-function headers_extract_charset($headers)
+function header_extract_charset($header)
  {
-    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
-        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
-        if (! empty($match[1])) {
-            return strtolower(trim($match[1]));
-        }
+    preg_match('/charset="?([^; ]+)/i', $header, $match);
+    if (! empty($match[1])) {
+        return strtolower(trim($match[1]));
      }
  
      return false;
diff --git a/index.php b/index.php

index fb00a9fa3adb8d302f712fcbe7d7fc2a694409f4..ac51038d7f1f50e940a2a6ad66aa3806dff742b7 100644 (file)
--- a/index.php
+++ b/index.php
@@ -1428,16 +1428,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history)
              // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
              if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
                  // Short timeout to keep the application responsive
-                list($headers, $content) = get_http_response($url, 4);
-                if (strpos($headers[0], '200 OK') !== false) {
-                    // Retrieve charset.
-                    $charset = get_charset($headers, $content);
-                    // Extract title.
-                    $title = html_extract_title($content);
-                    // Re-encode title in utf-8 if necessary.
-                    if (! empty($title) && strtolower($charset) != 'utf-8') {
-                        $title = mb_convert_encoding($title, 'utf-8', $charset);
-                    }
+                // The callback will fill $charset and $title with data from the downloaded page.
+                get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
+                if (! empty($title) && strtolower($charset) != 'utf-8') {
+                    $title = mb_convert_encoding($title, 'utf-8', $charset);
                  }
              }
  
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php

index 7c0d4b0bdc9bf7e0c029231bab2d07e16db26082..ef650f448d382fd2e4ab7ab7959186f831c255c9 100644 (file)
--- a/tests/LinkUtilsTest.php
+++ b/tests/LinkUtilsTest.php
@@ -28,28 +28,14 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
          $this->assertFalse(html_extract_title($html));
      }
  
-    /**
-     * Test get_charset() with all priorities.
-     */
-    public function testGetCharset()
-    {
-        $headers = array('Content-Type' => 'text/html; charset=Headers');
-        $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
-        $default = 'default';
-        $this->assertEquals('headers', get_charset($headers, $html, $default));
-        $this->assertEquals('html', get_charset(array(), $html, $default));
-        $this->assertEquals($default, get_charset(array(), '', $default));
-        $this->assertEquals('utf-8', get_charset(array(), ''));
-    }
-
      /**
       * Test headers_extract_charset() when the charset is found.
       */
      public function testHeadersExtractExistentCharset()
      {
          $charset = 'x-MacCroatian';
-        $headers = array('Content-Type' => 'text/html; charset='. $charset);
-        $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+        $headers = 'text/html; charset='. $charset;
+        $this->assertEquals(strtolower($charset), header_extract_charset($headers));
      }
  
      /**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
       */
      public function testHeadersExtractNonExistentCharset()
      {
-        $headers = array();
-        $this->assertFalse(headers_extract_charset($headers));
+        $headers = '';
+        $this->assertFalse(header_extract_charset($headers));
  
-        $headers = array('Content-Type' => 'text/html');
-        $this->assertFalse(headers_extract_charset($headers));
+        $headers = 'text/html';
+        $this->assertFalse(header_extract_charset($headers));
      }
  
      /**
@@ -85,6 +71,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
          $this->assertFalse(html_extract_charset($html));
      }
  
+    /**
+     * Test the download callback with valid value
+     */
+    public function testCurlDownloadCallbackOk()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'Server: GitHub.com',
+            'Date: Sat, 28 Oct 2017 12:01:33 GMT',
+            'Content-Type: text/html; charset=utf-8',
+            'Status: 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkNoCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEmpty($charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkHtmlCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+
+    /**
+     * Test the download callback with valid values and no title
+     */
+    public function testCurlDownloadCallbackOkNoTitle()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
+            'ignored',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEmpty($title);
+    }
+
+    /**
+     * Test the download callback with an invalid content type.
+     */
+    public function testCurlDownloadCallbackInvalidContentType()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+
+    /**
+     * Test the download callback with an invalid response code.
+     */
+    public function testCurlDownloadCallbackInvalidResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+
+    /**
+     * Test the download callback with an invalid content type and response code.
+     */
+    public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+
      /**
       * Test count_private.
       */
@@ -182,3 +293,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
          return str_replace('$1', $hashtag, $hashtagLink);
      }
  }
+
+// old style mock: PHPUnit doesn't allow function mock
+
+/**
+ * Returns code 200 or html content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_ok($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+
+/**
+ * Returns code 200 or html content type without charset.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_no_charset($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html';
+    }
+}
+
+/**
+ * Invalid response code.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/html'
+ */
+function ut_curl_getinfo_rc_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+
+/**
+ * Invalid content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/plain'
+ */
+function ut_curl_getinfo_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
+
+/**
+ * Invalid response code and content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/plain'
+ */
+function ut_curl_getinfo_rs_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
+
author	ArthurHoaro <arthur@hoa.ro>
	Sat, 30 Sep 2017 09:04:13 +0000 (11:04 +0200)
committer	ArthurHoaro <arthur@hoa.ro>
	Sat, 28 Oct 2017 12:35:49 +0000 (14:35 +0200)
application/HttpUtils.php		patch \| blob \| blame \| history
application/LinkUtils.php		patch \| blob \| blame \| history
index.php		patch \| blob \| blame \| history
tests/LinkUtilsTest.php		patch \| blob \| blame \| history