From 4cf3564d28dc8e4d08a3e64f09ad045ffbde97ae Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Fri, 25 Sep 2020 13:29:36 +0200 Subject: Add a setting to retrieve bookmark metadata asynchrounously - There is a new standalone script (metadata.js) which requests a new controller to get bookmark metadata and fill the form async - This feature is enabled with the new setting: general.enable_async_metadata (enabled by default) - general.retrieve_description is now enabled by default - A small rotating loader animation has a been added to bookmark inputs when metadata is being retrieved (default template) - Custom JS htmlentities has been removed and mathiasbynens/he library is used instead Fixes #1563 --- Makefile | 1 + application/config/ConfigManager.php | 3 +- application/container/ContainerBuilder.php | 5 + application/container/ShaarliContainer.php | 2 + .../controller/admin/ManageShaareController.php | 36 ++---- .../front/controller/admin/MetadataController.php | 29 +++++ application/http/MetadataRetriever.php | 68 ++++++++++++ assets/common/js/metadata.js | 39 +++++++ assets/default/js/base.js | 12 +- assets/default/scss/shaarli.scss | 51 +++++++++ doc/md/Shaarli-configuration.md | 1 + index.php | 2 +- package.json | 1 + tests/container/ContainerBuilderTest.php | 2 + .../DisplayCreateFormTest.php | 118 ++++++++++++++------ tests/http/MetadataRetrieverTest.php | 123 +++++++++++++++++++++ tpl/default/editlink.html | 22 +++- webpack.config.js | 2 + yarn.lock | 5 + 19 files changed, 447 insertions(+), 75 deletions(-) create mode 100644 application/front/controller/admin/MetadataController.php create mode 100644 application/http/MetadataRetriever.php create mode 100644 assets/common/js/metadata.js create mode 100644 tests/http/MetadataRetrieverTest.php diff --git a/Makefile b/Makefile index 0ff6bd3f..7415887a 100644 --- a/Makefile +++ b/Makefile @@ -175,6 +175,7 @@ translate: eslint: @yarn run eslint -c .dev/.eslintrc.js assets/vintage/js/ @yarn run eslint -c .dev/.eslintrc.js assets/default/js/ + @yarn run eslint -c .dev/.eslintrc.js assets/common/js/ ### Run CSSLint check against Shaarli's SCSS files sasslint: diff --git a/application/config/ConfigManager.php b/application/config/ConfigManager.php index 4c98be30..fb085023 100644 --- a/application/config/ConfigManager.php +++ b/application/config/ConfigManager.php @@ -366,7 +366,8 @@ class ConfigManager $this->setEmpty('general.links_per_page', 20); $this->setEmpty('general.enabled_plugins', self::$DEFAULT_PLUGINS); $this->setEmpty('general.default_note_title', 'Note: '); - $this->setEmpty('general.retrieve_description', false); + $this->setEmpty('general.retrieve_description', true); + $this->setEmpty('general.enable_async_metadata', true); $this->setEmpty('updates.check_updates', false); $this->setEmpty('updates.check_updates_branch', 'stable'); diff --git a/application/container/ContainerBuilder.php b/application/container/ContainerBuilder.php index c21d58dd..fd94a1c3 100644 --- a/application/container/ContainerBuilder.php +++ b/application/container/ContainerBuilder.php @@ -14,6 +14,7 @@ use Shaarli\Front\Controller\Visitor\ErrorController; use Shaarli\Front\Controller\Visitor\ErrorNotFoundController; use Shaarli\History; use Shaarli\Http\HttpAccess; +use Shaarli\Http\MetadataRetriever; use Shaarli\Netscape\NetscapeBookmarkUtils; use Shaarli\Plugin\PluginManager; use Shaarli\Render\PageBuilder; @@ -90,6 +91,10 @@ class ContainerBuilder ); }; + $container['metadataRetriever'] = function (ShaarliContainer $container): MetadataRetriever { + return new MetadataRetriever($container->conf, $container->httpAccess); + }; + $container['pageBuilder'] = function (ShaarliContainer $container): PageBuilder { return new PageBuilder( $container->conf, diff --git a/application/container/ShaarliContainer.php b/application/container/ShaarliContainer.php index 66e669aa..3a7c238f 100644 --- a/application/container/ShaarliContainer.php +++ b/application/container/ShaarliContainer.php @@ -10,6 +10,7 @@ use Shaarli\Feed\FeedBuilder; use Shaarli\Formatter\FormatterFactory; use Shaarli\History; use Shaarli\Http\HttpAccess; +use Shaarli\Http\MetadataRetriever; use Shaarli\Netscape\NetscapeBookmarkUtils; use Shaarli\Plugin\PluginManager; use Shaarli\Render\PageBuilder; @@ -35,6 +36,7 @@ use Slim\Container; * @property History $history * @property HttpAccess $httpAccess * @property LoginManager $loginManager + * @property MetadataRetriever $metadataRetriever * @property NetscapeBookmarkUtils $netscapeBookmarkUtils * @property callable $notFoundHandler Overrides default Slim exception display * @property PageBuilder $pageBuilder diff --git a/application/front/controller/admin/ManageShaareController.php b/application/front/controller/admin/ManageShaareController.php index bb083486..df2f1631 100644 --- a/application/front/controller/admin/ManageShaareController.php +++ b/application/front/controller/admin/ManageShaareController.php @@ -53,36 +53,22 @@ class ManageShaareController extends ShaarliAdminController // If this is an HTTP(S) link, we try go get the page to extract // the title (otherwise we will to straight to the edit form.) - if (empty($title) && strpos(get_url_scheme($url) ?: '', 'http') !== false) { - $retrieveDescription = $this->container->conf->get('general.retrieve_description'); - // Short timeout to keep the application responsive - // The callback will fill $charset and $title with data from the downloaded page. - $this->container->httpAccess->getHttpResponse( - $url, - $this->container->conf->get('general.download_timeout', 30), - $this->container->conf->get('general.download_max_size', 4194304), - $this->container->httpAccess->getCurlDownloadCallback( - $charset, - $title, - $description, - $tags, - $retrieveDescription - ) - ); - if (! empty($title) && strtolower($charset) !== 'utf-8' && mb_check_encoding($charset)) { - $title = mb_convert_encoding($title, 'utf-8', $charset); - } + if (true !== $this->container->conf->get('general.enable_async_metadata', true) + && empty($title) + && strpos(get_url_scheme($url) ?: '', 'http') !== false + ) { + $metadata = $this->container->metadataRetriever->retrieve($url); } - if (empty($url) && empty($title)) { - $title = $this->container->conf->get('general.default_note_title', t('Note: ')); + if (empty($url)) { + $metadata['title'] = $this->container->conf->get('general.default_note_title', t('Note: ')); } $link = [ - 'title' => $title, + 'title' => $title ?? $metadata['title'] ?? '', 'url' => $url ?? '', - 'description' => $description ?? '', - 'tags' => $tags ?? '', + 'description' => $description ?? $metadata['description'] ?? '', + 'tags' => $tags ?? $metadata['tags'] ?? '', 'private' => $private, ]; } else { @@ -352,6 +338,8 @@ class ManageShaareController extends ShaarliAdminController 'source' => $request->getParam('source') ?? '', 'tags' => $tags, 'default_private_links' => $this->container->conf->get('privacy.default_private_links', false), + 'async_metadata' => $this->container->conf->get('general.enable_async_metadata', true), + 'retrieve_description' => $this->container->conf->get('general.retrieve_description', false), ]); $this->executePageHooks('render_editlink', $data, TemplatePage::EDIT_LINK); diff --git a/application/front/controller/admin/MetadataController.php b/application/front/controller/admin/MetadataController.php new file mode 100644 index 00000000..ff845944 --- /dev/null +++ b/application/front/controller/admin/MetadataController.php @@ -0,0 +1,29 @@ +getParam('url'); + + // Only try to extract metadata from URL with HTTP(s) scheme + if (!empty($url) && strpos(get_url_scheme($url) ?: '', 'http') !== false) { + return $response->withJson($this->container->metadataRetriever->retrieve($url)); + } + + return $response->withJson([]); + } +} diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..2ca982e2 --- /dev/null +++ b/application/http/MetadataRetriever.php @@ -0,0 +1,68 @@ +conf = $conf; + $this->httpAccess = $httpAccess; + } + + /** + * Retrieve metadata for given URL. + * + * @return array [ + * 'title' => , + * 'description' => , + * 'tags' => , + * ] + */ + public function retrieve(string $url): array + { + $charset = null; + $title = null; + $description = null; + $tags = null; + $retrieveDescription = $this->conf->get('general.retrieve_description'); + + // Short timeout to keep the application responsive + // The callback will fill $charset and $title with data from the downloaded page. + $this->httpAccess->getHttpResponse( + $url, + $this->conf->get('general.download_timeout', 30), + $this->conf->get('general.download_max_size', 4194304), + $this->httpAccess->getCurlDownloadCallback( + $charset, + $title, + $description, + $tags, + $retrieveDescription + ) + ); + + if (!empty($title) && strtolower($charset) !== 'utf-8') { + $title = mb_convert_encoding($title, 'utf-8', $charset); + } + + return [ + 'title' => $title, + 'description' => $description, + 'tags' => $tags, + ]; + } +} diff --git a/assets/common/js/metadata.js b/assets/common/js/metadata.js new file mode 100644 index 00000000..5200b481 --- /dev/null +++ b/assets/common/js/metadata.js @@ -0,0 +1,39 @@ +import he from 'he'; + +function clearLoaders(loaders) { + if (loaders != null && loaders.length > 0) { + [...loaders].forEach((loader) => { + loader.classList.remove('loading-input'); + }); + } +} + +(() => { + const loaders = document.querySelectorAll('.loading-input'); + const inputTitle = document.querySelector('input[name="lf_title"]'); + if (inputTitle != null && inputTitle.value.length > 0) { + clearLoaders(loaders); + return; + } + + const url = document.querySelector('input[name="lf_url"]').value; + const basePath = document.querySelector('input[name="js_base_path"]').value; + + const xhr = new XMLHttpRequest(); + xhr.open('GET', `${basePath}/admin/metadata?url=${encodeURI(url)}`, true); + xhr.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); + xhr.onload = () => { + const result = JSON.parse(xhr.response); + Object.keys(result).forEach((key) => { + if (result[key] !== null && result[key].length) { + const element = document.querySelector(`input[name="lf_${key}"], textarea[name="lf_${key}"]`); + if (element != null && element.value.length === 0) { + element.value = he.decode(result[key]); + } + } + }); + clearLoaders(loaders); + }; + + xhr.send(); +})(); diff --git a/assets/default/js/base.js b/assets/default/js/base.js index be986ae0..31688815 100644 --- a/assets/default/js/base.js +++ b/assets/default/js/base.js @@ -1,4 +1,5 @@ import Awesomplete from 'awesomplete'; +import he from 'he'; /** * Find a parent element according to its tag and its attributes @@ -95,15 +96,6 @@ function updateAwesompleteList(selector, tags, instances) { return instances; } -/** - * html_entities in JS - * - * @see http://stackoverflow.com/questions/18749591/encode-html-entities-in-javascript - */ -function htmlEntities(str) { - return str.replace(/[\u00A0-\u9999<>&]/gim, (i) => `&#${i.charCodeAt(0)};`); -} - /** * Add the class 'hidden' to city options not attached to the current selected continent. * @@ -569,7 +561,7 @@ function init(description) { input.setAttribute('name', totag); input.setAttribute('value', totag); findParent(input, 'div', { class: 'rename-tag-form' }).style.display = 'none'; - block.querySelector('a.tag-link').innerHTML = htmlEntities(totag); + block.querySelector('a.tag-link').innerHTML = he.encode(totag); block .querySelector('a.tag-link') .setAttribute('href', `${basePath}/?searchtags=${encodeURIComponent(totag)}`); diff --git a/assets/default/scss/shaarli.scss b/assets/default/scss/shaarli.scss index a528adb0..df9c867b 100644 --- a/assets/default/scss/shaarli.scss +++ b/assets/default/scss/shaarli.scss @@ -1269,6 +1269,57 @@ form { } } +.loading-input { + position: relative; + + @keyframes around { + 0% { + transform: rotate(0deg); + } + + 100% { + transform: rotate(360deg); + } + } + + .icon-container { + position: absolute; + right: 60px; + top: calc(50% - 10px); + } + + .loader { + position: relative; + height: 20px; + width: 20px; + display: inline-block; + animation: around 5.4s infinite; + + &::after, + &::before { + content: ""; + background: $form-input-background; + position: absolute; + display: inline-block; + width: 100%; + height: 100%; + border-width: 2px; + border-color: #333 #333 transparent transparent; + border-style: solid; + border-radius: 20px; + box-sizing: border-box; + top: 0; + left: 0; + animation: around 0.7s ease-in-out infinite; + } + + &::after { + animation: around 0.7s ease-in-out 0.1s infinite; + background: transparent; + } + } +} + // LOGIN .login-form-container { .remember-me { diff --git a/doc/md/Shaarli-configuration.md b/doc/md/Shaarli-configuration.md index 263fb761..dbfc3da9 100644 --- a/doc/md/Shaarli-configuration.md +++ b/doc/md/Shaarli-configuration.md @@ -150,6 +150,7 @@ _These settings should not be edited_ - **timezone**: See [the list of supported timezones](http://php.net/manual/en/timezones.php). - **enabled_plugins**: List of enabled plugins. - **default_note_title**: Default title of a new note. +- **enable_async_metadata** (boolean): Retrieve external bookmark metadata asynchronously to prevent bookmark creation slowdown. - **retrieve_description** (boolean): If set to true, for every new Shaare Shaarli will try to retrieve the description and keywords from the HTML meta tags. - **root_url**: Overrides automatic discovery of Shaarli instance's URL (e.g.) `https://sub.domain.tld/shaarli-folder/`. diff --git a/index.php b/index.php index b10397dd..220847f5 100644 --- a/index.php +++ b/index.php @@ -129,7 +129,7 @@ $app->group('/admin', function () { $this->post('/plugins', '\Shaarli\Front\Controller\Admin\PluginsController:save'); $this->get('/token', '\Shaarli\Front\Controller\Admin\TokenController:getToken'); $this->get('/thumbnails', '\Shaarli\Front\Controller\Admin\ThumbnailsController:index'); - + $this->get('/metadata', '\Shaarli\Front\Controller\Admin\MetadataController:ajaxRetrieveTitle'); $this->get('/visibility/{visibility}', '\Shaarli\Front\Controller\Admin\SessionFilterController:visibility'); })->add('\Shaarli\Front\ShaarliAdminMiddleware'); diff --git a/package.json b/package.json index 8a24512a..b879b223 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "awesomplete": "^1.1.2", "blazy": "^1.8.2", "fork-awesome": "^1.1.7", + "he": "^1.2.0", "pure-extras": "^1.0.0", "purecss": "^1.0.0" }, diff --git a/tests/container/ContainerBuilderTest.php b/tests/container/ContainerBuilderTest.php index 5d52daef..3dadc0b9 100644 --- a/tests/container/ContainerBuilderTest.php +++ b/tests/container/ContainerBuilderTest.php @@ -12,6 +12,7 @@ use Shaarli\Front\Controller\Visitor\ErrorController; use Shaarli\Front\Controller\Visitor\ErrorNotFoundController; use Shaarli\History; use Shaarli\Http\HttpAccess; +use Shaarli\Http\MetadataRetriever; use Shaarli\Netscape\NetscapeBookmarkUtils; use Shaarli\Plugin\PluginManager; use Shaarli\Render\PageBuilder; @@ -72,6 +73,7 @@ class ContainerBuilderTest extends TestCase static::assertInstanceOf(History::class, $container->history); static::assertInstanceOf(HttpAccess::class, $container->httpAccess); static::assertInstanceOf(LoginManager::class, $container->loginManager); + static::assertInstanceOf(MetadataRetriever::class, $container->metadataRetriever); static::assertInstanceOf(NetscapeBookmarkUtils::class, $container->netscapeBookmarkUtils); static::assertInstanceOf(PageBuilder::class, $container->pageBuilder); static::assertInstanceOf(PageCacheManager::class, $container->pageCacheManager); diff --git a/tests/front/controller/admin/ManageShaareControllerTest/DisplayCreateFormTest.php b/tests/front/controller/admin/ManageShaareControllerTest/DisplayCreateFormTest.php index 2eb95251..4fd88480 100644 --- a/tests/front/controller/admin/ManageShaareControllerTest/DisplayCreateFormTest.php +++ b/tests/front/controller/admin/ManageShaareControllerTest/DisplayCreateFormTest.php @@ -9,6 +9,7 @@ use Shaarli\Config\ConfigManager; use Shaarli\Front\Controller\Admin\FrontAdminControllerMockHelper; use Shaarli\Front\Controller\Admin\ManageShaareController; use Shaarli\Http\HttpAccess; +use Shaarli\Http\MetadataRetriever; use Shaarli\TestCase; use Slim\Http\Request; use Slim\Http\Response; @@ -25,6 +26,7 @@ class DisplayCreateFormTest extends TestCase $this->createContainer(); $this->container->httpAccess = $this->createMock(HttpAccess::class); + $this->container->metadataRetriever = $this->createMock(MetadataRetriever::class); $this->controller = new ManageShaareController($this->container); } @@ -32,7 +34,7 @@ class DisplayCreateFormTest extends TestCase * Test displaying bookmark create form * Ensure that every step of the standard workflow works properly. */ - public function testDisplayCreateFormWithUrl(): void + public function testDisplayCreateFormWithUrlAndWithMetadataRetrieval(): void { $this->container->environment = [ 'HTTP_REFERER' => $referer = 'http://shaarli/subfolder/controller/?searchtag=abc' @@ -53,40 +55,20 @@ class DisplayCreateFormTest extends TestCase }); $response = new Response(); - $this->container->httpAccess - ->expects(static::once()) - ->method('getCurlDownloadCallback') - ->willReturnCallback( - function (&$charset, &$title, &$description, &$tags) use ( - $remoteTitle, - $remoteDesc, - $remoteTags - ): callable { - return function () use ( - &$charset, - &$title, - &$description, - &$tags, - $remoteTitle, - $remoteDesc, - $remoteTags - ): void { - $charset = 'ISO-8859-1'; - $title = $remoteTitle; - $description = $remoteDesc; - $tags = $remoteTags; - }; - } - ) - ; - $this->container->httpAccess - ->expects(static::once()) - ->method('getHttpResponse') - ->with($expectedUrl, 30, 4194304) - ->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void { - $callback(); - }) - ; + $this->container->conf = $this->createMock(ConfigManager::class); + $this->container->conf->method('get')->willReturnCallback(function (string $param, $default) { + if ($param === 'general.enable_async_metadata') { + return false; + } + + return $default; + }); + + $this->container->metadataRetriever->expects(static::once())->method('retrieve')->willReturn([ + 'title' => $remoteTitle, + 'description' => $remoteDesc, + 'tags' => $remoteTags, + ]); $this->container->bookmarkService ->expects(static::once()) @@ -127,6 +109,72 @@ class DisplayCreateFormTest extends TestCase static::assertSame($tags, $assignedVariables['tags']); static::assertArrayHasKey('source', $assignedVariables); static::assertArrayHasKey('default_private_links', $assignedVariables); + static::assertArrayHasKey('async_metadata', $assignedVariables); + static::assertArrayHasKey('retrieve_description', $assignedVariables); + } + + /** + * Test displaying bookmark create form without any external metadata retrieval attempt + */ + public function testDisplayCreateFormWithUrlAndWithoutMetadata(): void + { + $this->container->environment = [ + 'HTTP_REFERER' => $referer = 'http://shaarli/subfolder/controller/?searchtag=abc' + ]; + + $assignedVariables = []; + $this->assignTemplateVars($assignedVariables); + + $url = 'http://url.tld/other?part=3&utm_ad=pay#hash'; + $expectedUrl = str_replace('&utm_ad=pay', '', $url); + + $request = $this->createMock(Request::class); + $request->method('getParam')->willReturnCallback(function (string $key) use ($url): ?string { + return $key === 'post' ? $url : null; + }); + $response = new Response(); + + $this->container->metadataRetriever->expects(static::never())->method('retrieve'); + + $this->container->bookmarkService + ->expects(static::once()) + ->method('bookmarksCountPerTag') + ->willReturn($tags = ['tag1' => 2, 'tag2' => 1]) + ; + + // Make sure that PluginManager hook is triggered + $this->container->pluginManager + ->expects(static::at(0)) + ->method('executeHooks') + ->willReturnCallback(function (string $hook, array $data): array { + static::assertSame('render_editlink', $hook); + static::assertSame('', $data['link']['title']); + static::assertSame('', $data['link']['description']); + + return $data; + }) + ; + + $result = $this->controller->displayCreateForm($request, $response); + + static::assertSame(200, $result->getStatusCode()); + static::assertSame('editlink', (string) $result->getBody()); + + static::assertSame('Shaare - Shaarli', $assignedVariables['pagetitle']); + + static::assertSame($expectedUrl, $assignedVariables['link']['url']); + static::assertSame('', $assignedVariables['link']['title']); + static::assertSame('', $assignedVariables['link']['description']); + static::assertSame('', $assignedVariables['link']['tags']); + static::assertFalse($assignedVariables['link']['private']); + + static::assertTrue($assignedVariables['link_is_new']); + static::assertSame($referer, $assignedVariables['http_referer']); + static::assertSame($tags, $assignedVariables['tags']); + static::assertArrayHasKey('source', $assignedVariables); + static::assertArrayHasKey('default_private_links', $assignedVariables); + static::assertArrayHasKey('async_metadata', $assignedVariables); + static::assertArrayHasKey('retrieve_description', $assignedVariables); } /** diff --git a/tests/http/MetadataRetrieverTest.php b/tests/http/MetadataRetrieverTest.php new file mode 100644 index 00000000..2a1838e8 --- /dev/null +++ b/tests/http/MetadataRetrieverTest.php @@ -0,0 +1,123 @@ +conf = $this->createMock(ConfigManager::class); + $this->httpAccess = $this->createMock(HttpAccess::class); + $this->retriever = new MetadataRetriever($this->conf, $this->httpAccess); + + $this->conf->method('get')->willReturnCallback(function (string $param, $default) { + return $default === null ? $param : $default; + }); + } + + /** + * Test metadata retrieve() with values returned + */ + public function testFullRetrieval(): void + { + $url = 'https://domain.tld/link'; + $remoteTitle = 'Remote Title '; + $remoteDesc = 'Sometimes the meta description is relevant.'; + $remoteTags = 'abc def'; + + $expectedResult = [ + 'title' => $remoteTitle, + 'description' => $remoteDesc, + 'tags' => $remoteTags, + ]; + + $this->httpAccess + ->expects(static::once()) + ->method('getCurlDownloadCallback') + ->willReturnCallback( + function (&$charset, &$title, &$description, &$tags) use ( + $remoteTitle, + $remoteDesc, + $remoteTags + ): callable { + return function () use ( + &$charset, + &$title, + &$description, + &$tags, + $remoteTitle, + $remoteDesc, + $remoteTags + ): void { + $charset = 'ISO-8859-1'; + $title = $remoteTitle; + $description = $remoteDesc; + $tags = $remoteTags; + }; + } + ) + ; + $this->httpAccess + ->expects(static::once()) + ->method('getHttpResponse') + ->with($url, 30, 4194304) + ->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void { + $callback(); + }) + ; + + $result = $this->retriever->retrieve($url); + + static::assertSame($expectedResult, $result); + } + + /** + * Test metadata retrieve() without any value + */ + public function testEmptyRetrieval(): void + { + $url = 'https://domain.tld/link'; + + $expectedResult = [ + 'title' => null, + 'description' => null, + 'tags' => null, + ]; + + $this->httpAccess + ->expects(static::once()) + ->method('getCurlDownloadCallback') + ->willReturnCallback( + function (&$charset, &$title, &$description, &$tags): callable { + return function () use (&$charset, &$title, &$description, &$tags): void {}; + } + ) + ; + $this->httpAccess + ->expects(static::once()) + ->method('getHttpResponse') + ->with($url, 30, 4194304) + ->willReturnCallback(function($url, $timeout, $maxBytes, $callback): void { + $callback(); + }) + ; + + $result = $this->retriever->retrieve($url); + + static::assertSame($expectedResult, $result); + } +} diff --git a/tpl/default/editlink.html b/tpl/default/editlink.html index 568545bd..7ab7e1fe 100644 --- a/tpl/default/editlink.html +++ b/tpl/default/editlink.html @@ -12,6 +12,8 @@ action="{$base_path}/admin/shaare" class="page-form pure-u-lg-3-5 pure-u-22-24 page-form page-form-light" > + {$asyncLoadClass=$link_is_new && $async_metadata && empty($link.title) ? 'loading-input' : ''} +

{if="!$link_is_new"}{'Edit Shaare'|t}{else}{'New Shaare'|t}{/if}

@@ -28,21 +30,32 @@
-
- +
+ +
+ +
-
+
+
+ +
-
+
+
+ +
@@ -88,5 +101,6 @@
{include="page.footer"} + {if="$link_is_new && $async_metadata"}{/if} diff --git a/webpack.config.js b/webpack.config.js index a73758cc..8e3d1470 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -20,6 +20,7 @@ module.exports = [ entry: { thumbnails: './assets/common/js/thumbnails.js', thumbnails_update: './assets/common/js/thumbnails-update.js', + metadata: './assets/common/js/metadata.js', pluginsadmin: './assets/default/js/plugins-admin.js', shaarli: [ './assets/default/js/base.js', @@ -99,6 +100,7 @@ module.exports = [ ].concat(glob.sync('./assets/vintage/img/*')), markdown: './assets/common/css/markdown.css', thumbnails: './assets/common/js/thumbnails.js', + metadata: './assets/common/js/metadata.js', thumbnails_update: './assets/common/js/thumbnails-update.js', }, output: { diff --git a/yarn.lock b/yarn.lock index 0a12820c..55bd9827 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2912,6 +2912,11 @@ hash.js@^1.0.0, hash.js@^1.0.3: inherits "^2.0.3" minimalistic-assert "^1.0.1" +he@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f" + integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw== + hmac-drbg@^1.0.0: version "1.0.1" resolved "https://registry.yarnpkg.com/hmac-drbg/-/hmac-drbg-1.0.1.tgz#d2745701025a6c775a6c545793ed502fc0c649a1" -- cgit v1.2.3 From 5334090be04e66da5cb5c3ad487604b3733c5cac Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 15 Oct 2020 11:20:33 +0200 Subject: Improve metadata retrieval (performances and accuracy) - Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags --- application/http/HttpAccess.php | 22 ++-- application/http/HttpUtils.php | 123 +++++++++++------- application/http/MetadataRetriever.php | 1 + tests/bookmark/LinkUtilsTest.php | 223 ++++++++++++++++----------------- tests/http/MetadataRetrieverTest.php | 45 +++++-- 5 files changed, 237 insertions(+), 177 deletions(-) diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..646a5264 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php @@ -14,9 +14,14 @@ namespace Shaarli\Http; */ class HttpAccess { - public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) - { - return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); + public function getHttpResponse( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null + ) { + return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); } public function getCurlDownloadCallback( @@ -24,16 +29,19 @@ class HttpAccess &$title, &$description, &$keywords, - $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $retrieveDescription ) { return get_curl_download_callback( $charset, $title, $description, $keywords, - $retrieveDescription, - $curlGetInfo + $retrieveDescription ); } + + public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') + { + return get_curl_header_callback($charset, $curlGetInfo); + } } diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -6,12 +6,14 @@ use Shaarli\Http\Url; * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) - * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). - * Can be used to add download conditions on the - * headers (response code, content type, etc.). + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlHeaderFunction Optional callback called during the download of headers + * (CURLOPT_HEADERFUNCTION) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the + * headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -35,8 +37,13 @@ use Shaarli\Http\Url; * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) -{ +function get_http_response( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null +) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF // General cURL settings curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_HEADER, true); + // Default header download if the $curlHeaderFunction is not defined + curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); curl_setopt( $ch, CURLOPT_HTTPHEADER, @@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); - if (is_callable($curlWriteFunction)) { - curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); - } - // Max download size management curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); + if (is_callable($curlHeaderFunction)) { + curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); + } + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } curl_setopt( $ch, CURLOPT_PROGRESSFUNCTION, - function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { - if (version_compare(phpversion(), '5.5', '<')) { - // PHP version lower than 5.5 - // Callback has 4 arguments - $downloaded = $arg1; - } else { - // Callback has 5 arguments - $downloaded = $arg2; - } + function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { + $downloaded = $arg2; + // Non-zero return stops downloading return ($downloaded > $maxBytes) ? 1 : 0; } @@ -489,6 +493,46 @@ function is_https($server) return ! empty($server['HTTPS']); } +/** + * Get cURL callback function for CURLOPT_WRITEFUNCTION + * + * @param string $charset to extract from the downloaded page (reference) + * @param string $curlGetInfo Optionally overrides curl_getinfo function + * + * @return Closure + */ +function get_curl_header_callback( + &$charset, + $curlGetInfo = 'curl_getinfo' +) { + $isRedirected = false; + + return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { + $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + $chunkLength = strlen($data); + if (!empty($responseCode) && in_array($responseCode, [301, 302])) { + $isRedirected = true; + return $chunkLength; + } + if (!empty($responseCode) && $responseCode !== 200) { + return false; + } + // After a redirection, the content type will keep the previous request value + // until it finds the next content-type header. + if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { + $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); + } + if (!empty($contentType) && strpos($contentType, 'text/html') === false) { + return false; + } + if (!empty($contentType) && empty($charset)) { + $charset = header_extract_charset($contentType); + } + + return $chunkLength; + }; +} + /** * Get cURL callback function for CURLOPT_WRITEFUNCTION * @@ -506,10 +550,8 @@ function get_curl_download_callback( &$title, &$description, &$keywords, - $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $retrieveDescription ) { - $isRedirected = false; $currentChunk = 0; $foundChunk = null; @@ -524,37 +566,18 @@ function get_curl_download_callback( * * @return int|bool length of $data or false if we need to stop the download */ - return function (&$ch, $data) use ( + return function ($ch, $data) use ( $retrieveDescription, - $curlGetInfo, &$charset, &$title, &$description, &$keywords, - &$isRedirected, &$currentChunk, &$foundChunk ) { + $chunkLength = strlen($data); $currentChunk++; - $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); - if (!empty($responseCode) && in_array($responseCode, [301, 302])) { - $isRedirected = true; - return strlen($data); - } - if (!empty($responseCode) && $responseCode !== 200) { - return false; - } - // After a redirection, the content type will keep the previous request value - // until it finds the next content-type header. - if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { - $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); - } - if (!empty($contentType) && strpos($contentType, 'text/html') === false) { - return false; - } - if (!empty($contentType) && empty($charset)) { - $charset = header_extract_charset($contentType); - } + if (empty($charset)) { $charset = html_extract_charset($data); } @@ -562,6 +585,10 @@ function get_curl_download_callback( $title = html_extract_title($data); $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; } + if (empty($title)) { + $title = html_extract_tag('title', $data); + $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; + } if ($retrieveDescription && empty($description)) { $description = html_extract_tag('description', $data); $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; @@ -591,6 +618,6 @@ function get_curl_download_callback( return false; } - return strlen($data); + return $chunkLength; }; } diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php index 2ca982e2..ba9bd40c 100644 --- a/application/http/MetadataRetriever.php +++ b/application/http/MetadataRetriever.php @@ -46,6 +46,7 @@ class MetadataRetriever $url, $this->conf->get('general.download_timeout', 30), $this->conf->get('general.download_max_size', 4194304), + $this->httpAccess->getCurlHeaderCallback($charset), $this->httpAccess->getCurlDownloadCallback( $charset, $title, diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 29941c8c..3321242f 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -215,61 +215,92 @@ class LinkUtilsTest extends TestCase $this->assertFalse(html_extract_tag('description', $html)); } + /** + * Test the header callback with valid value + */ + public function testCurlHeaderCallbackOk(): void + { + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok'); + $data = [ + 'HTTP/1.1 200 OK', + 'Server: GitHub.com', + 'Date: Sat, 28 Oct 2017 12:01:33 GMT', + 'Content-Type: text/html; charset=utf-8', + 'Status: 200 OK', + ]; + + foreach ($data as $chunk) { + static::assertIsInt($callback(null, $chunk)); + } + + static::assertSame('utf-8', $charset); + } + /** * Test the download callback with valid value */ - public function testCurlDownloadCallbackOk() + public function testCurlDownloadCallbackOk(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_ok' + false ); + $data = [ - 'HTTP/1.1 200 OK', - 'Server: GitHub.com', - 'Date: Sat, 28 Oct 2017 12:01:33 GMT', - 'Content-Type: text/html; charset=utf-8', - 'Status: 200 OK', - 'end' => 'th=device-width">' + 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } - $this->assertEquals('utf-8', $charset); - $this->assertEquals('Refactoring · GitHub', $title); - $this->assertEmpty($desc); - $this->assertEmpty($keywords); + + static::assertSame('utf-8', $charset); + static::assertSame('Refactoring · GitHub', $title); + static::assertEmpty($desc); + static::assertEmpty($keywords); + } + + /** + * Test the header callback with valid value + */ + public function testCurlHeaderCallbackNoCharset(): void + { + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset'); + $data = [ + 'HTTP/1.1 200 OK', + ]; + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); + } + + static::assertFalse($charset); } /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkNoCharset() + public function testCurlDownloadCallbackOkNoCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $this->assertEquals(strlen($line), $callback($ignore, $line)); + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEmpty($charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -290,18 +322,18 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkHtmlCharset() + public function testCurlDownloadCallbackOkHtmlCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', '', 'end' => 'th=device-width">' . 'Refactoring · GitHub' @@ -310,14 +342,10 @@ class LinkUtilsTest extends TestCase . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -327,25 +355,26 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no title */ - public function testCurlDownloadCallbackOkNoTitle() + public function testCurlDownloadCallbackOkNoTitle(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_ok' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">Refactoring · GitHub' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('link desc', $desc); @@ -453,8 +453,9 @@ class LinkUtilsTest extends TestCase * Test the download callback with valid value, and retrieve_description option enabled, * but no desc or keyword defined in the page. */ - public function testCurlDownloadCallbackOkWithDescNotFound() + public function testCurlDownloadCallbackOkWithDescNotFound(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, @@ -464,24 +465,16 @@ class LinkUtilsTest extends TestCase 'ut_curl_getinfo_ok' ); $data = [ - 'HTTP/1.1 200 OK', - 'Server: GitHub.com', - 'Date: Sat, 28 Oct 2017 12:01:33 GMT', - 'Content-Type: text/html; charset=utf-8', - 'Status: 200 OK', 'th=device-width">' . 'Refactoring · GitHub' . '