if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
$fetchedContent = $this->graby->fetchContent($url);
+ $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
// when content is imported, we have information in $content
// in case fetching content goes bad, we'll keep the imported information instead of overriding them
// so we'll be able to refetch it in the future
$content['url'] = !empty($content['url']) ? $content['url'] : $url;
+ // In one case (at least in tests), url is empty here
+ // so we set it using $url provided in the updateEntry call.
+ // Not sure what are the other possible cases where this property is empty
+ if (empty($entry->getUrl()) && !empty($url))
+ {
+ $entry->setUrl($url);
+ }
+
$this->stockEntry($entry, $content);
}
(new LocaleConstraint())
);
- if (0 === count($errors)) {
+ if (0 === \count($errors)) {
$entry->setLanguage($value);
return;
(new UrlConstraint())
);
- if (0 === count($errors)) {
+ if (0 === \count($errors)) {
$entry->setPreviewPicture($value);
return;
}
}
+ /**
+ * Helper to extract and save host from entry url.
+ *
+ * @param Entry $entry
+ */
+ public function setEntryDomainName(Entry $entry)
+ {
+ $domainName = parse_url($entry->getUrl(), PHP_URL_HOST);
+ if (false !== $domainName) {
+ $entry->setDomainName($domainName);
+ }
+ }
+
+ /**
+ * Helper to set a default title using:
+ * - url basename, if applicable
+ * - hostname.
+ *
+ * @param Entry $entry
+ */
+ public function setDefaultEntryTitle(Entry $entry)
+ {
+ $url = parse_url($entry->getUrl());
+ $path = pathinfo($url['path'], PATHINFO_BASENAME);
+
+ if (empty($path)) {
+ $path = $url['host'];
+ }
+
+ $entry->setTitle($path);
+ }
+
+ /**
+ * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
+ *
+ * @param $title
+ * @param $contentType
+ *
+ * @return string
+ */
+ private function sanitizeContentTitle($title, $contentType)
+ {
+ if ('application/pdf' === $contentType) {
+ $title = $this->convertPdfEncodingToUTF8($title);
+ }
+
+ return $this->sanitizeUTF8Text($title);
+ }
+
+ /**
+ * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
+ * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
+ *
+ * @param $title
+ *
+ * @return string (maybe contains invalid UTF-8 character)
+ */
+ private function convertPdfEncodingToUTF8($title)
+ {
+ // first try UTF-8 because its easier to detect its present/absence
+ foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
+ if (mb_check_encoding($title, $encoding)) {
+ return mb_convert_encoding($title, 'UTF-8', $encoding);
+ }
+ }
+
+ return $title;
+ }
+
+ /**
+ * Remove invalid UTF-8 characters from the given string.
+ *
+ * @param string $rawText
+ *
+ * @return string
+ */
+ private function sanitizeUTF8Text($rawText)
+ {
+ if (mb_check_encoding($rawText, 'UTF-8')) {
+ return $rawText;
+ }
+
+ return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
+ }
+
/**
* Stock entry with fetched or imported content.
* Will fall back to OpenGraph data if available.
{
$entry->setUrl($content['url']);
- $domainName = parse_url($entry->getUrl(), PHP_URL_HOST);
- if (false !== $domainName) {
- $entry->setDomainName($domainName);
- }
+ $this->setEntryDomainName($entry);
if (!empty($content['title'])) {
$entry->setTitle($content['title']);
$entry->setHttpStatus($content['status']);
}
- if (!empty($content['authors']) && is_array($content['authors'])) {
+ if (!empty($content['authors']) && \is_array($content['authors'])) {
$entry->setPublishedBy($content['authors']);
}
}
// if content is an image, define it as a preview too
- if (!empty($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) {
+ if (!empty($content['content_type']) && \in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) {
$this->updatePreviewPicture($entry, $content['url']);
}