}
/**
- * Fetch content using graby and hydrate given entry with results information.
+ * Fetch content using graby and hydrate given $entry with results information.
* In case we couldn't find content, we'll try to use Open Graph data.
*
* We can also force the content, in case of an import from the v1 for example, so the function won't
*
* @param Entry $entry Entry to update
* @param string $url Url to grab content for
- * @param array $content An array with AT LEAST keys title, html, url, language & content_type to skip the fetchContent from the url
+ * @param array $content An array with AT LEAST keys title, html, url to skip the fetchContent from the url
*
* @return Entry
*/
{
// ensure content is a bit cleaned up
if (!empty($content['html'])) {
- $extractor = $this->graby->getExtractor();
- $contentExtracted = $extractor->process($content['html'], $url);
-
- if ($contentExtracted) {
- $contentBlock = $extractor->getContent();
- $contentBlock->normalize();
-
- $content['html'] = trim($contentBlock->innerHTML);
- }
-
- $content['html'] = htmLawed($content['html'], [
- 'safe' => 1,
- // which means: do not remove iframe elements
- 'elements' => '*+iframe',
- 'deny_attribute' => 'style',
- 'comment' => 1,
- 'cdata' => 1,
- ]);
+ $content['html'] = $this->graby->cleanupHtml($content['html'], $url);
}
// do we have to fetch the content or the provided one is ok?
$entry->setHttpStatus(isset($content['status']) ? $content['status'] : '');
if (!empty($content['date'])) {
+ $date = $content['date'];
+
+ // is it a timestamp?
+ if (filter_var($date, FILTER_VALIDATE_INT) !== false) {
+ $date = '@'.$content['date'];
+ }
+
try {
- $entry->setPublishedAt(new \DateTime($content['date']));
+ $entry->setPublishedAt(new \DateTime($date));
} catch (\Exception $e) {
- $this->logger->warn('Error while defining date', ['e' => $e, 'url' => $url, 'date' => $content['date']]);
+ $this->logger->warning('Error while defining date', ['e' => $e, 'url' => $url, 'date' => $content['date']]);
}
}