aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Wallabag/CoreBundle/Helper/ContentProxy.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php')
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php238
1 files changed, 150 insertions, 88 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index f222dd88..854acb6a 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -4,11 +4,12 @@ namespace Wallabag\CoreBundle\Helper;
4 4
5use Graby\Graby; 5use Graby\Graby;
6use Psr\Log\LoggerInterface; 6use Psr\Log\LoggerInterface;
7use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
8use Symfony\Component\Validator\Constraints\Locale as LocaleConstraint;
9use Symfony\Component\Validator\Constraints\Url as UrlConstraint;
10use Symfony\Component\Validator\Validator\ValidatorInterface;
7use Wallabag\CoreBundle\Entity\Entry; 11use Wallabag\CoreBundle\Entity\Entry;
8use Wallabag\CoreBundle\Entity\Tag;
9use Wallabag\CoreBundle\Tools\Utils; 12use Wallabag\CoreBundle\Tools\Utils;
10use Wallabag\CoreBundle\Repository\TagRepository;
11use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
12 13
13/** 14/**
14 * This kind of proxy class take care of getting the content from an url 15 * This kind of proxy class take care of getting the content from an url
@@ -18,38 +19,37 @@ class ContentProxy
18{ 19{
19 protected $graby; 20 protected $graby;
20 protected $tagger; 21 protected $tagger;
22 protected $validator;
21 protected $logger; 23 protected $logger;
22 protected $tagRepository;
23 protected $mimeGuesser; 24 protected $mimeGuesser;
24 protected $fetchingErrorMessage; 25 protected $fetchingErrorMessage;
26 protected $eventDispatcher;
25 27
26 public function __construct(Graby $graby, RuleBasedTagger $tagger, TagRepository $tagRepository, LoggerInterface $logger, $fetchingErrorMessage) 28 public function __construct(Graby $graby, RuleBasedTagger $tagger, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage)
27 { 29 {
28 $this->graby = $graby; 30 $this->graby = $graby;
29 $this->tagger = $tagger; 31 $this->tagger = $tagger;
32 $this->validator = $validator;
30 $this->logger = $logger; 33 $this->logger = $logger;
31 $this->tagRepository = $tagRepository;
32 $this->mimeGuesser = new MimeTypeExtensionGuesser(); 34 $this->mimeGuesser = new MimeTypeExtensionGuesser();
33 $this->fetchingErrorMessage = $fetchingErrorMessage; 35 $this->fetchingErrorMessage = $fetchingErrorMessage;
34 } 36 }
35 37
36 /** 38 /**
37 * Fetch content using graby and hydrate given entry with results information. 39 * Update entry using either fetched or provided content.
38 * In case we couldn't find content, we'll try to use Open Graph data.
39 *
40 * We can also force the content, in case of an import from the v1 for example, so the function won't
41 * fetch the content from the website but rather use information given with the $content parameter.
42 * 40 *
43 * @param Entry $entry Entry to update 41 * @param Entry $entry Entry to update
44 * @param string $url Url to grab content for 42 * @param string $url Url of the content
45 * @param array $content An array with AT LEAST keys title, html, url, language & content_type to skip the fetchContent from the url 43 * @param array $content Array with content provided for import with AT LEAST keys title, html, url to skip the fetchContent from the url
46 * 44 * @param bool $disableContentUpdate Whether to skip trying to fetch content using Graby
47 * @return Entry
48 */ 45 */
49 public function updateEntry(Entry $entry, $url, array $content = []) 46 public function updateEntry(Entry $entry, $url, array $content = [], $disableContentUpdate = false)
50 { 47 {
51 // do we have to fetch the content or the provided one is ok? 48 if (!empty($content['html'])) {
52 if (empty($content) || false === $this->validateContent($content)) { 49 $content['html'] = $this->graby->cleanupHtml($content['html'], $url);
50 }
51
52 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
53 $fetchedContent = $this->graby->fetchContent($url); 53 $fetchedContent = $this->graby->fetchContent($url);
54 54
55 // when content is imported, we have information in $content 55 // when content is imported, we have information in $content
@@ -59,107 +59,169 @@ class ContentProxy
59 } 59 }
60 } 60 }
61 61
62 $title = $content['title']; 62 // be sure to keep the url in case of error
63 if (!$title && isset($content['open_graph']['og_title'])) { 63 // so we'll be able to refetch it in the future
64 $title = $content['open_graph']['og_title']; 64 $content['url'] = !empty($content['url']) ? $content['url'] : $url;
65 }
66 65
67 $html = $content['html']; 66 $this->stockEntry($entry, $content);
68 if (false === $html) { 67 }
69 $html = $this->fetchingErrorMessage;
70 68
71 if (isset($content['open_graph']['og_description'])) { 69 /**
72 $html .= '<p><i>But we found a short description: </i></p>'; 70 * Use a Symfony validator to ensure the language is well formatted.
73 $html .= $content['open_graph']['og_description']; 71 *
74 } 72 * @param Entry $entry
75 } 73 * @param string $value Language to validate and save
74 */
75 public function updateLanguage(Entry $entry, $value)
76 {
77 // some lang are defined as fr-FR, es-ES.
78 // replacing - by _ might increase language support
79 $value = str_replace('-', '_', $value);
76 80
77 $entry->setUrl($content['url'] ?: $url); 81 $errors = $this->validator->validate(
78 $entry->setTitle($title); 82 $value,
79 $entry->setContent($html); 83 (new LocaleConstraint())
80 $entry->setHttpStatus(isset($content['status']) ? $content['status'] : ''); 84 );
81 85
82 $entry->setLanguage(isset($content['language']) ? $content['language'] : ''); 86 if (0 === count($errors)) {
83 $entry->setMimetype(isset($content['content_type']) ? $content['content_type'] : ''); 87 $entry->setLanguage($value);
84 $entry->setReadingTime(Utils::getReadingTime($html));
85 88
86 $domainName = parse_url($entry->getUrl(), PHP_URL_HOST); 89 return;
87 if (false !== $domainName) {
88 $entry->setDomainName($domainName);
89 } 90 }
90 91
91 if (isset($content['open_graph']['og_image']) && $content['open_graph']['og_image']) { 92 $this->logger->warning('Language validation failed. ' . (string) $errors);
92 $entry->setPreviewPicture($content['open_graph']['og_image']); 93 }
94
95 /**
96 * Use a Symfony validator to ensure the preview picture is a real url.
97 *
98 * @param Entry $entry
99 * @param string $value URL to validate and save
100 */
101 public function updatePreviewPicture(Entry $entry, $value)
102 {
103 $errors = $this->validator->validate(
104 $value,
105 (new UrlConstraint())
106 );
107
108 if (0 === count($errors)) {
109 $entry->setPreviewPicture($value);
110
111 return;
93 } 112 }
94 113
95 // if content is an image define as a preview too 114 $this->logger->warning('PreviewPicture validation failed. ' . (string) $errors);
96 if (isset($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { 115 }
97 $entry->setPreviewPicture($content['url']); 116
117 /**
118 * Update date.
119 *
120 * @param Entry $entry
121 * @param string $value Date to validate and save
122 */
123 public function updatePublishedAt(Entry $entry, $value)
124 {
125 $date = $value;
126
127 // is it a timestamp?
128 if (false !== filter_var($date, FILTER_VALIDATE_INT)) {
129 $date = '@' . $date;
98 } 130 }
99 131
100 try { 132 try {
101 $this->tagger->tag($entry); 133 // is it already a DateTime?
134 // (it's inside the try/catch in case of fail to be parse time string)
135 if (!$date instanceof \DateTime) {
136 $date = new \DateTime($date);
137 }
138
139 $entry->setPublishedAt($date);
102 } catch (\Exception $e) { 140 } catch (\Exception $e) {
103 $this->logger->error('Error while trying to automatically tag an entry.', [ 141 $this->logger->warning('Error while defining date', ['e' => $e, 'url' => $entry->getUrl(), 'date' => $value]);
104 'entry_url' => $url,
105 'error_msg' => $e->getMessage(),
106 ]);
107 } 142 }
108
109 return $entry;
110 } 143 }
111 144
112 /** 145 /**
113 * Assign some tags to an entry. 146 * Stock entry with fetched or imported content.
147 * Will fall back to OpenGraph data if available.
114 * 148 *
115 * @param Entry $entry 149 * @param Entry $entry Entry to stock
116 * @param array|string $tags An array of tag or a string coma separated of tag 150 * @param array $content Array with at least title, url & html
117 * @param array $entitiesReady Entities from the EntityManager which are persisted but not yet flushed
118 * It is mostly to fix duplicate tag on import @see http://stackoverflow.com/a/7879164/569101
119 */ 151 */
120 public function assignTagsToEntry(Entry $entry, $tags, array $entitiesReady = []) 152 private function stockEntry(Entry $entry, array $content)
121 { 153 {
122 if (!is_array($tags)) { 154 $entry->setUrl($content['url']);
123 $tags = explode(',', $tags); 155
156 $domainName = parse_url($entry->getUrl(), PHP_URL_HOST);
157 if (false !== $domainName) {
158 $entry->setDomainName($domainName);
124 } 159 }
125 160
126 // keeps only Tag entity from the "not yet flushed entities" 161 if (!empty($content['title'])) {
127 $tagsNotYetFlushed = []; 162 $entry->setTitle($content['title']);
128 foreach ($entitiesReady as $entity) { 163 } elseif (!empty($content['open_graph']['og_title'])) {
129 if ($entity instanceof Tag) { 164 $entry->setTitle($content['open_graph']['og_title']);
130 $tagsNotYetFlushed[$entity->getLabel()] = $entity;
131 }
132 } 165 }
133 166
134 foreach ($tags as $label) { 167 $html = $content['html'];
135 $label = trim($label); 168 if (false === $html) {
169 $html = $this->fetchingErrorMessage;
136 170
137 // avoid empty tag 171 if (!empty($content['open_graph']['og_description'])) {
138 if (0 === strlen($label)) { 172 $html .= '<p><i>But we found a short description: </i></p>';
139 continue; 173 $html .= $content['open_graph']['og_description'];
140 } 174 }
175 }
141 176
142 if (isset($tagsNotYetFlushed[$label])) { 177 $entry->setContent($html);
143 $tagEntity = $tagsNotYetFlushed[$label]; 178 $entry->setReadingTime(Utils::getReadingTime($html));
144 } else {
145 $tagEntity = $this->tagRepository->findOneByLabel($label);
146 179
147 if (is_null($tagEntity)) { 180 if (!empty($content['status'])) {
148 $tagEntity = new Tag(); 181 $entry->setHttpStatus($content['status']);
149 $tagEntity->setLabel($label); 182 }
150 }
151 }
152 183
153 // only add the tag on the entry if the relation doesn't exist 184 if (!empty($content['authors']) && is_array($content['authors'])) {
154 if (false === $entry->getTags()->contains($tagEntity)) { 185 $entry->setPublishedBy($content['authors']);
155 $entry->addTag($tagEntity); 186 }
156 } 187
188 if (!empty($content['all_headers'])) {
189 $entry->setHeaders($content['all_headers']);
190 }
191
192 if (!empty($content['date'])) {
193 $this->updatePublishedAt($entry, $content['date']);
194 }
195
196 if (!empty($content['language'])) {
197 $this->updateLanguage($entry, $content['language']);
198 }
199
200 if (!empty($content['open_graph']['og_image'])) {
201 $this->updatePreviewPicture($entry, $content['open_graph']['og_image']);
202 }
203
204 // if content is an image, define it as a preview too
205 if (!empty($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) {
206 $this->updatePreviewPicture($entry, $content['url']);
207 }
208
209 if (!empty($content['content_type'])) {
210 $entry->setMimetype($content['content_type']);
211 }
212
213 try {
214 $this->tagger->tag($entry);
215 } catch (\Exception $e) {
216 $this->logger->error('Error while trying to automatically tag an entry.', [
217 'entry_url' => $content['url'],
218 'error_msg' => $e->getMessage(),
219 ]);
157 } 220 }
158 } 221 }
159 222
160 /** 223 /**
161 * Validate that the given content as enough value to be used 224 * Validate that the given content has at least a title, an html and a url.
162 * instead of fetch the content from the url.
163 * 225 *
164 * @param array $content 226 * @param array $content
165 * 227 *
@@ -167,6 +229,6 @@ class ContentProxy
167 */ 229 */
168 private function validateContent(array $content) 230 private function validateContent(array $content)
169 { 231 {
170 return isset($content['title']) && isset($content['html']) && isset($content['url']) && isset($content['language']) && isset($content['content_type']); 232 return !empty($content['title']) && !empty($content['html']) && !empty($content['url']);
171 } 233 }
172} 234}