diff options
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 238 |
1 files changed, 150 insertions, 88 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index f222dd88..854acb6a 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -4,11 +4,12 @@ namespace Wallabag\CoreBundle\Helper; | |||
4 | 4 | ||
5 | use Graby\Graby; | 5 | use Graby\Graby; |
6 | use Psr\Log\LoggerInterface; | 6 | use Psr\Log\LoggerInterface; |
7 | use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; | ||
8 | use Symfony\Component\Validator\Constraints\Locale as LocaleConstraint; | ||
9 | use Symfony\Component\Validator\Constraints\Url as UrlConstraint; | ||
10 | use Symfony\Component\Validator\Validator\ValidatorInterface; | ||
7 | use Wallabag\CoreBundle\Entity\Entry; | 11 | use Wallabag\CoreBundle\Entity\Entry; |
8 | use Wallabag\CoreBundle\Entity\Tag; | ||
9 | use Wallabag\CoreBundle\Tools\Utils; | 12 | use Wallabag\CoreBundle\Tools\Utils; |
10 | use Wallabag\CoreBundle\Repository\TagRepository; | ||
11 | use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; | ||
12 | 13 | ||
13 | /** | 14 | /** |
14 | * This kind of proxy class take care of getting the content from an url | 15 | * This kind of proxy class take care of getting the content from an url |
@@ -18,38 +19,37 @@ class ContentProxy | |||
18 | { | 19 | { |
19 | protected $graby; | 20 | protected $graby; |
20 | protected $tagger; | 21 | protected $tagger; |
22 | protected $validator; | ||
21 | protected $logger; | 23 | protected $logger; |
22 | protected $tagRepository; | ||
23 | protected $mimeGuesser; | 24 | protected $mimeGuesser; |
24 | protected $fetchingErrorMessage; | 25 | protected $fetchingErrorMessage; |
26 | protected $eventDispatcher; | ||
25 | 27 | ||
26 | public function __construct(Graby $graby, RuleBasedTagger $tagger, TagRepository $tagRepository, LoggerInterface $logger, $fetchingErrorMessage) | 28 | public function __construct(Graby $graby, RuleBasedTagger $tagger, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage) |
27 | { | 29 | { |
28 | $this->graby = $graby; | 30 | $this->graby = $graby; |
29 | $this->tagger = $tagger; | 31 | $this->tagger = $tagger; |
32 | $this->validator = $validator; | ||
30 | $this->logger = $logger; | 33 | $this->logger = $logger; |
31 | $this->tagRepository = $tagRepository; | ||
32 | $this->mimeGuesser = new MimeTypeExtensionGuesser(); | 34 | $this->mimeGuesser = new MimeTypeExtensionGuesser(); |
33 | $this->fetchingErrorMessage = $fetchingErrorMessage; | 35 | $this->fetchingErrorMessage = $fetchingErrorMessage; |
34 | } | 36 | } |
35 | 37 | ||
36 | /** | 38 | /** |
37 | * Fetch content using graby and hydrate given entry with results information. | 39 | * Update entry using either fetched or provided content. |
38 | * In case we couldn't find content, we'll try to use Open Graph data. | ||
39 | * | ||
40 | * We can also force the content, in case of an import from the v1 for example, so the function won't | ||
41 | * fetch the content from the website but rather use information given with the $content parameter. | ||
42 | * | 40 | * |
43 | * @param Entry $entry Entry to update | 41 | * @param Entry $entry Entry to update |
44 | * @param string $url Url to grab content for | 42 | * @param string $url Url of the content |
45 | * @param array $content An array with AT LEAST keys title, html, url, language & content_type to skip the fetchContent from the url | 43 | * @param array $content Array with content provided for import with AT LEAST keys title, html, url to skip the fetchContent from the url |
46 | * | 44 | * @param bool $disableContentUpdate Whether to skip trying to fetch content using Graby |
47 | * @return Entry | ||
48 | */ | 45 | */ |
49 | public function updateEntry(Entry $entry, $url, array $content = []) | 46 | public function updateEntry(Entry $entry, $url, array $content = [], $disableContentUpdate = false) |
50 | { | 47 | { |
51 | // do we have to fetch the content or the provided one is ok? | 48 | if (!empty($content['html'])) { |
52 | if (empty($content) || false === $this->validateContent($content)) { | 49 | $content['html'] = $this->graby->cleanupHtml($content['html'], $url); |
50 | } | ||
51 | |||
52 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | ||
53 | $fetchedContent = $this->graby->fetchContent($url); | 53 | $fetchedContent = $this->graby->fetchContent($url); |
54 | 54 | ||
55 | // when content is imported, we have information in $content | 55 | // when content is imported, we have information in $content |
@@ -59,107 +59,169 @@ class ContentProxy | |||
59 | } | 59 | } |
60 | } | 60 | } |
61 | 61 | ||
62 | $title = $content['title']; | 62 | // be sure to keep the url in case of error |
63 | if (!$title && isset($content['open_graph']['og_title'])) { | 63 | // so we'll be able to refetch it in the future |
64 | $title = $content['open_graph']['og_title']; | 64 | $content['url'] = !empty($content['url']) ? $content['url'] : $url; |
65 | } | ||
66 | 65 | ||
67 | $html = $content['html']; | 66 | $this->stockEntry($entry, $content); |
68 | if (false === $html) { | 67 | } |
69 | $html = $this->fetchingErrorMessage; | ||
70 | 68 | ||
71 | if (isset($content['open_graph']['og_description'])) { | 69 | /** |
72 | $html .= '<p><i>But we found a short description: </i></p>'; | 70 | * Use a Symfony validator to ensure the language is well formatted. |
73 | $html .= $content['open_graph']['og_description']; | 71 | * |
74 | } | 72 | * @param Entry $entry |
75 | } | 73 | * @param string $value Language to validate and save |
74 | */ | ||
75 | public function updateLanguage(Entry $entry, $value) | ||
76 | { | ||
77 | // some lang are defined as fr-FR, es-ES. | ||
78 | // replacing - by _ might increase language support | ||
79 | $value = str_replace('-', '_', $value); | ||
76 | 80 | ||
77 | $entry->setUrl($content['url'] ?: $url); | 81 | $errors = $this->validator->validate( |
78 | $entry->setTitle($title); | 82 | $value, |
79 | $entry->setContent($html); | 83 | (new LocaleConstraint()) |
80 | $entry->setHttpStatus(isset($content['status']) ? $content['status'] : ''); | 84 | ); |
81 | 85 | ||
82 | $entry->setLanguage(isset($content['language']) ? $content['language'] : ''); | 86 | if (0 === count($errors)) { |
83 | $entry->setMimetype(isset($content['content_type']) ? $content['content_type'] : ''); | 87 | $entry->setLanguage($value); |
84 | $entry->setReadingTime(Utils::getReadingTime($html)); | ||
85 | 88 | ||
86 | $domainName = parse_url($entry->getUrl(), PHP_URL_HOST); | 89 | return; |
87 | if (false !== $domainName) { | ||
88 | $entry->setDomainName($domainName); | ||
89 | } | 90 | } |
90 | 91 | ||
91 | if (isset($content['open_graph']['og_image']) && $content['open_graph']['og_image']) { | 92 | $this->logger->warning('Language validation failed. ' . (string) $errors); |
92 | $entry->setPreviewPicture($content['open_graph']['og_image']); | 93 | } |
94 | |||
95 | /** | ||
96 | * Use a Symfony validator to ensure the preview picture is a real url. | ||
97 | * | ||
98 | * @param Entry $entry | ||
99 | * @param string $value URL to validate and save | ||
100 | */ | ||
101 | public function updatePreviewPicture(Entry $entry, $value) | ||
102 | { | ||
103 | $errors = $this->validator->validate( | ||
104 | $value, | ||
105 | (new UrlConstraint()) | ||
106 | ); | ||
107 | |||
108 | if (0 === count($errors)) { | ||
109 | $entry->setPreviewPicture($value); | ||
110 | |||
111 | return; | ||
93 | } | 112 | } |
94 | 113 | ||
95 | // if content is an image define as a preview too | 114 | $this->logger->warning('PreviewPicture validation failed. ' . (string) $errors); |
96 | if (isset($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { | 115 | } |
97 | $entry->setPreviewPicture($content['url']); | 116 | |
117 | /** | ||
118 | * Update date. | ||
119 | * | ||
120 | * @param Entry $entry | ||
121 | * @param string $value Date to validate and save | ||
122 | */ | ||
123 | public function updatePublishedAt(Entry $entry, $value) | ||
124 | { | ||
125 | $date = $value; | ||
126 | |||
127 | // is it a timestamp? | ||
128 | if (false !== filter_var($date, FILTER_VALIDATE_INT)) { | ||
129 | $date = '@' . $date; | ||
98 | } | 130 | } |
99 | 131 | ||
100 | try { | 132 | try { |
101 | $this->tagger->tag($entry); | 133 | // is it already a DateTime? |
134 | // (it's inside the try/catch in case of fail to be parse time string) | ||
135 | if (!$date instanceof \DateTime) { | ||
136 | $date = new \DateTime($date); | ||
137 | } | ||
138 | |||
139 | $entry->setPublishedAt($date); | ||
102 | } catch (\Exception $e) { | 140 | } catch (\Exception $e) { |
103 | $this->logger->error('Error while trying to automatically tag an entry.', [ | 141 | $this->logger->warning('Error while defining date', ['e' => $e, 'url' => $entry->getUrl(), 'date' => $value]); |
104 | 'entry_url' => $url, | ||
105 | 'error_msg' => $e->getMessage(), | ||
106 | ]); | ||
107 | } | 142 | } |
108 | |||
109 | return $entry; | ||
110 | } | 143 | } |
111 | 144 | ||
112 | /** | 145 | /** |
113 | * Assign some tags to an entry. | 146 | * Stock entry with fetched or imported content. |
147 | * Will fall back to OpenGraph data if available. | ||
114 | * | 148 | * |
115 | * @param Entry $entry | 149 | * @param Entry $entry Entry to stock |
116 | * @param array|string $tags An array of tag or a string coma separated of tag | 150 | * @param array $content Array with at least title, url & html |
117 | * @param array $entitiesReady Entities from the EntityManager which are persisted but not yet flushed | ||
118 | * It is mostly to fix duplicate tag on import @see http://stackoverflow.com/a/7879164/569101 | ||
119 | */ | 151 | */ |
120 | public function assignTagsToEntry(Entry $entry, $tags, array $entitiesReady = []) | 152 | private function stockEntry(Entry $entry, array $content) |
121 | { | 153 | { |
122 | if (!is_array($tags)) { | 154 | $entry->setUrl($content['url']); |
123 | $tags = explode(',', $tags); | 155 | |
156 | $domainName = parse_url($entry->getUrl(), PHP_URL_HOST); | ||
157 | if (false !== $domainName) { | ||
158 | $entry->setDomainName($domainName); | ||
124 | } | 159 | } |
125 | 160 | ||
126 | // keeps only Tag entity from the "not yet flushed entities" | 161 | if (!empty($content['title'])) { |
127 | $tagsNotYetFlushed = []; | 162 | $entry->setTitle($content['title']); |
128 | foreach ($entitiesReady as $entity) { | 163 | } elseif (!empty($content['open_graph']['og_title'])) { |
129 | if ($entity instanceof Tag) { | 164 | $entry->setTitle($content['open_graph']['og_title']); |
130 | $tagsNotYetFlushed[$entity->getLabel()] = $entity; | ||
131 | } | ||
132 | } | 165 | } |
133 | 166 | ||
134 | foreach ($tags as $label) { | 167 | $html = $content['html']; |
135 | $label = trim($label); | 168 | if (false === $html) { |
169 | $html = $this->fetchingErrorMessage; | ||
136 | 170 | ||
137 | // avoid empty tag | 171 | if (!empty($content['open_graph']['og_description'])) { |
138 | if (0 === strlen($label)) { | 172 | $html .= '<p><i>But we found a short description: </i></p>'; |
139 | continue; | 173 | $html .= $content['open_graph']['og_description']; |
140 | } | 174 | } |
175 | } | ||
141 | 176 | ||
142 | if (isset($tagsNotYetFlushed[$label])) { | 177 | $entry->setContent($html); |
143 | $tagEntity = $tagsNotYetFlushed[$label]; | 178 | $entry->setReadingTime(Utils::getReadingTime($html)); |
144 | } else { | ||
145 | $tagEntity = $this->tagRepository->findOneByLabel($label); | ||
146 | 179 | ||
147 | if (is_null($tagEntity)) { | 180 | if (!empty($content['status'])) { |
148 | $tagEntity = new Tag(); | 181 | $entry->setHttpStatus($content['status']); |
149 | $tagEntity->setLabel($label); | 182 | } |
150 | } | ||
151 | } | ||
152 | 183 | ||
153 | // only add the tag on the entry if the relation doesn't exist | 184 | if (!empty($content['authors']) && is_array($content['authors'])) { |
154 | if (false === $entry->getTags()->contains($tagEntity)) { | 185 | $entry->setPublishedBy($content['authors']); |
155 | $entry->addTag($tagEntity); | 186 | } |
156 | } | 187 | |
188 | if (!empty($content['all_headers'])) { | ||
189 | $entry->setHeaders($content['all_headers']); | ||
190 | } | ||
191 | |||
192 | if (!empty($content['date'])) { | ||
193 | $this->updatePublishedAt($entry, $content['date']); | ||
194 | } | ||
195 | |||
196 | if (!empty($content['language'])) { | ||
197 | $this->updateLanguage($entry, $content['language']); | ||
198 | } | ||
199 | |||
200 | if (!empty($content['open_graph']['og_image'])) { | ||
201 | $this->updatePreviewPicture($entry, $content['open_graph']['og_image']); | ||
202 | } | ||
203 | |||
204 | // if content is an image, define it as a preview too | ||
205 | if (!empty($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { | ||
206 | $this->updatePreviewPicture($entry, $content['url']); | ||
207 | } | ||
208 | |||
209 | if (!empty($content['content_type'])) { | ||
210 | $entry->setMimetype($content['content_type']); | ||
211 | } | ||
212 | |||
213 | try { | ||
214 | $this->tagger->tag($entry); | ||
215 | } catch (\Exception $e) { | ||
216 | $this->logger->error('Error while trying to automatically tag an entry.', [ | ||
217 | 'entry_url' => $content['url'], | ||
218 | 'error_msg' => $e->getMessage(), | ||
219 | ]); | ||
157 | } | 220 | } |
158 | } | 221 | } |
159 | 222 | ||
160 | /** | 223 | /** |
161 | * Validate that the given content as enough value to be used | 224 | * Validate that the given content has at least a title, an html and a url. |
162 | * instead of fetch the content from the url. | ||
163 | * | 225 | * |
164 | * @param array $content | 226 | * @param array $content |
165 | * | 227 | * |
@@ -167,6 +229,6 @@ class ContentProxy | |||
167 | */ | 229 | */ |
168 | private function validateContent(array $content) | 230 | private function validateContent(array $content) |
169 | { | 231 | { |
170 | return isset($content['title']) && isset($content['html']) && isset($content['url']) && isset($content['language']) && isset($content['content_type']); | 232 | return !empty($content['title']) && !empty($content['html']) && !empty($content['url']); |
171 | } | 233 | } |
172 | } | 234 | } |