diff options
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 200 |
1 files changed, 122 insertions, 78 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index f222dd88..51bb2ca2 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -5,10 +5,11 @@ namespace Wallabag\CoreBundle\Helper; | |||
5 | use Graby\Graby; | 5 | use Graby\Graby; |
6 | use Psr\Log\LoggerInterface; | 6 | use Psr\Log\LoggerInterface; |
7 | use Wallabag\CoreBundle\Entity\Entry; | 7 | use Wallabag\CoreBundle\Entity\Entry; |
8 | use Wallabag\CoreBundle\Entity\Tag; | ||
9 | use Wallabag\CoreBundle\Tools\Utils; | 8 | use Wallabag\CoreBundle\Tools\Utils; |
10 | use Wallabag\CoreBundle\Repository\TagRepository; | ||
11 | use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; | 9 | use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; |
10 | use Symfony\Component\Validator\Constraints\Locale as LocaleConstraint; | ||
11 | use Symfony\Component\Validator\Constraints\Url as UrlConstraint; | ||
12 | use Symfony\Component\Validator\Validator\ValidatorInterface; | ||
12 | 13 | ||
13 | /** | 14 | /** |
14 | * This kind of proxy class take care of getting the content from an url | 15 | * This kind of proxy class take care of getting the content from an url |
@@ -18,38 +19,37 @@ class ContentProxy | |||
18 | { | 19 | { |
19 | protected $graby; | 20 | protected $graby; |
20 | protected $tagger; | 21 | protected $tagger; |
22 | protected $validator; | ||
21 | protected $logger; | 23 | protected $logger; |
22 | protected $tagRepository; | ||
23 | protected $mimeGuesser; | 24 | protected $mimeGuesser; |
24 | protected $fetchingErrorMessage; | 25 | protected $fetchingErrorMessage; |
26 | protected $eventDispatcher; | ||
25 | 27 | ||
26 | public function __construct(Graby $graby, RuleBasedTagger $tagger, TagRepository $tagRepository, LoggerInterface $logger, $fetchingErrorMessage) | 28 | public function __construct(Graby $graby, RuleBasedTagger $tagger, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage) |
27 | { | 29 | { |
28 | $this->graby = $graby; | 30 | $this->graby = $graby; |
29 | $this->tagger = $tagger; | 31 | $this->tagger = $tagger; |
32 | $this->validator = $validator; | ||
30 | $this->logger = $logger; | 33 | $this->logger = $logger; |
31 | $this->tagRepository = $tagRepository; | ||
32 | $this->mimeGuesser = new MimeTypeExtensionGuesser(); | 34 | $this->mimeGuesser = new MimeTypeExtensionGuesser(); |
33 | $this->fetchingErrorMessage = $fetchingErrorMessage; | 35 | $this->fetchingErrorMessage = $fetchingErrorMessage; |
34 | } | 36 | } |
35 | 37 | ||
36 | /** | 38 | /** |
37 | * Fetch content using graby and hydrate given entry with results information. | 39 | * Update entry using either fetched or provided content. |
38 | * In case we couldn't find content, we'll try to use Open Graph data. | ||
39 | * | 40 | * |
40 | * We can also force the content, in case of an import from the v1 for example, so the function won't | 41 | * @param Entry $entry Entry to update |
41 | * fetch the content from the website but rather use information given with the $content parameter. | 42 | * @param string $url Url of the content |
42 | * | 43 | * @param array $content Array with content provided for import with AT LEAST keys title, html, url to skip the fetchContent from the url |
43 | * @param Entry $entry Entry to update | 44 | * @param bool $disableContentUpdate Whether to skip trying to fetch content using Graby |
44 | * @param string $url Url to grab content for | ||
45 | * @param array $content An array with AT LEAST keys title, html, url, language & content_type to skip the fetchContent from the url | ||
46 | * | ||
47 | * @return Entry | ||
48 | */ | 45 | */ |
49 | public function updateEntry(Entry $entry, $url, array $content = []) | 46 | public function updateEntry(Entry $entry, $url, array $content = [], $disableContentUpdate = false) |
50 | { | 47 | { |
51 | // do we have to fetch the content or the provided one is ok? | 48 | if (!empty($content['html'])) { |
52 | if (empty($content) || false === $this->validateContent($content)) { | 49 | $content['html'] = $this->graby->cleanupHtml($content['html'], $url); |
50 | } | ||
51 | |||
52 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | ||
53 | $fetchedContent = $this->graby->fetchContent($url); | 53 | $fetchedContent = $this->graby->fetchContent($url); |
54 | 54 | ||
55 | // when content is imported, we have information in $content | 55 | // when content is imported, we have information in $content |
@@ -59,8 +59,24 @@ class ContentProxy | |||
59 | } | 59 | } |
60 | } | 60 | } |
61 | 61 | ||
62 | // be sure to keep the url in case of error | ||
63 | // so we'll be able to refetch it in the future | ||
64 | $content['url'] = !empty($content['url']) ? $content['url'] : $url; | ||
65 | |||
66 | $this->stockEntry($entry, $content); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * Stock entry with fetched or imported content. | ||
71 | * Will fall back to OpenGraph data if available. | ||
72 | * | ||
73 | * @param Entry $entry Entry to stock | ||
74 | * @param array $content Array with at least title, url & html | ||
75 | */ | ||
76 | private function stockEntry(Entry $entry, array $content) | ||
77 | { | ||
62 | $title = $content['title']; | 78 | $title = $content['title']; |
63 | if (!$title && isset($content['open_graph']['og_title'])) { | 79 | if (!$title && !empty($content['open_graph']['og_title'])) { |
64 | $title = $content['open_graph']['og_title']; | 80 | $title = $content['open_graph']['og_title']; |
65 | } | 81 | } |
66 | 82 | ||
@@ -68,18 +84,58 @@ class ContentProxy | |||
68 | if (false === $html) { | 84 | if (false === $html) { |
69 | $html = $this->fetchingErrorMessage; | 85 | $html = $this->fetchingErrorMessage; |
70 | 86 | ||
71 | if (isset($content['open_graph']['og_description'])) { | 87 | if (!empty($content['open_graph']['og_description'])) { |
72 | $html .= '<p><i>But we found a short description: </i></p>'; | 88 | $html .= '<p><i>But we found a short description: </i></p>'; |
73 | $html .= $content['open_graph']['og_description']; | 89 | $html .= $content['open_graph']['og_description']; |
74 | } | 90 | } |
75 | } | 91 | } |
76 | 92 | ||
77 | $entry->setUrl($content['url'] ?: $url); | 93 | $entry->setUrl($content['url']); |
78 | $entry->setTitle($title); | 94 | $entry->setTitle($title); |
79 | $entry->setContent($html); | 95 | $entry->setContent($html); |
80 | $entry->setHttpStatus(isset($content['status']) ? $content['status'] : ''); | 96 | $entry->setHttpStatus(isset($content['status']) ? $content['status'] : ''); |
81 | 97 | ||
82 | $entry->setLanguage(isset($content['language']) ? $content['language'] : ''); | 98 | if (!empty($content['date'])) { |
99 | $date = $content['date']; | ||
100 | |||
101 | // is it a timestamp? | ||
102 | if (filter_var($date, FILTER_VALIDATE_INT) !== false) { | ||
103 | $date = '@'.$content['date']; | ||
104 | } | ||
105 | |||
106 | try { | ||
107 | $entry->setPublishedAt(new \DateTime($date)); | ||
108 | } catch (\Exception $e) { | ||
109 | $this->logger->warning('Error while defining date', ['e' => $e, 'url' => $content['url'], 'date' => $content['date']]); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | if (!empty($content['authors']) && is_array($content['authors'])) { | ||
114 | $entry->setPublishedBy($content['authors']); | ||
115 | } | ||
116 | |||
117 | if (!empty($content['all_headers'])) { | ||
118 | $entry->setHeaders($content['all_headers']); | ||
119 | } | ||
120 | |||
121 | $this->validateAndSetLanguage( | ||
122 | $entry, | ||
123 | isset($content['language']) ? $content['language'] : null | ||
124 | ); | ||
125 | |||
126 | $this->validateAndSetPreviewPicture( | ||
127 | $entry, | ||
128 | isset($content['open_graph']['og_image']) ? $content['open_graph']['og_image'] : null | ||
129 | ); | ||
130 | |||
131 | // if content is an image, define it as a preview too | ||
132 | if (!empty($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { | ||
133 | $this->validateAndSetPreviewPicture( | ||
134 | $entry, | ||
135 | $content['url'] | ||
136 | ); | ||
137 | } | ||
138 | |||
83 | $entry->setMimetype(isset($content['content_type']) ? $content['content_type'] : ''); | 139 | $entry->setMimetype(isset($content['content_type']) ? $content['content_type'] : ''); |
84 | $entry->setReadingTime(Utils::getReadingTime($html)); | 140 | $entry->setReadingTime(Utils::getReadingTime($html)); |
85 | 141 | ||
@@ -88,85 +144,73 @@ class ContentProxy | |||
88 | $entry->setDomainName($domainName); | 144 | $entry->setDomainName($domainName); |
89 | } | 145 | } |
90 | 146 | ||
91 | if (isset($content['open_graph']['og_image']) && $content['open_graph']['og_image']) { | ||
92 | $entry->setPreviewPicture($content['open_graph']['og_image']); | ||
93 | } | ||
94 | |||
95 | // if content is an image define as a preview too | ||
96 | if (isset($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { | ||
97 | $entry->setPreviewPicture($content['url']); | ||
98 | } | ||
99 | |||
100 | try { | 147 | try { |
101 | $this->tagger->tag($entry); | 148 | $this->tagger->tag($entry); |
102 | } catch (\Exception $e) { | 149 | } catch (\Exception $e) { |
103 | $this->logger->error('Error while trying to automatically tag an entry.', [ | 150 | $this->logger->error('Error while trying to automatically tag an entry.', [ |
104 | 'entry_url' => $url, | 151 | 'entry_url' => $content['url'], |
105 | 'error_msg' => $e->getMessage(), | 152 | 'error_msg' => $e->getMessage(), |
106 | ]); | 153 | ]); |
107 | } | 154 | } |
108 | |||
109 | return $entry; | ||
110 | } | 155 | } |
111 | 156 | ||
112 | /** | 157 | /** |
113 | * Assign some tags to an entry. | 158 | * Validate that the given content has at least a title, an html and a url. |
159 | * | ||
160 | * @param array $content | ||
114 | * | 161 | * |
115 | * @param Entry $entry | 162 | * @return bool true if valid otherwise false |
116 | * @param array|string $tags An array of tag or a string coma separated of tag | ||
117 | * @param array $entitiesReady Entities from the EntityManager which are persisted but not yet flushed | ||
118 | * It is mostly to fix duplicate tag on import @see http://stackoverflow.com/a/7879164/569101 | ||
119 | */ | 163 | */ |
120 | public function assignTagsToEntry(Entry $entry, $tags, array $entitiesReady = []) | 164 | private function validateContent(array $content) |
121 | { | 165 | { |
122 | if (!is_array($tags)) { | 166 | return !empty($content['title']) && !empty($content['html']) && !empty($content['url']); |
123 | $tags = explode(',', $tags); | 167 | } |
124 | } | ||
125 | |||
126 | // keeps only Tag entity from the "not yet flushed entities" | ||
127 | $tagsNotYetFlushed = []; | ||
128 | foreach ($entitiesReady as $entity) { | ||
129 | if ($entity instanceof Tag) { | ||
130 | $tagsNotYetFlushed[$entity->getLabel()] = $entity; | ||
131 | } | ||
132 | } | ||
133 | |||
134 | foreach ($tags as $label) { | ||
135 | $label = trim($label); | ||
136 | 168 | ||
137 | // avoid empty tag | 169 | /** |
138 | if (0 === strlen($label)) { | 170 | * Use a Symfony validator to ensure the language is well formatted. |
139 | continue; | 171 | * |
140 | } | 172 | * @param Entry $entry |
173 | * @param string $value Language to validate | ||
174 | */ | ||
175 | private function validateAndSetLanguage($entry, $value) | ||
176 | { | ||
177 | // some lang are defined as fr-FR, es-ES. | ||
178 | // replacing - by _ might increase language support | ||
179 | $value = str_replace('-', '_', $value); | ||
141 | 180 | ||
142 | if (isset($tagsNotYetFlushed[$label])) { | 181 | $errors = $this->validator->validate( |
143 | $tagEntity = $tagsNotYetFlushed[$label]; | 182 | $value, |
144 | } else { | 183 | (new LocaleConstraint()) |
145 | $tagEntity = $this->tagRepository->findOneByLabel($label); | 184 | ); |
146 | 185 | ||
147 | if (is_null($tagEntity)) { | 186 | if (0 === count($errors)) { |
148 | $tagEntity = new Tag(); | 187 | $entry->setLanguage($value); |
149 | $tagEntity->setLabel($label); | ||
150 | } | ||
151 | } | ||
152 | 188 | ||
153 | // only add the tag on the entry if the relation doesn't exist | 189 | return; |
154 | if (false === $entry->getTags()->contains($tagEntity)) { | ||
155 | $entry->addTag($tagEntity); | ||
156 | } | ||
157 | } | 190 | } |
191 | |||
192 | $this->logger->warning('Language validation failed. '.(string) $errors); | ||
158 | } | 193 | } |
159 | 194 | ||
160 | /** | 195 | /** |
161 | * Validate that the given content as enough value to be used | 196 | * Use a Symfony validator to ensure the preview picture is a real url. |
162 | * instead of fetch the content from the url. | ||
163 | * | ||
164 | * @param array $content | ||
165 | * | 197 | * |
166 | * @return bool true if valid otherwise false | 198 | * @param Entry $entry |
199 | * @param string $value URL to validate | ||
167 | */ | 200 | */ |
168 | private function validateContent(array $content) | 201 | private function validateAndSetPreviewPicture($entry, $value) |
169 | { | 202 | { |
170 | return isset($content['title']) && isset($content['html']) && isset($content['url']) && isset($content['language']) && isset($content['content_type']); | 203 | $errors = $this->validator->validate( |
204 | $value, | ||
205 | (new UrlConstraint()) | ||
206 | ); | ||
207 | |||
208 | if (0 === count($errors)) { | ||
209 | $entry->setPreviewPicture($value); | ||
210 | |||
211 | return; | ||
212 | } | ||
213 | |||
214 | $this->logger->warning('PreviewPicture validation failed. '.(string) $errors); | ||
171 | } | 215 | } |
172 | } | 216 | } |