public function processHtml($entryId, $html, $url)
{
$crawler = new Crawler($html);
- $result = $crawler
- ->filterXpath('//img')
+ $imagesCrawler = $crawler
+ ->filterXpath('//img');
+ $imagesUrls = $imagesCrawler
->extract(['src']);
+ $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler);
+ $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
$relativePath = $this->getRelativePath($entryId);
// download and save the image to the folder
- foreach ($result as $image) {
+ foreach ($imagesUrls as $image) {
$imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath);
if (false === $imagePath) {
*/
public function processSingleImage($entryId, $imagePath, $url, $relativePath = null)
{
+ if (null === $imagePath) {
+ return false;
+ }
+
if (null === $relativePath) {
$relativePath = $this->getRelativePath($entryId);
}
@rmdir($folderPath);
}
+ /**
+ * Get images urls from the srcset image attribute.
+ *
+ * @param Crawler $imagesCrawler
+ *
+ * @return array An array of urls
+ */
+ private function getSrcsetUrls(Crawler $imagesCrawler)
+ {
+ $urls = [];
+ $iterator = $imagesCrawler
+ ->getIterator();
+ while ($iterator->valid()) {
+ $srcsetAttribute = $iterator->current()->getAttribute('srcset');
+ if ('' !== $srcsetAttribute) {
+ // Couldn't start with " OR ' OR a white space
+ // Could be one or more white space
+ // Must be one or more digits followed by w OR x
+ $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/";
+ preg_match_all($pattern, $srcsetAttribute, $matches);
+ $srcset = call_user_func_array('array_merge', $matches);
+ $srcsetUrls = array_map(function ($src) {
+ return trim(explode(' ', $src, 2)[0]);
+ }, $srcset);
+ $urls = array_merge($srcsetUrls, $urls);
+ }
+ $iterator->next();
+ }
+
+ return $urls;
+ }
+
/**
* Setup base folder where all images are going to be saved.
*/