<?php
/**
 * poche, a read it later open source system
 *
 * @category   poche
 * @author     Nicolas LÅ“uillet <support@inthepoche.com>
 * @copyright  2013
 * @license    http://www.wtfpl.net/ see COPYING file
 */

class Url
{
    public $url;

    function __construct($url)
    {
        $this->url = base64_decode($url);
    }

    public function getUrl() {
        return $this->url;
    }

    public function setUrl($url) {
        $this->url = $url;
    }

    public function isCorrect()
    {
        $pattern = '|^(.*:)//([a-z\-.]+)(:[0-9]+)?(.*)$|i';

        return preg_match($pattern, $this->url);
    }

    public function clean()
    {
        $url = html_entity_decode(trim($this->url));

        $stuff = strpos($url,'&utm_source=');
        if ($stuff !== FALSE)
            $url = substr($url, 0, $stuff);
        $stuff = strpos($url,'?utm_source=');
        if ($stuff !== FALSE)
            $url = substr($url, 0, $stuff);
        $stuff = strpos($url,'#xtor=RSS-');
        if ($stuff !== FALSE)
            $url = substr($url, 0, $stuff);

        $this->url = $url;
    }

    public function fetchContent()
    {
        if ($this->isCorrect()) {
            $this->clean();
            $html = Encoding::toUTF8(Tools::getFile($this->getUrl()));

            # if Tools::getFile() if not able to retrieve HTTPS content, try the same URL with HTTP protocol
            if (!preg_match('!^https?://!i', $this->getUrl()) && (!isset($html) || strlen($html) <= 0)) {
                $this->setUrl('http://' . $this->getUrl());
                $html = Encoding::toUTF8(Tools::getFile($this->getUrl()));
            }

            if (function_exists('tidy_parse_string')) {
                $tidy = tidy_parse_string($html, array(), 'UTF8');
                $tidy->cleanRepair();

                //Warning: tidy might fail so, ensure there is still a content
                $body = $tidy->body();

                //hasChildren does not seem to work, just check the string
                //returned (and do not forget to clean the white spaces)
                if (preg_replace('/\s+/', '', $body->value) !== "<body></body>") {
                    $html = $tidy->value;
                }
            } 

            $parameters = array();
            if (isset($html) and strlen($html) > 0)
            {
                $readability = new Readability($html, $this->getUrl());
                $readability->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
                $readability->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS;

                if($readability->init())
                {
                    $content = $readability->articleContent->innerHTML;
                    $parameters['title'] = ($readability->articleTitle->innerHTML != '' ? $readability->articleTitle->innerHTML : _('Untitled'));
                    $parameters['content'] = $content;

                    return $parameters;
                }
            }
        }
        else {
            #$msg->add('e', _('error during url preparation : the link is not valid'));
            Tools::logm($this->getUrl() . ' is not a valid url');
        }

        return FALSE;
    }
}