From 6212acfc813781501177674baa3139ceef62f78f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Tue, 15 Apr 2014 21:49:00 +0200 Subject: [fix] rss feed content type set to text/xml #636 --- inc/3rdparty/libraries/feedwriter/FeedWriter.php | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index df4c8b4b..79639c0c 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php @@ -89,18 +89,11 @@ define('JSONP', 3, true); */ public function genarateFeed() { - if ($this->version == RSS2) { -// header('Content-type: text/xml; charset=UTF-8'); - // this line prevents Chrome 20 from prompting download - // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss -// header('X-content-type-options: nosniff'); - } elseif ($this->version == JSON) { -// header('Content-type: application/json; charset=UTF-8'); - $this->json = new stdClass(); - } elseif ($this->version == JSONP) { -// header('Content-type: application/javascript; charset=UTF-8'); - $this->json = new stdClass(); - } + header('Content-type: text/xml; charset=UTF-8'); + // this line prevents Chrome 20 from prompting download + // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss + header('X-content-type-options: nosniff'); + $this->printHead(); $this->printChannels(); $this->printItems(); -- cgit v1.2.3 From 87090d8ae7582708d20f3c09fb37d780af860bcd Mon Sep 17 00:00:00 2001 From: tcit Date: Thu, 24 Apr 2014 03:04:02 +0200 Subject: Added epub export function --- .../libraries/PHPePub/EPub.HtmlEntities.php | 266 +++ inc/3rdparty/libraries/PHPePub/EPub.NCX.php | 782 +++++++ inc/3rdparty/libraries/PHPePub/EPub.OPF.php | 1226 ++++++++++ inc/3rdparty/libraries/PHPePub/EPub.php | 2429 ++++++++++++++++++++ .../libraries/PHPePub/EPubChapterSplitter.php | 201 ++ inc/3rdparty/libraries/PHPePub/Logger.php | 92 + inc/3rdparty/libraries/PHPePub/Zip.php | 818 +++++++ .../libraries/PHPePub/lib.uuid.LICENCE.txt | 31 + inc/3rdparty/libraries/PHPePub/lib.uuid.php | 314 +++ 9 files changed, 6159 insertions(+) create mode 100644 inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php create mode 100644 inc/3rdparty/libraries/PHPePub/EPub.NCX.php create mode 100644 inc/3rdparty/libraries/PHPePub/EPub.OPF.php create mode 100644 inc/3rdparty/libraries/PHPePub/EPub.php create mode 100644 inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php create mode 100644 inc/3rdparty/libraries/PHPePub/Logger.php create mode 100644 inc/3rdparty/libraries/PHPePub/Zip.php create mode 100644 inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt create mode 100644 inc/3rdparty/libraries/PHPePub/lib.uuid.php (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php new file mode 100644 index 00000000..376b6133 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php @@ -0,0 +1,266 @@ + \ No newline at end of file diff --git a/inc/3rdparty/libraries/PHPePub/EPub.NCX.php b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php new file mode 100644 index 00000000..e5da05cd --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php @@ -0,0 +1,782 @@ + + * @copyright 2009-2014 A. Grandt + * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. + * @version 3.20 + */ +class Ncx { + const _VERSION = 3.20; + + const MIMETYPE = "application/x-dtbncx+xml"; + + private $bookVersion = EPub::BOOK_VERSION_EPUB2; + + private $navMap = NULL; + private $uid = NULL; + private $meta = array(); + private $docTitle = NULL; + private $docAuthor = NULL; + + private $currentLevel = NULL; + private $lastLevel = NULL; + + private $languageCode = "en"; + private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; + + public $chapterList = array(); + public $referencesTitle = "Guide"; + public $referencesClass = "references"; + public $referencesId = "references"; + public $referencesList = array(); + public $referencesName = array(); + public $referencesOrder = NULL; + + /** + * Class constructor. + * + * @param string $uid + * @param string $docTitle + * @param string $docAuthor + * @param string $languageCode + * @param string $writingDirection + */ + function __construct($uid = NULL, $docTitle = NULL, $docAuthor = NULL, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { + $this->navMap = new NavMap($writingDirection); + $this->currentLevel = $this->navMap; + $this->setUid($uid); + $this->setDocTitle($docTitle); + $this->setDocAuthor($docAuthor); + $this->setLanguageCode($languageCode); + $this->setWritingDirection($writingDirection); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset($this->bookVersion, $this->navMap, $this->uid, $this->meta); + unset($this->docTitle, $this->docAuthor, $this->currentLevel, $this->lastLevel); + unset($this->languageCode, $this->writingDirection, $this->chapterList, $this->referencesTitle); + unset($this->referencesClass, $this->referencesId, $this->referencesList, $this->referencesName); + unset($this->referencesOrder); + } + + /** + * + * Enter description here ... + * + * @param string $bookVersion + */ + function setVersion($bookVersion) { + $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; + } + + /** + * + * @return bool TRUE if the book is set to type ePub 2 + */ + function isEPubVersion2() { + return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; + } + + /** + * + * Enter description here ... + * + * @param string $uid + */ + function setUid($uid) { + $this->uid = is_string($uid) ? trim($uid) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $docTitle + */ + function setDocTitle($docTitle) { + $this->docTitle = is_string($docTitle) ? trim($docTitle) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $docAuthor + */ + function setDocAuthor($docAuthor) { + $this->docAuthor = is_string($docAuthor) ? trim($docAuthor) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $languageCode + */ + function setLanguageCode($languageCode) { + $this->languageCode = is_string($languageCode) ? trim($languageCode) : "en"; + } + + /** + * + * Enter description here ... + * + * @param string $writingDirection + */ + function setWritingDirection($writingDirection) { + $this->writingDirection = is_string($writingDirection) ? trim($writingDirection) : EPub::DIRECTION_LEFT_TO_RIGHT; + } + + /** + * + * Enter description here ... + * + * @param NavMap $navMap + */ + function setNavMap($navMap) { + if ($navMap != NULL && is_object($navMap) && get_class($navMap) === "NavMap") { + $this->navMap = $navMap; + } + } + + /** + * Add one chapter level. + * + * Subsequent chapters will be added to this level. + * + * @param string $navTitle + * @param string $navId + * @param string $navClass + * @param string $isNavHidden + * @param string $writingDirection + * @return NavPoint + */ + function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { + $navPoint = FALSE; + if (isset($navTitle) && isset($navClass)) { + $navPoint = new NavPoint($navTitle, NULL, $navId, $navClass, $isNavHidden, $writingDirection); + $this->addNavPoint($navPoint); + } + if ($this->lastLevel !== NULL) { + $this->currentLevel = $this->lastLevel; + } + return $navPoint; + } + + /** + * Step back one chapter level. + * + * Subsequent chapters will be added to this chapters parent level. + */ + function backLevel() { + $this->lastLevel = $this->currentLevel; + $this->currentLevel = $this->currentLevel->getParent(); + } + + /** + * Step back to the root level. + * + * Subsequent chapters will be added to the rooot NavMap. + */ + function rootLevel() { + $this->lastLevel = $this->currentLevel; + $this->currentLevel = $this->navMap; + } + + /** + * Step back to the given level. + * Useful for returning to a previous level from deep within the structure. + * Values below 2 will have the same effect as rootLevel() + * + * @param int $newLevel + */ + function setCurrentLevel($newLevel) { + if ($newLevel <= 1) { + $this->rootLevel(); + } else { + while ($this->currentLevel->getLevel() > $newLevel) { + $this->backLevel(); + } + } + } + + /** + * Get current level count. + * The indentation of the current structure point. + * + * @return current level count; + */ + function getCurrentLevel() { + return $this->currentLevel->getLevel(); + } + + /** + * Add child NavPoints to current level. + * + * @param NavPoint $navPoint + */ + function addNavPoint($navPoint) { + $this->lastLevel = $this->currentLevel->addNavPoint($navPoint); + } + + /** + * + * Enter description here ... + * + * @return NavMap + */ + function getNavMap() { + return $this->navMap; + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $content + */ + function addMetaEntry($name, $content) { + $name = is_string($name) ? trim($name) : NULL; + $content = is_string($content) ? trim($content) : NULL; + + if ($name != NULL && $content != NULL) { + $this->meta[] = array($name => $content); + } + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + $nav = $this->navMap->finalize(); + + $ncx = "\n"; + if ($this->isEPubVersion2()) { + $ncx .= "\n"; + } + $ncx .= "languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" + . "\t\n" + . "\t\tuid . "\" />\n" + . "\t\tnavMap->getNavLevels() . "\" />\n" + . "\t\t\n" + . "\t\t\n"; + + if (sizeof($this->meta)) { + foreach ($this->meta as $metaEntry) { + list($name, $content) = each($metaEntry); + $ncx .= "\t\t\n"; + } + } + + $ncx .= "\t\n\n\t\n\t\t" + . $this->docTitle + . "\n\t\n\n\t\n\t\t" + . $this->docAuthor + . "\n\t\n\n" + . $nav; + + return $ncx . "\n"; + } + + /** + * + * @param string $title + * @param string $cssFileName + * @return string + */ + function finalizeEPub3($title = "Table of Contents", $cssFileName = NULL) { + $end = "\n" + . "languageCode . "\" lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" + . "\t\n" + . "\t\t" . $this->docTitle . "\n" + . "\t\t\n"; + if ($cssFileName !== NULL) { + $end .= "\t\t\n"; + } + $end .= "\t\n" + . "\t\n" + . "\t\t
\n" + . "\t\t\t

" . $title . "

\n" + . "\t\t
\n" + . $this->navMap->finalizeEPub3() + . $this->finalizeEPub3Landmarks() + . "\t\n" + . "\n"; + + return $end; + } + + /** + * Build the references for the ePub 2 toc. + * These are merely reference pages added to the end of the navMap though. + * + * @return string + */ + function finalizeReferences() { + if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { + $this->rootLevel(); + $this->subLevel($this->referencesTitle, $this->referencesId, $this->referencesClass); + $refId = 1; + while (list($item, $descriptive) = each($this->referencesOrder)) { + if (array_key_exists($item, $this->referencesList)) { + $name = (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]); + $navPoint = new NavPoint($name, $this->referencesList[$item], "ref-" . $refId++); + $this->addNavPoint($navPoint); + } + } + } + } + + /** + * Build the landmarks for the ePub 3 toc. + * @return string + */ + function finalizeEPub3Landmarks() { + $lm = ""; + if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { + $lm = "\t\t\t\n"; + } + return $lm; + } +} + +/** + * ePub NavMap class + */ +class NavMap { + const _VERSION = 3.00; + + private $navPoints = array(); + private $navLevels = 0; + private $writingDirection = NULL; + + /** + * Class constructor. + * + * @return void + */ + function __construct($writingDirection = NULL) { + $this->setWritingDirection($writingDirection); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset($this->navPoints, $this->navLevels, $this->writingDirection); + } + + /** + * Set the writing direction to be used for this NavPoint. + * + * @param string $writingDirection + */ + function setWritingDirection($writingDirection) { + $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; + } + + function getWritingDirection() { + return $this->writingDirection; + } + + /** + * Add a navPoint to the root of the NavMap. + * + * @param NavPoint $navPoint + * @return NavMap + */ + function addNavPoint($navPoint) { + if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { + $navPoint->setParent($this); + if ($navPoint->getWritingDirection() == NULL) { + $navPoint->setWritingDirection($this->writingDirection); + } + $this->navPoints[] = $navPoint; + return $navPoint; + } + return $this; + } + + /** + * The final max depth for the "dtb:depth" meta attribute + * Only available after finalize have been called. + * + * @return number + */ + function getNavLevels() { + return $this->navLevels+1; + } + + function getLevel() { + return 1; + } + + function getParent() { + return $this; + } + + /** + * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization + * + */ + function finalize() { + $playOrder = 0; + $this->navLevels = 0; + + $nav = "\t\n"; + if (sizeof($this->navPoints) > 0) { + $this->navLevels++; + foreach ($this->navPoints as $navPoint) { + $retLevel = $navPoint->finalize($nav, $playOrder, 0); + if ($retLevel > $this->navLevels) { + $this->navLevels = $retLevel; + } + } + } + return $nav . "\t\n"; + } + + /** + * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization + * + */ + function finalizeEPub3() { + $playOrder = 0; + $level = 0; + $this->navLevels = 0; + + $nav = "\t\t\n"; + } +} + +/** + * ePub NavPoint class + */ +class NavPoint { + const _VERSION = 3.00; + + private $label = NULL; + private $contentSrc = NULL; + private $id = NULL; + private $navClass = NULL; + private $isNavHidden = FALSE; + private $navPoints = array(); + private $parent = NULL; + + /** + * Class constructor. + * + * All three attributes are mandatory, though if ID is set to null (default) the value will be generated. + * + * @param string $label + * @param string $contentSrc + * @param string $id + * @param string $navClass + * @param bool $isNavHidden + * @param string $writingDirection + */ + function __construct($label, $contentSrc = NULL, $id = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { + $this->setLabel($label); + $this->setContentSrc($contentSrc); + $this->setId($id); + $this->setNavClass($navClass); + $this->setNavHidden($isNavHidden); + $this->setWritingDirection($writingDirection); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset($this->label, $this->contentSrc, $this->id, $this->navClass); + unset($this->isNavHidden, $this->navPoints, $this->parent); + } + + /** + * Set the Text label for the NavPoint. + * + * The label is mandatory. + * + * @param string $label + */ + function setLabel($label) { + $this->label = is_string($label) ? trim($label) : NULL; + } + + /** + * Get the Text label for the NavPoint. + * + * @return string Label + */ + function getLabel() { + return $this->label; + } + + /** + * Set the src reference for the NavPoint. + * + * The src is mandatory for ePub 2. + * + * @param string $contentSrc + */ + function setContentSrc($contentSrc) { + $this->contentSrc = isset($contentSrc) && is_string($contentSrc) ? trim($contentSrc) : NULL; + } + + /** + * Get the src reference for the NavPoint. + * + * @return string content src url. + */ + function getContentSrc() { + return $this->contentSrc; + } + /** + * Set the parent for this NavPoint. + * + * @param NavPoint or NavMap $parent + */ + function setParent($parent) { + if ($parent != NULL && is_object($parent) && + (get_class($parent) === "NavPoint" || get_class($parent) === "NavMap") ) { + $this->parent = $parent; + } + } + + /** + * Get the parent to this NavPoint. + * + * @return NavPoint, or NavMap if the parent is the root. + */ + function getParent() { + return $this->parent; + } + + /** + * Get the current level. 1 = document root. + * + * @return int level + */ + function getLevel() { + return $this->parent === NULL ? 1 : $this->parent->getLevel()+1; + } + + /** + * Set the id for the NavPoint. + * + * The id must be unique, and is mandatory. + * + * @param string $id + */ + function setId($id) { + $this->id = is_string($id) ? trim($id) : NULL; + } + + /** + * Set the class to be used for this NavPoint. + * + * @param string $navClass + */ + function setNavClass($navClass) { + $this->navClass = isset($navClass) && is_string($navClass) ? trim($navClass) : NULL; + } + + /** + * Set the class to be used for this NavPoint. + * + * @param string $navClass + */ + function setNavHidden($isNavHidden) { + $this->isNavHidden = $isNavHidden === TRUE; + } + + /** + * Set the writing direction to be used for this NavPoint. + * + * @param string $writingDirection + */ + function setWritingDirection($writingDirection) { + $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; + } + + function getWritingDirection() { + return $this->writingDirection; + } + + /** + * Add child NavPoints for multi level NavMaps. + * + * @param NavPoint $navPoint + */ + function addNavPoint($navPoint) { + if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { + $navPoint->setParent($this); + if ($navPoint->getWritingDirection() == NULL) { + $navPoint->setWritingDirection($this->writingDirection); + } + $this->navPoints[] = $navPoint; + return $navPoint; + } + return $this; + } + + /** + * + * Enter description here ... + * + * @param string $nav + * @param int $playOrder + * @param int $level + * @return int + */ + function finalize(&$nav = "", &$playOrder = 0, $level = 0) { + $maxLevel = $level; + $levelAdjust = 0; + + if ($this->isNavHidden) { + return $maxLevel; + } + + if (isset($this->contentSrc)) { + $playOrder++; + + if ($this->id == NULL) { + $this->id = "navpoint-" . $playOrder; + } + $nav .= str_repeat("\t", $level) . "\t\tid . "\" playOrder=\"" . $playOrder . "\">\n" + . str_repeat("\t", $level) . "\t\t\t\n" + . str_repeat("\t", $level) . "\t\t\t\t" . $this->label . "\n" + . str_repeat("\t", $level) . "\t\t\t\n" + . str_repeat("\t", $level) . "\t\t\tcontentSrc . "\" />\n"; + } else { + $levelAdjust++; + } + + if (sizeof($this->navPoints) > 0) { + $maxLevel++; + foreach ($this->navPoints as $navPoint) { + $retLevel = $navPoint->finalize($nav, $playOrder, ($level+1+$levelAdjust)); + if ($retLevel > $maxLevel) { + $maxLevel = $retLevel; + } + } + } + + if (isset($this->contentSrc)) { + $nav .= str_repeat("\t", $level) . "\t\t\n"; + } + + return $maxLevel; + } + + /** + * + * Enter description here ... + * + * @param string $nav + * @param int $playOrder + * @param int $level + * @return int + */ + function finalizeEPub3(&$nav = "", &$playOrder = 0, $level = 0, $subLevelClass = NULL, $subLevelHidden = FALSE) { + $maxLevel = $level; + + if ($this->id == NULL) { + $this->id = "navpoint-" . $playOrder; + } + $indent = str_repeat("\t", $level) . "\t\t\t\t"; + + $nav .= $indent . "
  • id . "\""; + if (isset($this->writingDirection)) { + $nav .= " dir=\"" . $this->writingDirection . "\""; + } + $nav .= ">\n"; + + if (isset($this->contentSrc)) { + $nav .= $indent . "\tcontentSrc . "\">" . $this->label . "\n"; + } else { + $nav .= $indent . "\t" . $this->label . "\n"; + } + + if (sizeof($this->navPoints) > 0) { + $maxLevel++; + + $nav .= $indent . "\t
      navPoints as $navPoint) { + $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, ($level+2), $subLevelClass, $subLevelHidden); + if ($retLevel > $maxLevel) { + $maxLevel = $retLevel; + } + } + $nav .= $indent . "\t
    \n"; + } + + $nav .= $indent . "
  • \n"; + + return $maxLevel; + } +} +?> \ No newline at end of file diff --git a/inc/3rdparty/libraries/PHPePub/EPub.OPF.php b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php new file mode 100644 index 00000000..803a2108 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php @@ -0,0 +1,1226 @@ + + * @copyright 2009-2014 A. Grandt + * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. + * @version 3.20 + */ +class Opf { + const _VERSION = 3.20; + + /* Core Media types. + * These types are the only guaranteed mime types any ePub reader must understand. + * Any other type muse define a fall back whose fallback chain will end in one of these. + */ + const TYPE_GIF = "image/gif"; + const TYPE_JPEG = "image/jpeg"; + const TYPE_PNG = "image/png"; + const TYPE_SVG = "image/svg+xml"; + const TYPE_XHTML = "application/xhtml+xml"; + const TYPE_DTBOOK = "application/x-dtbook+xml"; + const TYPE_CSS = "text/css"; + const TYPE_XML = "application/xml"; + const TYPE_OEB1_DOC = "text/x-oeb1-document"; // Deprecated + const TYPE_OEB1_CSS = "text/x-oeb1-css"; // Deprecated + const TYPE_NCX = "application/x-dtbncx+xml"; + + private $bookVersion = EPub::BOOK_VERSION_EPUB2; + private $ident = "BookId"; + + public $date = NULL; + public $metadata = NULL; + public $manifest = NULL; + public $spine = NULL; + public $guide = NULL; + + /** + * Class constructor. + * + * @return void + */ + function __construct($ident = "BookId", $bookVersion = EPub::BOOK_VERSION_EPUB2) { + $this->setIdent($ident); + $this->setVersion($bookVersion); + $this->metadata = new Metadata(); + $this->manifest = new Manifest(); + $this->spine = new Spine(); + $this->guide = new Guide(); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->bookVersion, $this->ident, $this->date, $this->metadata, $this->manifest, $this->spine, $this->guide); + } + + /** + * + * Enter description here ... + * + * @param string $ident + */ + function setVersion($bookVersion) { + $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; + } + + function isEPubVersion2() { + return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; + } + + /** + * + * Enter description here ... + * + * @param string $ident + */ + function setIdent($ident = "BookId") { + $this->ident = is_string($ident) ? trim($ident) : "BookId"; + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + $opf = "\n" + . "ident . "\" version=\"" . $this->bookVersion . "\">\n"; + + $opf .= $this->metadata->finalize($this->bookVersion, $this->date); + $opf .= $this->manifest->finalize($this->bookVersion); + $opf .= $this->spine->finalize(); + + if ($this->guide->length() > 0) { + $opf .= $this->guide->finalize(); + } + + return $opf . "\n"; + } + + // Convenience functions: + + /** + * + * Enter description here ... + * + * @param string $title + * @param string $language + * @param string $identifier + * @param string $identifierScheme + */ + function initialize($title, $language, $identifier, $identifierScheme) { + $this->metadata->addDublinCore(new DublinCore("title", $title)); + $this->metadata->addDublinCore(new DublinCore("language", $language)); + + $dc = new DublinCore("identifier", $identifier); + $dc->addAttr("id", $this->ident); + $dc->addOpfAttr("scheme", $identifierScheme); + $this->metadata->addDublinCore($dc); + } + + /** + * + * Enter description here ... + * + * @param string $id + * @param string $href + * @param string $mediaType + */ + function addItem($id, $href, $mediaType, $properties = NULL) { + $this->manifest->addItem(new Item($id, $href, $mediaType, $properties)); + } + + /** + * + * Enter description here ... + * + * @param string $idref + * @param bool $linear + */ + function addItemRef($idref, $linear = TRUE) { + $this->spine->addItemref(new Itemref($idref, $linear)); + } + + /** + * + * Enter description here ... + * + * @param string $type + * @param string $title + * @param string $href + */ + function addReference($type, $title, $href) { + $this->guide->addReference(new Reference($type, $title, $href)); + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $value + */ + function addDCMeta($name, $value) { + $this->metadata->addDublinCore(new DublinCore($name, $value)); + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $content + */ + function addMeta($name, $content) { + $this->metadata->addMeta($name, $content); + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $fileAs + * @param string $role Use the MarcCode constants + */ + function addCreator($name, $fileAs = NULL, $role = NULL) { + $dc = new DublinCore(DublinCore::CREATOR, trim($name)); + + if ($fileAs !== NULL) { + $dc->addOpfAttr("file-as", trim($fileAs)); + } + + if ($role !== NULL) { + $dc->addOpfAttr("role", trim($role)); + } + + $this->metadata->addDublinCore($dc); + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $fileAs + * @param string $role Use the MarcCode constants + */ + function addColaborator($name, $fileAs = NULL, $role = NULL) { + $dc = new DublinCore(DublinCore::CONTRIBUTOR, trim($name)); + + if ($fileAs !== NULL) { + $dc->addOpfAttr("file-as", trim($fileAs)); + } + + if ($role !== NULL) { + $dc->addOpfAttr("role", trim($role)); + } + + $this->metadata->addDublinCore($dc); + } +} + +/** + * ePub OPF Metadata structures + */ +class Metadata { + const _VERSION = 3.00; + + private $dc = array(); + private $meta = array(); + + /** + * Class constructor. + * + * @return void + */ + function __construct() { + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->dc, $this->meta); + } + + /** + * + * Enter description here ... + * + * @param DublinCore $dc + */ + function addDublinCore($dc) { + if ($dc != NULL && is_object($dc) && get_class($dc) === "DublinCore") { + $this->dc[] = $dc; + } + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $content + */ + function addMeta($name, $content) { + $name = is_string($name) ? trim($name) : NULL; + if (isset($name)) { + $content = is_string($content) ? trim($content) : NULL; + } + if (isset($content)) { + $this->meta[] = array ($name => $content); + } + } + + /** + * + * @param string $bookVersion + * @param int $date + * @return string + */ + function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2, $date = NULL) { + $metadata = "\t\n"; + } else { + $metadata .= "\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; + if (!isset($date)) { + $date = time(); + } + $metadata .= "\t\t" . gmdate("Y-m-d\TH:i:s\Z", $date) . "\n"; + } + + foreach ($this->dc as $dc) { + $metadata .= $dc->finalize($bookVersion); + } + + foreach ($this->meta as $data) { + list($name, $content) = each($data); + $metadata .= "\t\t\n"; + } + + return $metadata . "\t\n"; + } +} + +/** + * ePub OPF Dublin Core (dc:) Metadata structures + */ +class DublinCore { + const _VERSION = 3.00; + + const CONTRIBUTOR = "contributor"; + const COVERAGE = "coverage"; + const CREATOR = "creator"; + const DATE = "date"; + const DESCRIPTION = "description"; + const FORMAT = "format"; + const IDENTIFIER = "identifier"; + const LANGUAGE = "language"; + const PUBLISHER = "publisher"; + const RELATION = "relation"; + const RIGHTS = "rights"; + const SOURCE = "source"; + const SUBJECT = "subject"; + const TITLE = "title"; + const TYPE = "type"; + + private $dcName = NULL; + private $dcValue = NULL; + private $attr = array(); + private $opfAttr = array(); + + /** + * Class constructor. + * + * @return void + */ + function __construct($name, $value) { + $this->setDc($name, $value); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->dcName, $this->dcValue, $this->attr, $this->opfAttr); + } + + /** + * + * Enter description here ... + * + * @param string $name + * @param string $value + */ + function setDc($name, $value) { + $this->dcName = is_string($name) ? trim($name) : NULL; + if (isset($this->dcName)) { + $this->dcValue = isset($value) ? (string)$value : NULL; + } + if (! isset($this->dcValue)) { + $this->dcName = NULL; + } + } + + /** + * + * Enter description here ... + * + * @param string $attrName + * @param string $attrValue + */ + function addAttr($attrName, $attrValue) { + $attrName = is_string($attrName) ? trim($attrName) : NULL; + if (isset($attrName)) { + $attrValue = is_string($attrValue) ? trim($attrValue) : NULL; + } + if (isset($attrValue)) { + $this->attr[$attrName] = $attrValue; + } + } + + /** + * + * Enter description here ... + * + * @param string $opfAttrName + * @param string $opfAttrValue + */ + function addOpfAttr($opfAttrName, $opfAttrValue) { + $opfAttrName = is_string($opfAttrName) ? trim($opfAttrName) : NULL; + if (isset($opfAttrName)) { + $opfAttrValue = is_string($opfAttrValue) ? trim($opfAttrValue) : NULL; + } + if (isset($opfAttrValue)) { + $this->opfAttr[$opfAttrName] = $opfAttrValue; + } + } + + + /** + * + * @param string $bookVersion + * @return string + */ + function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { + $dc = "\t\tdcName; + + if (sizeof($this->attr) > 0) { + while (list($name, $content) = each($this->attr)) { + $dc .= " " . $name . "=\"" . $content . "\""; + } + } + + if ($bookVersion === EPub::BOOK_VERSION_EPUB2 && sizeof($this->opfAttr) > 0) { + while (list($name, $content) = each($this->opfAttr)) { + $dc .= " opf:" . $name . "=\"" . $content . "\""; + } + } + + return $dc . ">" . $this->dcValue . "dcName . ">\n"; + } +} + +/** + * ePub OPF Manifest structure + */ +class Manifest { + const _VERSION = 3.00; + + private $items = array(); + + /** + * Class constructor. + * + * @return void + */ + function __construct() { + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->items); + } + + /** + * + * Enter description here ... + * + * @param Item $item + */ + function addItem($item) { + if ($item != NULL && is_object($item) && get_class($item) === "Item") { + $this->items[] = $item; + } + } + + /** + * + * @param string $bookVersion + * @return string + */ + function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { + $manifest = "\n\t\n"; + foreach ($this->items as $item) { + $manifest .= $item->finalize($bookVersion); + } + return $manifest . "\t\n"; + } +} + +/** + * ePub OPF Item structure + */ +class Item { + const _VERSION = 3.00; + + private $id = NULL; + private $href = NULL; + private $mediaType = NULL; + private $properties = NULL; + private $requiredNamespace = NULL; + private $requiredModules = NULL; + private $fallback = NULL; + private $fallbackStyle = NULL; + + /** + * Class constructor. + * + * @return void + */ + function __construct($id, $href, $mediaType, $properties = NULL) { + $this->setId($id); + $this->setHref($href); + $this->setMediaType($mediaType); + $this->setProperties($properties); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->id, $this->href, $this->mediaType); + unset ($this->properties, $this->requiredNamespace, $this->requiredModules, $this->fallback, $this->fallbackStyle); + } + + /** + * + * Enter description here ... + * + * @param string $id + */ + function setId($id) { + $this->id = is_string($id) ? trim($id) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $href + */ + function setHref($href) { + $this->href = is_string($href) ? trim($href) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $mediaType + */ + function setMediaType($mediaType) { + $this->mediaType = is_string($mediaType) ? trim($mediaType) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $properties + */ + function setProperties($properties) { + $this->properties = is_string($properties) ? trim($properties) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $requiredNamespace + */ + function setRequiredNamespace($requiredNamespace) { + $this->requiredNamespace = is_string($requiredNamespace) ? trim($requiredNamespace) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $requiredModules + */ + function setRequiredModules($requiredModules) { + $this->requiredModules = is_string($requiredModules) ? trim($requiredModules) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $fallback + */ + function setfallback($fallback) { + $this->fallback = is_string($fallback) ? trim($fallback) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $fallbackStyle + */ + function setFallbackStyle($fallbackStyle) { + $this->fallbackStyle = is_string($fallbackStyle) ? trim($fallbackStyle) : NULL; + } + + /** + * + * @param string $bookVersion + * @return string + */ + function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { + $item = "\t\tid . "\" href=\"" . $this->href . "\" media-type=\"" . $this->mediaType . "\" "; + if ($bookVersion === EPub::BOOK_VERSION_EPUB3 && isset($this->properties)) { + $item .= "properties=\"" . $this->properties . "\" "; + } + if (isset($this->requiredNamespace)) { + $item .= "\n\t\t\trequired-namespace=\"" . $this->requiredNamespace . "\" "; + if (isset($this->requiredModules)) { + $item .= "required-modules=\"" . $this->requiredModules . "\" "; + } + } + if (isset($this->fallback)) { + $item .= "\n\t\t\tfallback=\"" . $this->fallback . "\" "; + } + if (isset($this->fallbackStyle)) { + $item .= "\n\t\t\tfallback-style=\"" . $this->fallbackStyle . "\" "; + } + return $item . "/>\n"; + } +} + +/** + * ePub OPF Spine structure + */ +class Spine { + const _VERSION = 1.00; + + private $itemrefs = array(); + private $toc = NULL; + + /** + * Class constructor. + * + * @return void + */ + function __construct($toc = "ncx") { + $this->setToc($toc); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->itemrefs, $this->toc); + } + + /** + * + * Enter description here ... + * + * @param string $toc + */ + function setToc($toc) { + $this->toc = is_string($toc) ? trim($toc) : NULL; + } + + /** + * + * Enter description here ... + * + * @param Itemref $itemref + */ + function addItemref($itemref) { + if ($itemref != NULL + && is_object($itemref) + && get_class($itemref) === "Itemref" + && !isset($this->itemrefs[$itemref->getIdref()])) { + $this->itemrefs[$itemref->getIdref()] = $itemref; + } + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + $spine = "\n\ttoc . "\">\n"; + foreach ($this->itemrefs as $itemref) { + $spine .= $itemref->finalize(); + } + return $spine . "\t\n"; + } +} + +/** + * ePub OPF ItemRef structure + */ +class Itemref { + const _VERSION = 3.00; + + private $idref = NULL; + private $linear = TRUE; + + /** + * Class constructor. + * + * @return void + */ + function __construct($idref, $linear = TRUE) { + $this->setIdref($idref); + $this->setLinear($linear); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->idref, $this->linear); + } + + /** + * + * Enter description here ... + * + * @param string $idref + */ + function setIdref($idref) { + $this->idref = is_string($idref) ? trim($idref) : NULL; + } + + /** + * + * Enter description here ... + * + * @return string $idref + */ + function getIdref() { + return $this->idref; + } + + /** + * + * Enter description here ... + * + * @param bool $linear + */ + function setLinear($linear = TRUE) { + $this->linear = $linear === TRUE; + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + $itemref = "\t\tidref . "\""; + if ($this->linear == FALSE) { + return $itemref .= " linear=\"no\" />\n"; + } + return $itemref . " />\n"; + } +} + +/** + * ePub OPF Guide structure + */ +class Guide { + const _VERSION = 3.00; + + private $references = array(); + + /** + * Class constructor. + * + * @return void + */ + function __construct() { + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->references); + } + + /** + * + * Enter description here ... + * + */ + function length() { + return sizeof($this->references); + } + + /** + * + * Enter description here ... + * + * @param Reference $reference + */ + function addReference($reference) { + if ($reference != NULL && is_object($reference) && get_class($reference) === "Reference") { + $this->references[] = $reference; + } + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + $ref = ""; + if (sizeof($this->references) > 0) { + $ref = "\n\t\n"; + foreach ($this->references as $reference) { + $ref .= $reference->finalize(); + } + $ref .= "\t\n"; + } + return $ref; + } +} + +/** + * Reference constants + */ +class Reference { + const _VERSION = 1.00; + + /* REFERENCE types are derived from the "Chicago Manual of Style" + */ + + /** Acknowledgements page */ + const ACKNOWLEDGEMENTS = "acknowledgements"; + + /** Bibliography page */ + const BIBLIOGRAPHY = "bibliography"; + + /** Colophon page */ + const COLOPHON = "colophon"; + + /** Copyright page */ + const COPYRIGHT_PAGE = "copyright-page"; + + /** Dedication */ + const DEDICATION = "dedication"; + + /** Epigraph */ + const EPIGRAPH = "epigraph"; + + /** Foreword */ + const FOREWORD = "foreword"; + + /** Glossary page */ + const GLOSSARY = "glossary"; + + /** back-of-book style index */ + const INDEX = "index"; + + /** List of illustrations */ + const LIST_OF_ILLUSTRATIONS = "loi"; + + /** List of tables */ + const LIST_OF_TABLES = "lot"; + + /** Notes page */ + const NOTES = "notes"; + + /** Preface page */ + const PREFACE = "preface"; + + /** Table of contents */ + const TABLE_OF_CONTENTS = "toc"; + + /** Page with possibly title, author, publisher, and other metadata */ + const TITLE_PAGE = "titlepage"; + + /** First page of the book, ie. first page of the first chapter */ + const TEXT = "text"; + + // ****************** + // ePub3 constants + // ****************** + + // Document partitions + /** The publications cover(s), jacket information, etc. This is officially in ePub3, but works for ePub 2 as well */ + const COVER = "cover"; + + /** Preliminary material to the content body, such as tables of contents, dedications, etc. */ + const FRONTMATTER = "frontmatter"; + + /** The main (body) content of a document. */ + const BODYMATTER = "bodymatter"; + + /** Ancillary material occurring after the document body, such as indices, appendices, etc. */ + const BACKMATTER = "backmatter"; + + + private $type = NULL; + private $title = NULL; + private $href = NULL; + + /** + * Class constructor. + * + * @param string $type + * @param string $title + * @param string $href + */ + function __construct($type, $title, $href) { + $this->setType($type); + $this->setTitle($title); + $this->setHref($href); + } + + /** + * Class destructor + * + * @return void + */ + function __destruct() { + unset ($this->type, $this->title, $this->href); + } + + /** + * + * Enter description here ... + * + * @param string $type + */ + function setType($type) { + $this->type = is_string($type) ? trim($type) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $title + */ + function setTitle($title) { + $this->title = is_string($title) ? trim($title) : NULL; + } + + /** + * + * Enter description here ... + * + * @param string $href + */ + function setHref($href) { + $this->href = is_string($href) ? trim($href) : NULL; + } + + /** + * + * Enter description here ... + * + * @return string + */ + function finalize() { + return "\t\ttype . "\" title=\"" . $this->title . "\" href=\"" . $this->href . "\" />\n"; + } +} + +/** + * Common Marc codes. + * Ref: http://www.loc.gov/marc/relators/ + */ +class MarcCode { + const _VERSION = 3.00; + + /** + * Adapter + * + * Use for a person who + * 1) reworks a musical composition, usually for a different medium, or + * 2) rewrites novels or stories for motion pictures or other audiovisual medium. + */ + const ADAPTER = "adp"; + + /** + * Annotator + * + * Use for a person who writes manuscript annotations on a printed item. + */ + const ANNOTATOR = "ann"; + + /** + * Arranger + * + * Use for a person who transcribes a musical composition, usually for a different + * medium from that of the original; in an arrangement the musical substance remains + * essentially unchanged. + */ + const ARRANGER = "arr"; + + /** + * Artist + * + * Use for a person (e.g., a painter) who conceives, and perhaps also implements, + * an original graphic design or work of art, if specific codes (e.g., [egr], + * [etr]) are not desired. For book illustrators, prefer Illustrator [ill]. + */ + const ARTIST = "art"; + + /** + * Associated name + * + * Use as a general relator for a name associated with or found in an item or + * collection, or which cannot be determined to be that of a Former owner [fmo] + * or other designated relator indicative of provenance. + */ + const ASSOCIATED_NAME = "asn"; + + /** + * Author + * + * Use for a person or corporate body chiefly responsible for the intellectual + * or artistic content of a work. This term may also be used when more than one + * person or body bears such responsibility. + */ + const AUTHOR = "aut"; + + /** + * Author in quotations or text extracts + * + * Use for a person whose work is largely quoted or extracted in a works to which + * he or she did not contribute directly. Such quotations are found particularly + * in exhibition catalogs, collections of photographs, etc. + */ + const AUTHOR_IN_QUOTES = "aqt"; + + /** + * Author of afterword, colophon, etc. + * + * Use for a person or corporate body responsible for an afterword, postface, + * colophon, etc. but who is not the chief author of a work. + */ + const AUTHOR_OF_AFTERWORD = "aft"; + + /** + * Author of introduction, etc. + * + * Use for a person or corporate body responsible for an introduction, preface, + * foreword, or other critical matter, but who is not the chief author. + */ + const AUTHOR_OF_INTRO = "aui"; + + /** + * Bibliographic antecedent + * + * Use for the author responsible for a work upon which the work represented by + * the catalog record is based. This can be appropriate for adaptations, sequels, + * continuations, indexes, etc. + */ + const BIB_ANTECEDENT = "ant"; + + /** + * Book producer + * + * Use for the person or firm responsible for the production of books and other + * print media, if specific codes (e.g., [bkd], [egr], [tyd], [prt]) are not desired. + */ + const BOOK_PRODUCER = "bkp"; + + /** + * Collaborator + * + * Use for a person or corporate body that takes a limited part in the elaboration + * of a work of another author or that brings complements (e.g., appendices, notes) + * to the work of another author. + */ + const COLABORATOR = "clb"; + + /** + * Commentator + * + * Use for a person who provides interpretation, analysis, or a discussion of the + * subject matter on a recording, motion picture, or other audiovisual medium. + * Compiler [com] Use for a person who produces a work or publication by selecting + * and putting together material from the works of various persons or bodies. + */ + const COMMENTATOR = "cmm"; + + /** + * Designer + * + * Use for a person or organization responsible for design if specific codes (e.g., + * [bkd], [tyd]) are not desired. + */ + const DESIGNER = "dsr"; + + /** + * Editor + * + * Use for a person who prepares for publication a work not primarily his/her own, + * such as by elucidating text, adding introductory or other critical matter, or + * technically directing an editorial staff. + */ + const EDITORT = "edt"; + + /** + * Illustrator + * + * Use for the person who conceives, and perhaps also implements, a design or + * illustration, usually to accompany a written text. + */ + const ILLUSTRATOR = "ill"; + + /** + * Lyricist + * + * Use for the writer of the text of a song. + */ + const LYRICIST = "lyr"; + + /** + * Metadata contact + * + * Use for the person or organization primarily responsible for compiling and + * maintaining the original description of a metadata set (e.g., geospatial + * metadata set). + */ + const METADATA_CONTACT = "mdc"; + + /** + * Musician + * + * Use for the person who performs music or contributes to the musical content + * of a work when it is not possible or desirable to identify the function more + * precisely. + */ + const MUSICIAN = "mus"; + + /** + * Narrator + * + * Use for the speaker who relates the particulars of an act, occurrence, or + * course of events. + */ + const NARRATOR = "nrt"; + + /** + * Other + * + * Use for relator codes from other lists which have no equivalent in the MARC + * list or for terms which have not been assigned a code. + */ + const OTHER = "oth"; + + /** + * Photographer + * + * Use for the person or organization responsible for taking photographs, whether + * they are used in their original form or as reproductions. + */ + const PHOTOGRAPHER = "pht"; + + /** + * Printer + * + * Use for the person or organization who prints texts, whether from type or plates. + */ + const PRINTER = "prt"; + + /** + * Redactor + * + * Use for a person who writes or develops the framework for an item without + * being intellectually responsible for its content. + */ + const REDACTOR = "red"; + + /** + * Reviewer + * + * Use for a person or corporate body responsible for the review of book, motion + * picture, performance, etc. + */ + const REVIEWER = "rev"; + + /** + * Sponsor + * + * Use for the person or agency that issued a contract, or under whose auspices + * a work has been written, printed, published, etc. + */ + const SPONSOR = "spn"; + + /** + * Thesis advisor + * + * Use for the person under whose supervision a degree candidate develops and + * presents a thesis, memoir, or text of a dissertation. + */ + const THESIS_ADVISOR = "ths"; + + /** + * Transcriber + * + * Use for a person who prepares a handwritten or typewritten copy from original + * material, including from dictated or orally recorded material. + */ + const TRANSCRIBER = "trc"; + + /** + * Translator + * + * Use for a person who renders a text from one language into another, or from + * an older form of a language into the modern form. + */ + const TRANSLATOR = "trl"; +} +?> diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php new file mode 100644 index 00000000..836c0512 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.php @@ -0,0 +1,2429 @@ + + * @copyright 2009-2014 A. Grandt + * @license GNU LGPL 2.1 + * @version 3.20 + * @link http://www.phpclasses.org/package/6115 + * @link https://github.com/Grandt/PHPePub + * @uses Zip.php version 1.50; http://www.phpclasses.org/browse/package/6110.html or https://github.com/Grandt/PHPZip + */ +class EPub { + const VERSION = 3.20; + const REQ_ZIP_VERSION = 1.60; + + const IDENTIFIER_UUID = 'UUID'; + const IDENTIFIER_URI = 'URI'; + const IDENTIFIER_ISBN = 'ISBN'; + + /** Ignore all external references, and do not process the file for these */ + const EXTERNAL_REF_IGNORE = 0; + /** Process the file for external references and add them to the book */ + const EXTERNAL_REF_ADD = 1; + /** Process the file for external references and add them to the book, but remove images, and img tags */ + const EXTERNAL_REF_REMOVE_IMAGES = 2; + /** Process the file for external references and add them to the book, but replace images, and img tags with [image] */ + const EXTERNAL_REF_REPLACE_IMAGES = 3; + + const DIRECTION_LEFT_TO_RIGHT = "ltr"; + const DIRECTION_RIGHT_TO_LEFT = "rtl"; + + const BOOK_VERSION_EPUB2 = "2.0"; + const BOOK_VERSION_EPUB3 = "3.0"; + + private $bookVersion = EPub::BOOK_VERSION_EPUB2; + + public $maxImageWidth = 768; + public $maxImageHeight = 1024; + + public $splitDefaultSize = 250000; + /** Gifs can crash some early ADE based readers, and are disabled by default. + * getImage will convert these if it can, unless this is set to TRUE. + */ + public $isGifImagesEnabled = FALSE; + public $isReferencesAddedToToc = TRUE; + + private $zip; + + private $title = ""; + private $language = "en"; + private $identifier = ""; + private $identifierType = ""; + private $description = ""; + private $author = ""; + private $authorSortKey = ""; + private $publisherName = ""; + private $publisherURL = ""; + private $date = 0; + private $rights = ""; + private $coverage = ""; + private $relation = ""; + private $sourceURL = ""; + + private $chapterCount = 0; + private $opf = NULL; + private $ncx = NULL; + private $isFinalized = FALSE; + private $isCoverImageSet = FALSE; + private $buildTOC = FALSE; + private $tocTitle = NULL; + private $tocFileName = NULL; + private $tocCSSClass = NULL; + private $tocAddReferences = FALSE; + private $tocCssFileName = NULL; + + private $fileList = array(); + private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; + private $languageCode = "en"; + + /** + * Used for building the TOC. + * If this list is overwritten it MUST contain at least "text" as an element. + */ + public $referencesOrder = NULL; + + private $dateformat = 'Y-m-d\TH:i:s.000000P'; // ISO 8601 long + private $dateformatShort = 'Y-m-d'; // short date format to placate ePubChecker. + private $headerDateFormat = "D, d M Y H:i:s T"; + + protected $isCurlInstalled; + protected $isGdInstalled; + protected $isExifInstalled; + protected $isFileGetContentsInstalled; + protected $isFileGetContentsExtInstalled; + + private $bookRoot = "OEBPS/"; + private $docRoot = NULL; + private $EPubMark = TRUE; + private $generator = ""; + + private $log = NULL; + public $isLogging = TRUE; + + public $encodeHTML = FALSE; + + private $mimetypes = array( + "js" => "application/x-javascript", "swf" => "application/x-shockwave-flash", "xht" => "application/xhtml+xml", "xhtml" => "application/xhtml+xml", "zip" => "application/zip", + "aif" => "audio/x-aiff", "aifc" => "audio/x-aiff", "aiff" => "audio/x-aiff", "au" => "audio/basic", "kar" => "audio/midi", "m3u" => "audio/x-mpegurl", "mid" => "audio/midi", "midi" => "audio/midi", "mp2" => "audio/mpeg", "mp3" => "audio/mpeg", "mpga" => "audio/mpeg", "oga" => "audio/ogg", "ogg" => "audio/ogg", "ra" => "audio/x-realaudio", "ram" => "audio/x-pn-realaudio", "rm" => "audio/x-pn-realaudio", "rpm" => "audio/x-pn-realaudio-plugin", "snd" => "audio/basic", "wav" => "audio/x-wav", + "bmp" => "image/bmp", "djv" => "image/vnd.djvu", "djvu" => "image/vnd.djvu", "gif" => "image/gif", "ief" => "image/ief", "jpe" => "image/jpeg", "jpeg" => "image/jpeg", "jpg" => "image/jpeg", "pbm" => "image/x-portable-bitmap", "pgm" => "image/x-portable-graymap", "png" => "image/png", "pnm" => "image/x-portable-anymap", "ppm" => "image/x-portable-pixmap", "ras" => "image/x-cmu-raster", "rgb" => "image/x-rgb", "tif" => "image/tif", "tiff" => "image/tiff", "wbmp" => "image/vnd.wap.wbmp", "xbm" => "image/x-xbitmap", "xpm" => "image/x-xpixmap", "xwd" => "image/x-windowdump", + "asc" => "text/plain", "css" => "text/css", "etx" => "text/x-setext", "htm" => "text/html", "html" => "text/html", "rtf" => "text/rtf", "rtx" => "text/richtext", "sgm" => "text/sgml", "sgml" => "text/sgml", "tsv" => "text/tab-seperated-values", "txt" => "text/plain", "wml" => "text/vnd.wap.wml", "wmls" => "text/vnd.wap.wmlscript", "xml" => "text/xml", "xsl" => "text/xml", + "avi" => "video/x-msvideo", "mov" => "video/quicktime", "movie" => "video/x-sgi-movie", "mp4" => "video/mp4", "mpe" => "video/mpeg", "mpeg" => "video/mpeg", "mpg" => "video/mpeg", "mxu" => "video/vnd.mpegurl", "ogv" => "video/ogg", "qt" => "video/quicktime", "webm" => "video/webm"); + + // These are the ONLY allowed types in that these are the ones ANY reader must support, any other MUST have the fallback attribute pointing to one of these. + private $coreMediaTypes = array("image/gif", "image/jpeg", "image/png", "image/svg+xml", "application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/css", "text/x-oeb1-css", "text/x-oeb1-document"); + + private $opsContentTypes = array("application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/x-oeb1-document"); + + private $forbiddenCharacters = array("?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%"); + + private $htmlContentHeader = "\n\n\n\n\n\n\n"; + private $htmlContentFooter = "\n\n"; + + /** + * Class constructor. + * + * @return void + */ + function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { + include_once("Zip.php"); + include_once("Logger.php"); + + $this->bookVersion = $bookVersion; + $this->writingDirection = $writingDirection; + $this->languageCode = $languageCode; + + $this->log = new Logger("EPub", $this->isLogging); + + /* Prepare Logging. Just in case it's used. later */ + if ($this->isLogging) { + $this->log->logLine("EPub class version....: " . self::VERSION); + $this->log->logLine("EPub req. Zip version.: " . self::REQ_ZIP_VERSION); + $this->log->logLine("Zip version...........: " . Zip::VERSION); + $this->log->dumpInstalledModules(); + } + + if (!defined("Zip::VERSION") || Zip::VERSION < self::REQ_ZIP_VERSION) { + die("

    EPub version " . self::VERSION . " requires Zip.php at version " . self::REQ_ZIP_VERSION . " or higher.
    You can obtain the latest version from http://www.phpclasses.org/browse/package/6110.html.

    "); + } + + include_once("EPubChapterSplitter.php"); + include_once("EPub.HtmlEntities.php"); + include_once("EPub.NCX.php"); + include_once("EPub.OPF.php"); + + $this->initialize(); + } + + /** + * Class destructor + * + * @return void + * @TODO make sure elements in the destructor match the current class elements + */ + function __destruct() { + unset($this->bookVersion, $this->maxImageWidth, $this->maxImageHeight); + unset($this->splitDefaultSize, $this->isGifImagesEnabled, $this->isReferencesAddedToToc); + unset($this->zip, $this->title, $this->language, $this->identifier, $this->identifierType); + unset($this->description, $this->author, $this->authorSortKey, $this->publisherName); + unset($this->publisherURL, $this->date, $this->rights, $this->coverage, $this->relation); + unset($this->sourceURL, $this->chapterCount, $this->opf, $this->ncx, $this->isFinalized); + unset($this->isCoverImageSet, $this->fileList, $this->writingDirection, $this->languageCode); + unset($this->referencesOrder, $this->dateformat, $this->dateformatShort, $this->headerDateFormat); + unset($this->isCurlInstalled, $this->isGdInstalled, $this->isExifInstalled); + unset($this->isFileGetContentsInstalled, $this->isFileGetContentsExtInstalled, $this->bookRoot); + unset($this->docRoot, $this->EPubMark, $this->generator, $this->log, $this->isLogging); + unset($this->encodeHTML, $this->mimetypes, $this->coreMediaTypes, $this->opsContentTypes); + unset($this->forbiddenCharacters, $this->htmlContentHeader, $this->htmlContentFooter); + unset($this->buildTOC, $this->tocTitle, $this->tocCSSClass, $this->tocAddReferences); + unset($this->tocFileName, $this->tocCssFileName); + } + + /** + * initialize defaults. + */ + private function initialize() { + $this->referencesOrder = array( + Reference::COVER => "Cover Page", + Reference::TITLE_PAGE => "Title Page", + Reference::ACKNOWLEDGEMENTS => "Acknowledgements", + Reference::BIBLIOGRAPHY => "Bibliography", + Reference::COLOPHON => "Colophon", + Reference::COPYRIGHT_PAGE => "Copyright", + Reference::DEDICATION => "Dedication", + Reference::EPIGRAPH => "Epigraph", + Reference::FOREWORD => "Foreword", + Reference::TABLE_OF_CONTENTS => "Table of Contents", + Reference::NOTES => "Notes", + Reference::PREFACE => "Preface", + Reference::TEXT => "First Page", + Reference::LIST_OF_ILLUSTRATIONS => "List of Illustrations", + Reference::LIST_OF_TABLES => "List of Tables", + Reference::GLOSSARY => "Glossary", + Reference::INDEX => "Index"); + + $this->docRoot = filter_input(INPUT_SERVER, "DOCUMENT_ROOT") . "/"; + + $this->isCurlInstalled = extension_loaded('curl') && function_exists('curl_version'); + $this->isGdInstalled = extension_loaded('gd') && function_exists('gd_info'); + $this->isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype'); + $this->isFileGetContentsInstalled = function_exists('file_get_contents'); + $this->isFileGetContentsExtInstalled = $this->isFileGetContentsInstalled && ini_get('allow_url_fopen'); + + $this->zip = new Zip(); + $this->zip->setExtraField(FALSE); + $this->zip->addFile("application/epub+zip", "mimetype"); + $this->zip->setExtraField(TRUE); + $this->zip->addDirectory("META-INF"); + + $this->content = "\n\n\t\n\t\tbookRoot . "book.opf\" media-type=\"application/oebps-package+xml\" />\n\t\n\n"; + + if (!$this->isEPubVersion2()) { + $this->htmlContentHeader = "\n" + . "\n" + . "" + . "\n" + . "\n" + . "\n" + . "\n"; + } + + $this->zip->addFile($this->content, "META-INF/container.xml", 0, NULL, FALSE); + $this->content = NULL; + $this->ncx = new Ncx(NULL, NULL, NULL, $this->languageCode, $this->writingDirection); + $this->opf = new Opf(); + $this->ncx->setVersion($this->bookVersion); + $this->opf->setVersion($this->bookVersion); + $this->opf->addItem("ncx", "book.ncx", Ncx::MIMETYPE); + $this->chapterCount = 0; + } + + /** + * Add dynamically generated data as a file to the book. + * + * @param string $fileName Filename to use for the file, must be unique for the book. + * @param string $fileId Unique identifier for the file. + * @param string $fileData File data + * @param string $mimetype file mime type + * @return bool $success + */ + function addFile($fileName, $fileId, $fileData, $mimetype) { + if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { + return FALSE; + } + + $fileName = $this->normalizeFileName($fileName); + + $compress = (strpos($mimetype, "image/") !== 0); + + $this->zip->addFile($fileData, $this->bookRoot.$fileName, 0, NULL, $compress); + $this->fileList[$fileName] = $fileName; + $this->opf->addItem($fileId, $fileName, $mimetype); + return TRUE; + } + + /** + * Add a large file directly from the filestystem to the book. + * + * @param string $fileName Filename to use for the file, must be unique for the book. + * @param string $fileId Unique identifier for the file. + * @param string $filePath File path + * @param string $mimetype file mime type + * @return bool $success + */ + function addLargeFile($fileName, $fileId, $filePath, $mimetype) { + if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { + return FALSE; + } + $fileName = $this->normalizeFileName($fileName); + + if ($this->zip->addLargeFile($filePath, $this->bookRoot.$fileName)) { + $this->fileList[$fileName] = $fileName; + $this->opf->addItem($fileId, $fileName, $mimetype); + return TRUE; + } + return FALSE; + } + + /** + * Add a CSS file to the book. + * + * @param string $fileName Filename to use for the CSS file, must be unique for the book. + * @param string $fileId Unique identifier for the file. + * @param string $fileData CSS data + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for processCSSExternalReferences for explanation. Default is EPub::EXTERNAL_REF_IGNORE. + * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. + * + * @return bool $success + */ + function addCSSFile($fileName, $fileId, $fileData, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { + if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { + return FALSE; + } + $fileName = Zip::getRelativePath($fileName); + $fileName = preg_replace('#^[/\.]+#i', "", $fileName); + + if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { + $cssDir = pathinfo($fileName); + $cssDir = preg_replace('#^[/\.]+#i', "", $cssDir["dirname"] . "/"); + if (!empty($cssDir)) { + $cssDir = preg_replace('#[^/]+/#i', "../", $cssDir); + } + + $this->processCSSExternalReferences($fileData, $externalReferences, $baseDir, $cssDir); + } + + $this->addFile($fileName, "css_" . $fileId, $fileData, "text/css"); + + return TRUE; + } + + /** + * Add a chapter to the book, as a chapter should not exceed 250kB, you can parse an array with multiple parts as $chapterData. + * These will still only show up as a single chapter in the book TOC. + * + * @param string $chapterName Name of the chapter, will be use din the TOC + * @param string $fileName Filename to use for the chapter, must be unique for the book. + * @param string $chapter Chapter text in XHTML or array $chapterData valid XHTML data for the chapter. File should NOT exceed 250kB. + * @param bool $autoSplit Should the chapter be split if it exceeds the default split size? Default=FALSE, only used if $chapterData is a string. + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for processChapterExternalReferences for explanation. Default is EPub::EXTERNAL_REF_IGNORE. + * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. + * @return mixed $success FALSE if the addition failed, else the new NavPoint. + */ + function addChapter($chapterName, $fileName, $chapterData = NULL, $autoSplit = FALSE, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { + if ($this->isFinalized) { + return FALSE; + } + $fileName = Zip::getRelativePath($fileName); + $fileName = preg_replace('#^[/\.]+#i', "", $fileName); + + $chapter = $chapterData; + if ($autoSplit && is_string($chapterData) && mb_strlen($chapterData) > $this->splitDefaultSize) { + $splitter = new EPubChapterSplitter(); + + $chapterArray = $splitter->splitChapter($chapterData); + if (count($chapterArray) > 1) { + $chapter = $chapterArray; + } + } + + if (!empty($chapter) && is_string($chapter)) { + if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { + $htmlDirInfo = pathinfo($fileName); + $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); + $this->processChapterExternalReferences($chapter, $externalReferences, $baseDir, $htmlDir); + } + + if ($this->encodeHTML === TRUE) { + $chapter = $this->encodeHtml($chapter); + } + + $this->chapterCount++; + $this->addFile($fileName, "chapter" . $this->chapterCount, $chapter, "application/xhtml+xml"); + $this->opf->addItemRef("chapter" . $this->chapterCount); + + $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); + $this->ncx->addNavPoint($navPoint); + $this->ncx->chapterList[$chapterName] = $navPoint; + } else if (is_array($chapter)) { + $fileNameParts = pathinfo($fileName); + $extension = $fileNameParts['extension']; + $name = $fileNameParts['filename']; + + $partCount = 0; + $this->chapterCount++; + + $oneChapter = each($chapter); + while ($oneChapter) { + list($k, $v) = $oneChapter; + if ($this->encodeHTML === TRUE) { + $v = $this->encodeHtml($v); + } + + if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { + $this->processChapterExternalReferences($v, $externalReferences, $baseDir); + } + $partCount++; + $partName = $name . "_" . $partCount; + $this->addFile($partName . "." . $extension, $partName, $v, "application/xhtml+xml"); + $this->opf->addItemRef($partName); + + $oneChapter = each($chapter); + } + $partName = $name . "_1." . $extension; + $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $partName, $partName); + $this->ncx->addNavPoint($navPoint); + + $this->ncx->chapterList[$chapterName] = $navPoint; + } else if (!isset($chapterData) && strpos($fileName, "#") > 0) { + $this->chapterCount++; + //$this->opf->addItemRef("chapter" . $this->chapterCount); + + $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); + $this->ncx->addNavPoint($navPoint); + $this->ncx->chapterList[$chapterName] = $navPoint; + } else if (!isset($chapterData) && $fileName=="TOC.xhtml") { + $this->chapterCount++; + $this->opf->addItemRef("toc"); + + $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); + $this->ncx->addNavPoint($navPoint); + $this->ncx->chapterList[$chapterName] = $navPoint; + } + return $navPoint; + } + + /** + * Add one chapter level. + * + * Subsequent chapters will be added to this level. + * + * @param string $navTitle + * @param string $navId + * @param string $navClass + * @param int $isNavHidden + * @param string $writingDirection + * @return NavPoint The new NavPoint for that level. + */ + function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { + return $this->ncx->subLevel($this->decodeHtmlEntities($navTitle), $navId, $navClass, $isNavHidden, $writingDirection); + } + + /** + * Step back one chapter level. + * + * Subsequent chapters will be added to this chapters parent level. + */ + function backLevel() { + $this->ncx->backLevel(); + } + + /** + * Step back to the root level. + * + * Subsequent chapters will be added to the rooot NavMap. + */ + function rootLevel() { + $this->ncx->rootLevel(); + } + + /** + * Step back to the given level. + * Useful for returning to a previous level from deep within the structure. + * Values below 2 will have the same effect as rootLevel() + * + * @param int $newLevel + */ + function setCurrentLevel($newLevel) { + $this->ncx->setCurrentLevel($newLevel); + } + + /** + * Get current level count. + * The indentation of the current structure point. + * + * @return current level count; + */ + function getCurrentLevel() { + return $this->ncx->getCurrentLevel(); + } + + /** + * Wrap ChapterContent with Head and Footer + * + * @param $content + * @return string $content + */ + private function wrapChapter($content) { + return $this->htmlContentHeader . "\n" . $content . "\n" . $this->htmlContentFooter; + } + + /** + * Reference pages is usually one or two pages for items such as Table of Contents, reference lists, Author notes or Acknowledgements. + * These do not show up in the regular navigation list. + * + * As they are supposed to be short. + * + * @param string $pageName Name of the chapter, will be use din the TOC + * @param string $fileName Filename to use for the chapter, must be unique for the book. + * @param string $pageData Page content in XHTML. File should NOT exceed 250kB. + * @param string $reference Reference key + * @param int $externalReferences How to handle external references. See documentation for processChapterExternalReferences for explanation. Default is EPub::EXTERNAL_REF_IGNORE. + * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. + * @return bool $success + */ + function addReferencePage($pageName, $fileName, $pageData, $reference, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { + if ($this->isFinalized) { + return FALSE; + } + $fileName = Zip::getRelativePath($fileName); + $fileName = preg_replace('#^[/\.]+#i', "", $fileName); + + + if (!empty($pageData) && is_string($pageData)) { + if ($this->encodeHTML === TRUE) { + $pageData = $this->encodeHtml($pageData); + } + + $this->wrapChapter($pageData); + + if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { + $htmlDirInfo = pathinfo($fileName); + $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); + $this->processChapterExternalReferences($pageData, $externalReferences, $baseDir, $htmlDir); + } + + $this->addFile($fileName, "ref_" . $reference, $pageData, "application/xhtml+xml"); + + if ($reference !== Reference::TABLE_OF_CONTENTS || !isset($this->ncx->referencesList[$reference])) { + $this->opf->addItemRef("ref_" . $reference, FALSE); + $this->opf->addReference($reference, $pageName, $fileName); + + $this->ncx->referencesList[$reference] = $fileName; + $this->ncx->referencesName[$reference] = $pageName; + } + return TRUE; + } + return TRUE; + } + + /** + * Add custom metadata to the book. + * + * It is up to the builder to make sure there are no collisions. Metadata are just key value pairs. + * + * @param string $name + * @param string $content + */ + function addCustomMetadata($name, $content) { + $this->opf->addMeta($name, $content); + } + + /** + * Add DublinCore metadata to the book + * + * Use the DublinCore constants included in EPub, ie DublinCore::DATE + * + * @param string $dublinCore name + * @param string $value + */ + function addDublinCoreMetadata($dublinCoreConstant, $value) { + if ($this->isFinalized) { + return; + } + + $this->opf->addDCMeta($dublinCoreConstant, $this->decodeHtmlEntities($value)); + } + + /** + * Add a cover image to the book. + * If the $imageData is not set, the function assumes the $fileName is the path to the image file. + * + * The styling and structure of the generated XHTML is heavily inspired by the XHTML generated by Calibre. + * + * @param string $fileName Filename to use for the image, must be unique for the book. + * @param string $imageData Binary image data + * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png". + * @return bool $success + */ + function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL) { + if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) { + return FALSE; + } + + if ($imageData == NULL) { + // assume $fileName is the valid file path. + if (!file_exists($fileName)) { + // Attempt to locate the file using the doc root. + $rp = realpath($this->docRoot . "/" . $fileName); + + if ($rp !== FALSE) { + // only assign the docroot path if it actually exists there. + $fileName = $rp; + } + } + $image = $this->getImage($fileName); + $imageData = $image['image']; + $mimetype = $image['mime']; + $fileName = preg_replace("#\.[^\.]+$#", "." . $image['ext'], $fileName); + } + + + $path = pathinfo($fileName); + $imgPath = "images/" . $path["basename"]; + + if (empty($mimetype) && file_exists($fileName)) { + list($width, $height, $type, $attr) = getimagesize($fileName); + $mimetype = image_type_to_mime_type($type); + } + if (empty($mimetype)) { + $ext = strtolower($path['extension']); + if ($ext == "jpg") { + $ext = "jpeg"; + } + $mimetype = "image/" . $ext; + } + + $coverPage = ""; + + if ($this->isEPubVersion2()) { + $coverPage = "\n" + . "\n" + . "\n" + . "\t\n" + . "\t\t\n" + . "\t\tCover Image\n" + . "\t\t\n" + . "\t\n" + . "\t\n" + . "\t\t
    \n" + . "\t\t\t\"Cover\n" + . "\t\t
    \n" + . "\t\n" + . "\n"; + } else { + $coverPage = "\n" + . "\n" + . "" + . "\t\n" + . "\t\tCover Image\n" + . "\t\t\n" + . "\t\n" + . "\t\n" + . "\t\t
    \n" + . "\t\t\t\"Cover\n" + . "\t\t
    \n" + . "\t\n" + . "\n"; + } + $coverPageCss = "@page, body, div, img {\n" + . "\tpadding: 0pt;\n" + . "\tmargin:0pt;\n" + . "}\n\nbody {\n" + . "\ttext-align: center;\n" + . "}\n"; + + $this->addCSSFile("Styles/CoverPage.css", "CoverPageCss", $coverPageCss); + $this->addFile($imgPath, "CoverImage", $imageData, $mimetype); + $this->addReferencePage("CoverPage", "CoverPage.xhtml", $coverPage, "cover"); + $this->isCoverImageSet = TRUE; + return TRUE; + } + + /** + * Process external references from a HTML to the book. The chapter itself is not stored. + * the HTML is scanned for <link..., <style..., and <img tags. + * Embedded CSS styles and links will also be processed. + * Script tags are not processed, as scripting should be avoided in e-books. + * + * EPub keeps track of added files, and duplicate files referenced across multiple + * chapters, are only added once. + * + * If the $doc is a string, it is assumed to be the content of an HTML file, + * else is it assumes to be a DOMDocument. + * + * Basedir is the root dir the HTML is supposed to "live" in, used to resolve + * relative references such as <img src="../images/image.png"/> + * + * $externalReferences determines how the function will handle external references. + * + * @param mixed &$doc (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * + * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processChapterExternalReferences(&$doc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + + $backPath = preg_replace('#[^/]+/#i', "../", $htmlDir); + $isDocAString = is_string($doc); + $xmlDoc = NULL; + + if ($isDocAString) { + $xmlDoc = new DOMDocument(); + @$xmlDoc->loadHTML($doc); + } else { + $xmlDoc = $doc; + } + + $this->processChapterStyles($xmlDoc, $externalReferences, $baseDir, $htmlDir); + $this->processChapterLinks($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); + $this->processChapterImages($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); + $this->processChapterSources($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); + + if ($isDocAString) { + //$html = $xmlDoc->saveXML(); + + $htmlNode = $xmlDoc->getElementsByTagName("html"); + $headNode = $xmlDoc->getElementsByTagName("head"); + $bodyNode = $xmlDoc->getElementsByTagName("body"); + + $htmlNS = ""; + for ($index = 0; $index < $htmlNode->item(0)->attributes->length; $index++) { + $nodeName = $htmlNode->item(0)->attributes->item($index)->nodeName; + $nodeValue = $htmlNode->item(0)->attributes->item($index)->nodeValue; + + if ($nodeName != "xmlns") { + $htmlNS .= " $nodeName=\"$nodeValue\""; + } + } + + $xml = new DOMDocument('1.0', "utf-8"); + $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); + $xml->preserveWhiteSpace = FALSE; + $xml->formatOutput = TRUE; + + $xml2Doc = new DOMDocument('1.0', "utf-8"); + $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); + $xml2Doc->loadXML("\n\n\n\n"); + $html = $xml2Doc->getElementsByTagName("html")->item(0); + $html->appendChild($xml2Doc->importNode($headNode->item(0), TRUE)); + $html->appendChild($xml2Doc->importNode($bodyNode->item(0), TRUE)); + + // force pretty printing and correct formatting, should not be needed, but it is. + $xml->loadXML($xml2Doc->saveXML()); + $doc = $xml->saveXML(); + + if (!$this->isEPubVersion2()) { + $doc = preg_replace('#^\s*\s*#im', '', $doc); + } + } + return TRUE; + } + + /** + * Process images referenced from an CSS file to the book. + * + * $externalReferences determins how the function will handle external references. + * + * @param string &$cssFile (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $cssDir The of the CSS file's directory from the root of the archive. + * + * @return bool FALSE if unsuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processCSSExternalReferences(&$cssFile, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $cssDir = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + + $backPath = preg_replace('#[^/]+/#i', "../", $cssDir); + $imgs = null; + preg_match_all('#url\s*\([\'\"\s]*(.+?)[\'\"\s]*\)#im', $cssFile, $imgs, PREG_SET_ORDER); + + $itemCount = count($imgs); + for ($idx = 0; $idx < $itemCount; $idx++) { + $img = $imgs[$idx]; + if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES || $externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { + $cssFile = str_replace($img[0], "", $cssFile); + } else { + $source = $img[1]; + + $pathData = pathinfo($source); + $internalSrc = $pathData['basename']; + $internalPath = ""; + $isSourceExternal = FALSE; + + if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $cssDir, $backPath)) { + $cssFile = str_replace($img[0], "url('" . $backPath . $internalPath . "')", $cssFile); + } else if ($isSourceExternal) { + $cssFile = str_replace($img[0], "", $cssFile); // External image is missing + } // else do nothing, if the image is local, and missing, assume it's been generated. + } + } + return TRUE; + } + + /** + * Process style tags in a DOMDocument. Styles will be passed as CSS files and reinserted into the document. + * + * @param DOMDocument &$xmlDoc (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * + * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processChapterStyles(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + // process inlined CSS styles in style tags. + $styles = $xmlDoc->getElementsByTagName("style"); + $styleCount = $styles->length; + for ($styleIdx = 0; $styleIdx < $styleCount; $styleIdx++) { + $style = $styles->item($styleIdx); + + $styleData = preg_replace('#[/\*\s]*\<\!\[CDATA\[[\s\*/]*#im', "", $style->nodeValue); + $styleData = preg_replace('#[/\*\s]*\]\]\>[\s\*/]*#im', "", $styleData); + + $this->processCSSExternalReferences($styleData, $externalReferences, $baseDir, $htmlDir); + $style->nodeValue = "\n" . trim($styleData) . "\n"; + } + return TRUE; + } + + /** + * Process link tags in a DOMDocument. Linked files will be loaded into the archive, and the link src will be rewritten to point to that location. + * Link types text/css will be passed as CSS files. + * + * @param DOMDocument &$xmlDoc (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * @param string $backPath The path to get back to the root of the archive from $htmlDir. + * + * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processChapterLinks(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + // process link tags. + $links = $xmlDoc->getElementsByTagName("link"); + $linkCount = $links->length; + for ($linkIdx = 0; $linkIdx < $linkCount; $linkIdx++) { + $link = $links->item($linkIdx); + $source = $link->attributes->getNamedItem("href")->nodeValue; + $sourceData = NULL; + + $pathData = pathinfo($source); + $internalSrc = $pathData['basename']; + + if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { + $urlinfo = parse_url($source); + + if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { + $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); + } + + @$sourceData = getFileContents($source); + } else if (strpos($source, "/") === 0) { + @$sourceData = file_get_contents($this->docRoot . $source); + } else { + @$sourceData = file_get_contents($this->docRoot . $baseDir . "/" . $source); + } + + if (!empty($sourceData)) { + if (!array_key_exists($internalSrc, $this->fileList)) { + $mime = $link->attributes->getNamedItem("type")->nodeValue; + if (empty($mime)) { + $mime = "text/plain"; + } + if ($mime == "text/css") { + $this->processCSSExternalReferences($sourceData, $externalReferences, $baseDir, $htmlDir); + $this->addCSSFile($internalSrc, $internalSrc, $sourceData, EPub::EXTERNAL_REF_IGNORE, $baseDir); + $link->setAttribute("href", $backPath . $internalSrc); + } else { + $this->addFile($internalSrc, $internalSrc, $sourceData, $mime); + } + $this->fileList[$internalSrc] = $source; + } else { + $link->setAttribute("href", $backPath . $internalSrc); + } + } // else do nothing, if the link is local, and missing, assume it's been generated. + } + return TRUE; + } + + /** + * Process img tags in a DOMDocument. + * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. + * + * @param DOMDocument &$xmlDoc (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * @param string $backPath The path to get back to the root of the archive from $htmlDir. + * + * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processChapterImages(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + // process img tags. + $postProcDomElememts = array(); + $images = $xmlDoc->getElementsByTagName("img"); + $itemCount = $images->length; + + for ($idx = 0; $idx < $itemCount; $idx++) { + $img = $images->item($idx); + + if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { + $postProcDomElememts[] = $img; + } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { + $altNode = $img->attributes->getNamedItem("alt"); + $alt = "image"; + if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { + $alt = $altNode->nodeValue; + } + $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "[" . $alt . "]")); + } else { + $source = $img->attributes->getNamedItem("src")->nodeValue; + + $parsedSource = parse_url($source); + $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); + $internalPath = ""; + $isSourceExternal = FALSE; + + if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { + $img->setAttribute("src", $backPath . $internalPath); + } else if ($isSourceExternal) { + $postProcDomElememts[] = $img; // External image is missing + } // else do nothing, if the image is local, and missing, assume it's been generated. + } + } + + foreach ($postProcDomElememts as $target) { + if (is_array($target)) { + $target[0]->parentNode->replaceChild($target[1], $target[0]); + } else { + $target->parentNode->removeChild($target); + } + } + return TRUE; + } + + /** + * Process source tags in a DOMDocument. + * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. + * + * @param DOMDocument &$xmlDoc (referenced) + * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * @param string $backPath The path to get back to the root of the archive from $htmlDir. + * + * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). + */ + protected function processChapterSources(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { + if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { + return FALSE; + } + + if ($this->bookVersion !== EPub::BOOK_VERSION_EPUB3) { + // ePub 2 does not support multimedia formats, and they must be removed. + $externalReferences = EPub::EXTERNAL_REF_REMOVE_IMAGES; + } + + $postProcDomElememts = array(); + $images = $xmlDoc->getElementsByTagName("source"); + $itemCount = $images->length; + for ($idx = 0; $idx < $itemCount; $idx++) { + $img = $images->item($idx); + if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { + $postProcDomElememts[] = $img; + } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { + $altNode = $img->attributes->getNamedItem("alt"); + $alt = "image"; + if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { + $alt = $altNode->nodeValue; + } + $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "[" . $alt . "]")); + } else { + $source = $img->attributes->getNamedItem("src")->nodeValue; + + $parsedSource = parse_url($source); + $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); + $internalPath = ""; + $isSourceExternal = FALSE; + + if ($this->resolveMedia($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { + $img->setAttribute("src", $backPath . $internalPath); + } else if ($isSourceExternal) { + $postProcDomElememts[] = $img; // External image is missing + } // else do nothing, if the image is local, and missing, assume it's been generated. + } + } + } + + /** + * Resolve an image src and determine it's target location and add it to the book. + * + * @param string $source Image Source link. + * @param string &$internalPath (referenced) Return value, will be set to the target path and name in the book. + * @param string &$internalSrc (referenced) Return value, will be set to the target name in the book. + * @param string &$isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * @param string $backPath The path to get back to the root of the archive from $htmlDir. + */ + protected function resolveImage($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { + if ($this->isFinalized) { + return FALSE; + } + $imageData = NULL; + + if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { + $urlinfo = parse_url($source); + $urlPath = pathinfo($urlinfo['path']); + + if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { + $internalSrc = $this->sanitizeFileName(urldecode(substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1))); + } + $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); + $isSourceExternal = TRUE; + $imageData = $this->getImage($source); + } else if (strpos($source, "/") === 0) { + $internalPath = pathinfo($source, PATHINFO_DIRNAME); + + $path = $source; + if (!file_exists($path)) { + $path = $this->docRoot . $path; + } + + $imageData = $this->getImage($path); + } else { + $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); + + $path = $baseDir . "/" . $source; + if (!file_exists($path)) { + $path = $this->docRoot . $path; + } + + $imageData = $this->getImage($path); + } + if ($imageData !== FALSE) { + $iSrcInfo = pathinfo($internalSrc); + if (!empty($imageData['ext']) && $imageData['ext'] != $iSrcInfo['extension']) { + $internalSrc = $iSrcInfo['filename'] . "." . $imageData['ext']; + } + $internalPath = Zip::getRelativePath("images/" . $internalPath . "/" . $internalSrc); + if (!array_key_exists($internalPath, $this->fileList)) { + $this->addFile($internalPath, "i_" . $internalSrc, $imageData['image'], $imageData['mime']); + $this->fileList[$internalPath] = $source; + } + return TRUE; + } + return FALSE; + } + + /** + * Resolve a media src and determine it's target location and add it to the book. + * + * @param string $source Source link. + * @param string $internalPath (referenced) Return value, will be set to the target path and name in the book. + * @param string $internalSrc (referenced) Return value, will be set to the target name in the book. + * @param string $isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. + * @param string $baseDir Default is "", meaning it is pointing to the document root. + * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. + * @param string $backPath The path to get back to the root of the archive from $htmlDir. + */ + protected function resolveMedia($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { + if ($this->isFinalized) { + return FALSE; + } + $mediaPath = NULL; + $tmpFile; + + if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { + $urlinfo = parse_url($source); + + if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { + $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); + } + $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); + $isSourceExternal = TRUE; + $mediaPath = $this->getFileContents($source, true); + $tmpFile = $mediaPath; + } else if (strpos($source, "/") === 0) { + $internalPath = pathinfo($source, PATHINFO_DIRNAME); + + $mediaPath = $source; + if (!file_exists($mediaPath)) { + $mediaPath = $this->docRoot . $mediaPath; + } + } else { + $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); + + $mediaPath = $baseDir . "/" . $source; + if (!file_exists($mediaPath)) { + $mediaPath = $this->docRoot . $mediaPath; + } + } + + if ($mediaPath !== FALSE) { + $mime = $this->getMime($source); + $internalPath = Zip::getRelativePath("media/" . $internalPath . "/" . $internalSrc); + + if (!array_key_exists($internalPath, $this->fileList) && + $this->addLargeFile($internalPath, "m_" . $internalSrc, $mediaPath, $mime)) { + $this->fileList[$internalPath] = $source; + } + if (isset($tmpFile)) { + unlink($tmpFile); + } + return TRUE; + } + return FALSE; + } + + /** + * Get Book Chapter count. + * + * @access public + * @return number of chapters + */ + function getChapterCount() { + return $this->chapterCount; + } + + /** + * Book title, mandatory. + * + * Used for the dc:title metadata parameter in the OPF file as well as the DocTitle attribute in the NCX file. + * + * @param string $title + * @access public + * @return bool $success + */ + function setTitle($title) { + if ($this->isFinalized) { + return FALSE; + } + $this->title = $title; + return TRUE; + } + + /** + * Get Book title. + * + * @access public + * @return $title + */ + function getTitle() { + return $this->title; + } + + /** + * Book language, mandatory + * + * Use the RFC3066 Language codes, such as "en", "da", "fr" etc. + * Defaults to "en". + * + * Used for the dc:language metadata parameter in the OPF file. + * + * @param string $language + * @access public + * @return bool $success + */ + function setLanguage($language) { + if ($this->isFinalized || mb_strlen($language) != 2) { + return FALSE; + } + $this->language = $language; + return TRUE; + } + + /** + * Get Book language. + * + * @access public + * @return $language + */ + function getLanguage() { + return $this->language; + } + + /** + * Unique book identifier, mandatory. + * Use the URI, or ISBN if available. + * + * An unambiguous reference to the resource within a given context. + * + * Recommended best practice is to identify the resource by means of a + * string conforming to a formal identification system. + * + * Used for the dc:identifier metadata parameter in the OPF file, as well + * as dtb:uid in the NCX file. + * + * Identifier type should only be: + * EPub::IDENTIFIER_URI + * EPub::IDENTIFIER_ISBN + * EPub::IDENTIFIER_UUID + * + * @param string $identifier + * @param string $identifierType + * @access public + * @return bool $success + */ + function setIdentifier($identifier, $identifierType) { + if ($this->isFinalized || ($identifierType !== EPub::IDENTIFIER_URI && $identifierType !== EPub::IDENTIFIER_ISBN && $identifierType !== EPub::IDENTIFIER_UUID)) { + return FALSE; + } + $this->identifier = $identifier; + $this->identifierType = $identifierType; + return TRUE; + } + + /** + * Get Book identifier. + * + * @access public + * @return $identifier + */ + function getIdentifier() { + return $this->identifier; + } + + /** + * Get Book identifierType. + * + * @access public + * @return $identifierType + */ + function getIdentifierType() { + return $this->identifierType; + } + + /** + * Book description, optional. + * + * An account of the resource. + * + * Description may include but is not limited to: an abstract, a table of + * contents, a graphical representation, or a free-text account of the + * resource. + * + * Used for the dc:source metadata parameter in the OPF file + * + * @param string $description + * @access public + * @return bool $success + */ + function setDescription($description) { + if ($this->isFinalized) { + return FALSE; + } + $this->description = $description; + return TRUE; + } + + /** + * Get Book description. + * + * @access public + * @return $description + */ + function getDescription() { + return $this->description; + } + + /** + * Book author or creator, optional. + * The $authorSortKey is basically how the name is to be sorted, usually + * it's "Lastname, First names" where the $author is the straight + * "Firstnames Lastname" + * + * An entity primarily responsible for making the resource. + * + * Examples of a Creator include a person, an organization, or a service. + * Typically, the name of a Creator should be used to indicate the entity. + * + * Used for the dc:creator metadata parameter in the OPF file and the + * docAuthor attribure in the NCX file. + * The sort key is used for the opf:file-as attribute in dc:creator. + * + * @param string $author + * @param string $authorSortKey + * @access public + * @return bool $success + */ + function setAuthor($author, $authorSortKey) { + if ($this->isFinalized) { + return FALSE; + } + $this->author = $author; + $this->authorSortKey = $authorSortKey; + return TRUE; + } + + /** + * Get Book author. + * + * @access public + * @return $author + */ + function getAuthor() { + return $this->author; + } + + /** + * Publisher Information, optional. + * + * An entity responsible for making the resource available. + * + * Examples of a Publisher include a person, an organization, or a service. + * Typically, the name of a Publisher should be used to indicate the entity. + * + * Used for the dc:publisher and dc:relation metadata parameters in the OPF file. + * + * @param string $publisherName + * @param string $publisherURL + * @access public + * @return bool $success + */ + function setPublisher($publisherName, $publisherURL) { + if ($this->isFinalized) { + return FALSE; + } + $this->publisherName = $publisherName; + $this->publisherURL = $publisherURL; + return TRUE; + } + + /** + * Get Book publisherName. + * + * @access public + * @return $publisherName + */ + function getPublisherName() { + return $this->publisherName; + } + + /** + * Get Book publisherURL. + * + * @access public + * @return $publisherURL + */ + function getPublisherURL() { + return $this->publisherURL; + } + + /** + * Release date, optional. If left blank, the time of the finalization will + * be used. + * + * A point or period of time associated with an event in the lifecycle of + * the resource. + * + * Date may be used to express temporal information at any level of + * granularity. Recommended best practice is to use an encoding scheme, + * such as the W3CDTF profile of ISO 8601 [W3CDTF]. + * + * Used for the dc:date metadata parameter in the OPF file + * + * @param long $timestamp + * @access public + * @return bool $success + */ + function setDate($timestamp) { + if ($this->isFinalized) { + return FALSE; + } + $this->date = $timestamp; + $this->opf->date = $timestamp; + return TRUE; + } + + /** + * Get Book date. + * + * @access public + * @return $date + */ + function getDate() { + return $this->date; + } + + /** + * Book (copy)rights, optional. + * + * Information about rights held in and over the resource. + * + * Typically, rights information includes a statement about various + * property rights associated with the resource, including intellectual + * property rights. + * + * Used for the dc:rights metadata parameter in the OPF file + * + * @param string $rightsText + * @access public + * @return bool $success + */ + function setRights($rightsText) { + if ($this->isFinalized) { + return FALSE; + } + $this->rights = $rightsText; + return TRUE; + } + + /** + * Get Book rights. + * + * @access public + * @return $rights + */ + function getRights() { + return $this->rights; + } + + /** + * Add book Subject. + * + * The topic of the resource. + * + * Typically, the subject will be represented using keywords, key phrases, + * or classification codes. Recommended best practice is to use a + * controlled vocabulary. To describe the spatial or temporal topic of the + * resource, use the Coverage element. + * + * @param string $subject + */ + function setSubject($subject) { + if ($this->isFinalized) { + return; + } + $this->opf->addDCMeta(DublinCore::SUBJECT, $this->decodeHtmlEntities($subject)); + } + + /** + * Book source URL, optional. + * + * A related resource from which the described resource is derived. + * + * The described resource may be derived from the related resource in whole + * or in part. Recommended best practice is to identify the related + * resource by means of a string conforming to a formal identification system. + * + * Used for the dc:source metadata parameter in the OPF file + * + * @param string $sourceURL + * @access public + * @return bool $success + */ + function setSourceURL($sourceURL) { + if ($this->isFinalized) { + return FALSE; + } + $this->sourceURL = $sourceURL; + return TRUE; + } + + /** + * Get Book sourceURL. + * + * @access public + * @return $sourceURL + */ + function getSourceURL() { + return $this->sourceURL; + } + + /** + * Coverage, optional. + * + * The spatial or temporal topic of the resource, the spatial applicability + * of the resource, or the jurisdiction under which the resource is relevant. + * + * Spatial topic and spatial applicability may be a named place or a location + * specified by its geographic coordinates. Temporal topic may be a named + * period, date, or date range. A jurisdiction may be a named administrative + * entity or a geographic place to which the resource applies. Recommended + * best practice is to use a controlled vocabulary such as the Thesaurus of + * Geographic Names [TGN]. Where appropriate, named places or time periods + * can be used in preference to numeric identifiers such as sets of + * coordinates or date ranges. + * + * Used for the dc:coverage metadata parameter in the OPF file + * + * Same as ->addDublinCoreMetadata(DublinCore::COVERAGE, $coverage); + * + * @param string $coverage + * @access public + * @return bool $success + */ + function setCoverage($coverage) { + if ($this->isFinalized) { + return FALSE; + } + $this->coverage = $coverage; + return TRUE; + } + + /** + * Get Book coverage. + * + * @access public + * @return $coverage + */ + function getCoverage() { + return $this->coverage; + } + + /** + * Set book Relation. + * + * A related resource. + * + * Recommended best practice is to identify the related resource by means + * of a string conforming to a formal identification system. + * + * @param string $relation + */ + function setRelation($relation) { + if ($this->isFinalized) { + return; + } + $this->relation = $relation; + } + + /** + * Get the book relation. + * + * @return string The relation. + */ + function getRelation() { + return $this->relation; + } + + /** + * Set book Generator. + * + * The generator is a meta tag added to the ncx file, it is not visible + * from within the book, but is a kind of electronic watermark. + * + * @param string $generator + */ + function setGenerator($generator) { + if ($this->isFinalized) { + return; + } + $this->generator = $generator; + } + + /** + * Get the book relation. + * + * @return string The generator identity string. + */ + function getGenerator() { + return $this->generator; + } + + /** + * Set ePub date formate to the short yyyy-mm-dd form, for compliance with + * a bug in EpubCheck, prior to its version 1.1. + * + * The latest version of ePubCheck can be obtained here: + * http://code.google.com/p/epubcheck/ + * + * @access public + * @return bool $success + */ + function setShortDateFormat() { + if ($this->isFinalized) { + return FALSE; + } + $this->dateformat = $this->dateformatShort; + return TRUE; + } + + /** + * @Deprecated + */ + function setIgnoreEmptyBuffer($ignoreEmptyBuffer = TRUE) { + die ("Function was deprecated, functionality is no longer needed."); + } + + /** + * Set the references title for the ePub 3 landmarks section + * + * @param string $referencesTitle + * @param string $referencesId + * @param string $referencesClass + * @return bool + */ + function setReferencesTitle($referencesTitle = "Guide", $referencesId = "", $referencesClass = "references") { + if ($this->isFinalized) { + return FALSE; + } + $this->ncx->referencesTitle = is_string($referencesTitle) ? trim($referencesTitle) : "Guide"; + $this->ncx->referencesId = is_string($referencesId) ? trim($referencesId) : "references"; + $this->ncx->referencesClass = is_string($referencesClass) ? trim($referencesClass) : "references"; + return TRUE; + } + + /** + * Set the references title for the ePub 3 landmarks section + * + * @param bool $referencesTitle + */ + function setisReferencesAddedToToc($isReferencesAddedToToc = TRUE) { + if ($this->isFinalized) { + return FALSE; + } + $this->isReferencesAddedToToc = $isReferencesAddedToToc === TRUE; + return TRUE; + } + + /** + * Get Book status. + * + * @access public + * @return bool + */ + function isFinalized() { + return $this->isFinalized; + } + + /** + * Build the Table of Contents. This is not strictly necessary, as most eReaders will build it from the navigation structure in the .ncx file. + * + * @param string $cssFileName Include a link to this css file in the TOC html. + * @param string $tocCSSClass The TOC is a
    , if you need special formatting, you can add a css class for that div. Default is "toc". + * @param string $title Title of the Table of contents. Default is "Table of Contents". Use this for ie. languages other than English. + * @param bool $addReferences include reference pages in the TOC, using the $referencesOrder array to determine the order of the pages in the TOC. Default is TRUE. + * @param bool $addToIndex Add the TOC to the NCX index at the current leve/position. Default is FALSE + * @param string $tocFileName Change teh default name of the TOC file. The default is "TOC.xhtml" + */ + function buildTOC($cssFileName = NULL, $tocCSSClass = "toc", $title = "Table of Contents", $addReferences = TRUE, $addToIndex = FALSE, $tocFileName = "TOC.xhtml") { + if ($this->isFinalized) { + return FALSE; + } + $this->buildTOC = TRUE; + $this->tocTitle = $title; + $this->tocFileName = $this->normalizeFileName($tocFileName); + if (!empty($cssFileName)) { + $this->tocCSSFileName = $this->normalizeFileName($cssFileName); + } + $this->tocCSSClass = $tocCSSClass; + $this->tocAddReferences = $addReferences; + + $this->opf->addItemRef("ref_" . Reference::TABLE_OF_CONTENTS, FALSE); + $this->opf->addReference(Reference::TABLE_OF_CONTENTS, $title, $this->tocFileName); + + if ($addToIndex) { + $navPoint = new NavPoint($this->decodeHtmlEntities($title), $this->tocFileName, "ref_" . Reference::TABLE_OF_CONTENTS); + $this->ncx->addNavPoint($navPoint); + } else { + $this->ncx->referencesList[Reference::TABLE_OF_CONTENTS] = $this->tocFileName; + $this->ncx->referencesName[Reference::TABLE_OF_CONTENTS] = $title; + } + } + + private function finalizeTOC() { + if (!$this->buildTOC) { + return FALSE; + } + + if (empty($this->tocTitle)) { + $this->tocTitle = "Table of Contents"; + } + + $tocData = "\n"; + + if ($this->isEPubVersion2()) { + $tocData .= "\n" + . "\n" + . "\n\n"; + } else { + $tocData .= "\n" + . "\n\n"; + } + + if (!empty($this->tocCssFileName)) { + $tocData .= "tocCssFileName . "\" />\n"; + } + + $tocData .= "" . $this->tocTitle . "\n" + . "\n" + . "\n" + . "

    " . $this->tocTitle . "

    \ntocCSSClass)) { + $tocData .= " class=\"" . $this->tocCSSClass . "\""; + } + $tocData .= ">\n"; + + while (list($item, $descriptive) = each($this->referencesOrder)) { + if ($item === "text") { + while (list($chapterName, $navPoint) = each($this->ncx->chapterList)) { + $fileName = $navPoint->getContentSrc(); + $level = $navPoint->getLevel() -2; + $tocData .= "\t

    " . str_repeat("      ", $level) . "" . $chapterName . "

    \n"; + } + } else if ($this->tocAddReferences === TRUE) { + if (array_key_exists($item, $this->ncx->referencesList)) { + $tocData .= "\t

    ncx->referencesList[$item] . "\">" . $descriptive . "

    \n"; + } else if ($item === "toc") { + $tocData .= "\t

    " . $this->tocTitle . "

    \n"; + } else if ($item === "cover" && $this->isCoverImageSet) { + $tocData .= "\t

    " . $descriptive . "

    \n"; + } + } + } + $tocData .= "
    \n\n\n"; + + $this->addReferencePage($this->tocTitle, $this->tocFileName, $tocData, Reference::TABLE_OF_CONTENTS); + + } + + /** + * @return bool + */ + function isEPubVersion2() { + return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; + } + + /** + * @param string $cssFileName + * @param string $title + * @return string + */ + function buildEPub3TOC($cssFileName = NULL, $title = "Table of Contents") { + $this->ncx->referencesOrder = $this->referencesOrder; + $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); + return $this->ncx->finalizeEPub3($title, $cssFileName); + } + + /** + * @param string $fileName + * @param string $tocData + * @return bool + */ + function addEPub3TOC($fileName, $tocData) { + if ($this->isEPubVersion2() || $this->isFinalized || array_key_exists($fileName, $this->fileList)) { + return FALSE; + } + $fileName = Zip::getRelativePath($fileName); + $fileName = preg_replace('#^[/\.]+#i', "", $fileName); + + $this->zip->addFile($tocData, $this->bookRoot.$fileName); + + $this->fileList[$fileName] = $fileName; + $this->opf->addItem("toc", $fileName, "application/xhtml+xml", "nav"); + return TRUE; + } + + /** + * Check for mandatory parameters and finalize the e-book. + * Once finalized, the book is locked for further additions. + * + * @return bool $success + */ + function finalize() { + if ($this->isFinalized || $this->chapterCount == 0 || empty($this->title) || empty($this->language)) { + return FALSE; + } + + if (empty($this->identifier) || empty($this->identifierType)) { + $this->setIdentifier($this->createUUID(4), EPub::IDENTIFIER_UUID); + } + + if ($this->date == 0) { + $this->date = time(); + } + + if (empty($this->sourceURL)) { + $this->sourceURL = $this->getCurrentPageURL(); + } + + if (empty($this->publisherURL)) { + $this->sourceURL = $this->getCurrentServerURL(); + } + + // Generate OPF data: + $this->opf->setIdent("BookId"); + $this->opf->initialize($this->title, $this->language, $this->identifier, $this->identifierType); + + $DCdate = new DublinCore(DublinCore::DATE, gmdate($this->dateformat, $this->date)); + $DCdate->addOpfAttr("event", "publication"); + $this->opf->metadata->addDublinCore($DCdate); + + if (!empty($this->description)) { + $this->opf->addDCMeta(DublinCore::DESCRIPTION, $this->decodeHtmlEntities($this->description)); + } + + if (!empty($this->publisherName)) { + $this->opf->addDCMeta(DublinCore::PUBLISHER, $this->decodeHtmlEntities($this->publisherName)); + } + + if (!empty($this->publisherURL)) { + $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->publisherURL)); + } + + if (!empty($this->author)) { + $author = $this->decodeHtmlEntities($this->author); + $this->opf->addCreator($author, $this->decodeHtmlEntities($this->authorSortKey), MarcCode::AUTHOR); + $this->ncx->setDocAuthor($author); + } + + if (!empty($this->rights)) { + $this->opf->addDCMeta(DublinCore::RIGHTS, $this->decodeHtmlEntities($this->rights)); + } + + if (!empty($this->coverage)) { + $this->opf->addDCMeta(DublinCore::COVERAGE, $this->decodeHtmlEntities($this->coverage)); + } + + if (!empty($this->sourceURL)) { + $this->opf->addDCMeta(DublinCore::SOURCE, $this->sourceURL); + } + + if (!empty($this->relation)) { + $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->relation)); + } + + if ($this->isCoverImageSet) { + $this->opf->addMeta("cover", "coverImage"); + } + + if (!empty($this->generator)) { + $gen = $this->decodeHtmlEntities($this->generator); + $this->opf->addMeta("generator", $gen); + $this->ncx->addMetaEntry("dtb:generator", $gen); + } + + if ($this->EPubMark) { + $this->opf->addMeta("generator", "EPub (Version " . self::VERSION . ") by A. Grandt, http://www.phpclasses.org/package/6115"); + } + + reset($this->ncx->chapterList); + list($firstChapterName, $firstChapterNavPoint) = each($this->ncx->chapterList); + $firstChapterFileName = $firstChapterNavPoint->getContentSrc(); + $this->opf->addReference(Reference::TEXT, $this->decodeHtmlEntities($firstChapterName), $firstChapterFileName); + + $this->ncx->setUid($this->identifier); + + $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); + + $this->ncx->referencesOrder = $this->referencesOrder; + if ($this->isReferencesAddedToToc) { + $this->ncx->finalizeReferences(); + } + + $this->finalizeTOC(); + + if (!$this->isEPubVersion2()) { + $this->addEPub3TOC("epub3toc.xhtml", $this->buildEPub3TOC()); + } + + $opfFinal = $this->fixEncoding($this->opf->finalize()); + $ncxFinal = $this->fixEncoding($this->ncx->finalize()); + + if (mb_detect_encoding($opfFinal, 'UTF-8', true) === "UTF-8") { + $this->zip->addFile($opfFinal, $this->bookRoot."book.opf"); + } else { + $this->zip->addFile(mb_convert_encoding($opfFinal, "UTF-8"), $this->bookRoot."book.opf"); + } + + if (mb_detect_encoding($ncxFinal, 'UTF-8', true) === "UTF-8") { + $this->zip->addFile($ncxFinal, $this->bookRoot."book.ncx"); + } else { + $this->zip->addFile(mb_convert_encoding($ncxFinal, "UTF-8"), $this->bookRoot."book.ncx"); + } + + $this->opf = NULL; + $this->ncx = NULL; + + $this->isFinalized = TRUE; + return TRUE; + } + + /** + * Ensure the encoded string is a valid UTF-8 string. + * + * Note, that a mb_detect_encoding on the returned string will still return ASCII if the entire string is comprized of characters in the 1-127 range. + * + * @link: http://snippetdb.com/php/convert-string-to-utf-8-for-mysql + * @param string $in_str + * @return string converted string. + */ + function fixEncoding($in_str) { + if (mb_detect_encoding($in_str) == "UTF-8" && mb_check_encoding($in_str,"UTF-8")) { + return $in_str; + } else { + return utf8_encode($in_str); + } + } + + /** + * Return the finalized book. + * + * @return string with the book in binary form. + */ + function getBook() { + if (!$this->isFinalized) { + $this->finalize(); + } + + return $this->zip->getZipData(); + } + + /** + * Remove disallowed characters from string to get a nearly safe filename + * + * @param string $fileName + * @return mixed|string + */ + function sanitizeFileName($fileName) { + $fileName1 = str_replace($this->forbiddenCharacters, '', $fileName); + $fileName2 = preg_replace('/[\s-]+/', '-', $fileName1); + return trim($fileName2, '.-_'); + + } + + /** + * Cleanup the filepath, and remove leading . and / characters. + * + * Sometimes, when a path is generated from multiple fragments, + * you can get something like "../data/html/../images/image.jpeg" + * ePub files don't work well with that, this will normalize that + * example path to "data/images/image.jpeg" + * + * @param string $fileName + * @return string normalized filename + */ + function normalizeFileName($fileName) { + return preg_replace('#^[/\.]+#i', "", Zip::getRelativePath($fileName)); + } + + /** + * Save the ePub file to local disk. + * + * @param string $fileName + * @param string $baseDir If empty baseDir is absolute to server path, if omitted it's relative to script path + * @return The sent file name if successfull, FALSE if it failed. + */ + function saveBook($fileName, $baseDir = '.') { + + // Make fileName safe + $fileName = $this->sanitizeFileName($fileName); + + // Finalize book, if it's not done already + if (!$this->isFinalized) { + $this->finalize(); + } + + if (stripos(strrev($fileName), "bupe.") !== 0) { + $fileName .= ".epub"; + } + + // Try to open file access + $fh = fopen($baseDir.'/'.$fileName, "w"); + + if ($fh) { + fputs($fh, $this->getBook()); + fclose($fh); + + // if file is written return TRUE + return $fileName; + } + + // return FALSE by default + return FALSE; + } + + /** + * Return the finalized book size. + * + * @return string + */ + function getBookSize() { + if (!$this->isFinalized) { + $this->finalize(); + } + + return $this->zip->getArchiveSize(); + } + + /** + * Send the book as a zip download + * + * Sending will fail if the output buffer is in use. You can override this limit by + * calling setIgnoreEmptyBuffer(TRUE), though the function will still fail if that + * buffer is not empty. + * + * @param string $fileName The name of the book without the .epub at the end. + * @return The sent file name if successfull, FALSE if it failed. + */ + function sendBook($fileName) { + if (!$this->isFinalized) { + $this->finalize(); + } + + if (stripos(strrev($fileName), "bupe.") !== 0) { + $fileName .= ".epub"; + } + + if (TRUE === $this->zip->sendZip($fileName, "application/epub+zip")) { + return $fileName; + } + return FALSE; + } + + /** + * Generates an UUID. + * + * Default version (4) will generate a random UUID, version 3 will URL based UUID. + * + * Added for convinience + * + * @param int $bookVersion UUID version to retrieve, See lib.uuid.manual.html for details. + * @param string $url + * @return string The formatted uuid + */ + function createUUID($bookVersion = 4, $url = NULL) { + include_once("lib.uuid.php"); + return UUID::mint($bookVersion, $url, UUID::nsURL); + } + + /** + * Get the url of the current page. + * Example use: Default Source URL + * + * $return string Page URL. + */ + function getCurrentPageURL() { + $pageURL = $this->getCurrentServerURL() . filter_input(INPUT_SERVER, "REQUEST_URI"); + return $pageURL; + } + + /** + * Get the url of the server. + * Example use: Default Publisher URL + * + * $return string Server URL. + */ + function getCurrentServerURL() { + $serverURL = 'http'; + $https = filter_input(INPUT_SERVER, "HTTPS"); + $port = filter_input(INPUT_SERVER, "SERVER_PORT"); + + if ($https === "on") { + $serverURL .= "s"; + } + $serverURL .= "://" . filter_input(INPUT_SERVER, "SERVER_NAME"); + if ($port != "80") { + $serverURL .= ":" . $port; + } + return $serverURL . '/'; + } + + /** + * Try to determine the mimetype of the file path. + * + * @param string $source Path + * @return string mimetype, or FALSE. + */ + function getMime($source) { + return $this->mimetypes[pathinfo($source, PATHINFO_EXTENSION)]; + } + + /** + * Get an image from a file or url, return it resized if the image exceeds the $maxImageWidth or $maxImageHeight directives. + * + * The return value is an array. + * ['width'] is the width of the image. + * ['height'] is the height of the image. + * ['mime'] is the mime type of the image. Resized images are always in jpeg format. + * ['image'] is the image data. + * ['ext'] is the extension of the image file. + * + * @param string $source path or url to file. + * $return array + */ + function getImage($source) { + $width = -1; + $height = -1; + $mime = "application/octet-stream"; + $type = FALSE; + $ext = ""; + + + $image = $this->getFileContents($source); + + if ($image !== FALSE && strlen($image) > 0) { + $imageFile = imagecreatefromstring($image); + if ($imageFile !== false) { + $width = ImageSX($imageFile); + $height = ImageSY($imageFile); + } + if ($this->isExifInstalled) { + @$type = exif_imagetype($source); + $mime = image_type_to_mime_type($type); + } + if ($mime === "application/octet-stream") { + $mime = $this->image_file_type_from_binary($image); + } + if ($mime === "application/octet-stream") { + $mime = $this->getMimeTypeFromUrl($source); + } + } else { + return FALSE; + } + + if ($width <= 0 || $height <= 0) { + return FALSE; + } + + $ratio = 1; + + if ($this->isGdInstalled) { + if ($width > $this->maxImageWidth) { + $ratio = $this->maxImageWidth/$width; + } + if ($height*$ratio > $this->maxImageHeight) { + $ratio = $this->maxImageHeight/$height; + } + + if ($ratio < 1 || empty($mime) || ($this->isGifImagesEnabled !== FALSE && $mime == "image/gif")) { + $image_o = imagecreatefromstring($image); + $image_p = imagecreatetruecolor($width*$ratio, $height*$ratio); + + if ($mime == "image/png") { + imagealphablending($image_p, false); + imagesavealpha($image_p, true); + imagealphablending($image_o, true); + + imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); + ob_start(); + imagepng($image_p, NULL, 9); + $image = ob_get_contents(); + ob_end_clean(); + + $ext = "png"; + } else { + imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); + ob_start(); + imagejpeg($image_p, NULL, 80); + $image = ob_get_contents(); + ob_end_clean(); + + $mime = "image/jpeg"; + $ext = "jpg"; + } + imagedestroy($image_o); + imagedestroy($image_p); + } + } + + if ($ext === "") { + static $mimeToExt = array ( + 'image/jpeg' => 'jpg', + 'image/gif' => 'gif', + 'image/png' => 'png' + ); + + if (isset($mimeToExt[$mime])) { + $ext = $mimeToExt[$mime]; + } + } + + $rv = array(); + $rv['width'] = $width*$ratio; + $rv['height'] = $height*$ratio; + $rv['mime'] = $mime; + $rv['image'] = $image; + $rv['ext'] = $ext; + + return $rv; + } + + /** + * Get file contents, using curl if available, else file_get_contents + * + * @param string $source + * @return bool + */ + function getFileContents($source, $toTempFile = FALSE) { + $isExternal = preg_match('#^(http|ftp)s?://#i', $source) == 1; + + if ($isExternal && $this->isCurlInstalled) { + $ch = curl_init(); + $outFile = NULL; + $fp = NULL; + $res = FALSE; + $info = array('http_code' => 500); + + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_URL, str_replace(" ","%20",$source)); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_BUFFERSIZE, 4096); + + if ($toTempFile) { + $outFile = tempnam(sys_get_temp_dir(), "EPub_v" . EPub::VERSION . "_"); + $fp = fopen($outFile, "w+b"); + curl_setopt($ch, CURLOPT_FILE, $fp); + + $res = curl_exec($ch); + $info = curl_getinfo($ch); + + curl_close($ch); + fclose($fp); + } else { + $res = curl_exec($ch); + $info = curl_getinfo($ch); + + curl_close($ch); + } + + if ($info['http_code'] == 200 && $res != false) { + if ($toTempFile) { + return $outFile; + } + return $res; + } + return FALSE; + } + + if ($this->isFileGetContentsInstalled && (!$isExternal || $this->isFileGetContentsExtInstalled)) { + @$data = file_get_contents($source); + return $data; + } + return FALSE; + } + + /** + * get mime type from image data + * + * By fireweasel found on http://stackoverflow.com/questions/2207095/get-image-mimetype-from-resource-in-php-gd + * @staticvar array $type + * @param object $binary + * @return string + */ + function image_file_type_from_binary($binary) { + $hits = 0; + if (!preg_match( + '/\A(?:(\xff\xd8\xff)|(GIF8[79]a)|(\x89PNG\x0d\x0a)|(BM)|(\x49\x49(?:\x2a\x00|\x00\x4a))|(FORM.{4}ILBM))/', + $binary, $hits)) { + return 'application/octet-stream'; + } + static $type = array ( + 1 => 'image/jpeg', + 2 => 'image/gif', + 3 => 'image/png', + 4 => 'image/x-windows-bmp', + 5 => 'image/tiff', + 6 => 'image/x-ilbm', + ); + return $type[count($hits) - 1]; + } + + /** + * @param string $source URL Source + * @return string MimeType + */ + function getMimeTypeFromUrl($source) { + $ext = FALSE; + + $srev = strrev($source); + $pos = strpos($srev, "?"); + if ($pos !== FALSE) { + $srev = substr($srev, $pos+1); + } + + $pos = strpos($srev, "."); + if ($pos !== FALSE) { + $ext = strtolower(strrev(substr($srev, 0, $pos))); + } + + if ($ext !== FALSE) { + return $this->getMimeTypeFromExtension($ext); + } + return "application/octet-stream"; + } + + /** + * @param string $ext Extension + * @return string MimeType + */ + function getMimeTypeFromExtension($ext) { + switch ($ext) { + case "jpg": + case "jpe": + case "jpeg": + return 'image/jpeg'; + case "gif": + return 'image/gif'; + case "png": + return 'image/png'; + case "bmp": + return 'image/x-windows-bmp'; + case "tif": + case "tiff": + case "cpt": + return 'image/tiff'; + case "lbm": + case "ilbm": + return 'image/x-ilbm'; + default: + return "application/octet-stream"; + } + } + + /** + * Encode html code to use html entities, safeguarding it from potential character encoding peoblems + * This function is a bit different from the vanilla htmlentities function in that it does not encode html tags. + * + * The regexp is taken from the PHP Manual discussion, it was written by user "busbyjon". + * http://www.php.net/manual/en/function.htmlentities.php#90111 + * + * @param string $string string to encode. + */ + public function encodeHtml($string) { + $string = strtr($string, $this->html_encoding_characters); + + //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&\\1", $string); + //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&", $string); + return $string; + } + + /** + * Helper function to create a DOM fragment with given markup. + * + * @author Adam Schmalhofer + * + * @param DOMDocument $dom + * @param string $markup + * @return DOMNode fragment in a node. + */ + protected function createDomFragment($dom, $markup) { + $node = $dom->createDocumentFragment(); + $node->appendXML($markup); + return $node; + } + + /** + * Retrieve an array of file names currently added to the book. + * $key is the filename used in the book + * $value is the original filename, will be the same as $key for most entries + * + * @return array file list + */ + function getFileList() { + return $this->fileList; + } + + /** + * @deprecated Use Zip::getRelativePath($relPath) instead. + */ + function relPath($relPath) { + die ("Function was deprecated, use Zip::getRelativePath(\$relPath); instead"); + } + + /** + * Set default chapter target size. + * Default is 250000 bytes, and minimum is 10240 bytes. + * + * @param int $size segment size in bytes + * @return void + */ + function setSplitSize($size) { + $this->splitDefaultSize = (int)$size; + if ($size < 10240) { + $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. + } + } + + /** + * Get the chapter target size. + * + * @return $size + */ + function getSplitSize() { + return $this->splitDefaultSize; + } + + /** + * Remove all non essential html tags and entities. + * + * @global type $htmlEntities + * @param string $string + * @return string with the stripped entities. + */ + function decodeHtmlEntities($string) { + global $htmlEntities; + + $string = preg_replace('~\s*\s*~i', "\n", $string); + $string = preg_replace('~\s*\s*~i', "\n\n", $string); + $string = preg_replace('~<[^>]*>~', '', $string); + + $string = strtr($string, $htmlEntities); + + $string = str_replace('&', '&', $string); + $string = str_replace('&amp;', '&', $string); + $string = preg_replace('~&(#x*[a-fA-F0-9]+;)~', '&\1', $string); + $string = str_replace('<', '<', $string); + $string = str_replace('>', '>', $string); + + return $string; + } + + /** + * Simply remove all HTML tags, brute force and no finesse. + * + * @param string $string html + * @return string + */ + function html2text($string) { + return preg_replace('~<[^>]*>~', '', $string); + } + + /** + * @return string + */ + function getLog() { + return $this->log->getLog(); + } +} diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php new file mode 100644 index 00000000..1d44f238 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php @@ -0,0 +1,201 @@ + + * @copyright 2009-2014 A. Grandt + * @license GNU LGPL 2.1 + * @link http://www.phpclasses.org/package/6115 + * @link https://github.com/Grandt/PHPePub + * @version 3.20 + */ +class EPubChapterSplitter { + const VERSION = 3.20; + + private $splitDefaultSize = 250000; + private $bookVersion = EPub::BOOK_VERSION_EPUB2; + + /** + * + * Enter description here ... + * + * @param unknown_type $ident + */ + function setVersion($bookVersion) { + $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; + } + + /** + * Set default chapter target size. + * Default is 250000 bytes, and minimum is 10240 bytes. + * + * @param $size segment size in bytes + * @return void + */ + function setSplitSize($size) { + $this->splitDefaultSize = (int)$size; + if ($size < 10240) { + $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. + } + } + + /** + * Get the chapter target size. + * + * @return $size + */ + function getSplitSize() { + return $this->splitDefaultSize; + } + + /** + * Split $chapter into multiple parts. + * + * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php + * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given + * + * @param String $chapter XHTML file + * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. + * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. + * + * @return array with 1 or more parts + */ + function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { + $chapterData = array(); + $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1); + if ($splitOnSearchString && !$isSearchRegexp) { + $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; + } + + if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { + return array($chapter); + } + + $xmlDoc = new DOMDocument(); + @$xmlDoc->loadHTML($chapter); + + $head = $xmlDoc->getElementsByTagName("head"); + $body = $xmlDoc->getElementsByTagName("body"); + + $htmlPos = stripos($chapter, "", $htmlPos); + $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n"; + if (strpos(trim($newXML), "\n" . $newXML; + } + $headerLength = strlen($newXML); + + $files = array(); + $chapterNames = array(); + $domDepth = 0; + $domPath = array(); + $domClonedPath = array(); + + $curFile = $xmlDoc->createDocumentFragment(); + $files[] = $curFile; + $curParent = $curFile; + $curSize = 0; + + $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); + $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; + + $partSize = $this->splitDefaultSize - $headLen; + + if ($bodyLen > $partSize) { + $parts = ceil($bodyLen / $partSize); + $partSize = ($bodyLen / $parts) - $headLen; + } + + $node = $body->item(0)->firstChild; + + do { + $nodeData = $xmlDoc->saveXML($node); + $nodeLen = strlen($nodeData); + + if ($nodeLen > $partSize && $node->hasChildNodes()) { + $domPath[] = $node; + $domClonedPath[] = $node->cloneNode(false); + $domDepth++; + + $node = $node->firstChild; + } + + $node2 = $node->nextSibling; + + if ($node != null && $node->nodeName != "#text") { + $doSplit = false; + if ($splitOnSearchString) { + $doSplit = preg_match($searchString, $nodeData) == 1; + if ($doSplit) { + $chapterNames[] = trim($nodeData); + } + } + + if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) { + $curFile = $xmlDoc->createDocumentFragment(); + $files[] = $curFile; + $curParent = $curFile; + if ($domDepth > 0) { + reset($domPath); + reset($domClonedPath); + $oneDomClonedPath = each($domClonedPath); + while ($oneDomClonedPath) { + list($k, $v) = $oneDomClonedPath; + $newParent = $v->cloneNode(false); + $curParent->appendChild($newParent); + $curParent = $newParent; + $oneDomClonedPath = each($domClonedPath); + } + } + $curSize = strlen($xmlDoc->saveXML($curFile)); + } + $curParent->appendChild($node->cloneNode(true)); + $curSize += $nodeLen; + } + + $node = $node2; + while ($node == null && $domDepth > 0) { + $domDepth--; + $node = end($domPath)->nextSibling; + array_pop($domPath); + array_pop($domClonedPath); + $curParent = $curParent->parentNode; + } + } while ($node != null); + + $curFile = null; + $curSize = 0; + + $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); + $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); + $xml->preserveWhiteSpace = false; + $xml->formatOutput = true; + + for ($idx = 0; $idx < count($files); $idx++) { + $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); + $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); + $xml2Doc->loadXML($newXML); + $html = $xml2Doc->getElementsByTagName("html")->item(0); + $html->appendChild($xml2Doc->importNode($head->item(0), true)); + $body = $xml2Doc->createElement("body"); + $html->appendChild($body); + $body->appendChild($xml2Doc->importNode($files[$idx], true)); + + // force pretty printing and correct formatting, should not be needed, but it is. + $xml->loadXML($xml2Doc->saveXML()); + + $doc = $xml->saveXML(); + + if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) { + $doc = preg_replace('#^\s*\s*#im', '', $doc); + } + + $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc; + } + + return $chapterData; + } +} +?> diff --git a/inc/3rdparty/libraries/PHPePub/Logger.php b/inc/3rdparty/libraries/PHPePub/Logger.php new file mode 100644 index 00000000..314019cb --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Logger.php @@ -0,0 +1,92 @@ + + * @copyright 2012-2013 A. Grandt + * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. + * @version 1.00 + */ +class Logger { + const VERSION = 1.00; + + private $log = ""; + private $tStart; + private $tLast; + private $name = NULL; + private $isLogging = FALSE; + private $isDebugging = FALSE; + + /** + * Class constructor. + * + * @return void + */ + function __construct($name = NULL, $isLogging = FALSE) { + if ($name === NULL) { + $this->name = ""; + } else { + $this->name = $name . " : "; + } + $this->isLogging = $isLogging; + $this->start(); + } + + /** + * Class destructor + * + * @return void + * @TODO make sure elements in the destructor match the current class elements + */ + function __destruct() { + unset($this->log); + } + + function start() { + /* Prepare Logging. Just in case it's used. later */ + if ($this->isLogging) { + $this->tStart = gettimeofday(); + $this->tLast = $this->tStart; + $this->log = "

    Log: " . $this->name . "

    \n
    Started: " . gmdate("D, d M Y H:i:s T", $this->tStart['sec']) . "\n Δ Start ;  Δ Last  ;";
    +			$this->logLine("Start");
    +		}
    +    }
    +
    +    function dumpInstalledModules() {
    +        if ($this->isLogging) {
    +            $isCurlInstalled = extension_loaded('curl') && function_exists('curl_version');
    +            $isGdInstalled = extension_loaded('gd') && function_exists('gd_info');
    +            $isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype');
    +            $isFileGetContentsInstalled = function_exists('file_get_contents');
    +            $isFileGetContentsExtInstalled = $isFileGetContentsInstalled && ini_get('allow_url_fopen');
    +
    +            $this->logLine("isCurlInstalled...............: " . ($isCurlInstalled ? "Yes" : "No"));
    +            $this->logLine("isGdInstalled.................: " . ($isGdInstalled ? "Yes" : "No"));
    +            $this->logLine("isExifInstalled...............: " . ($isExifInstalled ? "Yes" : "No"));
    +            $this->logLine("isFileGetContentsInstalled....: " . ($isFileGetContentsInstalled ? "Yes" : "No"));
    +            $this->logLine("isFileGetContentsExtInstalled.: " . ($isFileGetContentsExtInstalled ? "Yes" : "No"));
    +        }
    +    }
    +
    +    function logLine($line) {
    +        if ($this->isLogging) {
    +            $tTemp = gettimeofday();
    +            $tS = $this->tStart['sec'] + (((int)($this->tStart['usec']/100))/10000);
    +            $tL = $this->tLast['sec'] + (((int)($this->tLast['usec']/100))/10000);
    +            $tT = $tTemp['sec'] + (((int)($tTemp['usec']/100))/10000);
    +
    +			$logline = sprintf("\n+%08.04f; +%08.04f; ", ($tT-$tS), ($tT-$tL)) . $this->name . $line;
    +            $this->log .= $logline;
    +            $this->tLast = $tTemp;
    +
    +		    if ($this->isDebugging) {
    +				echo "
    " . $logline . "\n
    \n"; + } + } + } + + function getLog() { + return $this->log; + } +} +?> \ No newline at end of file diff --git a/inc/3rdparty/libraries/PHPePub/Zip.php b/inc/3rdparty/libraries/PHPePub/Zip.php new file mode 100644 index 00000000..01e03566 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Zip.php @@ -0,0 +1,818 @@ + + * @copyright 2009-2014 A. Grandt + * @license GNU LGPL 2.1 + * @link http://www.phpclasses.org/package/6110 + * @link https://github.com/Grandt/PHPZip + * @version 1.60 + */ +class Zip { + const VERSION = 1.60; + + const ZIP_LOCAL_FILE_HEADER = "\x50\x4b\x03\x04"; // Local file header signature + const ZIP_CENTRAL_FILE_HEADER = "\x50\x4b\x01\x02"; // Central file header signature + const ZIP_END_OF_CENTRAL_DIRECTORY = "\x50\x4b\x05\x06\x00\x00\x00\x00"; //end of Central directory record + + const EXT_FILE_ATTR_DIR = 010173200020; // Permission 755 drwxr-xr-x = (((S_IFDIR | 0755) << 16) | S_DOS_D); + const EXT_FILE_ATTR_FILE = 020151000040; // Permission 644 -rw-r--r-- = (((S_IFREG | 0644) << 16) | S_DOS_A); + + const ATTR_VERSION_TO_EXTRACT = "\x14\x00"; // Version needed to extract + const ATTR_MADE_BY_VERSION = "\x1E\x03"; // Made By Version + + // Unix file types + const S_IFIFO = 0010000; // named pipe (fifo) + const S_IFCHR = 0020000; // character special + const S_IFDIR = 0040000; // directory + const S_IFBLK = 0060000; // block special + const S_IFREG = 0100000; // regular + const S_IFLNK = 0120000; // symbolic link + const S_IFSOCK = 0140000; // socket + + // setuid/setgid/sticky bits, the same as for chmod: + + const S_ISUID = 0004000; // set user id on execution + const S_ISGID = 0002000; // set group id on execution + const S_ISTXT = 0001000; // sticky bit + + // And of course, the other 12 bits are for the permissions, the same as for chmod: + // When addding these up, you can also just write the permissions as a simgle octal number + // ie. 0755. The leading 0 specifies octal notation. + const S_IRWXU = 0000700; // RWX mask for owner + const S_IRUSR = 0000400; // R for owner + const S_IWUSR = 0000200; // W for owner + const S_IXUSR = 0000100; // X for owner + const S_IRWXG = 0000070; // RWX mask for group + const S_IRGRP = 0000040; // R for group + const S_IWGRP = 0000020; // W for group + const S_IXGRP = 0000010; // X for group + const S_IRWXO = 0000007; // RWX mask for other + const S_IROTH = 0000004; // R for other + const S_IWOTH = 0000002; // W for other + const S_IXOTH = 0000001; // X for other + const S_ISVTX = 0001000; // save swapped text even after use + + // Filetype, sticky and permissions are added up, and shifted 16 bits left BEFORE adding the DOS flags. + + // DOS file type flags, we really only use the S_DOS_D flag. + + const S_DOS_A = 0000040; // DOS flag for Archive + const S_DOS_D = 0000020; // DOS flag for Directory + const S_DOS_V = 0000010; // DOS flag for Volume + const S_DOS_S = 0000004; // DOS flag for System + const S_DOS_H = 0000002; // DOS flag for Hidden + const S_DOS_R = 0000001; // DOS flag for Read Only + + private $zipMemoryThreshold = 1048576; // Autocreate tempfile if the zip data exceeds 1048576 bytes (1 MB) + + private $zipData = NULL; + private $zipFile = NULL; + private $zipComment = NULL; + private $cdRec = array(); // central directory + private $offset = 0; + private $isFinalized = FALSE; + private $addExtraField = TRUE; + + private $streamChunkSize = 65536; + private $streamFilePath = NULL; + private $streamTimestamp = NULL; + private $streamFileComment = NULL; + private $streamFile = NULL; + private $streamData = NULL; + private $streamFileLength = 0; + private $streamExtFileAttr = null; + + /** + * Constructor. + * + * @param boolean $useZipFile Write temp zip data to tempFile? Default FALSE + */ + function __construct($useZipFile = FALSE) { + if ($useZipFile) { + $this->zipFile = tmpfile(); + } else { + $this->zipData = ""; + } + } + + function __destruct() { + if (is_resource($this->zipFile)) { + fclose($this->zipFile); + } + $this->zipData = NULL; + } + + /** + * Extra fields on the Zip directory records are Unix time codes needed for compatibility on the default Mac zip archive tool. + * These are enabled as default, as they do no harm elsewhere and only add 26 bytes per file added. + * + * @param bool $setExtraField TRUE (default) will enable adding of extra fields, anything else will disable it. + */ + function setExtraField($setExtraField = TRUE) { + $this->addExtraField = ($setExtraField === TRUE); + } + + /** + * Set Zip archive comment. + * + * @param string $newComment New comment. NULL to clear. + * @return bool $success + */ + public function setComment($newComment = NULL) { + if ($this->isFinalized) { + return FALSE; + } + $this->zipComment = $newComment; + + return TRUE; + } + + /** + * Set zip file to write zip data to. + * This will cause all present and future data written to this class to be written to this file. + * This can be used at any time, even after the Zip Archive have been finalized. Any previous file will be closed. + * Warning: If the given file already exists, it will be overwritten. + * + * @param string $fileName + * @return bool $success + */ + public function setZipFile($fileName) { + if (is_file($fileName)) { + unlink($fileName); + } + $fd=fopen($fileName, "x+b"); + if (is_resource($this->zipFile)) { + rewind($this->zipFile); + while (!feof($this->zipFile)) { + fwrite($fd, fread($this->zipFile, $this->streamChunkSize)); + } + + fclose($this->zipFile); + } else { + fwrite($fd, $this->zipData); + $this->zipData = NULL; + } + $this->zipFile = $fd; + + return TRUE; + } + + /** + * Add an empty directory entry to the zip archive. + * Basically this is only used if an empty directory is added. + * + * @param string $directoryPath Directory Path and name to be added to the archive. + * @param int $timestamp (Optional) Timestamp for the added directory, if omitted or set to 0, the current time will be used. + * @param string $fileComment (Optional) Comment to be added to the archive for this directory. To use fileComment, timestamp must be given. + * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. + * @return bool $success + */ + public function addDirectory($directoryPath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_DIR) { + if ($this->isFinalized) { + return FALSE; + } + $directoryPath = str_replace("\\", "/", $directoryPath); + $directoryPath = rtrim($directoryPath, "/"); + + if (strlen($directoryPath) > 0) { + $this->buildZipEntry($directoryPath.'/', $fileComment, "\x00\x00", "\x00\x00", $timestamp, "\x00\x00\x00\x00", 0, 0, $extFileAttr); + return TRUE; + } + return FALSE; + } + + /** + * Add a file to the archive at the specified location and file name. + * + * @param string $data File data. + * @param string $filePath Filepath and name to be used in the archive. + * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. + * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. + * @param bool $compress (Optional) Compress file, if set to FALSE the file will only be stored. Default TRUE. + * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. + * @return bool $success + */ + public function addFile($data, $filePath, $timestamp = 0, $fileComment = NULL, $compress = TRUE, $extFileAttr = self::EXT_FILE_ATTR_FILE) { + if ($this->isFinalized) { + return FALSE; + } + + if (is_resource($data) && get_resource_type($data) == "stream") { + $this->addLargeFile($data, $filePath, $timestamp, $fileComment, $extFileAttr); + return FALSE; + } + + $gzData = ""; + $gzType = "\x08\x00"; // Compression type 8 = deflate + $gpFlags = "\x00\x00"; // General Purpose bit flags for compression type 8 it is: 0=Normal, 1=Maximum, 2=Fast, 3=super fast compression. + $dataLength = strlen($data); + $fileCRC32 = pack("V", crc32($data)); + + if ($compress) { + $gzTmp = gzcompress($data); + $gzData = substr(substr($gzTmp, 0, strlen($gzTmp) - 4), 2); // gzcompress adds a 2 byte header and 4 byte CRC we can't use. + // The 2 byte header does contain useful data, though in this case the 2 parameters we'd be interrested in will always be 8 for compression type, and 2 for General purpose flag. + $gzLength = strlen($gzData); + } else { + $gzLength = $dataLength; + } + + if ($gzLength >= $dataLength) { + $gzLength = $dataLength; + $gzData = $data; + $gzType = "\x00\x00"; // Compression type 0 = stored + $gpFlags = "\x00\x00"; // Compression type 0 = stored + } + + if (!is_resource($this->zipFile) && ($this->offset + $gzLength) > $this->zipMemoryThreshold) { + $this->zipflush(); + } + + $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); + + $this->zipwrite($gzData); + + return TRUE; + } + + /** + * Add the content to a directory. + * + * @author Adam Schmalhofer + * @author A. Grandt + * + * @param string $realPath Path on the file system. + * @param string $zipPath Filepath and name to be used in the archive. + * @param bool $recursive Add content recursively, default is TRUE. + * @param bool $followSymlinks Follow and add symbolic links, if they are accessible, default is TRUE. + * @param array &$addedFiles Reference to the added files, this is used to prevent duplicates, efault is an empty array. + * If you start the function by parsing an array, the array will be populated with the realPath + * and zipPath kay/value pairs added to the archive by the function. + * @param bool $overrideFilePermissions Force the use of the file/dir permissions set in the $extDirAttr + * and $extFileAttr parameters. + * @param int $extDirAttr Permissions for directories. + * @param int $extFileAttr Permissions for files. + */ + public function addDirectoryContent($realPath, $zipPath, $recursive = TRUE, $followSymlinks = TRUE, &$addedFiles = array(), + $overrideFilePermissions = FALSE, $extDirAttr = self::EXT_FILE_ATTR_DIR, $extFileAttr = self::EXT_FILE_ATTR_FILE) { + if (file_exists($realPath) && !isset($addedFiles[realpath($realPath)])) { + if (is_dir($realPath)) { + if ($overrideFilePermissions) { + $this->addDirectory($zipPath, 0, null, $extDirAttr); + } else { + $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($realPath)); + } + } + + $addedFiles[realpath($realPath)] = $zipPath; + + $iter = new DirectoryIterator($realPath); + foreach ($iter as $file) { + if ($file->isDot()) { + continue; + } + $newRealPath = $file->getPathname(); + $newZipPath = self::pathJoin($zipPath, $file->getFilename()); + + if (file_exists($newRealPath) && ($followSymlinks === TRUE || !is_link($newRealPath))) { + if ($file->isFile()) { + $addedFiles[realpath($newRealPath)] = $newZipPath; + if ($overrideFilePermissions) { + $this->addLargeFile($newRealPath, $newZipPath, 0, null, $extFileAttr); + } else { + $this->addLargeFile($newRealPath, $newZipPath, 0, null, self::getFileExtAttr($newRealPath)); + } + } else if ($recursive === TRUE) { + $this->addDirectoryContent($newRealPath, $newZipPath, $recursive, $followSymlinks, $addedFiles, $overrideFilePermissions, $extDirAttr, $extFileAttr); + } else { + if ($overrideFilePermissions) { + $this->addDirectory($zipPath, 0, null, $extDirAttr); + } else { + $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($newRealPath)); + } + } + } + } + } + } + + /** + * Add a file to the archive at the specified location and file name. + * + * @param string $dataFile File name/path. + * @param string $filePath Filepath and name to be used in the archive. + * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. + * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. + * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. + * @return bool $success + */ + public function addLargeFile($dataFile, $filePath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_FILE) { + if ($this->isFinalized) { + return FALSE; + } + + if (is_string($dataFile) && is_file($dataFile)) { + $this->processFile($dataFile, $filePath, $timestamp, $fileComment, $extFileAttr); + } else if (is_resource($dataFile) && get_resource_type($dataFile) == "stream") { + $fh = $dataFile; + $this->openStream($filePath, $timestamp, $fileComment, $extFileAttr); + + while (!feof($fh)) { + $this->addStreamData(fread($fh, $this->streamChunkSize)); + } + $this->closeStream($this->addExtraField); + } + return TRUE; + } + + /** + * Create a stream to be used for large entries. + * + * @param string $filePath Filepath and name to be used in the archive. + * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. + * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. + * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. + * @return bool $success + */ + public function openStream($filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { + if (!function_exists('sys_get_temp_dir')) { + die ("ERROR: Zip " . self::VERSION . " requires PHP version 5.2.1 or above if large files are used."); + } + + if ($this->isFinalized) { + return FALSE; + } + + $this->zipflush(); + + if (strlen($this->streamFilePath) > 0) { + $this->closeStream(); + } + + $this->streamFile = tempnam(sys_get_temp_dir(), 'Zip'); + $this->streamData = fopen($this->streamFile, "wb"); + $this->streamFilePath = $filePath; + $this->streamTimestamp = $timestamp; + $this->streamFileComment = $fileComment; + $this->streamFileLength = 0; + $this->streamExtFileAttr = $extFileAttr; + + return TRUE; + } + + /** + * Add data to the open stream. + * + * @param string $data + * @return mixed length in bytes added or FALSE if the archive is finalized or there are no open stream. + */ + public function addStreamData($data) { + if ($this->isFinalized || strlen($this->streamFilePath) == 0) { + return FALSE; + } + + $length = fwrite($this->streamData, $data, strlen($data)); + if ($length != strlen($data)) { + die ("

    Length mismatch

    \n"); + } + $this->streamFileLength += $length; + + return $length; + } + + /** + * Close the current stream. + * + * @return bool $success + */ + public function closeStream() { + if ($this->isFinalized || strlen($this->streamFilePath) == 0) { + return FALSE; + } + + fflush($this->streamData); + fclose($this->streamData); + + $this->processFile($this->streamFile, $this->streamFilePath, $this->streamTimestamp, $this->streamFileComment, $this->streamExtFileAttr); + + $this->streamData = null; + $this->streamFilePath = null; + $this->streamTimestamp = null; + $this->streamFileComment = null; + $this->streamFileLength = 0; + $this->streamExtFileAttr = null; + + // Windows is a little slow at times, so a millisecond later, we can unlink this. + unlink($this->streamFile); + + $this->streamFile = null; + + return TRUE; + } + + private function processFile($dataFile, $filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { + if ($this->isFinalized) { + return FALSE; + } + + $tempzip = tempnam(sys_get_temp_dir(), 'ZipStream'); + + $zip = new ZipArchive; + if ($zip->open($tempzip) === TRUE) { + $zip->addFile($dataFile, 'file'); + $zip->close(); + } + + $file_handle = fopen($tempzip, "rb"); + $stats = fstat($file_handle); + $eof = $stats['size']-72; + + fseek($file_handle, 6); + + $gpFlags = fread($file_handle, 2); + $gzType = fread($file_handle, 2); + fread($file_handle, 4); + $fileCRC32 = fread($file_handle, 4); + $v = unpack("Vval", fread($file_handle, 4)); + $gzLength = $v['val']; + $v = unpack("Vval", fread($file_handle, 4)); + $dataLength = $v['val']; + + $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); + + fseek($file_handle, 34); + $pos = 34; + + while (!feof($file_handle) && $pos < $eof) { + $datalen = $this->streamChunkSize; + if ($pos + $this->streamChunkSize > $eof) { + $datalen = $eof-$pos; + } + $data = fread($file_handle, $datalen); + $pos += $datalen; + + $this->zipwrite($data); + } + + fclose($file_handle); + + unlink($tempzip); + } + + /** + * Close the archive. + * A closed archive can no longer have new files added to it. + * + * @return bool $success + */ + public function finalize() { + if (!$this->isFinalized) { + if (strlen($this->streamFilePath) > 0) { + $this->closeStream(); + } + $cd = implode("", $this->cdRec); + + $cdRecSize = pack("v", sizeof($this->cdRec)); + $cdRec = $cd . self::ZIP_END_OF_CENTRAL_DIRECTORY + . $cdRecSize . $cdRecSize + . pack("VV", strlen($cd), $this->offset); + if (!empty($this->zipComment)) { + $cdRec .= pack("v", strlen($this->zipComment)) . $this->zipComment; + } else { + $cdRec .= "\x00\x00"; + } + + $this->zipwrite($cdRec); + + $this->isFinalized = TRUE; + $this->cdRec = NULL; + + return TRUE; + } + return FALSE; + } + + /** + * Get the handle ressource for the archive zip file. + * If the zip haven't been finalized yet, this will cause it to become finalized + * + * @return zip file handle + */ + public function getZipFile() { + if (!$this->isFinalized) { + $this->finalize(); + } + + $this->zipflush(); + + rewind($this->zipFile); + + return $this->zipFile; + } + + /** + * Get the zip file contents + * If the zip haven't been finalized yet, this will cause it to become finalized + * + * @return zip data + */ + public function getZipData() { + if (!$this->isFinalized) { + $this->finalize(); + } + if (!is_resource($this->zipFile)) { + return $this->zipData; + } else { + rewind($this->zipFile); + $filestat = fstat($this->zipFile); + return fread($this->zipFile, $filestat['size']); + } + } + + /** + * Send the archive as a zip download + * + * @param String $fileName The name of the Zip archive, in ISO-8859-1 (or ASCII) encoding, ie. "archive.zip". Optional, defaults to NULL, which means that no ISO-8859-1 encoded file name will be specified. + * @param String $contentType Content mime type. Optional, defaults to "application/zip". + * @param String $utf8FileName The name of the Zip archive, in UTF-8 encoding. Optional, defaults to NULL, which means that no UTF-8 encoded file name will be specified. + * @param bool $inline Use Content-Disposition with "inline" instead of "attached". Optional, defaults to FALSE. + * @return bool $success + */ + function sendZip($fileName = null, $contentType = "application/zip", $utf8FileName = null, $inline = false) { + if (!$this->isFinalized) { + $this->finalize(); + } + + $headerFile = null; + $headerLine = null; + if (!headers_sent($headerFile, $headerLine) or die("

    Error: Unable to send file $fileName. HTML Headers have already been sent from $headerFile in line $headerLine

    ")) { + if ((ob_get_contents() === FALSE || ob_get_contents() == '') or die("\n

    Error: Unable to send file $fileName. Output buffer contains the following text (typically warnings or errors):
    " . htmlentities(ob_get_contents()) . "

    ")) { + if (ini_get('zlib.output_compression')) { + ini_set('zlib.output_compression', 'Off'); + } + + header("Pragma: public"); + header("Last-Modified: " . gmdate("D, d M Y H:i:s T")); + header("Expires: 0"); + header("Accept-Ranges: bytes"); + header("Connection: close"); + header("Content-Type: " . $contentType); + $cd = "Content-Disposition: "; + if ($inline) { + $cd .= "inline"; + } else{ + $cd .= "attached"; + } + if ($fileName) { + $cd .= '; filename="' . $fileName . '"'; + } + if ($utf8FileName) { + $cd .= "; filename*=UTF-8''" . rawurlencode($utf8FileName); + } + header($cd); + header("Content-Length: ". $this->getArchiveSize()); + + if (!is_resource($this->zipFile)) { + echo $this->zipData; + } else { + rewind($this->zipFile); + + while (!feof($this->zipFile)) { + echo fread($this->zipFile, $this->streamChunkSize); + } + } + } + return TRUE; + } + return FALSE; + } + + /** + * Return the current size of the archive + * + * @return $size Size of the archive + */ + public function getArchiveSize() { + if (!is_resource($this->zipFile)) { + return strlen($this->zipData); + } + $filestat = fstat($this->zipFile); + + return $filestat['size']; + } + + /** + * Calculate the 2 byte dostime used in the zip entries. + * + * @param int $timestamp + * @return 2-byte encoded DOS Date + */ + private function getDosTime($timestamp = 0) { + $timestamp = (int)$timestamp; + $oldTZ = @date_default_timezone_get(); + date_default_timezone_set('UTC'); + $date = ($timestamp == 0 ? getdate() : getdate($timestamp)); + date_default_timezone_set($oldTZ); + if ($date["year"] >= 1980) { + return pack("V", (($date["mday"] + ($date["mon"] << 5) + (($date["year"]-1980) << 9)) << 16) | + (($date["seconds"] >> 1) + ($date["minutes"] << 5) + ($date["hours"] << 11))); + } + return "\x00\x00\x00\x00"; + } + + /** + * Build the Zip file structures + * + * @param string $filePath + * @param string $fileComment + * @param string $gpFlags + * @param string $gzType + * @param int $timestamp + * @param string $fileCRC32 + * @param int $gzLength + * @param int $dataLength + * @param int $extFileAttr Use self::EXT_FILE_ATTR_FILE for files, self::EXT_FILE_ATTR_DIR for Directories. + */ + private function buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr) { + $filePath = str_replace("\\", "/", $filePath); + $fileCommentLength = (empty($fileComment) ? 0 : strlen($fileComment)); + $timestamp = (int)$timestamp; + $timestamp = ($timestamp == 0 ? time() : $timestamp); + + $dosTime = $this->getDosTime($timestamp); + $tsPack = pack("V", $timestamp); + + $ux = "\x75\x78\x0B\x00\x01\x04\xE8\x03\x00\x00\x04\x00\x00\x00\x00"; + + if (!isset($gpFlags) || strlen($gpFlags) != 2) { + $gpFlags = "\x00\x00"; + } + + $isFileUTF8 = mb_check_encoding($filePath, "UTF-8") && !mb_check_encoding($filePath, "ASCII"); + $isCommentUTF8 = !empty($fileComment) && mb_check_encoding($fileComment, "UTF-8") && !mb_check_encoding($fileComment, "ASCII"); + if ($isFileUTF8 || $isCommentUTF8) { + $flag = 0; + $gpFlagsV = unpack("vflags", $gpFlags); + if (isset($gpFlagsV['flags'])) { + $flag = $gpFlagsV['flags']; + } + $gpFlags = pack("v", $flag | (1 << 11)); + } + + $header = $gpFlags . $gzType . $dosTime. $fileCRC32 + . pack("VVv", $gzLength, $dataLength, strlen($filePath)); // File name length + + $zipEntry = self::ZIP_LOCAL_FILE_HEADER; + $zipEntry .= self::ATTR_VERSION_TO_EXTRACT; + $zipEntry .= $header; + $zipEntry .= pack("v", ($this->addExtraField ? 28 : 0)); // Extra field length + $zipEntry .= $filePath; // FileName + // Extra fields + if ($this->addExtraField) { + $zipEntry .= "\x55\x54\x09\x00\x03" . $tsPack . $tsPack . $ux; + } + $this->zipwrite($zipEntry); + + $cdEntry = self::ZIP_CENTRAL_FILE_HEADER; + $cdEntry .= self::ATTR_MADE_BY_VERSION; + $cdEntry .= ($dataLength === 0 ? "\x0A\x00" : self::ATTR_VERSION_TO_EXTRACT); + $cdEntry .= $header; + $cdEntry .= pack("v", ($this->addExtraField ? 24 : 0)); // Extra field length + $cdEntry .= pack("v", $fileCommentLength); // File comment length + $cdEntry .= "\x00\x00"; // Disk number start + $cdEntry .= "\x00\x00"; // internal file attributes + $cdEntry .= pack("V", $extFileAttr); // External file attributes + $cdEntry .= pack("V", $this->offset); // Relative offset of local header + $cdEntry .= $filePath; // FileName + // Extra fields + if ($this->addExtraField) { + $cdEntry .= "\x55\x54\x05\x00\x03" . $tsPack . $ux; + } + if (!empty($fileComment)) { + $cdEntry .= $fileComment; // Comment + } + + $this->cdRec[] = $cdEntry; + $this->offset += strlen($zipEntry) + $gzLength; + } + + private function zipwrite($data) { + if (!is_resource($this->zipFile)) { + $this->zipData .= $data; + } else { + fwrite($this->zipFile, $data); + fflush($this->zipFile); + } + } + + private function zipflush() { + if (!is_resource($this->zipFile)) { + $this->zipFile = tmpfile(); + fwrite($this->zipFile, $this->zipData); + $this->zipData = NULL; + } + } + + /** + * Join $file to $dir path, and clean up any excess slashes. + * + * @param string $dir + * @param string $file + */ + public static function pathJoin($dir, $file) { + if (empty($dir) || empty($file)) { + return self::getRelativePath($dir . $file); + } + return self::getRelativePath($dir . '/' . $file); + } + + /** + * Clean up a path, removing any unnecessary elements such as /./, // or redundant ../ segments. + * If the path starts with a "/", it is deemed an absolute path and any /../ in the beginning is stripped off. + * The returned path will not end in a "/". + * + * Sometimes, when a path is generated from multiple fragments, + * you can get something like "../data/html/../images/image.jpeg" + * This will normalize that example path to "../data/images/image.jpeg" + * + * @param string $path The path to clean up + * @return string the clean path + */ + public static function getRelativePath($path) { + $path = preg_replace("#/+\.?/+#", "/", str_replace("\\", "/", $path)); + $dirs = explode("/", rtrim(preg_replace('#^(?:\./)+#', '', $path), '/')); + + $offset = 0; + $sub = 0; + $subOffset = 0; + $root = ""; + + if (empty($dirs[0])) { + $root = "/"; + $dirs = array_splice($dirs, 1); + } else if (preg_match("#[A-Za-z]:#", $dirs[0])) { + $root = strtoupper($dirs[0]) . "/"; + $dirs = array_splice($dirs, 1); + } + + $newDirs = array(); + foreach ($dirs as $dir) { + if ($dir !== "..") { + $subOffset--; + $newDirs[++$offset] = $dir; + } else { + $subOffset++; + if (--$offset < 0) { + $offset = 0; + if ($subOffset > $sub) { + $sub++; + } + } + } + } + + if (empty($root)) { + $root = str_repeat("../", $sub); + } + return $root . implode("/", array_slice($newDirs, 0, $offset)); + } + + /** + * Create the file permissions for a file or directory, for use in the extFileAttr parameters. + * + * @param int $owner Unix permisions for owner (octal from 00 to 07) + * @param int $group Unix permisions for group (octal from 00 to 07) + * @param int $other Unix permisions for others (octal from 00 to 07) + * @param bool $isFile + * @return EXTRERNAL_REF field. + */ + public static function generateExtAttr($owner = 07, $group = 05, $other = 05, $isFile = true) { + $fp = $isFile ? self::S_IFREG : self::S_IFDIR; + $fp |= (($owner & 07) << 6) | (($group & 07) << 3) | ($other & 07); + + return ($fp << 16) | ($isFile ? self::S_DOS_A : self::S_DOS_D); + } + + /** + * Get the file permissions for a file or directory, for use in the extFileAttr parameters. + * + * @param string $filename + * @return external ref field, or FALSE if the file is not found. + */ + public static function getFileExtAttr($filename) { + if (file_exists($filename)) { + $fp = fileperms($filename) << 16; + return $fp | (is_dir($filename) ? self::S_DOS_D : self::S_DOS_A); + } + return FALSE; + } +} +?> diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt new file mode 100644 index 00000000..9424a83e --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt @@ -0,0 +1,31 @@ + DrUUID RFC4122 library for PHP5 + by J. King (http://jkingweb.ca/) + Licensed under MIT license + + See http://jkingweb.ca/code/php/lib.uuid/ + for documentation + + Last revised 2010-02-15 + +Copyright (c) 2009 J. King + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.php b/inc/3rdparty/libraries/PHPePub/lib.uuid.php new file mode 100644 index 00000000..c6a8de52 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.php @@ -0,0 +1,314 @@ +string; + } + + public function __get($var) { + switch($var) { + case "bytes": + return $this->bytes; + case "hex": + return bin2hex($this->bytes); + case "string": + return $this->__toString(); + case "urn": + return "urn:uuid:".$this->__toString(); + case "version": + return ord($this->bytes[6]) >> 4; + case "variant": + $byte = ord($this->bytes[8]); + if ($byte >= self::varRes) { + return 3; + } + if ($byte >= self::varMS) { + return 2; + } + if ($byte >= self::varRFC) { + return 1; + } + return 0; + case "node": + if (ord($this->bytes[6])>>4==1) { + return bin2hex(substr($this->bytes,10)); + } else { + return NULL; + } + case "time": + if (ord($this->bytes[6])>>4==1) { + // Restore contiguous big-endian byte order + $time = bin2hex($this->bytes[6].$this->bytes[7].$this->bytes[4].$this->bytes[5].$this->bytes[0].$this->bytes[1].$this->bytes[2].$this->bytes[3]); + // Clear version flag + $time[0] = "0"; + // Do some reverse arithmetic to get a Unix timestamp + $time = (hexdec($time) - self::interval) / 10000000; + return $time; + } else { + return NULL; + } + default: + return NULL; + } + } + + protected function __construct($uuid) { + if (strlen($uuid) != 16) { + throw new UUIDException("Input must be a 128-bit integer."); + } + $this->bytes = $uuid; + // Optimize the most common use + $this->string = + bin2hex(substr($uuid,0,4))."-". + bin2hex(substr($uuid,4,2))."-". + bin2hex(substr($uuid,6,2))."-". + bin2hex(substr($uuid,8,2))."-". + bin2hex(substr($uuid,10,6)); + } + + protected static function mintTime($node = NULL) { + /* Generates a Version 1 UUID. + These are derived from the time at which they were generated. */ + // Get time since Gregorian calendar reform in 100ns intervals + // This is exceedingly difficult because of PHP's (and pack()'s) + // integer size limits. + // Note that this will never be more accurate than to the microsecond. + $time = microtime(1) * 10000000 + self::interval; + // Convert to a string representation + $time = sprintf("%F", $time); + preg_match("/^\d+/", $time, $time); //strip decimal point + // And now to a 64-bit binary representation + $time = base_convert($time[0], 10, 16); + $time = pack("H*", str_pad($time, 16, "0", STR_PAD_LEFT)); + // Reorder bytes to their proper locations in the UUID + $uuid = $time[4].$time[5].$time[6].$time[7].$time[2].$time[3].$time[0].$time[1]; + // Generate a random clock sequence + $uuid .= self::randomBytes(2); + // set variant + $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); + // set version + $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version1); + // Set the final 'node' parameter, a MAC address + if ($node) { + $node = self::makeBin($node, 6); + } + if (!$node) { + // If no node was provided or if the node was invalid, + // generate a random MAC address and set the multicast bit + $node = self::randomBytes(6); + $node[0] = pack("C", ord($node[0]) | 1); + } + $uuid .= $node; + return $uuid; + } + + protected static function mintRand() { + /* Generate a Version 4 UUID. + These are derived soly from random numbers. */ + // generate random fields + $uuid = self::randomBytes(16); + // set variant + $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); + // set version + $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version4); + return $uuid; + } + + protected static function mintName($ver, $node, $ns) { + /* Generates a Version 3 or Version 5 UUID. + These are derived from a hash of a name and its namespace, in binary form. */ + if (!$node) { + throw new UUIDException("A name-string is required for Version 3 or 5 UUIDs."); + } + // if the namespace UUID isn't binary, make it so + $ns = self::makeBin($ns, 16); + if (!$ns) { + throw new UUIDException("A binary namespace is required for Version 3 or 5 UUIDs."); + } + $uuid = null; + $version = self::version3; + switch($ver) { + case self::MD5: + $version = self::version3; + $uuid = md5($ns.$node,1); + break; + case self::SHA1: + $version = self::version5; + $uuid = substr(sha1($ns.$node,1),0, 16); + break; + } + // set variant + $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); + // set version + $uuid[6] = chr(ord($uuid[6]) & self::clearVer | $version); + return ($uuid); + } + + protected static function makeBin($str, $len) { + /* Insure that an input string is either binary or hexadecimal. + Returns binary representation, or false on failure. */ + if ($str instanceof self) { + return $str->bytes; + } + if (strlen($str)==$len) { + return $str; + } else { + $str = preg_replace("/^urn:uuid:/is", "", $str); // strip URN scheme and namespace + } + $str = preg_replace("/[^a-f0-9]/is", "", $str); // strip non-hex characters + if (strlen($str) != ($len * 2)) { + return FALSE; + } else { + return pack("H*", $str); + } + } + + public static function initRandom() { + /* Look for a system-provided source of randomness, which is usually crytographically secure. + /dev/urandom is tried first simply out of bias for Linux systems. */ + if (is_readable('/dev/urandom')) { + self::$randomSource = fopen('/dev/urandom', 'rb'); + self::$randomFunc = 'randomFRead'; + } + else if (class_exists('COM', 0)) { + try { + self::$randomSource = new COM('CAPICOM.Utilities.1'); // See http://msdn.microsoft.com/en-us/library/aa388182(VS.85).aspx + self::$randomFunc = 'randomCOM'; + } + catch(Exception $e) { + } + } + return self::$randomFunc; + } + + public static function randomBytes($bytes) { + return call_user_func(array('self', self::$randomFunc), $bytes); + } + + protected static function randomTwister($bytes) { + /* Get the specified number of random bytes, using mt_rand(). + Randomness is returned as a string of bytes. */ + $rand = ""; + for ($a = 0; $a < $bytes; $a++) { + $rand .= chr(mt_rand(0, 255)); + } + return $rand; + } + + protected static function randomFRead($bytes) { + /* Get the specified number of random bytes using a file handle + previously opened with UUID::initRandom(). + Randomness is returned as a string of bytes. */ + return fread(self::$randomSource, $bytes); + } + + protected static function randomCOM($bytes) { + /* Get the specified number of random bytes using Windows' + randomness source via a COM object previously created by UUID::initRandom(). + Randomness is returned as a string of bytes. */ + return base64_decode(self::$randomSource->GetRandom($bytes,0)); // straight binary mysteriously doesn't work, hence the base64 + } +} + +class UUIDException extends Exception { +} -- cgit v1.2.3 From 72a857158c187206dae2eed08143d5743322cb0c Mon Sep 17 00:00:00 2001 From: tcit Date: Thu, 24 Apr 2014 03:08:31 +0200 Subject: Fixed a bug into PHPePub with special caracters --- inc/3rdparty/libraries/PHPePub/EPub.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php index 836c0512..0260ce4f 100644 --- a/inc/3rdparty/libraries/PHPePub/EPub.php +++ b/inc/3rdparty/libraries/PHPePub/EPub.php @@ -343,6 +343,7 @@ class EPub { } $fileName = Zip::getRelativePath($fileName); $fileName = preg_replace('#^[/\.]+#i', "", $fileName); + $fileName = $this->sanitizeFileName($fileName); $chapter = $chapterData; if ($autoSplit && is_string($chapterData) && mb_strlen($chapterData) > $this->splitDefaultSize) { @@ -1699,7 +1700,7 @@ class EPub { while (list($chapterName, $navPoint) = each($this->ncx->chapterList)) { $fileName = $navPoint->getContentSrc(); $level = $navPoint->getLevel() -2; - $tocData .= "\t

    " . str_repeat("      ", $level) . "" . $chapterName . "

    \n"; + $tocData .= "\t

    " . str_repeat("      ", $level) . "sanitizeFileName($fileName) . "\">" . $chapterName . "

    \n"; } } else if ($this->tocAddReferences === TRUE) { if (array_key_exists($item, $this->ncx->referencesList)) { -- cgit v1.2.3 From ef17914960191c6008fc70891544ae2182d70582 Mon Sep 17 00:00:00 2001 From: tcit Date: Thu, 24 Apr 2014 09:39:50 +0200 Subject: Fix for #664 - Missing source url attribute in RSS feeds --- inc/3rdparty/libraries/feedwriter/FeedItem.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php index 3487423f..54a56f22 100644 --- a/inc/3rdparty/libraries/feedwriter/FeedItem.php +++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php @@ -174,7 +174,8 @@ */ public function setSource($link) { - $this->setElement('source', $link); + $attributes = array('url'=>$link); + $this->setElement('source', "wallabag",$attributes); } /** -- cgit v1.2.3 From 827f5b42a667a9ac2ab68701c23885ae6e617907 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 24 Apr 2014 11:48:00 +0300 Subject: fix of rss headers problem --- inc/3rdparty/libraries/feedwriter/FeedWriter.php | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index 79639c0c..d708e99b 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php @@ -87,12 +87,25 @@ define('JSONP', 3, true); * @access public * @return void */ - public function genarateFeed() + public function genarateFeed($withHeaders = true) { - header('Content-type: text/xml; charset=UTF-8'); - // this line prevents Chrome 20 from prompting download - // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss - header('X-content-type-options: nosniff'); + if ($withHeaders) { + if ($this->version == RSS2) { + header('Content-type: text/xml; charset=UTF-8'); + // this line prevents Chrome 20 from prompting download + // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss + header('X-content-type-options: nosniff'); + } elseif ($this->version == JSON) { + header('Content-type: application/json; charset=UTF-8'); + } elseif ($this->version == JSONP) { + header('Content-type: application/javascript; charset=UTF-8'); + } + } + + if ($this->version == JSON || $this->version == JSONP) { + $this->json = new stdClass(); + } + $this->printHead(); $this->printChannels(); -- cgit v1.2.3 From 4877836b12cde621a9c6200ec460ce025384ea35 Mon Sep 17 00:00:00 2001 From: tcit Date: Wed, 7 May 2014 12:40:09 +0200 Subject: Many improvements to epub produced : better cover, better tags --- inc/3rdparty/libraries/PHPePub/EPub.php | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php index 0260ce4f..e120b341 100644 --- a/inc/3rdparty/libraries/PHPePub/EPub.php +++ b/inc/3rdparty/libraries/PHPePub/EPub.php @@ -574,7 +574,7 @@ class EPub { * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png". * @return bool $success */ - function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL) { + function setCover($fileName, $imageData = NULL, $mimetype = NULL, $coverText=NULL) { if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) { return FALSE; } @@ -621,12 +621,13 @@ class EPub { . "\n" . "\t\n" . "\t\t\n" - . "\t\tCover Image\n" + . "\t\tCover\n" . "\t\t\n" . "\t\n" . "\t\n" + . "\t\t" . $coverText . "\n" . "\t\t
    \n" - . "\t\t\t\"Cover\n" + . "\t\t\t\"Cover\n" . "\t\t
    \n" . "\t\n" . "\n"; @@ -635,12 +636,13 @@ class EPub { . "\n" . "" . "\t\n" - . "\t\tCover Image\n" + . "\t\tCover\n" . "\t\t\n" . "\t\n" . "\t\n" . "\t\t
    \n" - . "\t\t\t\"Cover\n" + . "\t\t" . $coverText . "\n" + . "\t\t\t\"Cover\n" . "\t\t
    \n" . "\t\n" . "\n"; -- cgit v1.2.3 From f2b6b4e23064c40cde9e2ad5327499589dee503b Mon Sep 17 00:00:00 2001 From: tcit Date: Wed, 14 May 2014 22:03:16 +0200 Subject: Fix bugs and improved epub rendering --- inc/3rdparty/libraries/PHPePub/EPub.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php index e120b341..f1f41bd5 100644 --- a/inc/3rdparty/libraries/PHPePub/EPub.php +++ b/inc/3rdparty/libraries/PHPePub/EPub.php @@ -574,7 +574,7 @@ class EPub { * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png". * @return bool $success */ - function setCover($fileName, $imageData = NULL, $mimetype = NULL, $coverText=NULL) { + function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL,$bookTitle) { if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) { return FALSE; } @@ -621,13 +621,13 @@ class EPub { . "\n" . "\t\n" . "\t\t\n" - . "\t\tCover\n" + . "\t\tCover Image\n" . "\t\t\n" . "\t\n" . "\t\n" - . "\t\t" . $coverText . "\n" + . "\t" . $bookTitle . "\n" . "\t\t
    \n" - . "\t\t\t\"Cover\n" + . "\t\t\t\"Cover\n" . "\t\t
    \n" . "\t\n" . "\n"; @@ -636,13 +636,13 @@ class EPub { . "\n" . "" . "\t\n" - . "\t\tCover\n" + . "\t\tCover Image\n" . "\t\t\n" . "\t\n" . "\t\n" . "\t\t
    \n" - . "\t\t" . $coverText . "\n" - . "\t\t\t\"Cover\n" + . "\t" . $bookTitle . "\n" + . "\t\t\t\"Cover\n" . "\t\t
    \n" . "\t\n" . "\n"; -- cgit v1.2.3 From 3ec62cf95ab4436923d4c665fad7aef226cbb822 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 22 May 2014 17:16:38 +0300 Subject: update to 3.2 version of full-text-rss, issue #694 --- .../content-extractor/ContentExtractor.php | 1455 +++++++------ .../libraries/content-extractor/SiteConfig.php | 681 +++--- inc/3rdparty/libraries/feedwriter/FeedItem.php | 100 +- inc/3rdparty/libraries/feedwriter/FeedWriter.php | 17 +- inc/3rdparty/libraries/html5/TreeBuilder.php | 13 +- .../libraries/humble-http-agent/CookieJar.php | 807 ++++--- .../humble-http-agent/HumbleHttpAgent.php | 1589 +++++++------- .../SimplePie_HumbleHttpAgent.php | 157 +- .../libraries/language-detect/LanguageDetect.php | 992 +++++---- inc/3rdparty/libraries/readability/Readability.php | 2274 ++++++++++---------- 10 files changed, 4097 insertions(+), 3988 deletions(-) mode change 100644 => 100755 inc/3rdparty/libraries/feedwriter/FeedItem.php (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php index ddd33bb5..21e693e7 100644 --- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php +++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php @@ -1,728 +1,727 @@ - true, - 'output-xhtml' => true, - 'logical-emphasis' => true, - 'show-body-only' => false, - 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', - 'new-inline-tags' => 'mark, time, meter, progress, data', - 'wrap' => 0, - 'drop-empty-paras' => true, - 'drop-proprietary-attributes' => false, - 'enclose-text' => true, - 'enclose-block-text' => true, - 'merge-divs' => true, - 'merge-spans' => true, - 'char-encoding' => 'utf8', - 'hide-comments' => true - ); - protected $html; - protected $config; - protected $title; - protected $author = array(); - protected $language; - protected $date; - protected $body; - protected $success = false; - protected $nextPageUrl; - public $allowedParsers = array('libxml', 'html5lib'); - public $fingerprints = array(); - public $readability; - public $debug = false; - public $debugVerbose = false; - - function __construct($path, $fallback=null) { - SiteConfig::set_config_path($path, $fallback); - } - - protected function debug($msg) { - if ($this->debug) { - $mem = round(memory_get_usage()/1024, 2); - $memPeak = round(memory_get_peak_usage()/1024, 2); - echo '* ',$msg; - if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; - echo "\n"; - ob_flush(); - flush(); - } - } - - public function reset() { - $this->html = null; - $this->readability = null; - $this->config = null; - $this->title = null; - $this->body = null; - $this->author = array(); - $this->language = null; - $this->date = null; - $this->nextPageUrl = null; - $this->success = false; - } - - public function findHostUsingFingerprints($html) { - $this->debug('Checking fingerprints...'); - $head = substr($html, 0, 8000); - foreach ($this->fingerprints as $_fp => $_fphost) { - $lookin = 'html'; - if (is_array($_fphost)) { - if (isset($_fphost['head']) && $_fphost['head']) { - $lookin = 'head'; - } - $_fphost = $_fphost['hostname']; - } - if (strpos($$lookin, $_fp) !== false) { - $this->debug("Found match: $_fphost"); - return $_fphost; - } - } - $this->debug('No fingerprint matches'); - return false; - } - - // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) - public function buildSiteConfig($url, $html='', $add_to_cache=true) { - // extract host name - $host = @parse_url($url, PHP_URL_HOST); - $host = strtolower($host); - if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); - // is merged version already cached? - if (SiteConfig::is_cached("$host.merged")) { - $this->debug("Returning cached and merged site config for $host"); - return SiteConfig::build("$host.merged"); - } - // let's build from site_config/custom/ and standard/ - $config = SiteConfig::build($host); - if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { - SiteConfig::add_to_cache($host, $config); - } - // if no match, use defaults - if (!$config) $config = new SiteConfig(); - // load fingerprint config? - if ($config->autodetect_on_failure()) { - // check HTML for fingerprints - if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { - if ($config_fingerprint = SiteConfig::build($_fphost)) { - $this->debug("Appending site config settings from $_fphost (fingerprint match)"); - $config->append($config_fingerprint); - if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { - //$config_fingerprint->cache_in_apc = true; - SiteConfig::add_to_cache($_fphost, $config_fingerprint); - } - } - } - } - // load global config? - if ($config->autodetect_on_failure()) { - if ($config_global = SiteConfig::build('global', true)) { - $this->debug('Appending site config settings from global.txt'); - $config->append($config_global); - if ($add_to_cache && !SiteConfig::is_cached('global')) { - //$config_global->cache_in_apc = true; - SiteConfig::add_to_cache('global', $config_global); - } - } - } - // store copy of merged config - if ($add_to_cache) { - // do not store in APC if wildcard match - $use_apc = ($host == $config->cache_key); - $config->cache_key = null; - SiteConfig::add_to_cache("$host.merged", $config, $use_apc); - } - return $config; - } - - // returns true on success, false on failure - // $smart_tidy indicates that if tidy is used and no results are produced, we will - // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time - // but it has problems of its own which we try to avoid with this option. - public function process($html, $url, $smart_tidy=true) { - $this->reset(); - $this->config = $this->buildSiteConfig($url, $html); - - // do string replacements - if (!empty($this->config->find_string)) { - if (count($this->config->find_string) == count($this->config->replace_string)) { - $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); - $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); - } else { - $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); - } - unset($_count); - } - - // use tidy (if it exists)? - // This fixes problems with some sites which would otherwise - // trouble DOMDocument's HTML parsing. (Although sometimes it - // makes matters worse, which is why you can override it in site config files.) - $tidied = false; - if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { - $this->debug('Using Tidy'); - $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); - if (tidy_clean_repair($tidy)) { - $original_html = $html; - $tidied = true; - $html = $tidy->value; - } - unset($tidy); - } - - // load and parse html - $_parser = $this->config->parser(); - if (!in_array($_parser, $this->allowedParsers)) { - $this->debug("HTML parser $_parser not listed, using libxml instead"); - $_parser = 'libxml'; - } - $this->debug("Attempting to parse HTML with $_parser"); - $this->readability = new Readability($html, $url, $_parser); - - // we use xpath to find elements in the given HTML document - // see http://en.wikipedia.org/wiki/XPath_1.0 - $xpath = new DOMXPath($this->readability->dom); - - // try to get next page link - foreach ($this->config->next_page_link as $pattern) { - $elems = @$xpath->evaluate($pattern, $this->readability->dom); - if (is_string($elems)) { - $this->nextPageUrl = trim($elems); - break; - } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { - foreach ($elems as $item) { - if ($item instanceof DOMElement && $item->hasAttribute('href')) { - $this->nextPageUrl = $item->getAttribute('href'); - break 2; - } elseif ($item instanceof DOMAttr && $item->value) { - $this->nextPageUrl = $item->value; - break 2; - } - } - } - } - - // try to get title - foreach ($this->config->title as $pattern) { - // $this->debug("Trying $pattern"); - $elems = @$xpath->evaluate($pattern, $this->readability->dom); - if (is_string($elems)) { - $this->title = trim($elems); - $this->debug('Title expression evaluated as string: '.$this->title); - $this->debug("...XPath match: $pattern"); - break; - } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { - $this->title = $elems->item(0)->textContent; - $this->debug('Title matched: '.$this->title); - $this->debug("...XPath match: $pattern"); - // remove title from document - try { - $elems->item(0)->parentNode->removeChild($elems->item(0)); - } catch (DOMException $e) { - // do nothing - } - break; - } - } - - // try to get author (if it hasn't already been set) - if (empty($this->author)) { - foreach ($this->config->author as $pattern) { - $elems = @$xpath->evaluate($pattern, $this->readability->dom); - if (is_string($elems)) { - if (trim($elems) != '') { - $this->author[] = trim($elems); - $this->debug('Author expression evaluated as string: '.trim($elems)); - $this->debug("...XPath match: $pattern"); - break; - } - } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { - foreach ($elems as $elem) { - if (!isset($elem->parentNode)) continue; - $this->author[] = trim($elem->textContent); - $this->debug('Author matched: '.trim($elem->textContent)); - } - if (!empty($this->author)) { - $this->debug("...XPath match: $pattern"); - break; - } - } - } - } - - // try to get language - $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); - foreach ($_lang_xpath as $pattern) { - $elems = @$xpath->evaluate($pattern, $this->readability->dom); - if (is_string($elems)) { - if (trim($elems) != '') { - $this->language = trim($elems); - $this->debug('Language matched: '.$this->language); - break; - } - } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { - foreach ($elems as $elem) { - if (!isset($elem->parentNode)) continue; - $this->language = trim($elem->textContent); - $this->debug('Language matched: '.$this->language); - } - if ($this->language) break; - } - } - - // try to get date - foreach ($this->config->date as $pattern) { - $elems = @$xpath->evaluate($pattern, $this->readability->dom); - if (is_string($elems)) { - $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); - } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { - $this->date = $elems->item(0)->textContent; - $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); - // remove date from document - // $elems->item(0)->parentNode->removeChild($elems->item(0)); - } - if (!$this->date) { - $this->date = null; - } else { - $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); - $this->debug("...XPath match: $pattern"); - break; - } - } - - // strip elements (using xpath expressions) - foreach ($this->config->strip as $pattern) { - $elems = @$xpath->query($pattern, $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' elements (strip)'); - for ($i=$elems->length-1; $i >= 0; $i--) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); - } - } - } - - // strip elements (using id and class attribute values) - foreach ($this->config->strip_id_or_class as $string) { - $string = strtr($string, array("'"=>'', '"'=>'')); - $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); - for ($i=$elems->length-1; $i >= 0; $i--) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); - } - } - } - - // strip images (using src attribute values) - foreach ($this->config->strip_image_src as $string) { - $string = strtr($string, array("'"=>'', '"'=>'')); - $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' image elements'); - for ($i=$elems->length-1; $i >= 0; $i--) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); - } - } - } - // strip elements using Readability.com and Instapaper.com ignore class names - // .entry-unrelated and .instapaper_ignore - // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines - // and http://blog.instapaper.com/post/730281947 - $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); - for ($i=$elems->length-1; $i >= 0; $i--) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); - } - } - - // strip elements that contain style="display: none;" - $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); - for ($i=$elems->length-1; $i >= 0; $i--) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); - } - } - - // try to get body - foreach ($this->config->body as $pattern) { - $elems = @$xpath->query($pattern, $this->readability->dom); - // check for matches - if ($elems && $elems->length > 0) { - $this->debug('Body matched'); - $this->debug("...XPath match: $pattern"); - if ($elems->length == 1) { - $this->body = $elems->item(0); - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('...pruning content'); - $this->readability->prepArticle($this->body); - } - break; - } else { - $this->body = $this->readability->dom->createElement('div'); - $this->debug($elems->length.' body elems found'); - foreach ($elems as $elem) { - if (!isset($elem->parentNode)) continue; - $isDescendant = false; - foreach ($this->body->childNodes as $parent) { - if ($this->isDescendant($parent, $elem)) { - $isDescendant = true; - break; - } - } - if ($isDescendant) { - $this->debug('...element is child of another body element, skipping.'); - } else { - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('Pruning content'); - $this->readability->prepArticle($elem); - } - $this->debug('...element added to body'); - $this->body->appendChild($elem); - } - } - if ($this->body->hasChildNodes()) break; - } - } - } - - // auto detect? - $detect_title = $detect_body = $detect_author = $detect_date = false; - // detect title? - if (!isset($this->title)) { - if (empty($this->config->title) || $this->config->autodetect_on_failure()) { - $detect_title = true; - } - } - // detect body? - if (!isset($this->body)) { - if (empty($this->config->body) || $this->config->autodetect_on_failure()) { - $detect_body = true; - } - } - // detect author? - if (empty($this->author)) { - if (empty($this->config->author) || $this->config->autodetect_on_failure()) { - $detect_author = true; - } - } - // detect date? - if (!isset($this->date)) { - if (empty($this->config->date) || $this->config->autodetect_on_failure()) { - $detect_date = true; - } - } - - // check for hNews - if ($detect_title || $detect_body) { - // check for hentry - $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); - if ($elems && $elems->length > 0) { - $this->debug('hNews: found hentry'); - $hentry = $elems->item(0); - - if ($detect_title) { - // check for entry-title - $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); - if ($elems && $elems->length > 0) { - $this->title = $elems->item(0)->textContent; - $this->debug('hNews: found entry-title: '.$this->title); - // remove title from document - $elems->item(0)->parentNode->removeChild($elems->item(0)); - $detect_title = false; - } - } - - if ($detect_date) { - // check for time element with pubdate attribute - $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); - if ($elems && $elems->length > 0) { - $this->date = strtotime(trim($elems->item(0)->textContent)); - // remove date from document - //$elems->item(0)->parentNode->removeChild($elems->item(0)); - if ($this->date) { - $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); - $detect_date = false; - } else { - $this->date = null; - } - } - } - - if ($detect_author) { - // check for time element with pubdate attribute - $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); - if ($elems && $elems->length > 0) { - $author = $elems->item(0); - $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); - if ($fn && $fn->length > 0) { - foreach ($fn as $_fn) { - if (trim($_fn->textContent) != '') { - $this->author[] = trim($_fn->textContent); - $this->debug('hNews: found author: '.trim($_fn->textContent)); - } - } - } else { - if (trim($author->textContent) != '') { - $this->author[] = trim($author->textContent); - $this->debug('hNews: found author: '.trim($author->textContent)); - } - } - $detect_author = empty($this->author); - } - } - - // check for entry-content. - // according to hAtom spec, if there are multiple elements marked entry-content, - // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content - if ($detect_body) { - $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); - if ($elems && $elems->length > 0) { - $this->debug('hNews: found entry-content'); - if ($elems->length == 1) { - // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) - $e = $elems->item(0); - if (($e->tagName == 'img') || (trim($e->textContent) != '')) { - $this->body = $elems->item(0); - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('Pruning content'); - $this->readability->prepArticle($this->body); - } - $detect_body = false; - } else { - $this->debug('hNews: skipping entry-content - appears not to contain content'); - } - unset($e); - } else { - $this->body = $this->readability->dom->createElement('div'); - $this->debug($elems->length.' entry-content elems found'); - foreach ($elems as $elem) { - if (!isset($elem->parentNode)) continue; - $isDescendant = false; - foreach ($this->body->childNodes as $parent) { - if ($this->isDescendant($parent, $elem)) { - $isDescendant = true; - break; - } - } - if ($isDescendant) { - $this->debug('Element is child of another body element, skipping.'); - } else { - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('Pruning content'); - $this->readability->prepArticle($elem); - } - $this->debug('Element added to body'); - $this->body->appendChild($elem); - } - } - $detect_body = false; - } - } - } - } - } - - // check for elements marked with instapaper_title - if ($detect_title) { - // check for instapaper_title - $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); - if ($elems && $elems->length > 0) { - $this->title = $elems->item(0)->textContent; - $this->debug('Title found (.instapaper_title): '.$this->title); - // remove title from document - $elems->item(0)->parentNode->removeChild($elems->item(0)); - $detect_title = false; - } - } - // check for elements marked with instapaper_body - if ($detect_body) { - $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); - if ($elems && $elems->length > 0) { - $this->debug('body found (.instapaper_body)'); - $this->body = $elems->item(0); - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('Pruning content'); - $this->readability->prepArticle($this->body); - } - $detect_body = false; - } - } - - // Find author in rel="author" marked element - // We only use this if there's exactly one. - // If there's more than one, it could indicate more than - // one author, but it could also indicate that we're processing - // a page listing different articles with different authors. - if ($detect_author) { - $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); - if ($elems && $elems->length == 1) { - $author = trim($elems->item(0)->textContent); - if ($author != '') { - $this->debug("Author found (rel=\"author\"): $author"); - $this->author[] = $author; - $detect_author = false; - } - } - } - - // Find date in pubdate marked time element - // For the same reason given above, we only use this - // if there's exactly one element. - if ($detect_date) { - $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); - if ($elems && $elems->length == 1) { - $this->date = strtotime(trim($elems->item(0)->textContent)); - // remove date from document - //$elems->item(0)->parentNode->removeChild($elems->item(0)); - if ($this->date) { - $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); - $detect_date = false; - } else { - $this->date = null; - } - } - } - - // still missing title or body, so we detect using Readability - if ($detect_title || $detect_body) { - $this->debug('Using Readability'); - // clone body if we're only using Readability for title (otherwise it may interfere with body element) - if (isset($this->body)) $this->body = $this->body->cloneNode(true); - $success = $this->readability->init(); - } - if ($detect_title) { - $this->debug('Detecting title'); - $this->title = $this->readability->getTitle()->textContent; - } - if ($detect_body && $success) { - $this->debug('Detecting body'); - $this->body = $this->readability->getContent(); - if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { - $this->body = $this->body->firstChild; - } - // prune (clean up elements that may not be content) - if ($this->config->prune()) { - $this->debug('Pruning content'); - $this->readability->prepArticle($this->body); - } - } - if (isset($this->body)) { - // remove scripts - $this->readability->removeScripts($this->body); - // remove any h1-h6 elements that appear as first thing in the body - // and which match our title - if (isset($this->title) && ($this->title != '')) { - $firstChild = $this->body->firstChild; - while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { - $firstChild = $firstChild->nextSibling; - } - if (($firstChild->nodeType === XML_ELEMENT_NODE) - && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) - && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { - $this->body->removeChild($firstChild); - } - } - // prevent self-closing iframes - $elems = $this->body->getElementsByTagName('iframe'); - for ($i = $elems->length-1; $i >= 0; $i--) { - $e = $elems->item($i); - if (!$e->hasChildNodes()) { - $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); - } - } - // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ - // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src - // inside the data-lazy-src attribute. It also places the original image inside a noscript element - // next to the amended one. - $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); - for ($i = $elems->length-1; $i >= 0; $i--) { - $e = $elems->item($i); - // let's see if we can grab image from noscript - if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { - $_new_elem = $e->ownerDocument->createDocumentFragment(); - @$_new_elem->appendXML($e->nextSibling->innerHTML); - $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); - $e->parentNode->removeChild($e); - } else { - // Use data-lazy-src as src value - $e->setAttribute('src', $e->getAttribute('data-lazy-src')); - $e->removeAttribute('data-lazy-src'); - } - } - - $this->success = true; - } - - // if we've had no success and we've used tidy, there's a chance - // that tidy has messed up. So let's try again without tidy... - if (!$this->success && $tidied && $smart_tidy) { - $this->debug('Trying again without tidy'); - $this->process($original_html, $url, false); - } - - return $this->success; - } - - private function isDescendant(DOMElement $parent, DOMElement $child) { - $node = $child->parentNode; - while ($node != null) { - if ($node->isSameNode($parent)) return true; - $node = $node->parentNode; - } - return false; - } - - public function getContent() { - return $this->body; - } - - public function getTitle() { - return $this->title; - } - - public function getAuthors() { - return $this->author; - } - - public function getLanguage() { - return $this->language; - } - - public function getDate() { - return $this->date; - } - - public function getSiteConfig() { - return $this->config; - } - - public function getNextPageUrl() { - return $this->nextPageUrl; - } -} -?> \ No newline at end of file + true, + 'output-xhtml' => true, + 'logical-emphasis' => true, + 'show-body-only' => false, + 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', + 'new-inline-tags' => 'mark, time, meter, progress, data', + 'wrap' => 0, + 'drop-empty-paras' => true, + 'drop-proprietary-attributes' => false, + 'enclose-text' => true, + 'enclose-block-text' => true, + 'merge-divs' => true, + 'merge-spans' => true, + 'char-encoding' => 'utf8', + 'hide-comments' => true + ); + protected $html; + protected $config; + protected $title; + protected $author = array(); + protected $language; + protected $date; + protected $body; + protected $success = false; + protected $nextPageUrl; + public $allowedParsers = array('libxml', 'html5lib'); + public $fingerprints = array(); + public $readability; + public $debug = false; + public $debugVerbose = false; + + function __construct($path, $fallback=null) { + SiteConfig::set_config_path($path, $fallback); + } + + protected function debug($msg) { + if ($this->debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; + echo "\n"; + ob_flush(); + flush(); + } + } + + public function reset() { + $this->html = null; + $this->readability = null; + $this->config = null; + $this->title = null; + $this->body = null; + $this->author = array(); + $this->language = null; + $this->date = null; + $this->nextPageUrl = null; + $this->success = false; + } + + public function findHostUsingFingerprints($html) { + $this->debug('Checking fingerprints...'); + $head = substr($html, 0, 8000); + foreach ($this->fingerprints as $_fp => $_fphost) { + $lookin = 'html'; + if (is_array($_fphost)) { + if (isset($_fphost['head']) && $_fphost['head']) { + $lookin = 'head'; + } + $_fphost = $_fphost['hostname']; + } + if (strpos($$lookin, $_fp) !== false) { + $this->debug("Found match: $_fphost"); + return $_fphost; + } + } + $this->debug('No fingerprint matches'); + return false; + } + + // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) + public function buildSiteConfig($url, $html='', $add_to_cache=true) { + // extract host name + $host = @parse_url($url, PHP_URL_HOST); + $host = strtolower($host); + if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); + // is merged version already cached? + if (SiteConfig::is_cached("$host.merged")) { + $this->debug("Returning cached and merged site config for $host"); + return SiteConfig::build("$host.merged"); + } + // let's build from site_config/custom/ and standard/ + $config = SiteConfig::build($host); + if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { + SiteConfig::add_to_cache($host, $config); + } + // if no match, use defaults + if (!$config) $config = new SiteConfig(); + // load fingerprint config? + if ($config->autodetect_on_failure()) { + // check HTML for fingerprints + if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { + if ($config_fingerprint = SiteConfig::build($_fphost)) { + $this->debug("Appending site config settings from $_fphost (fingerprint match)"); + $config->append($config_fingerprint); + if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { + //$config_fingerprint->cache_in_apc = true; + SiteConfig::add_to_cache($_fphost, $config_fingerprint); + } + } + } + } + // load global config? + if ($config->autodetect_on_failure()) { + if ($config_global = SiteConfig::build('global', true)) { + $this->debug('Appending site config settings from global.txt'); + $config->append($config_global); + if ($add_to_cache && !SiteConfig::is_cached('global')) { + //$config_global->cache_in_apc = true; + SiteConfig::add_to_cache('global', $config_global); + } + } + } + // store copy of merged config + if ($add_to_cache) { + // do not store in APC if wildcard match + $use_apc = ($host == $config->cache_key); + $config->cache_key = null; + SiteConfig::add_to_cache("$host.merged", $config, $use_apc); + } + return $config; + } + + // returns true on success, false on failure + // $smart_tidy indicates that if tidy is used and no results are produced, we will + // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time + // but it has problems of its own which we try to avoid with this option. + public function process($html, $url, $smart_tidy=true) { + $this->reset(); + $this->config = $this->buildSiteConfig($url, $html); + + // do string replacements + if (!empty($this->config->find_string)) { + if (count($this->config->find_string) == count($this->config->replace_string)) { + $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); + $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); + } else { + $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); + } + unset($_count); + } + + // use tidy (if it exists)? + // This fixes problems with some sites which would otherwise + // trouble DOMDocument's HTML parsing. (Although sometimes it + // makes matters worse, which is why you can override it in site config files.) + $tidied = false; + if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { + $this->debug('Using Tidy'); + $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); + if (tidy_clean_repair($tidy)) { + $original_html = $html; + $tidied = true; + $html = $tidy->value; + } + unset($tidy); + } + + // load and parse html + $_parser = $this->config->parser(); + if (!in_array($_parser, $this->allowedParsers)) { + $this->debug("HTML parser $_parser not listed, using libxml instead"); + $_parser = 'libxml'; + } + $this->debug("Attempting to parse HTML with $_parser"); + $this->readability = new Readability($html, $url, $_parser); + + // we use xpath to find elements in the given HTML document + // see http://en.wikipedia.org/wiki/XPath_1.0 + $xpath = new DOMXPath($this->readability->dom); + + // try to get next page link + foreach ($this->config->next_page_link as $pattern) { + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if (is_string($elems)) { + $this->nextPageUrl = trim($elems); + break; + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + foreach ($elems as $item) { + if ($item instanceof DOMElement && $item->hasAttribute('href')) { + $this->nextPageUrl = $item->getAttribute('href'); + break 2; + } elseif ($item instanceof DOMAttr && $item->value) { + $this->nextPageUrl = $item->value; + break 2; + } + } + } + } + + // try to get title + foreach ($this->config->title as $pattern) { + // $this->debug("Trying $pattern"); + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if (is_string($elems)) { + $this->title = trim($elems); + $this->debug('Title expression evaluated as string: '.$this->title); + $this->debug("...XPath match: $pattern"); + break; + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + $this->title = $elems->item(0)->textContent; + $this->debug('Title matched: '.$this->title); + $this->debug("...XPath match: $pattern"); + // remove title from document + try { + @$elems->item(0)->parentNode->removeChild($elems->item(0)); + } catch (DOMException $e) { + // do nothing + } + break; + } + } + + // try to get author (if it hasn't already been set) + if (empty($this->author)) { + foreach ($this->config->author as $pattern) { + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if (is_string($elems)) { + if (trim($elems) != '') { + $this->author[] = trim($elems); + $this->debug('Author expression evaluated as string: '.trim($elems)); + $this->debug("...XPath match: $pattern"); + break; + } + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + foreach ($elems as $elem) { + if (!isset($elem->parentNode)) continue; + $this->author[] = trim($elem->textContent); + $this->debug('Author matched: '.trim($elem->textContent)); + } + if (!empty($this->author)) { + $this->debug("...XPath match: $pattern"); + break; + } + } + } + } + + // try to get language + $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); + foreach ($_lang_xpath as $pattern) { + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if (is_string($elems)) { + if (trim($elems) != '') { + $this->language = trim($elems); + $this->debug('Language matched: '.$this->language); + break; + } + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + foreach ($elems as $elem) { + if (!isset($elem->parentNode)) continue; + $this->language = trim($elem->textContent); + $this->debug('Language matched: '.$this->language); + } + if ($this->language) break; + } + } + + // try to get date + foreach ($this->config->date as $pattern) { + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if (is_string($elems)) { + $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + $this->date = $elems->item(0)->textContent; + $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); + // remove date from document + // $elems->item(0)->parentNode->removeChild($elems->item(0)); + } + if (!$this->date) { + $this->date = null; + } else { + $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); + $this->debug("...XPath match: $pattern"); + break; + } + } + + // strip elements (using xpath expressions) + foreach ($this->config->strip as $pattern) { + $elems = @$xpath->query($pattern, $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' elements (strip)'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } + } + + // strip elements (using id and class attribute values) + foreach ($this->config->strip_id_or_class as $string) { + $string = strtr($string, array("'"=>'', '"'=>'')); + $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } + } + + // strip images (using src attribute values) + foreach ($this->config->strip_image_src as $string) { + $string = strtr($string, array("'"=>'', '"'=>'')); + $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' image elements'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } + } + // strip elements using Readability.com and Instapaper.com ignore class names + // .entry-unrelated and .instapaper_ignore + // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines + // and http://blog.instapaper.com/post/730281947 + $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } + + // strip elements that contain style="display: none;" + $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } + + // try to get body + foreach ($this->config->body as $pattern) { + $elems = @$xpath->query($pattern, $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Body matched'); + $this->debug("...XPath match: $pattern"); + if ($elems->length == 1) { + $this->body = $elems->item(0); + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('...pruning content'); + $this->readability->prepArticle($this->body); + } + break; + } else { + $this->body = $this->readability->dom->createElement('div'); + $this->debug($elems->length.' body elems found'); + foreach ($elems as $elem) { + if (!isset($elem->parentNode)) continue; + $isDescendant = false; + foreach ($this->body->childNodes as $parent) { + if ($this->isDescendant($parent, $elem)) { + $isDescendant = true; + break; + } + } + if ($isDescendant) { + $this->debug('...element is child of another body element, skipping.'); + } else { + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('Pruning content'); + $this->readability->prepArticle($elem); + } + $this->debug('...element added to body'); + $this->body->appendChild($elem); + } + } + if ($this->body->hasChildNodes()) break; + } + } + } + + // auto detect? + $detect_title = $detect_body = $detect_author = $detect_date = false; + // detect title? + if (!isset($this->title)) { + if (empty($this->config->title) || $this->config->autodetect_on_failure()) { + $detect_title = true; + } + } + // detect body? + if (!isset($this->body)) { + if (empty($this->config->body) || $this->config->autodetect_on_failure()) { + $detect_body = true; + } + } + // detect author? + if (empty($this->author)) { + if (empty($this->config->author) || $this->config->autodetect_on_failure()) { + $detect_author = true; + } + } + // detect date? + if (!isset($this->date)) { + if (empty($this->config->date) || $this->config->autodetect_on_failure()) { + $detect_date = true; + } + } + + // check for hNews + if ($detect_title || $detect_body) { + // check for hentry + $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); + if ($elems && $elems->length > 0) { + $this->debug('hNews: found hentry'); + $hentry = $elems->item(0); + + if ($detect_title) { + // check for entry-title + $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); + if ($elems && $elems->length > 0) { + $this->title = $elems->item(0)->textContent; + $this->debug('hNews: found entry-title: '.$this->title); + // remove title from document + $elems->item(0)->parentNode->removeChild($elems->item(0)); + $detect_title = false; + } + } + + if ($detect_date) { + // check for time element with pubdate attribute + $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); + if ($elems && $elems->length > 0) { + $this->date = strtotime(trim($elems->item(0)->textContent)); + // remove date from document + //$elems->item(0)->parentNode->removeChild($elems->item(0)); + if ($this->date) { + $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); + $detect_date = false; + } else { + $this->date = null; + } + } + } + + if ($detect_author) { + // check for time element with pubdate attribute + $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); + if ($elems && $elems->length > 0) { + $author = $elems->item(0); + $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); + if ($fn && $fn->length > 0) { + foreach ($fn as $_fn) { + if (trim($_fn->textContent) != '') { + $this->author[] = trim($_fn->textContent); + $this->debug('hNews: found author: '.trim($_fn->textContent)); + } + } + } else { + if (trim($author->textContent) != '') { + $this->author[] = trim($author->textContent); + $this->debug('hNews: found author: '.trim($author->textContent)); + } + } + $detect_author = empty($this->author); + } + } + + // check for entry-content. + // according to hAtom spec, if there are multiple elements marked entry-content, + // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content + if ($detect_body) { + $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); + if ($elems && $elems->length > 0) { + $this->debug('hNews: found entry-content'); + if ($elems->length == 1) { + // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) + $e = $elems->item(0); + if (($e->tagName == 'img') || (trim($e->textContent) != '')) { + $this->body = $elems->item(0); + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('Pruning content'); + $this->readability->prepArticle($this->body); + } + $detect_body = false; + } else { + $this->debug('hNews: skipping entry-content - appears not to contain content'); + } + unset($e); + } else { + $this->body = $this->readability->dom->createElement('div'); + $this->debug($elems->length.' entry-content elems found'); + foreach ($elems as $elem) { + if (!isset($elem->parentNode)) continue; + $isDescendant = false; + foreach ($this->body->childNodes as $parent) { + if ($this->isDescendant($parent, $elem)) { + $isDescendant = true; + break; + } + } + if ($isDescendant) { + $this->debug('Element is child of another body element, skipping.'); + } else { + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('Pruning content'); + $this->readability->prepArticle($elem); + } + $this->debug('Element added to body'); + $this->body->appendChild($elem); + } + } + $detect_body = false; + } + } + } + } + } + + // check for elements marked with instapaper_title + if ($detect_title) { + // check for instapaper_title + $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); + if ($elems && $elems->length > 0) { + $this->title = $elems->item(0)->textContent; + $this->debug('Title found (.instapaper_title): '.$this->title); + // remove title from document + $elems->item(0)->parentNode->removeChild($elems->item(0)); + $detect_title = false; + } + } + // check for elements marked with instapaper_body + if ($detect_body) { + $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); + if ($elems && $elems->length > 0) { + $this->debug('body found (.instapaper_body)'); + $this->body = $elems->item(0); + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('Pruning content'); + $this->readability->prepArticle($this->body); + } + $detect_body = false; + } + } + + // Find author in rel="author" marked element + // We only use this if there's exactly one. + // If there's more than one, it could indicate more than + // one author, but it could also indicate that we're processing + // a page listing different articles with different authors. + if ($detect_author) { + $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); + if ($elems && $elems->length == 1) { + $author = trim($elems->item(0)->textContent); + if ($author != '') { + $this->debug("Author found (rel=\"author\"): $author"); + $this->author[] = $author; + $detect_author = false; + } + } + } + + // Find date in pubdate marked time element + // For the same reason given above, we only use this + // if there's exactly one element. + if ($detect_date) { + $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); + if ($elems && $elems->length == 1) { + $this->date = strtotime(trim($elems->item(0)->textContent)); + // remove date from document + //$elems->item(0)->parentNode->removeChild($elems->item(0)); + if ($this->date) { + $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); + $detect_date = false; + } else { + $this->date = null; + } + } + } + + // still missing title or body, so we detect using Readability + if ($detect_title || $detect_body) { + $this->debug('Using Readability'); + // clone body if we're only using Readability for title (otherwise it may interfere with body element) + if (isset($this->body)) $this->body = $this->body->cloneNode(true); + $success = $this->readability->init(); + } + if ($detect_title) { + $this->debug('Detecting title'); + $this->title = $this->readability->getTitle()->textContent; + } + if ($detect_body && $success) { + $this->debug('Detecting body'); + $this->body = $this->readability->getContent(); + if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { + $this->body = $this->body->firstChild; + } + // prune (clean up elements that may not be content) + if ($this->config->prune()) { + $this->debug('Pruning content'); + $this->readability->prepArticle($this->body); + } + } + if (isset($this->body)) { + // remove scripts + $this->readability->removeScripts($this->body); + // remove any h1-h6 elements that appear as first thing in the body + // and which match our title + if (isset($this->title) && ($this->title != '')) { + $firstChild = $this->body->firstChild; + while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { + $firstChild = $firstChild->nextSibling; + } + if (($firstChild->nodeType === XML_ELEMENT_NODE) + && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) + && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { + $this->body->removeChild($firstChild); + } + } + // prevent self-closing iframes + $elems = $this->body->getElementsByTagName('iframe'); + for ($i = $elems->length-1; $i >= 0; $i--) { + $e = $elems->item($i); + if (!$e->hasChildNodes()) { + $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); + } + } + // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ + // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src + // inside the data-lazy-src attribute. It also places the original image inside a noscript element + // next to the amended one. + $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); + for ($i = $elems->length-1; $i >= 0; $i--) { + $e = $elems->item($i); + // let's see if we can grab image from noscript + if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { + $_new_elem = $e->ownerDocument->createDocumentFragment(); + @$_new_elem->appendXML($e->nextSibling->innerHTML); + $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); + $e->parentNode->removeChild($e); + } else { + // Use data-lazy-src as src value + $e->setAttribute('src', $e->getAttribute('data-lazy-src')); + $e->removeAttribute('data-lazy-src'); + } + } + + $this->success = true; + } + + // if we've had no success and we've used tidy, there's a chance + // that tidy has messed up. So let's try again without tidy... + if (!$this->success && $tidied && $smart_tidy) { + $this->debug('Trying again without tidy'); + $this->process($original_html, $url, false); + } + + return $this->success; + } + + private function isDescendant(DOMElement $parent, DOMElement $child) { + $node = $child->parentNode; + while ($node != null) { + if ($node->isSameNode($parent)) return true; + $node = $node->parentNode; + } + return false; + } + + public function getContent() { + return $this->body; + } + + public function getTitle() { + return $this->title; + } + + public function getAuthors() { + return $this->author; + } + + public function getLanguage() { + return $this->language; + } + + public function getDate() { + return $this->date; + } + + public function getSiteConfig() { + return $this->config; + } + + public function getNextPageUrl() { + return $this->nextPageUrl; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php @@ -1,338 +1,343 @@ -tidy)) ? $this->tidy : $this->default_tidy; - return $this->tidy; - } - - // return bool or null - public function prune($use_default=true) { - if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; - return $this->prune; - } - - // return string or null - public function parser($use_default=true) { - if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; - return $this->parser; - } - - // return bool or null - public function autodetect_on_failure($use_default=true) { - if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; - return $this->autodetect_on_failure; - } - - public static function set_config_path($path, $fallback=null) { - self::$config_path = $path; - self::$config_path_fallback = $fallback; - } - - public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { - $key = strtolower($key); - if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); - if ($config->cache_key) $key = $config->cache_key; - self::$config_cache[$key] = $config; - if (self::$apc && $use_apc) { - self::debug("Adding site config to APC cache with key sc.$key"); - apc_add("sc.$key", $config); - } - self::debug("Cached site config with key $key"); - } - - public static function is_cached($key) { - $key = strtolower($key); - if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); - if (array_key_exists($key, self::$config_cache)) { - return true; - } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { - return true; - } - return false; - } - - public function append(SiteConfig $newconfig) { - // check for commands where we accept multiple statements (no test_url) - foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { - // append array elements for this config variable from $newconfig to this config - //$this->$var = $this->$var + $newconfig->$var; - $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); - } - // check for single statement commands - // we do not overwrite existing non null values - foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { - if ($this->$var === null) $this->$var = $newconfig->$var; - } - } - - // returns SiteConfig instance if an appropriate one is found, false otherwise - // if $exact_host_match is true, we will not look for wildcard config matches - // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists - public static function build($host, $exact_host_match=false) { - $host = strtolower($host); - if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); - if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; - // check for site configuration - $try = array($host); - // should we look for wildcard matches - if (!$exact_host_match) { - $split = explode('.', $host); - if (count($split) > 1) { - array_shift($split); - $try[] = '.'.implode('.', $split); - } - } - - // look for site config file in primary folder - self::debug(". looking for site config for $host in primary folder"); - foreach ($try as $h) { - if (array_key_exists($h, self::$config_cache)) { - self::debug("... site config for $h already loaded in this request"); - return self::$config_cache[$h]; - } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { - self::debug("... site config for $h in APC cache"); - return $sconfig; - } elseif (file_exists(self::$config_path."/$h.txt")) { - self::debug("... found site config ($h.txt)"); - $file_primary = self::$config_path."/$h.txt"; - $matched_name = $h; - break; - } - } - - // if we found site config, process it - if (isset($file_primary)) { - $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) return false; - $config = self::build_from_array($config_lines); - // if APC caching is available and enabled, mark this for cache - //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - - // if autodetec on failure is off (on by default) we do not need to look - // in secondary folder - if (!$config->autodetect_on_failure()) { - self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); - return $config; - } - } - - // look for site config file in secondary folder - if (isset(self::$config_path_fallback)) { - self::debug(". looking for site config for $host in secondary folder"); - foreach ($try as $h) { - if (file_exists(self::$config_path_fallback."/$h.txt")) { - self::debug("... found site config in secondary folder ($h.txt)"); - $file_secondary = self::$config_path_fallback."/$h.txt"; - $matched_name = $h; - break; - } - } - if (!isset($file_secondary)) { - self::debug("... no site config match in secondary folder"); - } - } - - // return false if no config file found - if (!isset($file_primary) && !isset($file_secondary)) { - self::debug("... no site config match for $host"); - return false; - } - - // return primary config if secondary not found - if (!isset($file_secondary) && isset($config)) { - return $config; - } - - // process secondary config file - $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) { - // failed to process secondary - if (isset($config)) { - // return primary config - return $config; - } else { - return false; - } - } - - // merge with primary and return - if (isset($config)) { - self::debug('. merging config files'); - $config->append(self::build_from_array($config_lines)); - return $config; - } else { - // return just secondary - $config = self::build_from_array($config_lines); - // if APC caching is available and enabled, mark this for cache - //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - return $config; - } - } - - public static function build_from_array(array $lines) { - $config = new SiteConfig(); - foreach ($lines as $line) { - $line = trim($line); - - // skip comments, empty lines - if ($line == '' || $line[0] == '#') continue; - - // get command - $command = explode(':', $line, 2); - // if there's no colon ':', skip this line - if (count($command) != 2) continue; - $val = trim($command[1]); - $command = trim($command[0]); - if ($command == '' || $val == '') continue; - - // check for commands where we accept multiple statements - if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { - array_push($config->$command, $val); - // check for single statement commands that evaluate to true or false - } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { - $config->$command = ($val == 'yes'); - // check for single statement commands stored as strings - } elseif (in_array($command, array('parser'))) { - $config->$command = $val; - // check for replace_string(find): replace - } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { - if (in_array($match[1], array('replace_string'))) { - $command = $match[1]; - array_push($config->find_string, $match[2]); - array_push($config->$command, $val); - } - } - } - return $config; - } -} -?> \ No newline at end of file +tidy)) ? $this->tidy : $this->default_tidy; + return $this->tidy; + } + + // return bool or null + public function prune($use_default=true) { + if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; + return $this->prune; + } + + // return string or null + public function parser($use_default=true) { + if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; + return $this->parser; + } + + // return bool or null + public function autodetect_on_failure($use_default=true) { + if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; + return $this->autodetect_on_failure; + } + + public static function set_config_path($path, $fallback=null) { + self::$config_path = $path; + self::$config_path_fallback = $fallback; + } + + public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { + $key = strtolower($key); + if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); + if ($config->cache_key) $key = $config->cache_key; + self::$config_cache[$key] = $config; + if (self::$apc && $use_apc) { + self::debug("Adding site config to APC cache with key sc.$key"); + apc_add("sc.$key", $config); + } + self::debug("Cached site config with key $key"); + } + + public static function is_cached($key) { + $key = strtolower($key); + if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); + if (array_key_exists($key, self::$config_cache)) { + return true; + } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { + return true; + } + return false; + } + + public function append(SiteConfig $newconfig) { + // check for commands where we accept multiple statements (no test_url) + foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { + // append array elements for this config variable from $newconfig to this config + //$this->$var = $this->$var + $newconfig->$var; + $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); + } + // check for single statement commands + // we do not overwrite existing non null values + foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { + if ($this->$var === null) $this->$var = $newconfig->$var; + } + // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) + foreach (array('find_string', 'replace_string') as $var) { + // append array elements for this config variable from $newconfig to this config + //$this->$var = $this->$var + $newconfig->$var; + $this->$var = array_merge($this->$var, $newconfig->$var); + } + } + + // returns SiteConfig instance if an appropriate one is found, false otherwise + // if $exact_host_match is true, we will not look for wildcard config matches + // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists + public static function build($host, $exact_host_match=false) { + $host = strtolower($host); + if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); + if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; + // check for site configuration + $try = array($host); + // should we look for wildcard matches + if (!$exact_host_match) { + $split = explode('.', $host); + if (count($split) > 1) { + array_shift($split); + $try[] = '.'.implode('.', $split); + } + } + + // look for site config file in primary folder + self::debug(". looking for site config for $host in primary folder"); + foreach ($try as $h) { + if (array_key_exists($h, self::$config_cache)) { + self::debug("... site config for $h already loaded in this request"); + return self::$config_cache[$h]; + } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { + self::debug("... site config for $h in APC cache"); + return $sconfig; + } elseif (file_exists(self::$config_path."/$h.txt")) { + self::debug("... found site config ($h.txt)"); + $file_primary = self::$config_path."/$h.txt"; + $matched_name = $h; + break; + } + } + + // if we found site config, process it + if (isset($file_primary)) { + $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_lines || !is_array($config_lines)) return false; + $config = self::build_from_array($config_lines); + // if APC caching is available and enabled, mark this for cache + //$config->cache_in_apc = true; + $config->cache_key = $matched_name; + + // if autodetec on failure is off (on by default) we do not need to look + // in secondary folder + if (!$config->autodetect_on_failure()) { + self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); + return $config; + } + } + + // look for site config file in secondary folder + if (isset(self::$config_path_fallback)) { + self::debug(". looking for site config for $host in secondary folder"); + foreach ($try as $h) { + if (file_exists(self::$config_path_fallback."/$h.txt")) { + self::debug("... found site config in secondary folder ($h.txt)"); + $file_secondary = self::$config_path_fallback."/$h.txt"; + $matched_name = $h; + break; + } + } + if (!isset($file_secondary)) { + self::debug("... no site config match in secondary folder"); + } + } + + // return false if no config file found + if (!isset($file_primary) && !isset($file_secondary)) { + self::debug("... no site config match for $host"); + return false; + } + + // return primary config if secondary not found + if (!isset($file_secondary) && isset($config)) { + return $config; + } + + // process secondary config file + $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_lines || !is_array($config_lines)) { + // failed to process secondary + if (isset($config)) { + // return primary config + return $config; + } else { + return false; + } + } + + // merge with primary and return + if (isset($config)) { + self::debug('. merging config files'); + $config->append(self::build_from_array($config_lines)); + return $config; + } else { + // return just secondary + $config = self::build_from_array($config_lines); + // if APC caching is available and enabled, mark this for cache + //$config->cache_in_apc = true; + $config->cache_key = $matched_name; + return $config; + } + } + + public static function build_from_array(array $lines) { + $config = new SiteConfig(); + foreach ($lines as $line) { + $line = trim($line); + + // skip comments, empty lines + if ($line == '' || $line[0] == '#') continue; + + // get command + $command = explode(':', $line, 2); + // if there's no colon ':', skip this line + if (count($command) != 2) continue; + $val = trim($command[1]); + $command = trim($command[0]); + if ($command == '' || $val == '') continue; + + // check for commands where we accept multiple statements + if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { + array_push($config->$command, $val); + // check for single statement commands that evaluate to true or false + } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { + $config->$command = ($val == 'yes'); + // check for single statement commands stored as strings + } elseif (in_array($command, array('parser'))) { + $config->$command = $val; + // check for replace_string(find): replace + } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { + if (in_array($match[1], array('replace_string'))) { + $command = $match[1]; + array_push($config->find_string, $match[2]); + array_push($config->$command, $val); + } + } + } + return $config; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php old mode 100644 new mode 100755 index 54a56f22..40786598 --- a/inc/3rdparty/libraries/feedwriter/FeedItem.php +++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php @@ -1,7 +1,7 @@ version = $version; } /** * Set element (overwrites existing elements with $elementName) - * + * * @access public * @param srting The tag name of an element * @param srting The content of tag @@ -38,11 +38,11 @@ unset($this->elements[$elementName]); } $this->addElement($elementName, $content, $attributes); - } - + } + /** * Add an element to elements array - * + * * @access public * @param srting The tag name of an element * @param srting The content of tag @@ -61,11 +61,11 @@ $this->elements[$elementName][$i]['content'] = $content; $this->elements[$elementName][$i]['attributes'] = $attributes; } - + /** - * Set multiple feed elements from an array. + * Set multiple feed elements from an array. * Elements which have attributes cannot be added by this method - * + * * @access public * @param array array of elements in 'tagName' => 'tagContent' format. * @return void @@ -73,15 +73,15 @@ public function addElementArray($elementArray) { if(! is_array($elementArray)) return; - foreach ($elementArray as $elementName => $content) + foreach ($elementArray as $elementName => $content) { $this->addElement($elementName, $content); } } - + /** * Return the collection of elements in this feed item - * + * * @access public * @return array */ @@ -89,68 +89,74 @@ { return $this->elements; } - + // Wrapper functions ------------------------------------------------------ - + /** * Set the 'dscription' element of feed item - * + * * @access public * @param string The content of 'description' element * @return void */ - public function setDescription($description) + public function setDescription($description) { - $this->setElement('description', $description); + $tag = ($this->version == ATOM)? 'summary' : 'description'; + $this->setElement($tag, $description); } - + /** * @desc Set the 'title' element of feed item * @access public * @param string The content of 'title' element * @return void */ - public function setTitle($title) + public function setTitle($title) { - $this->setElement('title', $title); + $this->setElement('title', $title); } - + /** * Set the 'date' element of feed item - * + * * @access public * @param string The content of 'date' element * @return void */ - public function setDate($date) + public function setDate($date) { if(! is_numeric($date)) { $date = strtotime($date); } - - if($this->version == RSS2) + + if($this->version == ATOM) + { + $tag = 'updated'; + $value = date(DATE_ATOM, $date); + } + elseif($this->version == RSS2) { - $tag = 'pubDate'; - $value = date(DATE_RSS, $date); + $tag = 'pubDate'; + $value = date(DATE_RSS, $date); } - else + else { - $tag = 'dc:date'; - $value = date("Y-m-d", $date); + $tag = 'dc:date'; + $value = date("Y-m-d", $date); } - - $this->setElement($tag, $value); + + $this->setElement($tag, $value); } - + /** * Set the 'link' element of feed item - * + * * @access public * @param string The content of 'link' element * @return void */ - public function setLink($link) + public function setLink($link) { if($this->version == RSS2 || $this->version == RSS1) { @@ -161,27 +167,27 @@ { $this->setElement('link','',array('href'=>$link)); $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); - } - + } + } /** * Set the 'source' element of feed item - * + * * @access public * @param string The content of 'source' element * @return void */ - public function setSource($link) + public function setSource($link) { $attributes = array('url'=>$link); $this->setElement('source', "wallabag",$attributes); } - + /** * Set the 'encloser' element of feed item * For RSS 2.0 only - * + * * @access public * @param string The url attribute of encloser tag * @param string The length attribute of encloser tag @@ -193,6 +199,6 @@ $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); $this->setElement('enclosure','',$attributes); } - + } // end of class FeedItem ?> \ No newline at end of file diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index d708e99b..77755690 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php @@ -97,15 +97,12 @@ define('JSONP', 3, true); header('X-content-type-options: nosniff'); } elseif ($this->version == JSON) { header('Content-type: application/json; charset=UTF-8'); + $this->json = new stdClass(); } elseif ($this->version == JSONP) { header('Content-type: application/javascript; charset=UTF-8'); + $this->json = new stdClass(); } } - - if ($this->version == JSON || $this->version == JSONP) { - $this->json = new stdClass(); - } - $this->printHead(); $this->printChannels(); @@ -116,6 +113,11 @@ define('JSONP', 3, true); } } + public function &getItems() + { + return $this->items; + } + /** * Create a new FeedItem. * @@ -199,7 +201,8 @@ define('JSONP', 3, true); */ public function setDescription($description) { - $this->setChannelElement('description', $description); + $tag = ($this->version == ATOM)? 'subtitle' : 'description'; + $this->setChannelElement($tag, $desciption); } /** @@ -244,7 +247,7 @@ define('JSONP', 3, true); { $out = ''."\n"; if ($this->xsl) $out .= 'xsl).'"?>' . PHP_EOL; - $out .= '' . PHP_EOL; + $out .= '' . PHP_EOL; echo $out; } elseif ($this->version == JSON || $this->version == JSONP) diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php index 2f5244f9..c4a48b21 100644 --- a/inc/3rdparty/libraries/html5/TreeBuilder.php +++ b/inc/3rdparty/libraries/html5/TreeBuilder.php @@ -134,6 +134,7 @@ class HTML5_TreeBuilder { // Namespaces for foreign content const NS_HTML = null; // to prevent DOM from requiring NS on everything + const NS_XHTML = 'http://www.w3.org/1999/xhtml'; const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_XLINK = 'http://www.w3.org/1999/xlink'; @@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder { } private function insertElement($token, $append = true) { - $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); + //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); + $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML; + $el = $this->dom->createElementNS($namespaceURI, $token['name']); if (!empty($token['attr'])) { foreach($token['attr'] as $attr) { - if(!$el->hasAttribute($attr['name'])) { + + // mike@macgirvin.com 2011-11-17, check attribute name for + // validity (ignoring extenders and combiners) as illegal chars in names + // causes everything to abort + + $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']); + if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { $el->setAttribute($attr['name'], $attr['value']); } } diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php index 83e94f14..e4d5f495 100644 --- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php +++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php @@ -1,404 +1,403 @@ - - * - * This class should be used to handle cookies (storing cookies from HTTP response messages, and - * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org - * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ - * - * This class is mainly based on Cookies.pm from the libwww-perl collection . - * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. - * - * @version 0.5 - * @date 2011-03-15 - * @see http://php.net/HttpRequestPool - * @author Keyvan Minoukadeh - * @copyright 2011 Keyvan Minoukadeh - * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 - */ - -class CookieJar -{ - /** - * Cookies - array containing all cookies. - * - *
    -    * Cookies are stored like this:
    -    *   [domain][path][name] = array
    -    * where array is:
    -    *   0 => value, 1 => secure, 2 => expires
    -    * 
    - * @var array - * @access private - */ - public $cookies = array(); - public $debug = false; - - /** - * Constructor - */ - function __construct() { - } - - protected function debug($msg, $file=null, $line=null) { - if ($this->debug) { - $mem = round(memory_get_usage()/1024, 2); - $memPeak = round(memory_get_peak_usage()/1024, 2); - echo '* ',$msg; - if (isset($file, $line)) echo " ($file line $line)"; - echo ' - mem used: ',$mem," (peak: $memPeak)\n"; - ob_flush(); - flush(); - } - } - - /** - * Get matching cookies - * - * Only use this method if you cannot use add_cookie_header(), for example, if you want to use - * this cookie jar class without using the request class. - * - * @param array $param associative array containing 'domain', 'path', 'secure' keys - * @return string - * @see add_cookie_header() - */ - public function getMatchingCookies($url) - { - if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { - $param['domain'] = $parts['host']; - $param['path'] = $parts['path']; - $param['secure'] = (strtolower($parts['scheme']) == 'https'); - unset($parts); - } else { - return false; - } - // RFC 2965 notes: - // If multiple cookies satisfy the criteria above, they are ordered in - // the Cookie header such that those with more specific Path attributes - // precede those with less specific. Ordering with respect to other - // attributes (e.g., Domain) is unspecified. - $domain = $param['domain']; - if (strpos($domain, '.') === false) $domain .= '.local'; - $request_path = $param['path']; - if ($request_path == '') $request_path = '/'; - $request_secure = $param['secure']; - $now = time(); - $matched_cookies = array(); - // domain - find matching domains - $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); - while (strpos($domain, '.') !== false) { - if (isset($this->cookies[$domain])) { - $this->debug(' domain match found: '.$domain); - $cookies =& $this->cookies[$domain]; - } else { - $domain = $this->_reduce_domain($domain); - continue; - } - // paths - find matching paths starting from most specific - $this->debug(' - Finding matching paths for '.$request_path); - $paths = array_keys($cookies); - usort($paths, array($this, '_cmp_length')); - foreach ($paths as $path) { - // continue to next cookie if request path does not path-match cookie path - if (!$this->_path_match($request_path, $path)) continue; - // loop through cookie names - $this->debug(' path match found: '.$path); - foreach ($cookies[$path] as $name => $values) { - // if this cookie is secure but request isn't, continue to next cookie - if ($values[1] && !$request_secure) continue; - // if cookie is not a session cookie and has expired, continue to next cookie - if (is_int($values[2]) && ($values[2] < $now)) continue; - // cookie matches request - $this->debug(' cookie match: '.$name.'='.$values[0]); - $matched_cookies[] = $name.'='.$values[0]; - } - } - $domain = $this->_reduce_domain($domain); - } - // return cookies - return implode('; ', $matched_cookies); - } - - /** - * Parse Set-Cookie values. - * - * Only use this method if you cannot use extract_cookies(), for example, if you want to use - * this cookie jar class without using the response class. - * - * @param array $set_cookies array holding 1 or more "Set-Cookie" header values - * @param array $param associative array containing 'host', 'path' keys - * @return void - * @see extract_cookies() - */ - public function storeCookies($url, $set_cookies) - { - if (count($set_cookies) == 0) return; - $param = @parse_url($url); - if (!is_array($param) || !isset($param['host'])) return; - $request_host = $param['host']; - if (strpos($request_host, '.') === false) $request_host .= '.local'; - $request_path = @$param['path']; - if ($request_path == '') $request_path = '/'; - // - // loop through set-cookie headers - // - foreach ($set_cookies as $set_cookie) { - $this->debug('Parsing: '.$set_cookie); - // temporary cookie store (before adding to jar) - $tmp_cookie = array(); - $param = explode(';', $set_cookie); - // loop through params - for ($x=0; $x$key, 'value'=>$val); - continue; - } - $key = strtolower($key); - if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { - $tmp_cookie[$key] = $val; - } - } - // - // set cookie - // - // check domain - if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && - ($tmp_cookie['domain'] != ".$request_host")) { - $domain = $tmp_cookie['domain']; - if ((strpos($domain, '.') === false) && ($domain != 'local')) { - $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); - continue; - } - if (preg_match('/\.[0-9]+$/', $domain)) { - $this->debug(' - domain "'.$domain.'" appears to be an ip address'); - continue; - } - if (substr($domain, 0, 1) != '.') $domain = ".$domain"; - if (!$this->_domain_match($request_host, $domain)) { - $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); - continue; - } - } else { - // if domain is not specified in the set-cookie header, domain will default to - // the request host - $domain = $request_host; - } - // check path - if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { - $path = urldecode($tmp_cookie['path']); - if (!$this->_path_match($request_path, $path)) { - $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); - continue; - } - } else { - $path = $request_path; - $path = substr($path, 0, strrpos($path, '/')); - if ($path == '') $path = '/'; - } - // check if secure - $secure = (isset($tmp_cookie['secure'])) ? true : false; - // check expiry - if (isset($tmp_cookie['expires'])) { - if (($expires = strtotime($tmp_cookie['expires'])) < 0) { - $expires = null; - } - } else { - $expires = null; - } - // set cookie - $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); - } - } - - // return array of set-cookie values extracted from HTTP response headers (string $h) - public function extractCookies($h) { - $x = 0; - $lines = 0; - $headers = array(); - $last_match = false; - $h = explode("\n", $h); - foreach ($h as $line) { - $line = rtrim($line); - $lines++; - - $trimmed_line = trim($line); - if (isset($line_last)) { - // check if we have \r\n\r\n (indicating the end of headers) - // some servers will not use CRLF (\r\n), so we make CR (\r) optional. - // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { - // break; - // } - // As an alternative, we can check if the current trimmed line is empty - if ($trimmed_line == '') { - break; - } - - // check for continuation line... - // RFC 2616 Section 2.2 "Basic Rules": - // HTTP/1.1 header field values can be folded onto multiple lines if the - // continuation line begins with a space or horizontal tab. All linear - // white space, including folding, has the same semantics as SP. A - // recipient MAY replace any linear white space with a single SP before - // interpreting the field value or forwarding the message downstream. - if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { - // append to previous header value - $headers[$x-1] .= ' '.rtrim($match[1]); - continue; - } - } - $line_last = $line; - - // split header name and value - if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { - $headers[$x++] = rtrim($match[1]); - $last_match = true; - } else { - $last_match = false; - } - } - return $headers; - } - - /** - * Set Cookie - * @param string $domain - * @param string $path - * @param string $name cookie name - * @param string $value cookie value - * @param bool $secure - * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) - * @return void - */ - function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) - { - if ($domain == '') return; - if ($path == '') return; - if ($name == '') return; - // check if cookie needs to go - if (isset($expires) && ($expires <= 0)) { - if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); - return; - } - if ($value == '') return; - $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); - return; - } - - /** - * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. - * @param string $domain - * @param string $path - * @param string $name - * @return void - */ - function clear($domain=null, $path=null, $name=null) - { - if (!isset($domain)) { - $this->cookies = array(); - } elseif (!isset($path)) { - if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); - } elseif (!isset($name)) { - if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); - } elseif (isset($name)) { - if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); - } - } - - /** - * Compare string length - used for sorting - * @access private - * @return int - */ - function _cmp_length($a, $b) - { - $la = strlen($a); $lb = strlen($b); - if ($la == $lb) return 0; - return ($la > $lb) ? -1 : 1; - } - - /** - * Reduce domain - * @param string $domain - * @return string - * @access private - */ - function _reduce_domain($domain) - { - if ($domain == '') return ''; - if (substr($domain, 0, 1) == '.') return substr($domain, 1); - return substr($domain, strpos($domain, '.')); - } - - /** - * Path match - check if path1 path-matches path2 - * - * From RFC 2965: - * For two strings that represent paths, P1 and P2, P1 path-matches P2 - * if P2 is a prefix of P1 (including the case where P1 and P2 string- - * compare equal). Thus, the string /tec/waldo path-matches /tec. - * @param string $path1 - * @param string $path2 - * @return bool - * @access private - */ - function _path_match($path1, $path2) - { - return (substr($path1, 0, strlen($path2)) == $path2); - } - - /** - * Domain match - check if domain1 domain-matches domain2 - * - * A few extracts from RFC 2965: - * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com - * would be rejected, because H is y.x and contains a dot. - * - * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com - * would be accepted. - * - * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be - * rejected, because there is no embedded dot. - * - * - A Set-Cookie2 from request-host example for Domain=.local will - * be accepted, because the effective host name for the request- - * host is example.local, and example.local domain-matches .local. - * - * I'm ignoring the first point for now (must check to see how other browsers handle - * this rule for Set-Cookie headers) - * - * @param string $domain1 - * @param string $domain2 - * @return bool - * @access private - */ - function _domain_match($domain1, $domain2) - { - $domain1 = strtolower($domain1); - $domain2 = strtolower($domain2); - while (strpos($domain1, '.') !== false) { - if ($domain1 == $domain2) return true; - $domain1 = $this->_reduce_domain($domain1); - continue; - } - return false; - } -} -?> \ No newline at end of file + + * + * This class should be used to handle cookies (storing cookies from HTTP response messages, and + * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org + * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ + * + * This class is mainly based on Cookies.pm from the libwww-perl collection . + * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. + * + * @version 0.5 + * @date 2011-03-15 + * @see http://php.net/HttpRequestPool + * @author Keyvan Minoukadeh + * @copyright 2011 Keyvan Minoukadeh + * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 + */ + +class CookieJar +{ + /** + * Cookies - array containing all cookies. + * + *
    +    * Cookies are stored like this:
    +    *   [domain][path][name] = array
    +    * where array is:
    +    *   0 => value, 1 => secure, 2 => expires
    +    * 
    + * @var array + * @access private + */ + public $cookies = array(); + public $debug = false; + + /** + * Constructor + */ + function __construct() { + } + + protected function debug($msg, $file=null, $line=null) { + if ($this->debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + if (isset($file, $line)) echo " ($file line $line)"; + echo ' - mem used: ',$mem," (peak: $memPeak)\n"; + ob_flush(); + flush(); + } + } + + /** + * Get matching cookies + * + * Only use this method if you cannot use add_cookie_header(), for example, if you want to use + * this cookie jar class without using the request class. + * + * @param array $param associative array containing 'domain', 'path', 'secure' keys + * @return string + * @see add_cookie_header() + */ + public function getMatchingCookies($url) + { + if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { + $param['domain'] = $parts['host']; + $param['path'] = $parts['path']; + $param['secure'] = (strtolower($parts['scheme']) == 'https'); + unset($parts); + } else { + return false; + } + // RFC 2965 notes: + // If multiple cookies satisfy the criteria above, they are ordered in + // the Cookie header such that those with more specific Path attributes + // precede those with less specific. Ordering with respect to other + // attributes (e.g., Domain) is unspecified. + $domain = $param['domain']; + if (strpos($domain, '.') === false) $domain .= '.local'; + $request_path = $param['path']; + if ($request_path == '') $request_path = '/'; + $request_secure = $param['secure']; + $now = time(); + $matched_cookies = array(); + // domain - find matching domains + $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); + while (strpos($domain, '.') !== false) { + if (isset($this->cookies[$domain])) { + $this->debug(' domain match found: '.$domain); + $cookies =& $this->cookies[$domain]; + } else { + $domain = $this->_reduce_domain($domain); + continue; + } + // paths - find matching paths starting from most specific + $this->debug(' - Finding matching paths for '.$request_path); + $paths = array_keys($cookies); + usort($paths, array($this, '_cmp_length')); + foreach ($paths as $path) { + // continue to next cookie if request path does not path-match cookie path + if (!$this->_path_match($request_path, $path)) continue; + // loop through cookie names + $this->debug(' path match found: '.$path); + foreach ($cookies[$path] as $name => $values) { + // if this cookie is secure but request isn't, continue to next cookie + if ($values[1] && !$request_secure) continue; + // if cookie is not a session cookie and has expired, continue to next cookie + if (is_int($values[2]) && ($values[2] < $now)) continue; + // cookie matches request + $this->debug(' cookie match: '.$name.'='.$values[0]); + $matched_cookies[] = $name.'='.$values[0]; + } + } + $domain = $this->_reduce_domain($domain); + } + // return cookies + return implode('; ', $matched_cookies); + } + + /** + * Parse Set-Cookie values. + * + * Only use this method if you cannot use extract_cookies(), for example, if you want to use + * this cookie jar class without using the response class. + * + * @param array $set_cookies array holding 1 or more "Set-Cookie" header values + * @param array $param associative array containing 'host', 'path' keys + * @return void + * @see extract_cookies() + */ + public function storeCookies($url, $set_cookies) + { + if (count($set_cookies) == 0) return; + $param = @parse_url($url); + if (!is_array($param) || !isset($param['host'])) return; + $request_host = $param['host']; + if (strpos($request_host, '.') === false) $request_host .= '.local'; + $request_path = @$param['path']; + if ($request_path == '') $request_path = '/'; + // + // loop through set-cookie headers + // + foreach ($set_cookies as $set_cookie) { + $this->debug('Parsing: '.$set_cookie); + // temporary cookie store (before adding to jar) + $tmp_cookie = array(); + $param = explode(';', $set_cookie); + // loop through params + for ($x=0; $x$key, 'value'=>$val); + continue; + } + $key = strtolower($key); + if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { + $tmp_cookie[$key] = $val; + } + } + // + // set cookie + // + // check domain + if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && + ($tmp_cookie['domain'] != ".$request_host")) { + $domain = $tmp_cookie['domain']; + if ((strpos($domain, '.') === false) && ($domain != 'local')) { + $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); + continue; + } + if (preg_match('/\.[0-9]+$/', $domain)) { + $this->debug(' - domain "'.$domain.'" appears to be an ip address'); + continue; + } + if (substr($domain, 0, 1) != '.') $domain = ".$domain"; + if (!$this->_domain_match($request_host, $domain)) { + $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); + continue; + } + } else { + // if domain is not specified in the set-cookie header, domain will default to + // the request host + $domain = $request_host; + } + // check path + if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { + $path = urldecode($tmp_cookie['path']); + if (!$this->_path_match($request_path, $path)) { + $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); + continue; + } + } else { + $path = $request_path; + $path = substr($path, 0, strrpos($path, '/')); + if ($path == '') $path = '/'; + } + // check if secure + $secure = (isset($tmp_cookie['secure'])) ? true : false; + // check expiry + if (isset($tmp_cookie['expires'])) { + if (($expires = strtotime($tmp_cookie['expires'])) < 0) { + $expires = null; + } + } else { + $expires = null; + } + // set cookie + $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); + } + } + + // return array of set-cookie values extracted from HTTP response headers (string $h) + public function extractCookies($h) { + $x = 0; + $lines = 0; + $headers = array(); + $last_match = false; + $h = explode("\n", $h); + foreach ($h as $line) { + $line = rtrim($line); + $lines++; + + $trimmed_line = trim($line); + if (isset($line_last)) { + // check if we have \r\n\r\n (indicating the end of headers) + // some servers will not use CRLF (\r\n), so we make CR (\r) optional. + // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { + // break; + // } + // As an alternative, we can check if the current trimmed line is empty + if ($trimmed_line == '') { + break; + } + + // check for continuation line... + // RFC 2616 Section 2.2 "Basic Rules": + // HTTP/1.1 header field values can be folded onto multiple lines if the + // continuation line begins with a space or horizontal tab. All linear + // white space, including folding, has the same semantics as SP. A + // recipient MAY replace any linear white space with a single SP before + // interpreting the field value or forwarding the message downstream. + if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { + // append to previous header value + $headers[$x-1] .= ' '.rtrim($match[1]); + continue; + } + } + $line_last = $line; + + // split header name and value + if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { + $headers[$x++] = rtrim($match[1]); + $last_match = true; + } else { + $last_match = false; + } + } + return $headers; + } + + /** + * Set Cookie + * @param string $domain + * @param string $path + * @param string $name cookie name + * @param string $value cookie value + * @param bool $secure + * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) + * @return void + */ + function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) + { + if ($domain == '') return; + if ($path == '') return; + if ($name == '') return; + // check if cookie needs to go + if (isset($expires) && ($expires <= 0)) { + if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); + return; + } + if ($value == '') return; + $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); + return; + } + + /** + * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. + * @param string $domain + * @param string $path + * @param string $name + * @return void + */ + function clear($domain=null, $path=null, $name=null) + { + if (!isset($domain)) { + $this->cookies = array(); + } elseif (!isset($path)) { + if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); + } elseif (!isset($name)) { + if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); + } elseif (isset($name)) { + if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); + } + } + + /** + * Compare string length - used for sorting + * @access private + * @return int + */ + function _cmp_length($a, $b) + { + $la = strlen($a); $lb = strlen($b); + if ($la == $lb) return 0; + return ($la > $lb) ? -1 : 1; + } + + /** + * Reduce domain + * @param string $domain + * @return string + * @access private + */ + function _reduce_domain($domain) + { + if ($domain == '') return ''; + if (substr($domain, 0, 1) == '.') return substr($domain, 1); + return substr($domain, strpos($domain, '.')); + } + + /** + * Path match - check if path1 path-matches path2 + * + * From RFC 2965: + * For two strings that represent paths, P1 and P2, P1 path-matches P2 + * if P2 is a prefix of P1 (including the case where P1 and P2 string- + * compare equal). Thus, the string /tec/waldo path-matches /tec. + * @param string $path1 + * @param string $path2 + * @return bool + * @access private + */ + function _path_match($path1, $path2) + { + return (substr($path1, 0, strlen($path2)) == $path2); + } + + /** + * Domain match - check if domain1 domain-matches domain2 + * + * A few extracts from RFC 2965: + * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com + * would be rejected, because H is y.x and contains a dot. + * + * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com + * would be accepted. + * + * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be + * rejected, because there is no embedded dot. + * + * - A Set-Cookie2 from request-host example for Domain=.local will + * be accepted, because the effective host name for the request- + * host is example.local, and example.local domain-matches .local. + * + * I'm ignoring the first point for now (must check to see how other browsers handle + * this rule for Set-Cookie headers) + * + * @param string $domain1 + * @param string $domain2 + * @return bool + * @access private + */ + function _domain_match($domain1, $domain2) + { + $domain1 = strtolower($domain1); + $domain2 = strtolower($domain2); + while (strpos($domain1, '.') !== false) { + if ($domain1 == $domain2) return true; + $domain1 = $this->_reduce_domain($domain1); + continue; + } + return false; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php @@ -1,779 +1,810 @@ -userAgentDefault = self::UA_BROWSER; - $this->referer = self::REF_GOOGLE; - // set the request method - if (in_array($method, array(1,2,4))) { - $this->method = $method; - } else { - if (class_exists('HttpRequestPool')) { - $this->method = self::METHOD_REQUEST_POOL; - } elseif (function_exists('curl_multi_init')) { - $this->method = self::METHOD_CURL_MULTI; - } else { - $this->method = self::METHOD_FILE_GET_CONTENTS; - } - } - if ($this->method == self::METHOD_CURL_MULTI) { - require_once(dirname(__FILE__).'/RollingCurl.php'); - } - // create cookie jar - $this->cookieJar = new CookieJar(); - // set request options (redirect must be 0) - $this->requestOptions = array( - 'timeout' => 15, - 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web - // TODO: test onprogress? - ); - if (is_array($requestOptions)) { - $this->requestOptions = array_merge($this->requestOptions, $requestOptions); - } - $this->httpContext = array( - 'http' => array( - 'ignore_errors' => true, - 'timeout' => $this->requestOptions['timeout'], - 'max_redirects' => $this->requestOptions['redirect'], - 'header' => "Accept: */*\r\n" - ) - ); - } - - protected function debug($msg) { - if ($this->debug) { - $mem = round(memory_get_usage()/1024, 2); - $memPeak = round(memory_get_peak_usage()/1024, 2); - echo '* ',$msg; - if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; - echo "\n"; - ob_flush(); - flush(); - } - } - - protected function getUserAgent($url, $asArray=false) { - $host = @parse_url($url, PHP_URL_HOST); - if (strtolower(substr($host, 0, 4)) == 'www.') { - $host = substr($host, 4); - } - if ($host) { - $try = array($host); - $split = explode('.', $host); - if (count($split) > 1) { - array_shift($split); - $try[] = '.'.implode('.', $split); - } - foreach ($try as $h) { - if (isset($this->userAgentMap[$h])) { - $ua = $this->userAgentMap[$h]; - break; - } - } - } - if (!isset($ua)) $ua = $this->userAgentDefault; - if ($asArray) { - return array('User-Agent' => $ua); - } else { - return 'User-Agent: '.$ua; - } - } - - public function rewriteHashbangFragment($url) { - // return $url if there's no '#!' - if (strpos($url, '#!') === false) return $url; - // split $url and rewrite - // TODO: is SimplePie_IRI included? - $iri = new SimplePie_IRI($url); - $fragment = substr($iri->fragment, 1); // strip '!' - $iri->fragment = null; - if (isset($iri->query)) { - parse_str($iri->query, $query); - } else { - $query = array(); - } - $query['_escaped_fragment_'] = (string)$fragment; - $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites - return $iri->get_iri(); - } - - public function getUglyURL($url, $html) { - if ($html == '') return false; - $found = false; - foreach ($this->ajaxTriggers as $string) { - if (stripos($html, $string)) { - $found = true; - break; - } - } - if (!$found) return false; - $iri = new SimplePie_IRI($url); - if (isset($iri->query)) { - parse_str($iri->query, $query); - } else { - $query = array(); - } - $query['_escaped_fragment_'] = ''; - $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites - return $iri->get_iri(); - } - - public function removeFragment($url) { - $pos = strpos($url, '#'); - if ($pos === false) { - return $url; - } else { - return substr($url, 0, $pos); - } - } - - public function rewriteUrls($url) { - foreach ($this->rewriteUrls as $find => $action) { - if (strpos($url, $find) !== false) { - if (is_array($action)) { - return strtr($url, $action); - } - } - } - return $url; - } - - public function enableDebug($bool=true) { - $this->debug = (bool)$bool; - } - - public function minimiseMemoryUse($bool = true) { - $this->minimiseMemoryUse = $bool; - } - - public function setMaxParallelRequests($max) { - $this->maxParallelRequests = $max; - } - - public function validateUrl($url) { - $url = filter_var($url, FILTER_SANITIZE_URL); - $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); - // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) - if ($test === false) { - $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); - } - if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { - return $url; - } else { - return false; - } - } - - public function fetchAll(array $urls) { - $this->fetchAllOnce($urls, $isRedirect=false); - $redirects = 0; - while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { - $this->debug("Following redirects #$redirects..."); - $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); - } - } - - // fetch all URLs without following redirects - public function fetchAllOnce(array $urls, $isRedirect=false) { - if (!$isRedirect) $urls = array_unique($urls); - if (empty($urls)) return; - - ////////////////////////////////////////////////////// - // parallel (HttpRequestPool) - if ($this->method == self::METHOD_REQUEST_POOL) { - $this->debug('Starting parallel fetch (HttpRequestPool)'); - try { - while (count($urls) > 0) { - $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); - $subset = array_splice($urls, 0, $this->maxParallelRequests); - $pool = new HttpRequestPool(); - foreach ($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("......adding to pool"); - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { - $_meth = HttpRequest::METH_HEAD; - } else { - $_meth = HttpRequest::METH_GET; - unset($this->requests[$orig]['wrongGuess']); - } - $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); - // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpRequest->addHeaders(array('Cookie' => $cookies)); - } - //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); - $httpRequest->addHeaders($this->getUserAgent($req_url, true)); - // add referer for picky sites - $httpRequest->addheaders(array('Referer' => $this->referer)); - $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); - $this->requests[$orig]['original_url'] = $orig; - $pool->attach($httpRequest); - } - } - // did we get anything into the pool? - if (count($pool) > 0) { - $this->debug('Sending request...'); - try { - $pool->send(); - } catch (HttpRequestPoolException $e) { - // do nothing - } - $this->debug('Received responses'); - foreach($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - $request = $this->requests[$orig]['httpRequest']; - //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); - // getResponseHeader() doesn't return status line, so, for consistency... - $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); - // check content type - // TODO: use getResponseHeader('content-type') or getResponseInfo() - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - $_header_only_type = true; - $this->debug('Header only type returned'); - } else { - $this->requests[$orig]['body'] = $request->getResponseBody(); - $_header_only_type = false; - } - $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); - $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); - // is redirect? - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { - $redirectURL = $request->getResponseHeader('location'); - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $request->getResponseHeader('set-cookie'); - if ($cookies && !is_array($cookies)) $cookies = array($cookies); - if ($cookies) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { - // the response content-type did not match our 'header only' types, - // but we'd issues a HEAD request because we assumed it would. So - // let's queue a proper GET request for this item... - $this->debug('Wrong guess at content-type, queing GET request'); - $this->requests[$orig]['wrongGuess'] = true; - $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - //die($url.' -multi- '.$request->getResponseInfo('effective_url')); - $pool->detach($request); - unset($this->requests[$orig]['httpRequest'], $request); - /* - if ($this->minimiseMemoryUse) { - if ($this->cache($url)) { - unset($this->requests[$url]); - } - } - */ - } - } - } - } catch (HttpException $e) { - $this->debug($e); - return false; - } - } - - ////////////////////////////////////////////////////////// - // parallel (curl_multi_*) - elseif ($this->method == self::METHOD_CURL_MULTI) { - $this->debug('Starting parallel fetch (curl_multi_*)'); - while (count($urls) > 0) { - $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); - $subset = array_splice($urls, 0, $this->maxParallelRequests); - $pool = new RollingCurl(array($this, 'handleCurlResponse')); - $pool->window_size = count($subset); - - foreach ($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("......adding to pool"); - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { - $_meth = 'HEAD'; - } else { - $_meth = 'GET'; - unset($this->requests[$orig]['wrongGuess']); - } - $headers = array(); - //$headers[] = 'User-Agent: '.$this->userAgent; - $headers[] = $this->getUserAgent($req_url); - // add referer for picky sites - $headers[] = 'Referer: '.$this->referer; - // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $headers[] = 'Cookie: '.$cookies; - } - $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( - CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], - CURLOPT_TIMEOUT => $this->requestOptions['timeout'] - )); - $httpRequest->set_original_url($orig); - $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); - $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? - $pool->add($httpRequest); - } - } - // did we get anything into the pool? - if (count($pool) > 0) { - $this->debug('Sending request...'); - $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] - $this->debug('Received responses'); - foreach($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - // $this->requests[$orig]['headers'] - // $this->requests[$orig]['body'] - // $this->requests[$orig]['effective_url'] - // check content type - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - $_header_only_type = true; - $this->debug('Header only type returned'); - } else { - $_header_only_type = false; - } - $status_code = $this->requests[$orig]['status_code']; - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { - $redirectURL = $this->requests[$orig]['location']; - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { - // the response content-type did not match our 'header only' types, - // but we'd issues a HEAD request because we assumed it would. So - // let's queue a proper GET request for this item... - $this->debug('Wrong guess at content-type, queing GET request'); - $this->requests[$orig]['wrongGuess'] = true; - $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - // die($url.' -multi- '.$request->getResponseInfo('effective_url')); - unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); - } - } - } - } - - ////////////////////////////////////////////////////// - // sequential (file_get_contents) - else { - $this->debug('Starting sequential fetch (file_get_contents)'); - $this->debug('Processing set of '.count($urls)); - foreach ($urls as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("Sending request for $url"); - $this->requests[$orig]['original_url'] = $orig; - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - // send cookies, if we have any - $httpContext = $this->httpContext; - $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; - // add referer for picky sites - $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; - } - if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { - $this->debug('Received response'); - // get status code - if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { - $this->debug('Error: no status code found'); - // TODO: handle error - no status code - } else { - $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); - // check content type - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - } else { - $this->requests[$orig]['body'] = $html; - } - $this->requests[$orig]['effective_url'] = $req_url; - $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; - unset($match); - // handle redirect - if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { - $this->requests[$orig]['location'] = trim($match[1]); - } - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { - $redirectURL = $this->requests[$orig]['location']; - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - } - } else { - $this->debug('Error retrieving URL'); - //print_r($req_url); - //print_r($http_response_header); - //print_r($html); - - // TODO: handle error - failed to retrieve URL - } - } - } - } - } - - public function handleCurlResponse($response, $info, $request) { - $orig = $request->url_original; - $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); - $this->requests[$orig]['body'] = substr($response, $info['header_size']); - $this->requests[$orig]['method'] = $request->method; - $this->requests[$orig]['effective_url'] = $info['url']; - $this->requests[$orig]['status_code'] = (int)$info['http_code']; - if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { - $this->requests[$orig]['location'] = trim($match[1]); - } - } - - protected function headersToString(array $headers, $associative=true) { - if (!$associative) { - return implode("\n", $headers); - } else { - $str = ''; - foreach ($headers as $key => $val) { - if (is_array($val)) { - foreach ($val as $v) $str .= "$key: $v\n"; - } else { - $str .= "$key: $val\n"; - } - } - return rtrim($str); - } - } - - public function get($url, $remove=false, $gzdecode=true) { - $url = "$url"; - if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { - $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); - $response = $this->requests[$url]; - /* - } elseif ($this->isCached($url)) { - $this->debug("URL already fetched - in disk cache ($url)"); - $response = $this->getCached($url); - $this->requests[$url] = $response; - */ - } else { - $this->debug("Fetching URL ($url)"); - $this->fetchAll(array($url)); - if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { - $response = $this->requests[$url]; - } else { - $this->debug("Request failed"); - $response = false; - } - } - /* - if ($this->minimiseMemoryUse && $response) { - $this->cache($url); - unset($this->requests[$url]); - } - */ - if ($remove && $response) unset($this->requests[$url]); - if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { - if ($html = gzdecode($response['body'])) { - $response['body'] = $html; - } - } - return $response; - } - - public function parallelSupport() { - return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); - } - - private function headerOnlyType($headers) { - if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { - // look for full mime type (e.g. image/jpeg) or just type (e.g. image) - $match[1] = strtolower(trim($match[1])); - $match[2] = strtolower(trim($match[2])); - foreach (array($match[1], $match[2]) as $mime) { - if (in_array($mime, $this->headerOnlyTypes)) return true; - } - } - return false; - } - - private function possibleUnsupportedType($url) { - $path = @parse_url($url, PHP_URL_PATH); - if ($path && strpos($path, '.') !== false) { - $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); - return in_array($ext, $this->headerOnlyClues); - } - return false; - } -} - -// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 -if (!function_exists('gzdecode')) { - function gzdecode($data,&$filename='',&$error='',$maxlength=null) - { - $len = strlen($data); - if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { - $error = "Not in GZIP format."; - return null; // Not GZIP format (See RFC 1952) - } - $method = ord(substr($data,2,1)); // Compression method - $flags = ord(substr($data,3,1)); // Flags - if ($flags & 31 != $flags) { - $error = "Reserved bits not allowed."; - return null; - } - // NOTE: $mtime may be negative (PHP integer limitations) - $mtime = unpack("V", substr($data,4,4)); - $mtime = $mtime[1]; - $xfl = substr($data,8,1); - $os = substr($data,8,1); - $headerlen = 10; - $extralen = 0; - $extra = ""; - if ($flags & 4) { - // 2-byte length prefixed EXTRA data in header - if ($len - $headerlen - 2 < 8) { - return false; // invalid - } - $extralen = unpack("v",substr($data,8,2)); - $extralen = $extralen[1]; - if ($len - $headerlen - 2 - $extralen < 8) { - return false; // invalid - } - $extra = substr($data,10,$extralen); - $headerlen += 2 + $extralen; - } - $filenamelen = 0; - $filename = ""; - if ($flags & 8) { - // C-style string - if ($len - $headerlen - 1 < 8) { - return false; // invalid - } - $filenamelen = strpos(substr($data,$headerlen),chr(0)); - if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { - return false; // invalid - } - $filename = substr($data,$headerlen,$filenamelen); - $headerlen += $filenamelen + 1; - } - $commentlen = 0; - $comment = ""; - if ($flags & 16) { - // C-style string COMMENT data in header - if ($len - $headerlen - 1 < 8) { - return false; // invalid - } - $commentlen = strpos(substr($data,$headerlen),chr(0)); - if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { - return false; // Invalid header format - } - $comment = substr($data,$headerlen,$commentlen); - $headerlen += $commentlen + 1; - } - $headercrc = ""; - if ($flags & 2) { - // 2-bytes (lowest order) of CRC32 on header present - if ($len - $headerlen - 2 < 8) { - return false; // invalid - } - $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; - $headercrc = unpack("v", substr($data,$headerlen,2)); - $headercrc = $headercrc[1]; - if ($headercrc != $calccrc) { - $error = "Header checksum failed."; - return false; // Bad header CRC - } - $headerlen += 2; - } - // GZIP FOOTER - $datacrc = unpack("V",substr($data,-8,4)); - $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); - $isize = unpack("V",substr($data,-4)); - $isize = $isize[1]; - // decompression: - $bodylen = $len-$headerlen-8; - if ($bodylen < 1) { - // IMPLEMENTATION BUG! - return null; - } - $body = substr($data,$headerlen,$bodylen); - $data = ""; - if ($bodylen > 0) { - switch ($method) { - case 8: - // Currently the only supported compression method: - $data = gzinflate($body,$maxlength); - break; - default: - $error = "Unknown compression method."; - return false; - } - } // zero-byte body content is allowed - // Verifiy CRC32 - $crc = sprintf("%u",crc32($data)); - $crcOK = $crc == $datacrc; - $lenOK = $isize == strlen($data); - if (!$lenOK || !$crcOK) { - $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); - return false; - } - return $data; - } -} -?> \ No newline at end of file +userAgentDefault = self::UA_BROWSER; + $this->referer = self::REF_GOOGLE; + // set the request method + if (in_array($method, array(1,2,4))) { + $this->method = $method; + } else { + if (class_exists('HttpRequestPool')) { + $this->method = self::METHOD_REQUEST_POOL; + } elseif (function_exists('curl_multi_init')) { + $this->method = self::METHOD_CURL_MULTI; + } else { + $this->method = self::METHOD_FILE_GET_CONTENTS; + } + } + if ($this->method == self::METHOD_CURL_MULTI) { + require_once(dirname(__FILE__).'/RollingCurl.php'); + } + // create cookie jar + $this->cookieJar = new CookieJar(); + // set request options (redirect must be 0) + $this->requestOptions = array( + 'timeout' => 15, + 'connecttimeout' => 15, + 'dns_cache_timeout' => 300, + 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web + // TODO: test onprogress? + ); + if (is_array($requestOptions)) { + $this->requestOptions = array_merge($this->requestOptions, $requestOptions); + } + $this->httpContext = array( + 'http' => array( + 'ignore_errors' => true, + 'timeout' => $this->requestOptions['timeout'], + 'max_redirects' => $this->requestOptions['redirect'], + 'header' => "Accept: */*\r\n" + ) + ); + } + + protected function debug($msg) { + if ($this->debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; + echo "\n"; + ob_flush(); + flush(); + } + } + + protected function getUserAgent($url, $asArray=false) { + $host = @parse_url($url, PHP_URL_HOST); + if (strtolower(substr($host, 0, 4)) == 'www.') { + $host = substr($host, 4); + } + if ($host) { + $try = array($host); + $split = explode('.', $host); + if (count($split) > 1) { + array_shift($split); + $try[] = '.'.implode('.', $split); + } + foreach ($try as $h) { + if (isset($this->userAgentMap[$h])) { + $ua = $this->userAgentMap[$h]; + break; + } + } + } + if (!isset($ua)) $ua = $this->userAgentDefault; + if ($asArray) { + return array('User-Agent' => $ua); + } else { + return 'User-Agent: '.$ua; + } + } + + public function rewriteHashbangFragment($url) { + // return $url if there's no '#!' + if (strpos($url, '#!') === false) return $url; + // split $url and rewrite + // TODO: is SimplePie_IRI included? + $iri = new SimplePie_IRI($url); + $fragment = substr($iri->fragment, 1); // strip '!' + $iri->fragment = null; + if (isset($iri->query)) { + parse_str($iri->query, $query); + } else { + $query = array(); + } + $query['_escaped_fragment_'] = (string)$fragment; + $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites + return $iri->get_iri(); + } + + public function getRedirectURLfromHTML($url, $html) { + $redirect_url = $this->getMetaRefreshURL($url, $html); + if (!$redirect_url) { + $redirect_url = $this->getUglyURL($url, $html); + } + return $redirect_url; + } + + public function getMetaRefreshURL($url, $html) { + if ($html == '') return false; + // + if (!preg_match('!]+)["\']*>!i', $html, $match)) { + return false; + } + $redirect_url = $match[1]; + if (preg_match('!^https?://!i', $redirect_url)) { + // already absolute + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); + return $redirect_url; + } + // absolutize redirect URL + $base = new SimplePie_IRI($url); + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); + if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); + return $absolute; + } + return false; + } + + public function getUglyURL($url, $html) { + if ($html == '') return false; + $found = false; + foreach ($this->ajaxTriggers as $string) { + if (stripos($html, $string)) { + $found = true; + break; + } + } + if (!$found) return false; + $iri = new SimplePie_IRI($url); + if (isset($iri->query)) { + parse_str($iri->query, $query); + } else { + $query = array(); + } + $query['_escaped_fragment_'] = ''; + $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites + $ugly_url = $iri->get_iri(); + $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); + return $ugly_url; + } + + public function removeFragment($url) { + $pos = strpos($url, '#'); + if ($pos === false) { + return $url; + } else { + return substr($url, 0, $pos); + } + } + + public function rewriteUrls($url) { + foreach ($this->rewriteUrls as $find => $action) { + if (strpos($url, $find) !== false) { + if (is_array($action)) { + return strtr($url, $action); + } + } + } + return $url; + } + + public function enableDebug($bool=true) { + $this->debug = (bool)$bool; + } + + public function minimiseMemoryUse($bool = true) { + $this->minimiseMemoryUse = $bool; + } + + public function setMaxParallelRequests($max) { + $this->maxParallelRequests = $max; + } + + public function validateUrl($url) { + $url = filter_var($url, FILTER_SANITIZE_URL); + $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) + if ($test === false) { + $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + } + if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { + return $url; + } else { + return false; + } + } + + public function fetchAll(array $urls) { + $this->fetchAllOnce($urls, $isRedirect=false); + $redirects = 0; + while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { + $this->debug("Following redirects #$redirects..."); + $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); + } + } + + // fetch all URLs without following redirects + public function fetchAllOnce(array $urls, $isRedirect=false) { + if (!$isRedirect) $urls = array_unique($urls); + if (empty($urls)) return; + + ////////////////////////////////////////////////////// + // parallel (HttpRequestPool) + if ($this->method == self::METHOD_REQUEST_POOL) { + $this->debug('Starting parallel fetch (HttpRequestPool)'); + try { + while (count($urls) > 0) { + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); + $subset = array_splice($urls, 0, $this->maxParallelRequests); + $pool = new HttpRequestPool(); + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("......adding to pool"); + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { + $_meth = HttpRequest::METH_HEAD; + } else { + $_meth = HttpRequest::METH_GET; + unset($this->requests[$orig]['wrongGuess']); + } + $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); + // send cookies, if we have any + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $httpRequest->addHeaders(array('Cookie' => $cookies)); + } + //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); + $httpRequest->addHeaders($this->getUserAgent($req_url, true)); + // add referer for picky sites + $httpRequest->addheaders(array('Referer' => $this->referer)); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; + $pool->attach($httpRequest); + } + } + // did we get anything into the pool? + if (count($pool) > 0) { + $this->debug('Sending request...'); + try { + $pool->send(); + } catch (HttpRequestPoolException $e) { + // do nothing + } + $this->debug('Received responses'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + $request = $this->requests[$orig]['httpRequest']; + //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); + // getResponseHeader() doesn't return status line, so, for consistency... + $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); + // check content type + // TODO: use getResponseHeader('content-type') or getResponseInfo() + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + $_header_only_type = true; + $this->debug('Header only type returned'); + } else { + $this->requests[$orig]['body'] = $request->getResponseBody(); + $_header_only_type = false; + } + $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); + $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); + // is redirect? + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { + $redirectURL = $request->getResponseHeader('location'); + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $request->getResponseHeader('set-cookie'); + if ($cookies && !is_array($cookies)) $cookies = array($cookies); + if ($cookies) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { + // the response content-type did not match our 'header only' types, + // but we'd issues a HEAD request because we assumed it would. So + // let's queue a proper GET request for this item... + $this->debug('Wrong guess at content-type, queing GET request'); + $this->requests[$orig]['wrongGuess'] = true; + $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + //die($url.' -multi- '.$request->getResponseInfo('effective_url')); + $pool->detach($request); + unset($this->requests[$orig]['httpRequest'], $request); + /* + if ($this->minimiseMemoryUse) { + if ($this->cache($url)) { + unset($this->requests[$url]); + } + } + */ + } + } + } + } catch (HttpException $e) { + $this->debug($e); + return false; + } + } + + ////////////////////////////////////////////////////////// + // parallel (curl_multi_*) + elseif ($this->method == self::METHOD_CURL_MULTI) { + $this->debug('Starting parallel fetch (curl_multi_*)'); + while (count($urls) > 0) { + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); + $subset = array_splice($urls, 0, $this->maxParallelRequests); + $pool = new RollingCurl(array($this, 'handleCurlResponse')); + $pool->window_size = count($subset); + + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("......adding to pool"); + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { + $_meth = 'HEAD'; + } else { + $_meth = 'GET'; + unset($this->requests[$orig]['wrongGuess']); + } + $headers = array(); + //$headers[] = 'User-Agent: '.$this->userAgent; + $headers[] = $this->getUserAgent($req_url); + // add referer for picky sites + $headers[] = 'Referer: '.$this->referer; + // send cookies, if we have any + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $headers[] = 'Cookie: '.$cookies; + } + $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( + CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], + CURLOPT_TIMEOUT => $this->requestOptions['timeout'] + )); + $httpRequest->set_original_url($orig); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? + $pool->add($httpRequest); + } + } + // did we get anything into the pool? + if (count($pool) > 0) { + $this->debug('Sending request...'); + $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] + $this->debug('Received responses'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + // $this->requests[$orig]['headers'] + // $this->requests[$orig]['body'] + // $this->requests[$orig]['effective_url'] + // check content type + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + $_header_only_type = true; + $this->debug('Header only type returned'); + } else { + $_header_only_type = false; + } + $status_code = $this->requests[$orig]['status_code']; + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { + // the response content-type did not match our 'header only' types, + // but we'd issues a HEAD request because we assumed it would. So + // let's queue a proper GET request for this item... + $this->debug('Wrong guess at content-type, queing GET request'); + $this->requests[$orig]['wrongGuess'] = true; + $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + // die($url.' -multi- '.$request->getResponseInfo('effective_url')); + unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); + } + } + } + } + + ////////////////////////////////////////////////////// + // sequential (file_get_contents) + else { + $this->debug('Starting sequential fetch (file_get_contents)'); + $this->debug('Processing set of '.count($urls)); + foreach ($urls as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("Sending request for $url"); + $this->requests[$orig]['original_url'] = $orig; + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + // send cookies, if we have any + $httpContext = $this->httpContext; + $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; + // add referer for picky sites + $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; + } + if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { + $this->debug('Received response'); + // get status code + if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { + $this->debug('Error: no status code found'); + // TODO: handle error - no status code + } else { + $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); + // check content type + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + } else { + $this->requests[$orig]['body'] = $html; + } + $this->requests[$orig]['effective_url'] = $req_url; + $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; + unset($match); + // handle redirect + if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + } + } else { + $this->debug('Error retrieving URL'); + //print_r($req_url); + //print_r($http_response_header); + //print_r($html); + + // TODO: handle error - failed to retrieve URL + } + } + } + } + } + + public function handleCurlResponse($response, $info, $request) { + $orig = $request->url_original; + $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); + $this->requests[$orig]['body'] = substr($response, $info['header_size']); + $this->requests[$orig]['method'] = $request->method; + $this->requests[$orig]['effective_url'] = $info['url']; + $this->requests[$orig]['status_code'] = (int)$info['http_code']; + if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } + } + + protected function headersToString(array $headers, $associative=true) { + if (!$associative) { + return implode("\n", $headers); + } else { + $str = ''; + foreach ($headers as $key => $val) { + if (is_array($val)) { + foreach ($val as $v) $str .= "$key: $v\n"; + } else { + $str .= "$key: $val\n"; + } + } + return rtrim($str); + } + } + + public function get($url, $remove=false, $gzdecode=true) { + $url = "$url"; + if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { + $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); + $response = $this->requests[$url]; + /* + } elseif ($this->isCached($url)) { + $this->debug("URL already fetched - in disk cache ($url)"); + $response = $this->getCached($url); + $this->requests[$url] = $response; + */ + } else { + $this->debug("Fetching URL ($url)"); + $this->fetchAll(array($url)); + if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { + $response = $this->requests[$url]; + } else { + $this->debug("Request failed"); + $response = false; + } + } + /* + if ($this->minimiseMemoryUse && $response) { + $this->cache($url); + unset($this->requests[$url]); + } + */ + if ($remove && $response) unset($this->requests[$url]); + if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { + if ($html = gzdecode($response['body'])) { + $response['body'] = $html; + } + } + return $response; + } + + public function parallelSupport() { + return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); + } + + private function headerOnlyType($headers) { + if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { + // look for full mime type (e.g. image/jpeg) or just type (e.g. image) + $match[1] = strtolower(trim($match[1])); + $match[2] = strtolower(trim($match[2])); + foreach (array($match[1], $match[2]) as $mime) { + if (in_array($mime, $this->headerOnlyTypes)) return true; + } + } + return false; + } + + private function possibleUnsupportedType($url) { + $path = @parse_url($url, PHP_URL_PATH); + if ($path && strpos($path, '.') !== false) { + $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); + return in_array($ext, $this->headerOnlyClues); + } + return false; + } +} + +// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 +if (!function_exists('gzdecode')) { + function gzdecode($data,&$filename='',&$error='',$maxlength=null) + { + $len = strlen($data); + if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { + $error = "Not in GZIP format."; + return null; // Not GZIP format (See RFC 1952) + } + $method = ord(substr($data,2,1)); // Compression method + $flags = ord(substr($data,3,1)); // Flags + if ($flags & 31 != $flags) { + $error = "Reserved bits not allowed."; + return null; + } + // NOTE: $mtime may be negative (PHP integer limitations) + $mtime = unpack("V", substr($data,4,4)); + $mtime = $mtime[1]; + $xfl = substr($data,8,1); + $os = substr($data,8,1); + $headerlen = 10; + $extralen = 0; + $extra = ""; + if ($flags & 4) { + // 2-byte length prefixed EXTRA data in header + if ($len - $headerlen - 2 < 8) { + return false; // invalid + } + $extralen = unpack("v",substr($data,8,2)); + $extralen = $extralen[1]; + if ($len - $headerlen - 2 - $extralen < 8) { + return false; // invalid + } + $extra = substr($data,10,$extralen); + $headerlen += 2 + $extralen; + } + $filenamelen = 0; + $filename = ""; + if ($flags & 8) { + // C-style string + if ($len - $headerlen - 1 < 8) { + return false; // invalid + } + $filenamelen = strpos(substr($data,$headerlen),chr(0)); + if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { + return false; // invalid + } + $filename = substr($data,$headerlen,$filenamelen); + $headerlen += $filenamelen + 1; + } + $commentlen = 0; + $comment = ""; + if ($flags & 16) { + // C-style string COMMENT data in header + if ($len - $headerlen - 1 < 8) { + return false; // invalid + } + $commentlen = strpos(substr($data,$headerlen),chr(0)); + if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { + return false; // Invalid header format + } + $comment = substr($data,$headerlen,$commentlen); + $headerlen += $commentlen + 1; + } + $headercrc = ""; + if ($flags & 2) { + // 2-bytes (lowest order) of CRC32 on header present + if ($len - $headerlen - 2 < 8) { + return false; // invalid + } + $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; + $headercrc = unpack("v", substr($data,$headerlen,2)); + $headercrc = $headercrc[1]; + if ($headercrc != $calccrc) { + $error = "Header checksum failed."; + return false; // Bad header CRC + } + $headerlen += 2; + } + // GZIP FOOTER + $datacrc = unpack("V",substr($data,-8,4)); + $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); + $isize = unpack("V",substr($data,-4)); + $isize = $isize[1]; + // decompression: + $bodylen = $len-$headerlen-8; + if ($bodylen < 1) { + // IMPLEMENTATION BUG! + return null; + } + $body = substr($data,$headerlen,$bodylen); + $data = ""; + if ($bodylen > 0) { + switch ($method) { + case 8: + // Currently the only supported compression method: + $data = gzinflate($body,$maxlength); + break; + default: + $error = "Unknown compression method."; + return false; + } + } // zero-byte body content is allowed + // Verifiy CRC32 + $crc = sprintf("%u",crc32($data)); + $crcOK = $crc == $datacrc; + $lenOK = $isize == strlen($data); + if (!$lenOK || !$crcOK) { + $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); + return false; + } + return $data; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ecd46d5f..c524a1ee 100644 --- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php @@ -1,79 +1,78 @@ -encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); - } - $this->url = $url; - $this->useragent = $useragent; - if (preg_match('/^http(s)?:\/\//i', $url)) - { - if (!is_array($headers)) - { - $headers = array(); - } - $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; - $headers2 = array(); - foreach ($headers as $key => $value) { - $headers2[] = "$key: $value"; - } - //TODO: allow for HTTP headers - // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); - - $response = self::$agent->get($url); - - if ($response === false || !isset($response['status_code'])) { - $this->error = 'failed to fetch URL'; - $this->success = false; - } else { - // The extra lines at the end are there to satisfy SimplePie's HTTP parser. - // The class expects a full HTTP message, whereas we're giving it only - // headers - the new lines indicate the start of the body. - $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); - if ($parser->parse()) { - $this->headers = $parser->headers; - //$this->body = $parser->body; - $this->body = $response['body']; - $this->status_code = $parser->status_code; - } - } - } - else - { - $this->error = 'invalid URL'; - $this->success = false; - } - } -} -?> \ No newline at end of file +encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); + } + $this->url = $url; + $this->useragent = $useragent; + if (preg_match('/^http(s)?:\/\//i', $url)) + { + if (!is_array($headers)) + { + $headers = array(); + } + $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; + $headers2 = array(); + foreach ($headers as $key => $value) { + $headers2[] = "$key: $value"; + } + //TODO: allow for HTTP headers + // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); + + $response = self::$agent->get($url); + + if ($response === false || !isset($response['status_code'])) { + $this->error = 'failed to fetch URL'; + $this->success = false; + } else { + // The extra lines at the end are there to satisfy SimplePie's HTTP parser. + // The class expects a full HTTP message, whereas we're giving it only + // headers - the new lines indicate the start of the body. + $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); + if ($parser->parse()) { + $this->headers = $parser->headers; + //$this->body = $parser->body; + $this->body = $response['body']; + $this->status_code = $parser->status_code; + } + } + } + else + { + $this->error = 'invalid URL'; + $this->success = false; + } + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php index 09b11546..382d869c 100644 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php @@ -6,23 +6,24 @@ * Attempts to detect the language of a sample of text by correlating ranked * 3-gram frequencies to a table of 3-gram frequencies of known languages. * - * Implements a version of a technique originally proposed by Cavnar & Trenkle - * (1994): "N-Gram-Based Text Categorization" + * Implements a version of a technique originally proposed by Cavnar & Trenkle + * (1994): "N-Gram-Based Text Categorization" * - * PHP versions 4 and 5 + * PHP version 5 * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2005-2006 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD - * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ - * @link http://pear.php.net/package/Text_LanguageDetect/ - * @link http://langdetect.blogspot.com/ + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @copyright 2005-2006 Nicholas Pisarro + * @license http://www.debian.org/misc/bsd.license BSD + * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ + * @link http://pear.php.net/package/Text_LanguageDetect/ + * @link http://langdetect.blogspot.com/ */ -//require_once 'PEAR.php'; -require_once 'Parser.php'; +require_once 'LanguageDetect/Exception.php'; +require_once 'LanguageDetect/Parser.php'; +require_once 'LanguageDetect/ISO639.php'; /** * Language detection class @@ -41,9 +42,10 @@ require_once 'Parser.php'; * * echo "Supported languages:\n"; * - * $langs = $l->getLanguages(); - * if (PEAR::isError($langs)) { - * die($langs->getMessage()); + * try { + * $langs = $l->getLanguages(); + * } catch (Text_LanguageDetect_Exception $e) { + * die($e->getMessage()); * } * * sort($langs); @@ -54,38 +56,38 @@ require_once 'Parser.php'; * } * * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2005 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD - * @version Release: @package_version@ - * @todo allow users to generate their own language models + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @copyright 2005 Nicholas Pisarro + * @license http://www.debian.org/misc/bsd.license BSD + * @version Release: @package_version@ + * @link http://pear.php.net/package/Text_LanguageDetect/ + * @todo allow users to generate their own language models */ - class Text_LanguageDetect { - /** + /** * The filename that stores the trigram data for the detector * - * If this value starts with a slash (/) or a dot (.) the value of + * If this value starts with a slash (/) or a dot (.) the value of * $this->_data_dir will be ignored - * + * * @var string * @access private */ - var $_db_filename = './lang.dat'; + var $_db_filename = 'lang.dat'; /** * The filename that stores the unicode block definitions * - * If this value starts with a slash (/) or a dot (.) the value of + * If this value starts with a slash (/) or a dot (.) the value of * $this->_data_dir will be ignored - * + * * @var string * @access private */ - var $_unicode_db_filename = './unicode_blocks.dat'; + var $_unicode_db_filename = 'unicode_blocks.dat'; /** * The data directory @@ -99,11 +101,8 @@ class Text_LanguageDetect /** * The trigram data for comparison - * - * Will be loaded on start from $this->_db_filename * - * May be set to a PEAR_Error object if there is an error during its - * initialization + * Will be loaded on start from $this->_db_filename * * @var array * @access private @@ -120,7 +119,7 @@ class Text_LanguageDetect /** * The size of the trigram data arrays - * + * * @var int * @access private */ @@ -140,7 +139,7 @@ class Text_LanguageDetect /** * Whether or not to simulate perl's Language::Guess exactly - * + * * @access private * @var bool * @see setPerlCompatible() @@ -164,19 +163,25 @@ class Text_LanguageDetect */ var $_clusters; + /** + * Which type of "language names" are accepted and returned: + * + * 0 - language name ("english") + * 2 - 2-letter ISO 639-1 code ("en") + * 3 - 3-letter ISO 639-2 code ("eng") + */ + var $_name_mode = 0; + /** * Constructor * * Will attempt to load the language database. If it fails, you will get - * a PEAR_Error object returned when you try to use detect() - * + * an exception. */ - function Text_LanguageDetect($db=null, $unicode_db=null) + function __construct() { - if (isset($db)) $this->_db_filename = $db; - if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; - $data = $this->_readdb($this->_db_filename); + $this->_checkTrigram($data['trigram']); $this->_lang_db = $data['trigram']; if (isset($data['trigram-unicodemap'])) { @@ -186,29 +191,32 @@ class Text_LanguageDetect // Not yet implemented: if (isset($data['trigram-clusters'])) { $this->_clusters = $data['trigram-clusters']; - } + } } /** * Returns the path to the location of the database * - * @access private - * @return string expected path to the language model database + * @param string $fname File name to load + * + * @return string expected path to the language model database + * @access private */ function _get_data_loc($fname) { - return $fname; + return dirname(__FILE__).'/'.$fname; } /** * Loads the language trigram database from filename * * Trigram datbase should be a serialize()'d array - * - * @access private - * @param string $fname the filename where the data is stored - * @return array the language model data - * @throws PEAR_Error + * + * @param string $fname the filename where the data is stored + * + * @return array the language model data + * @throws Text_LanguageDetect_Exception + * @access private */ function _readdb($fname) { @@ -217,79 +225,74 @@ class Text_LanguageDetect // input check if (!file_exists($fname)) { - throw new Exception('Language database does not exist.'); + throw new Text_LanguageDetect_Exception( + 'Language database does not exist: ' . $fname, + Text_LanguageDetect_Exception::DB_NOT_FOUND + ); } elseif (!is_readable($fname)) { - throw new Exception('Language database is not readable.'); + throw new Text_LanguageDetect_Exception( + 'Language database is not readable: ' . $fname, + Text_LanguageDetect_Exception::DB_NOT_READABLE + ); } - if (function_exists('file_get_contents')) { - return unserialize(file_get_contents($fname)); - } else { - // if you don't have file_get_contents(), - // then this is the next fastest way - ob_start(); - readfile($fname); - $contents = ob_get_contents(); - ob_end_clean(); - return unserialize($contents); - } + return unserialize(file_get_contents($fname)); } /** * Checks if this object is ready to detect languages - * - * @access private - * @param mixed &$err error object to be returned by reference, if any - * @return bool true if no errors + * + * @param array $trigram Trigram data from database + * + * @return void + * @access private */ - function _setup_ok(&$err) + function _checkTrigram($trigram) { - if (!is_array($this->_lang_db)) { + if (!is_array($trigram)) { if (ini_get('magic_quotes_runtime')) { - throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); - } else { - throw new Exception('Language database is not an array.'); + throw new Text_LanguageDetect_Exception( + 'Error loading database. Try turning magic_quotes_runtime off.', + Text_LanguageDetect_Exception::MAGIC_QUOTES + ); } - return false; - - } elseif (empty($this->_lang_db)) { - throw new Exception('Language database has no elements.'); - return false; - - } else { - return true; + throw new Text_LanguageDetect_Exception( + 'Language database is not an array.', + Text_LanguageDetect_Exception::DB_NOT_ARRAY + ); + } elseif (empty($trigram)) { + throw new Text_LanguageDetect_Exception( + 'Language database has no elements.', + Text_LanguageDetect_Exception::DB_EMPTY + ); } } /** * Omits languages * - * Pass this function the name of or an array of names of + * Pass this function the name of or an array of names of * languages that you don't want considered * - * If you're only expecting a limited set of languages, this can greatly + * If you're only expecting a limited set of languages, this can greatly * speed up processing * - * @access public - * @param mixed $omit_list language name or array of names to omit - * @param bool $include_only if true will include (rather than - * exclude) only those in the list - * @return int number of languages successfully deleted - * @throws PEAR_Error + * @param mixed $omit_list language name or array of names to omit + * @param bool $include_only if true will include (rather than + * exclude) only those in the list + * + * @return int number of languages successfully deleted + * @throws Text_LanguageDetect_Exception */ - function omitLanguages($omit_list, $include_only = false) + public function omitLanguages($omit_list, $include_only = false) { - - // setup check - if (!$this->_setup_ok($err)) { - return $err; - } - $deleted = 0; - // deleting the given languages + $omit_list = $this->_convertFromNameMode($omit_list); + if (!$include_only) { + // deleting the given languages if (!is_array($omit_list)) { $omit_list = strtolower($omit_list); // case desensitize if (isset($this->_lang_db[$omit_list])) { @@ -301,12 +304,12 @@ class Text_LanguageDetect if (isset($this->_lang_db[$omit_lang])) { unset($this->_lang_db[$omit_lang]); $deleted++; - } + } } } - // deleting all except the given languages } else { + // deleting all except the given languages if (!is_array($omit_list)) { $omit_list = array($omit_list); } @@ -327,7 +330,7 @@ class Text_LanguageDetect // reset the cluster cache if the number of languages changes // this will then have to be recalculated if (isset($this->_clusters) && $deleted > 0) { - unset($this->_clusters); + $this->_clusters = null; } return $deleted; @@ -339,49 +342,40 @@ class Text_LanguageDetect * * @access public * @return int the number of languages - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ function getLanguageCount() { - if (!$this->_setup_ok($err)) { - return $err; - } else { - return count($this->_lang_db); - } + return count($this->_lang_db); } /** - * Returns true if a given language exists + * Checks if the language with the given name exists in the database * - * If passed an array of names, will return true only if all exist + * @param mixed $lang Language name or array of language names * - * @access public - * @param mixed $lang language name or array of language names - * @return bool true if language model exists - * @throws PEAR_Error + * @return bool true if language model exists */ - function languageExists($lang) + public function languageExists($lang) { - if (!$this->_setup_ok($err)) { - return $err; - } else { - // string - if (is_string($lang)) { - return isset($this->_lang_db[strtolower($lang)]); - - // array - } elseif (is_array($lang)) { - foreach ($lang as $test_lang) { - if (!isset($this->_lang_db[strtolower($test_lang)])) { - return false; - } - } - return true; + $lang = $this->_convertFromNameMode($lang); - // other (error) - } else { - throw new Exception('Unknown type passed to languageExists()'); + if (is_string($lang)) { + return isset($this->_lang_db[strtolower($lang)]); + + } elseif (is_array($lang)) { + foreach ($lang as $test_lang) { + if (!isset($this->_lang_db[strtolower($test_lang)])) { + return false; + } } + return true; + + } else { + throw new Text_LanguageDetect_Exception( + 'Unsupported parameter type passed to languageExists()', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } } @@ -389,25 +383,24 @@ class Text_LanguageDetect * Returns the list of detectable languages * * @access public - * @return array the names of the languages known to this object - * @throws PEAR_Error + * @return array the names of the languages known to this object<<<<<<< + * @throws Text_LanguageDetect_Exception */ function getLanguages() { - if (!$this->_setup_ok($err)) { - return $err; - } else { - return array_keys($this->_lang_db); - } + return $this->_convertToNameMode( + array_keys($this->_lang_db) + ); } /** * Make this object behave like Language::Guess - * - * @access public - * @param bool $setting false to turn off perl compatibility + * + * @param bool $setting false to turn off perl compatibility + * + * @return void */ - function setPerlCompatible($setting = true) + public function setPerlCompatible($setting = true) { if (is_bool($setting)) { // input check $this->_perl_compatible = $setting; @@ -421,6 +414,21 @@ class Text_LanguageDetect } + /** + * Sets the way how language names are accepted and returned. + * + * @param integer $name_mode One of the following modes: + * 0 - language name ("english") + * 2 - 2-letter ISO 639-1 code ("en") + * 3 - 3-letter ISO 639-2 code ("eng") + * + * @return void + */ + function setNameMode($name_mode) + { + $this->_name_mode = $name_mode; + } + /** * Whether to use unicode block ranges in detection * @@ -429,10 +437,11 @@ class Text_LanguageDetect * in languages that use latin scripts. In other cases it should speed up * detection noticeably. * - * @access public - * @param bool $setting false to turn off + * @param bool $setting false to turn off + * + * @return void */ - function useUnicodeBlocks($setting = true) + public function useUnicodeBlocks($setting = true) { if (is_bool($setting)) { $this->_use_unicode_narrowing = $setting; @@ -442,15 +451,15 @@ class Text_LanguageDetect /** * Converts a piece of text into trigrams * - * Superceded by the Text_LanguageDetect_Parser class + * @param string $text text to convert * - * @access private - * @param string $text text to convert - * @return array array of trigram frequencies + * @return array array of trigram frequencies + * @access private + * @deprecated Superceded by the Text_LanguageDetect_Parser class */ function _trigram($text) { - $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); + $s = new Text_LanguageDetect_Parser($text); $s->prepareTrigram(); $s->prepareUnicode(false); $s->setPadStart(!$this->_perl_compatible); @@ -463,11 +472,12 @@ class Text_LanguageDetect * * Thresholds (cuts off) the list at $this->_threshold * - * @access protected - * @param array $arr array of trgram - * @return array ranks of trigrams + * @param array $arr array of trigram + * + * @return array ranks of trigrams + * @access protected */ - function _arr_rank(&$arr) + function _arr_rank($arr) { // sorts alphabetically first as a standard way of breaking rank ties @@ -494,14 +504,17 @@ class Text_LanguageDetect /** * Sorts an array by value breaking ties alphabetically - * - * @access private - * @param array &$arr the array to sort + * + * @param array &$arr the array to sort + * + * @return void + * @access private */ function _bub_sort(&$arr) { // should do the same as this perl statement: - // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } + // sort { $trigrams{$b} == $trigrams{$a} + // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } // needs to sort by both key and value at once // using the key to break ties for the value @@ -528,13 +541,14 @@ class Text_LanguageDetect /** * Sort function used by bubble sort * - * Callback function for usort(). + * Callback function for usort(). * - * @access private - * @param array first param passed by usort() - * @param array second param passed by usort() - * @return int 1 if $a is greater, -1 if not - * @see _bub_sort() + * @param array $a first param passed by usort() + * @param array $b second param passed by usort() + * + * @return int 1 if $a is greater, -1 if not + * @see _bub_sort() + * @access private */ function _sort_func($a, $b) { @@ -542,12 +556,12 @@ class Text_LanguageDetect list($a_key, $a_value) = $a; list($b_key, $b_value) = $b; - // if the values are the same, break ties using the key if ($a_value == $b_value) { + // if the values are the same, break ties using the key return strcmp($a_key, $b_key); - // if not, just sort normally } else { + // if not, just sort normally if ($a_value > $b_value) { return -1; } else { @@ -559,23 +573,24 @@ class Text_LanguageDetect } /** - * Calculates a linear rank-order distance statistic between two sets of + * Calculates a linear rank-order distance statistic between two sets of * ranked trigrams * - * Sums the differences in rank for each trigram. If the trigram does not + * Sums the differences in rank for each trigram. If the trigram does not * appear in both, consider it a difference of $this->_threshold. * * This distance measure was proposed by Cavnar & Trenkle (1994). Despite * its simplicity it has been shown to be highly accurate for language * identification tasks. * - * @access private - * @param array $arr1 the reference set of trigram ranks - * @param array $arr2 the target set of trigram ranks - * @return int the sum of the differences between the ranks of - * the two trigram sets + * @param array $arr1 the reference set of trigram ranks + * @param array $arr2 the target set of trigram ranks + * + * @return int the sum of the differences between the ranks of + * the two trigram sets + * @access private */ - function _distance(&$arr1, &$arr2) + function _distance($arr1, $arr2) { $sumdist = 0; @@ -598,14 +613,15 @@ class Text_LanguageDetect /** * Normalizes the score returned by _distance() - * + * * Different if perl compatible or not * - * @access private - * @param int $score the score from _distance() - * @param int $base_count the number of trigrams being considered - * @return float the normalized score - * @see _distance() + * @param int $score the score from _distance() + * @param int $base_count the number of trigrams being considered + * + * @return float the normalized score + * @see _distance() + * @access private */ function _normalize_score($score, $base_count = null) { @@ -630,29 +646,24 @@ class Text_LanguageDetect * * If perl compatible, the score is 300-0, 0 being most similar. * Otherwise, it's 0-1 with 1 being most similar. - * + * * The $sample text should be at least a few sentences in length; * should be ascii-7 or utf8 encoded, if another and the mbstring extension * is present it will try to detect and convert. However, experience has - * shown that mb_detect_encoding() *does not work very well* with at least + * shown that mb_detect_encoding() *does not work very well* with at least * some types of encoding. * - * @access public - * @param string $sample a sample of text to compare. - * @param int $limit if specified, return an array of the most likely - * $limit languages and their scores. - * @return mixed sorted array of language scores, blank array if no - * useable text was found, or PEAR_Error if error - * with the object setup - * @see _distance() - * @throws PEAR_Error + * @param string $sample a sample of text to compare. + * @param int $limit if specified, return an array of the most likely + * $limit languages and their scores. + * + * @return mixed sorted array of language scores, blank array if no + * useable text was found + * @see _distance() + * @throws Text_LanguageDetect_Exception */ - function detect($sample, $limit = 0) + public function detect($sample, $limit = 0) { - if (!$this->_setup_ok($err)) { - return $err; - } - // input check if (!Text_LanguageDetect_Parser::validateString($sample)) { return array(); @@ -660,36 +671,27 @@ class Text_LanguageDetect // check char encoding // (only if mbstring extension is compiled and PHP > 4.0.6) - if (function_exists('mb_detect_encoding') - && function_exists('mb_convert_encoding')) { - + if (function_exists('mb_detect_encoding') + && function_exists('mb_convert_encoding') + ) { // mb_detect_encoding isn't very reliable, to say the least - // detection should still work with a sufficient sample of ascii characters + // detection should still work with a sufficient sample + // of ascii characters $encoding = mb_detect_encoding($sample); // mb_detect_encoding() will return FALSE if detection fails // don't attempt conversion if that's the case - if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { - - if (function_exists('mb_list_encodings')) { - - // verify the encoding exists in mb_list_encodings - if (in_array($encoding, mb_list_encodings())) { - $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); - } - - // if the previous condition failed: - // somehow we detected an encoding that also we don't support - - } else { - // php 4 doesnt have mb_list_encodings() - // so attempt with error suppression - $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); + if ($encoding != 'ASCII' && $encoding != 'UTF-8' + && $encoding !== false + ) { + // verify the encoding exists in mb_list_encodings + if (in_array($encoding, mb_list_encodings())) { + $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); } } } - $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($sample); $sample_obj->prepareTrigram(); if ($this->_use_unicode_narrowing) { $sample_obj->prepareUnicode(); @@ -713,7 +715,10 @@ class Text_LanguageDetect if (is_array($blocks)) { $present_blocks = array_keys($blocks); } else { - throw new Exception('Error during block detection'); + throw new Text_LanguageDetect_Exception( + 'Error during block detection', + Text_LanguageDetect_Exception::BLOCK_DETECTION + ); } $possible_langs = array(); @@ -731,30 +736,30 @@ class Text_LanguageDetect } // could also try an intersect operation rather than a union - // in other words, choose languages whose trigrams contain + // in other words, choose languages whose trigrams contain // ALL of the unicode blocks found in this sample // would improve speed but would be completely thrown off by an // unexpected character, like an umlaut appearing in english text $possible_langs = array_intersect( - array_keys($this->_lang_db), - array_unique($possible_langs) + array_keys($this->_lang_db), + array_unique($possible_langs) ); - // needs to intersect it with the keys of _lang_db in case + // needs to intersect it with the keys of _lang_db in case // languages have been omitted - // or just try 'em all } else { + // or just try 'em all $possible_langs = array_keys($this->_lang_db); } foreach ($possible_langs as $lang) { - $scores[$lang] = - $this->_normalize_score( - $this->_distance($this->_lang_db[$lang], $trigram_freqs), - $trigram_count); + $scores[$lang] = $this->_normalize_score( + $this->_distance($this->_lang_db[$lang], $trigram_freqs), + $trigram_count + ); } unset($sample_obj); @@ -772,7 +777,6 @@ class Text_LanguageDetect $limited_scores = array(); $i = 0; - foreach ($scores as $key => $value) { if ($i++ >= $limit) { break; @@ -781,9 +785,9 @@ class Text_LanguageDetect $limited_scores[$key] = $value; } - return $limited_scores; + return $this->_convertToNameMode($limited_scores, true); } else { - return $scores; + return $this->_convertToNameMode($scores, true); } } @@ -791,35 +795,33 @@ class Text_LanguageDetect * Returns only the most similar language to the text sample * * Calls $this->detect() and returns only the top result - * - * @access public - * @param string $sample text to detect the language of - * @return string the name of the most likely language - * or null if no language is similar - * @see detect() - * @throws PEAR_Error + * + * @param string $sample text to detect the language of + * + * @return string the name of the most likely language + * or null if no language is similar + * @see detect() + * @throws Text_LanguageDetect_Exception */ - function detectSimple($sample) + public function detectSimple($sample) { $scores = $this->detect($sample, 1); // if top language has the maximum possible score, // then the top score will have been picked at random - if ( !is_array($scores) - || empty($scores) - || current($scores) == $this->_max_score) { - + if (!is_array($scores) || empty($scores) + || current($scores) == $this->_max_score + ) { return null; - } else { - return ucfirst(key($scores)); + return key($scores); } } /** * Returns an array containing the most similar language and a confidence * rating - * + * * Confidence is a simple measure calculated from the similarity score * minus the similarity score from the next most similar language * divided by the highest possible score. Languages that have closely @@ -827,46 +829,43 @@ class Text_LanguageDetect * confidence scores. * * The similarity score answers the question "How likely is the text the - * returned language regardless of the other languages considered?" The + * returned language regardless of the other languages considered?" The * confidence score is one way of answering the question "how likely is the * text the detected language relative to the rest of the language model * set?" * * To see how similar languages are a priori, see languageSimilarity() - * - * @access public - * @param string $sample text for which language will be detected - * @return array most similar language, score and confidence rating - * or null if no language is similar - * @see detect() - * @throws PEAR_Error + * + * @param string $sample text for which language will be detected + * + * @return array most similar language, score and confidence rating + * or null if no language is similar + * @see detect() + * @throws Text_LanguageDetect_Exception */ - function detectConfidence($sample) + public function detectConfidence($sample) { $scores = $this->detect($sample, 2); - // if most similar language has the max score, it + // if most similar language has the max score, it // will have been picked at random - if ( !is_array($scores) - || empty($scores) - || current($scores) == $this->_max_score) { - + if (!is_array($scores) || empty($scores) + || current($scores) == $this->_max_score + ) { return null; } - $arr['language'] = ucfirst(key($scores)); + $arr['language'] = key($scores); $arr['similarity'] = current($scores); if (next($scores) !== false) { // if false then no next element // the goal is to return a higher value if the distance between // the similarity of the first score and the second score is high if ($this->_perl_compatible) { - - $arr['confidence'] = - (current($scores) - $arr['similarity']) / $this->_max_score; + $arr['confidence'] = (current($scores) - $arr['similarity']) + / $this->_max_score; } else { - $arr['confidence'] = $arr['similarity'] - current($scores); } @@ -882,32 +881,26 @@ class Text_LanguageDetect * Returns the distribution of unicode blocks in a given utf8 string * * For the block name of a single char, use unicodeBlockName() - * - * @access public - * @param string $str input string. Must be ascii or utf8 - * @param bool $skip_symbols if true, skip ascii digits, symbols and - * non-printing characters. Includes spaces, - * newlines and common punctutation characters. + * + * @param string $str input string. Must be ascii or utf8 + * @param bool $skip_symbols if true, skip ascii digits, symbols and + * non-printing characters. Includes spaces, + * newlines and common punctutation characters. + * * @return array - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ - function detectUnicodeBlocks($str, $skip_symbols) + public function detectUnicodeBlocks($str, $skip_symbols) { - // input check - if (!is_bool($skip_symbols)) { - throw new Exception('Second parameter must be boolean'); - } - - if (!is_string($str)) { - throw new Exception('First parameter was not a string'); - } + $skip_symbols = (bool)$skip_symbols; + $str = (string)$str; - $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($str); $sample_obj->prepareUnicode(); $sample_obj->prepareTrigram(false); $sample_obj->setUnicodeSkipSymbols($skip_symbols); $sample_obj->analyze(); - $blocks =& $sample_obj->getUnicodeBlocks(); + $blocks = $sample_obj->getUnicodeBlocks(); unset($sample_obj); return $blocks; } @@ -915,38 +908,37 @@ class Text_LanguageDetect /** * Returns the block name for a given unicode value * - * If passed a string, will assume it is being passed a UTF8-formatted + * If passed a string, will assume it is being passed a UTF8-formatted * character and will automatically convert. Otherwise it will assume it * is being passed a numeric unicode value. * * Make sure input is of the correct type! * - * @access public * @param mixed $unicode unicode value or utf8 char + * * @return mixed the block name string or false if not found - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ - function unicodeBlockName($unicode) { + public function unicodeBlockName($unicode) + { if (is_string($unicode)) { // assume it is being passed a utf8 char, so convert it - - // input check - if ($this->utf8strlen($unicode) > 1) { - throw new Exception('Pass this function only a single char'); + if (self::utf8strlen($unicode) > 1) { + throw new Text_LanguageDetect_Exception( + 'Pass a single char only to this method', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } - $unicode = $this->_utf8char2unicode($unicode); - if ($unicode == -1) { - throw new Exception('Malformatted char'); - } - - // input check } elseif (!is_int($unicode)) { - throw new Exception('Input must be of type string or int.'); + throw new Text_LanguageDetect_Exception( + 'Input must be of type string or int.', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } - $blocks =& $this->_read_unicode_block_db(); + $blocks = $this->_read_unicode_block_db(); $result = $this->_unicode_block_name($unicode, $blocks); @@ -964,14 +956,17 @@ class Text_LanguageDetect * the public interface for this function, which does input checks which * this function omits for speed. * - * @access protected - * @param int $unicode the unicode value - * @param array &$blocks the block database - * @param int $block_count the number of defined blocks in the database - * @see unicodeBlockName() + * @param int $unicode the unicode value + * @param array $blocks the block database + * @param int $block_count the number of defined blocks in the database + * + * @return mixed Block name, -1 if it failed + * @see unicodeBlockName() + * @access protected */ - function _unicode_block_name($unicode, &$blocks, $block_count = -1) { - // for a reference, see + function _unicode_block_name($unicode, $blocks, $block_count = -1) + { + // for a reference, see // http://www.unicode.org/Public/UNIDATA/Blocks.txt // assume that ascii characters are the most common @@ -994,35 +989,36 @@ class Text_LanguageDetect while ($low <= $high) { $mid = floor(($low + $high) / 2); - // if it's lower than the lower bound if ($unicode < $blocks[$mid][0]) { + // if it's lower than the lower bound $high = $mid - 1; - // if it's higher than the upper bound } elseif ($unicode > $blocks[$mid][1]) { + // if it's higher than the upper bound $low = $mid + 1; - // found it } else { + // found it return $blocks[$mid]; } } - // failed to find the block + // failed to find the block return -1; - // todo: differentiate when it's out of range or when it falls + // todo: differentiate when it's out of range or when it falls // into an unassigned range? } /** * Brings up the unicode block database * - * @access protected * @return array the database of unicode block definitions - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception + * @access protected */ - function &_read_unicode_block_db() { + function _read_unicode_block_db() + { // since the unicode definitions are always going to be the same, // might as well share the memory for the db with all other instances // of this class @@ -1037,29 +1033,27 @@ class Text_LanguageDetect /** * Calculate the similarities between the language models - * + * * Use this function to see how similar languages are to each other. * * If passed 2 language names, will return just those languages compared. * If passed 1 language name, will return that language compared to * all others. - * If passed none, will return an array of every language model compared + * If passed none, will return an array of every language model compared * to every other one. * - * @access public - * @param string $lang1 the name of the first language to be compared - * @param string $lang2 the name of the second language to be compared - * @return array scores of every language compared - * or the score of just the provided languages - * or null if one of the supplied languages does not exist - * @throws PEAR_Error + * @param string $lang1 the name of the first language to be compared + * @param string $lang2 the name of the second language to be compared + * + * @return array scores of every language compared + * or the score of just the provided languages + * or null if one of the supplied languages does not exist + * @throws Text_LanguageDetect_Exception */ - function languageSimilarity($lang1 = null, $lang2 = null) + public function languageSimilarity($lang1 = null, $lang2 = null) { - if (!$this->_setup_ok($err)) { - return $err; - } - + $lang1 = $this->_convertFromNameMode($lang1); + $lang2 = $this->_convertFromNameMode($lang2); if ($lang1 != null) { $lang1 = strtolower($lang1); @@ -1069,12 +1063,8 @@ class Text_LanguageDetect } if ($lang2 != null) { - - // can't only set the second param - if ($lang1 == null) { - return null; - // check if language model exists - } elseif (!isset($this->_lang_db[$lang2])) { + if (!isset($this->_lang_db[$lang2])) { + // check if language model exists return null; } @@ -1088,14 +1078,15 @@ class Text_LanguageDetect ) ); - - // compare just $lang1 to all languages } else { + // compare just $lang1 to all languages $return_arr = array(); foreach ($this->_lang_db as $key => $value) { - if ($key != $lang1) { // don't compare a language to itself + if ($key != $lang1) { + // don't compare a language to itself $return_arr[$key] = $this->_normalize_score( - $this->_distance($this->_lang_db[$lang1], $value)); + $this->_distance($this->_lang_db[$lang1], $value) + ); } } asort($return_arr); @@ -1104,30 +1095,27 @@ class Text_LanguageDetect } - // compare all languages to each other } else { + // compare all languages to each other $return_arr = array(); foreach (array_keys($this->_lang_db) as $lang1) { foreach (array_keys($this->_lang_db) as $lang2) { - // skip comparing languages to themselves - if ($lang1 != $lang2) { - - // don't re-calculate what's already been done - if (isset($return_arr[$lang2][$lang1])) { + if ($lang1 != $lang2) { - $return_arr[$lang1][$lang2] = - $return_arr[$lang2][$lang1]; + if (isset($return_arr[$lang2][$lang1])) { + // don't re-calculate what's already been done + $return_arr[$lang1][$lang2] + = $return_arr[$lang2][$lang1]; - // calculate } else { - - $return_arr[$lang1][$lang2] = - $this->_normalize_score( - $this->_distance( - $this->_lang_db[$lang1], - $this->_lang_db[$lang2] - ) + // calculate + $return_arr[$lang1][$lang2] + = $this->_normalize_score( + $this->_distance( + $this->_lang_db[$lang1], + $this->_lang_db[$lang2] + ) ); } @@ -1150,20 +1138,14 @@ class Text_LanguageDetect * * @access public * @return array language cluster data - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception * @see languageSimilarity() - * @deprecated this function will eventually be removed and placed into + * @deprecated this function will eventually be removed and placed into * the model generation class */ function clusterLanguages() { // todo: set the maximum number of clusters - - // setup check - if (!$this->_setup_ok($err)) { - return $err; - } - // return cached result, if any if (isset($this->_clusters)) { return $this->_clusters; @@ -1177,7 +1159,10 @@ class Text_LanguageDetect foreach ($langs as $lang) { if (!isset($this->_lang_db[$lang])) { - throw new Exception("missing $lang!\n"); + throw new Text_LanguageDetect_Exception( + "missing $lang!", + Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE + ); } } @@ -1186,7 +1171,9 @@ class Text_LanguageDetect $langs[$lang1] = $lang1; unset($langs[$old_key]); } - + + $result_data = $really_map = array(); + $i = 0; while (count($langs) > 2 && $i++ < 200) { $highest_score = -1; @@ -1194,18 +1181,22 @@ class Text_LanguageDetect $highest_key2 = ''; foreach ($langs as $lang1) { foreach ($langs as $lang2) { - if ( $lang1 != $lang2 - && $arr[$lang1][$lang2] > $highest_score) { + if ($lang1 != $lang2 + && $arr[$lang1][$lang2] > $highest_score + ) { $highest_score = $arr[$lang1][$lang2]; $highest_key1 = $lang1; $highest_key2 = $lang2; } } } - + if (!$highest_key1) { // should not ever happen - throw new Exception("no highest key? (step: $i)"); + throw new Text_LanguageDetect_Exception( + "no highest key? (step: $i)", + Text_LanguageDetect_Exception::NO_HIGHEST_KEY + ); } if ($highest_score == 0) { @@ -1217,7 +1208,7 @@ class Text_LanguageDetect $sum1 = array_sum($arr[$highest_key1]); $sum2 = array_sum($arr[$highest_key2]); - // use the score for the one that is most similar to the rest of + // use the score for the one that is most similar to the rest of // the field as the score for the group // todo: could try averaging or "centroid" method instead // seems like that might make more sense @@ -1248,7 +1239,7 @@ class Text_LanguageDetect $really_lang = $replaceme; while (isset($really_map[$really_lang])) { $really_lang = $really_map[$really_lang]; - } + } $really_map[$newkey] = $really_lang; @@ -1259,8 +1250,8 @@ class Text_LanguageDetect $arr[$key1][$newkey] = $arr[$key1][$key2]; unset($arr[$key1][$key2]); // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] - } - + } + if ($key1 == $replaceme) { $arr[$newkey][$key2] = $arr[$key1][$key2]; unset($arr[$key1][$key2]); @@ -1273,7 +1264,7 @@ class Text_LanguageDetect } } } - + unset($langs[$highest_key1]); unset($langs[$highest_key2]); @@ -1293,7 +1284,7 @@ class Text_LanguageDetect } $return_val = array( - 'open_forks' => $langs, + 'open_forks' => $langs, // the top level of clusters // clusters that are mutually exclusive // or specified by a specific maximum @@ -1323,11 +1314,11 @@ class Text_LanguageDetect * use, and it may disappear or its functionality may change in future * releases without notice. * - * This compares the sample text to top the top level of clusters. If the + * This compares the sample text to top the top level of clusters. If the * sample is similar to the cluster it will drop down and compare it to the * languages in the cluster, and so on until it hits a leaf node. * - * this should find the language in considerably fewer compares + * this should find the language in considerably fewer compares * (the equivalent of a binary search), however clusterLanguages() is costly * and the loss of accuracy from this technique is significant. * @@ -1337,15 +1328,14 @@ class Text_LanguageDetect * was very large, however in such cases some method of Bayesian inference * might be more helpful. * - * @see clusterLanguages() - * @access public - * @param string $str input string - * @return array language scores (only those compared) - * @throws PEAR_Error + * @param string $str input string + * + * @return array language scores (only those compared) + * @throws Text_LanguageDetect_Exception + * @see clusterLanguages() */ - function clusteredSearch($str) + public function clusteredSearch($str) { - // input check if (!Text_LanguageDetect_Parser::validateString($str)) { return array(); @@ -1359,7 +1349,7 @@ class Text_LanguageDetect $dendogram_data = $result['fork_data']; $dendogram_alias = $result['name_map']; - $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($str); $sample_obj->prepareTrigram(); $sample_obj->setPadStart(!$this->_perl_compatible); $sample_obj->analyze(); @@ -1372,7 +1362,7 @@ class Text_LanguageDetect } $i = 0; // counts the number of steps - + foreach ($dendogram_start as $lang) { if (isset($dendogram_alias[$lang])) { $lang_key = $dendogram_alias[$lang]; @@ -1382,7 +1372,8 @@ class Text_LanguageDetect $scores[$lang] = $this->_normalize_score( $this->_distance($this->_lang_db[$lang_key], $sample_result), - $sample_count); + $sample_count + ); $i++; } @@ -1411,7 +1402,8 @@ class Text_LanguageDetect $scores[$lang] = $this->_normalize_score( $this->_distance($this->_lang_db[$lang_key], $sample_result), - $sample_count); + $sample_count + ); //todo: does not need to do same comparison again } @@ -1428,8 +1420,8 @@ class Text_LanguageDetect $diff = $scores[$cur_key] - $scores[$loser_key]; - // $cur_key ({$dendogram_alias[$cur_key]}) wins - // over $loser_key ({$dendogram_alias[$loser_key]}) + // $cur_key ({$dendogram_alias[$cur_key]}) wins + // over $loser_key ({$dendogram_alias[$loser_key]}) // with a difference of $diff } @@ -1439,9 +1431,9 @@ class Text_LanguageDetect // which paths the algorithm decided to take along the tree // but sometimes the last item is only the second highest - if ( ($this->_perl_compatible && (end($scores) > prev($scores))) - || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { - + if (($this->_perl_compatible && (end($scores) > prev($scores))) + || (!$this->_perl_compatible && (end($scores) < prev($scores))) + ) { $real_last_score = current($scores); $real_last_key = key($scores); @@ -1449,7 +1441,7 @@ class Text_LanguageDetect unset($scores[$real_last_key]); $scores[$real_last_key] = $real_last_score; } - + if (!$this->_perl_compatible) { $scores = array_reverse($scores, true); @@ -1464,12 +1456,11 @@ class Text_LanguageDetect * * Returns the numbers of characters (not bytes) in a utf8 string * - * @static - * @access public - * @param string $str string to get the length of - * @return int number of chars + * @param string $str string to get the length of + * + * @return int number of chars */ - function utf8strlen($str) + public static function utf8strlen($str) { // utf8_decode() will convert unknown chars to '?', which is actually // ideal for counting. @@ -1482,53 +1473,45 @@ class Text_LanguageDetect /** * Returns the unicode value of a utf8 char * - * @access protected - * @param string $char a utf8 (possibly multi-byte) char - * @return int unicode value or -1 if malformatted + * @param string $char a utf8 (possibly multi-byte) char + * + * @return int unicode value + * @access protected + * @link http://en.wikipedia.org/wiki/UTF-8 */ - function _utf8char2unicode($char) { - + function _utf8char2unicode($char) + { // strlen() here will actually get the binary length of a single char switch (strlen($char)) { - - // for a reference, see http://en.wikipedia.org/wiki/UTF-8 - - case 1: - // normal ASCII-7 byte - // 0xxxxxxx --> 0xxxxxxx - return ord($char{0}); - - case 2: - // 2 byte unicode - // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx - $z = (ord($char{0}) & 0x000001F) << 6; - $x = (ord($char{1}) & 0x0000003F); - - return ($z | $x); - - case 3: - // 3 byte unicode - // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx - $z = (ord($char{0}) & 0x0000000F) << 12; - $x1 = (ord($char{1}) & 0x0000003F) << 6; - $x2 = (ord($char{2}) & 0x0000003F); - - return ($z | $x1 | $x2); - - case 4: - // 4 byte unicode - // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> - // 000zzzzz xxxxxxxx xxxxxxxx - $z1 = (ord($char{0}) & 0x00000007) << 18; - $z2 = (ord($char{1}) & 0x0000003F) << 12; - $x1 = (ord($char{2}) & 0x0000003F) << 6; - $x2 = (ord($char{3}) & 0x0000003F); - - return ($z1 | $z2 | $x1 | $x2); - - default: - // error: malformatted char? - return -1; + case 1: + // normal ASCII-7 byte + // 0xxxxxxx --> 0xxxxxxx + return ord($char{0}); + + case 2: + // 2 byte unicode + // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx + $z = (ord($char{0}) & 0x000001F) << 6; + $x = (ord($char{1}) & 0x0000003F); + return ($z | $x); + + case 3: + // 3 byte unicode + // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx + $z = (ord($char{0}) & 0x0000000F) << 12; + $x1 = (ord($char{1}) & 0x0000003F) << 6; + $x2 = (ord($char{2}) & 0x0000003F); + return ($z | $x1 | $x2); + + case 4: + // 4 byte unicode + // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> + // 000zzzzz xxxxxxxx xxxxxxxx + $z1 = (ord($char{0}) & 0x00000007) << 18; + $z2 = (ord($char{1}) & 0x0000003F) << 12; + $x1 = (ord($char{2}) & 0x0000003F) << 6; + $x2 = (ord($char{3}) & 0x0000003F); + return ($z1 | $z2 | $x1 | $x2); } } @@ -1536,18 +1519,18 @@ class Text_LanguageDetect * utf8-safe fast character iterator * * Will get the next character starting from $counter, which will then be - * incremented. If a multi-byte char the bytes will be concatenated and + * incremented. If a multi-byte char the bytes will be concatenated and * $counter will be incremeted by the number of bytes in the char. * - * @access private - * @param string &$str the string being iterated over - * @param int &$counter the iterator, will increment by reference - * @param bool $special_convert whether to do special conversions - * @return char the next (possibly multi-byte) char from $counter + * @param string $str the string being iterated over + * @param int &$counter the iterator, will increment by reference + * @param bool $special_convert whether to do special conversions + * + * @return char the next (possibly multi-byte) char from $counter + * @access private */ - function _next_char(&$str, &$counter, $special_convert = false) + static function _next_char($str, &$counter, $special_convert = false) { - $char = $str{$counter++}; $ord = ord($char); @@ -1556,7 +1539,6 @@ class Text_LanguageDetect // normal ascii one byte char if ($ord <= 127) { - // special conversions needed for this package // (that only apply to regular ascii characters) // lower case, and convert all non-alphanumeric characters @@ -1571,8 +1553,8 @@ class Text_LanguageDetect return $char; - // multi-byte chars } elseif ($ord >> 5 == 6) { // two-byte char + // multi-byte chars $nextchar = $str{$counter++}; // get next byte // lower-casing of non-ascii characters is still incomplete @@ -1582,27 +1564,27 @@ class Text_LanguageDetect if ($ord == 195) { $nextord = ord($nextchar); $nextord_adj = $nextord + 64; - // for a reference, see + // for a reference, see // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html // À - Þ but not × - if ( $nextord_adj >= 192 - && $nextord_adj <= 222 - && $nextord_adj != 215) { - - $nextchar = chr($nextord + 32); + if ($nextord_adj >= 192 + && $nextord_adj <= 222 + && $nextord_adj != 215 + ) { + $nextchar = chr($nextord + 32); } - // lower case cyrillic alphabet } elseif ($ord == 208) { + // lower case cyrillic alphabet $nextord = ord($nextchar); // if A - Pe if ($nextord >= 144 && $nextord <= 159) { // lower case $nextchar = chr($nextord + 32); - // if Er - Ya } elseif ($nextord >= 160 && $nextord <= 175) { + // if Er - Ya // lower case $char = chr(209); // == $ord++ $nextchar = chr($nextord - 32); @@ -1611,12 +1593,11 @@ class Text_LanguageDetect } // tag on next byte - return $char . $nextchar; - + return $char . $nextchar; } elseif ($ord >> 4 == 14) { // three-byte char - + // tag on next 2 bytes - return $char . $str{$counter++} . $str{$counter++}; + return $char . $str{$counter++} . $str{$counter++}; } elseif ($ord >> 3 == 30) { // four-byte char @@ -1628,8 +1609,85 @@ class Text_LanguageDetect } } -} + /** + * Converts an $language input parameter from the configured mode + * to the language name that is used internally. + * + * Works for strings and arrays. + * + * @param string|array $lang A language description ("english"/"en"/"eng") + * @param boolean $convertKey If $lang is an array, setting $key + * converts the keys to the language name. + * + * @return string|array Language name + */ + function _convertFromNameMode($lang, $convertKey = false) + { + if ($this->_name_mode == 0) { + return $lang; + } + + if ($this->_name_mode == 2) { + $method = 'code2ToName'; + } else { + $method = 'code3ToName'; + } + + if (is_string($lang)) { + return (string)Text_LanguageDetect_ISO639::$method($lang); + } + + $newlang = array(); + foreach ($lang as $key => $val) { + if ($convertKey) { + $newkey = (string)Text_LanguageDetect_ISO639::$method($key); + $newlang[$newkey] = $val; + } else { + $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); + } + } + return $newlang; + } -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ + /** + * Converts an $language output parameter from the language name that is + * used internally to the configured mode. + * + * Works for strings and arrays. + * + * @param string|array $lang A language description ("english"/"en"/"eng") + * @param boolean $convertKey If $lang is an array, setting $key + * converts the keys to the language name. + * + * @return string|array Language name + */ + function _convertToNameMode($lang, $convertKey = false) + { + if ($this->_name_mode == 0) { + return $lang; + } + + if ($this->_name_mode == 2) { + $method = 'nameToCode2'; + } else { + $method = 'nameToCode3'; + } + + if (is_string($lang)) { + return Text_LanguageDetect_ISO639::$method($lang); + } + + $newlang = array(); + foreach ($lang as $key => $val) { + if ($convertKey) { + $newkey = Text_LanguageDetect_ISO639::$method($key); + $newlang[$newkey] = $val; + } else { + $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); + } + } + return $newlang; + } +} -?> +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index 2e8991cc..d0f09d74 100644 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php @@ -1,1138 +1,1138 @@ -init(); -echo $r->articleContent->innerHTML; -*/ - -class Readability -{ - public $version = '1.7.1-without-multi-page'; - public $convertLinksToFootnotes = false; - public $revertForcedParagraphElements = true; - public $articleTitle; - public $articleContent; - public $dom; - public $url = null; // optional - URL where HTML was retrieved - public $debug = false; - public $lightClean = true; // preserves more content (experimental) added 2012-09-19 - protected $body = null; // - protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later - protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. - protected $success = false; // indicates whether we were able to extract or not - - /** - * All of the regular expressions in use within readability. - * Defined up here so we don't instantiate them repeatedly in loops. - **/ - public $regexps = array( - 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', - 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', - 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', - 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', - 'replaceBrs' => '/(]*>[ \n\r\t]*){2,}/i', - 'replaceFonts' => '/<(\/?)font[^>]*>/i', - // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() - 'normalize' => '/\s{2,}/', - 'killBreaks' => '/((\s| ?)*){1,}/', - 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', - 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' - ); - - /* constants */ - const FLAG_STRIP_UNLIKELYS = 1; - const FLAG_WEIGHT_CLASSES = 2; - const FLAG_CLEAN_CONDITIONALLY = 4; - - /** - * Create instance of Readability - * @param string UTF-8 encoded string - * @param string (optional) URL associated with HTML (used for footnotes) - * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') - */ - function __construct($html, $url=null, $parser='libxml') - { - $this->url = $url; - /* Turn all double br's into p's */ - $html = preg_replace($this->regexps['replaceBrs'], '

    ', $html); - $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); - $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); - if (trim($html) == '') $html = ''; - if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { - // all good - } else { - $this->dom = new DOMDocument(); - $this->dom->preserveWhiteSpace = false; - @$this->dom->loadHTML($html); - } - $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); - } - - /** - * Get article title element - * @return DOMElement - */ - public function getTitle() { - return $this->articleTitle; - } - - /** - * Get article content element - * @return DOMElement - */ - public function getContent() { - return $this->articleContent; - } - - /** - * Runs readability. - * - * Workflow: - * 1. Prep the document by removing script tags, css, etc. - * 2. Build readability's DOM tree. - * 3. Grab the article content from the current dom tree. - * 4. Replace the current DOM tree with the new one. - * 5. Read peacefully. - * - * @return boolean true if we found content, false otherwise - **/ - public function init() - { - if (!isset($this->dom->documentElement)) return false; - $this->removeScripts($this->dom); - //die($this->getInnerHTML($this->dom->documentElement)); - - // Assume successful outcome - $this->success = true; - - $bodyElems = $this->dom->getElementsByTagName('body'); - if ($bodyElems->length > 0) { - if ($this->bodyCache == null) { - $this->bodyCache = $bodyElems->item(0)->innerHTML; - } - if ($this->body == null) { - $this->body = $bodyElems->item(0); - } - } - - $this->prepDocument(); - - //die($this->dom->documentElement->parentNode->nodeType); - //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); - //die($this->getInnerHTML($this->dom->documentElement)); - - /* Build readability's DOM tree */ - $overlay = $this->dom->createElement('div'); - $innerDiv = $this->dom->createElement('div'); - $articleTitle = $this->getArticleTitle(); - $articleContent = $this->grabArticle(); - - if (!$articleContent) { - $this->success = false; - $articleContent = $this->dom->createElement('div'); - $articleContent->setAttribute('id', 'readability-content'); - $articleContent->innerHTML = '

    Sorry, Readability was unable to parse this page for content.

    '; - } - - $overlay->setAttribute('id', 'readOverlay'); - $innerDiv->setAttribute('id', 'readInner'); - - /* Glue the structure of our document together. */ - $innerDiv->appendChild($articleTitle); - $innerDiv->appendChild($articleContent); - $overlay->appendChild($innerDiv); - - /* Clear the old HTML, insert the new content. */ - $this->body->innerHTML = ''; - $this->body->appendChild($overlay); - //document.body.insertBefore(overlay, document.body.firstChild); - $this->body->removeAttribute('style'); - - $this->postProcessContent($articleContent); - - // Set title and content instance variables - $this->articleTitle = $articleTitle; - $this->articleContent = $articleContent; - - return $this->success; - } - - /** - * Debug - */ - protected function dbg($msg) { - if ($this->debug) echo '* ',$msg, "\n"; - } - - /** - * Run any post-process modifications to article content as necessary. - * - * @param DOMElement - * @return void - */ - public function postProcessContent($articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { - $this->addFootnotes($articleContent); - } - } - - /** - * Get the article title as an H1. - * - * @return DOMElement - */ - protected function getArticleTitle() { - $curTitle = ''; - $origTitle = ''; - - try { - $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); - } catch(Exception $e) {} - - if (preg_match('/ [\|\-] /', $curTitle)) - { - $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); - } - } - else if (strpos($curTitle, ': ') !== false) - { - $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); - - if (count(explode(' ', $curTitle)) < 3) { - $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); - } - } - else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) - { - $hOnes = $this->dom->getElementsByTagName('h1'); - if($hOnes->length == 1) - { - $curTitle = $this->getInnerText($hOnes->item(0)); - } - } - - $curTitle = trim($curTitle); - - if (count(explode(' ', $curTitle)) <= 4) { - $curTitle = $origTitle; - } - - $articleTitle = $this->dom->createElement('h1'); - $articleTitle->innerHTML = $curTitle; - - return $articleTitle; - } - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ - protected function prepDocument() { - /** - * In some cases a body element can't be found (if the HTML is totally hosed for example) - * so we create a new body node and append it to the document. - */ - if ($this->body == null) - { - $this->body = $this->dom->createElement('body'); - $this->dom->documentElement->appendChild($this->body); - } - $this->body->setAttribute('id', 'readabilityBody'); - - /* Remove all style tags in head */ - $styleTags = $this->dom->getElementsByTagName('style'); - for ($i = $styleTags->length-1; $i >= 0; $i--) - { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); - } - - /* Turn all double br's into p's */ - /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ - //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

    ').replace(readability.regexps.replaceFonts, '<$1span>'); - // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. - // Manipulating innerHTML as it's done in JS is not possible in PHP. - } - - /** - * For easier reading, convert this document to have footnotes at the bottom rather than inline links. - * @see http://www.roughtype.com/archives/2010/05/experiments_in.php - * - * @return void - **/ - public function addFootnotes($articleContent) { - $footnotesWrapper = $this->dom->createElement('div'); - $footnotesWrapper->setAttribute('id', 'readability-footnotes'); - $footnotesWrapper->innerHTML = '

    References

    '; - - $articleFootnotes = $this->dom->createElement('ol'); - $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); - $footnotesWrapper->appendChild($articleFootnotes); - - $articleLinks = $articleContent->getElementsByTagName('a'); - - $linkCount = 0; - for ($i = 0; $i < $articleLinks->length; $i++) - { - $articleLink = $articleLinks->item($i); - $footnoteLink = $articleLink->cloneNode(true); - $refLink = $this->dom->createElement('a'); - $footnote = $this->dom->createElement('li'); - $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); - if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); - //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, - $linkText = $this->getInnerText($articleLink); - - if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { - continue; - } - - $linkCount++; - - /** Add a superscript reference after the article link */ - $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); - $refLink->innerHTML = '[' . $linkCount . ']'; - $refLink->setAttribute('class', 'readability-DoNotFootnote'); - $refLink->setAttribute('style', 'color: inherit;'); - - //TODO: does this work or should we use DOMNode.isSameNode()? - if ($articleLink->parentNode->lastChild == $articleLink) { - $articleLink->parentNode->appendChild($refLink); - } else { - $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); - } - - $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); - $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); - - $footnote->innerHTML = '^ '; - - $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); - $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); - - $footnote->appendChild($footnoteLink); - if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; - - $articleFootnotes->appendChild($footnote); - } - - if ($linkCount > 0) { - $articleContent->appendChild($footnotesWrapper); - } - } - - /** - * Reverts P elements with class 'readability-styled' - * to text nodes - which is what they were before. - * - * @param DOMElement - * @return void - */ - function revertReadabilityStyledElements($articleContent) { - $xpath = new DOMXPath($articleContent->ownerDocument); - $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); - //$elems = $articleContent->getElementsByTagName('p'); - for ($i = $elems->length-1; $i >= 0; $i--) { - $e = $elems->item($i); - $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); - //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { - // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); - //} - } - } - - /** - * Prepare the article node for display. Clean out any inline styles, - * iframes, forms, strip extraneous

    tags, etc. - * - * @param DOMElement - * @return void - */ - function prepArticle($articleContent) { - $this->cleanStyles($articleContent); - $this->killBreaks($articleContent); - if ($this->revertForcedParagraphElements) { - $this->revertReadabilityStyledElements($articleContent); - } - - /* Clean out junk from the article content */ - $this->cleanConditionally($articleContent, 'form'); - $this->clean($articleContent, 'object'); - $this->clean($articleContent, 'h1'); - - /** - * If there is only one h2, they are probably using it - * as a header and not a subheader, so remove it since we already have a header. - ***/ - if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { - $this->clean($articleContent, 'h2'); - } - $this->clean($articleContent, 'iframe'); - - $this->cleanHeaders($articleContent); - - /* Do these last as the previous stuff may have removed junk that will affect these */ - $this->cleanConditionally($articleContent, 'table'); - $this->cleanConditionally($articleContent, 'ul'); - $this->cleanConditionally($articleContent, 'div'); - - /* Remove extra paragraphs */ - $articleParagraphs = $articleContent->getElementsByTagName('p'); - for ($i = $articleParagraphs->length-1; $i >= 0; $i--) - { - $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; - $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; - $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; - $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; - - if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') - { - $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); - } - } - - try { - $articleContent->innerHTML = preg_replace('/]*>\s*

    innerHTML); - //articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

    dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); - } - } - - /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. - * - * @param Element - * @return void - **/ - protected function initializeNode($node) { - $readability = $this->dom->createAttribute('readability'); - $readability->value = 0; // this is our contentScore - $node->setAttributeNode($readability); - - switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case - case 'DIV': - $readability->value += 5; - break; - - case 'PRE': - case 'TD': - case 'BLOCKQUOTE': - $readability->value += 3; - break; - - case 'ADDRESS': - case 'OL': - case 'UL': - case 'DL': - case 'DD': - case 'DT': - case 'LI': - case 'FORM': - $readability->value -= 3; - break; - - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'TH': - $readability->value -= 5; - break; - } - $readability->value += $this->getClassWeight($node); - } - - /*** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @return DOMElement - **/ - protected function grabArticle($page=null) { - $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); - if (!$page) $page = $this->dom; - $allElements = $page->getElementsByTagName('*'); - /** - * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs - * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) - * - * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 - * TODO: Shouldn't this be a reverse traversal? - **/ - $node = null; - $nodesToScore = array(); - for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { - //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { - //$node = $targetList->item($nodeIndex); - $tagName = strtoupper($node->tagName); - /* Remove unlikely candidates */ - if ($stripUnlikelyCandidates) { - $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); - if ( - preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && - !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && - $tagName != 'BODY' - ) - { - $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); - //$nodesToRemove[] = $node; - $node->parentNode->removeChild($node); - $nodeIndex--; - continue; - } - } - - if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { - $nodesToScore[] = $node; - } - - /* Turn all divs that don't have children block level elements into p's */ - if ($tagName == 'DIV') { - if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { - //$this->dbg('Altering div to p'); - $newNode = $this->dom->createElement('p'); - try { - $newNode->innerHTML = $node->innerHTML; - //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); - $node->parentNode->replaceChild($newNode, $node); - $nodeIndex--; - $nodesToScore[] = $node; // or $newNode? - } - catch(Exception $e) { - $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); - } - } - else - { - /* EXPERIMENTAL */ - // TODO: change these p elements back to text nodes after processing - for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { - $childNode = $node->childNodes->item($i); - if ($childNode->nodeType == 3) { // XML_TEXT_NODE - //$this->dbg('replacing text node with a p tag with the same content.'); - $p = $this->dom->createElement('p'); - $p->innerHTML = $childNode->nodeValue; - $p->setAttribute('style', 'display: inline;'); - $p->setAttribute('class', 'readability-styled'); - $childNode->parentNode->replaceChild($p, $childNode); - } - } - } - } - } - - /** - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. - * - * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ - $candidates = array(); - for ($pt=0; $pt < count($nodesToScore); $pt++) { - $parentNode = $nodesToScore[$pt]->parentNode; - // $grandParentNode = $parentNode ? $parentNode->parentNode : null; - $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); - $innerText = $this->getInnerText($nodesToScore[$pt]); - - if (!$parentNode || !isset($parentNode->tagName)) { - continue; - } - - /* If this paragraph is less than 25 characters, don't even count it. */ - if(strlen($innerText) < 25) { - continue; - } - - /* Initialize readability data for the parent. */ - if (!$parentNode->hasAttribute('readability')) - { - $this->initializeNode($parentNode); - $candidates[] = $parentNode; - } - - /* Initialize readability data for the grandparent. */ - if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) - { - $this->initializeNode($grandParentNode); - $candidates[] = $grandParentNode; - } - - $contentScore = 0; - - /* Add a point for the paragraph itself as a base. */ - $contentScore++; - - /* Add points for any commas within this paragraph */ - $contentScore += count(explode(',', $innerText)); - - /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ - $contentScore += min(floor(strlen($innerText) / 100), 3); - - /* Add the score to the parent. The grandparent gets half. */ - $parentNode->getAttributeNode('readability')->value += $contentScore; - - if ($grandParentNode) { - $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; - } - } - - /** - * After we've calculated scores, loop through all of the possible candidate nodes we found - * and find the one with the highest score. - **/ - $topCandidate = null; - for ($c=0, $cl=count($candidates); $c < $cl; $c++) - { - /** - * Scale the final candidates score based on link density. Good content should have a - * relatively small link density (5% or less) and be mostly unaffected by this operation. - **/ - $readability = $candidates[$c]->getAttributeNode('readability'); - $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); - - $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); - - if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { - $topCandidate = $candidates[$c]; - } - } - - /** - * If we still have no top candidate, just use the body as a last resort. - * We also have to copy the body node so it is something we can modify. - **/ - if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') - { - $topCandidate = $this->dom->createElement('div'); - if ($page instanceof DOMDocument) { - if (!isset($page->documentElement)) { - // we don't have a body either? what a mess! :) - } else { - $topCandidate->innerHTML = $page->documentElement->innerHTML; - $page->documentElement->innerHTML = ''; - $page->documentElement->appendChild($topCandidate); - } - } else { - $topCandidate->innerHTML = $page->innerHTML; - $page->innerHTML = ''; - $page->appendChild($topCandidate); - } - $this->initializeNode($topCandidate); - } - - /** - * Now that we have the top candidate, look through its siblings for content that might also be related. - * Things like preambles, content split by ads that we removed, etc. - **/ - $articleContent = $this->dom->createElement('div'); - $articleContent->setAttribute('id', 'readability-content'); - $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); - $siblingNodes = $topCandidate->parentNode->childNodes; - if (!isset($siblingNodes)) { - $siblingNodes = new stdClass; - $siblingNodes->length = 0; - } - - for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) - { - $siblingNode = $siblingNodes->item($s); - $append = false; - - $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); - - //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); - - if ($siblingNode === $topCandidate) - // or if ($siblingNode->isSameNode($topCandidate)) - { - $append = true; - } - - $contentBonus = 0; - /* Give a bonus if sibling nodes and top candidates have the example same classname */ - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { - $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; - } - - if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) - { - $append = true; - } - - if (strtoupper($siblingNode->nodeName) == 'P') { - $linkDensity = $this->getLinkDensity($siblingNode); - $nodeContent = $this->getInnerText($siblingNode); - $nodeLength = strlen($nodeContent); - - if ($nodeLength > 80 && $linkDensity < 0.25) - { - $append = true; - } - else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) - { - $append = true; - } - } - - if ($append) - { - $this->dbg('Appending node: ' . $siblingNode->nodeName); - - $nodeToAppend = null; - $sibNodeName = strtoupper($siblingNode->nodeName); - if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { - /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - - $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); - $nodeToAppend = $this->dom->createElement('div'); - try { - $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); - $nodeToAppend->innerHTML = $siblingNode->innerHTML; - } - catch(Exception $e) - { - $this->dbg('Could not alter siblingNode to div, reverting back to original.'); - $nodeToAppend = $siblingNode; - $s--; - $sl--; - } - } else { - $nodeToAppend = $siblingNode; - $s--; - $sl--; - } - - /* To ensure a node does not interfere with readability styles, remove its classnames */ - $nodeToAppend->removeAttribute('class'); - - /* Append sibling and subtract from our list because it removes the node when you append to another node */ - $articleContent->appendChild($nodeToAppend); - } - } - - /** - * So we have all of the content that we need. Now we clean it up for presentation. - **/ - $this->prepArticle($articleContent); - - /** - * Now that we've gone through the full algorithm, check to see if we got any meaningful content. - * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher - * likelihood of finding the content, and the sieve approach gives us a higher likelihood of - * finding the -right- content. - **/ - if (strlen($this->getInnerText($articleContent, false)) < 250) - { - // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 - // in the meantime, we check and create an empty element if it's not there. - if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); - $this->body->innerHTML = $this->bodyCache; - - if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { - $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); - return $this->grabArticle($this->body); - } - else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { - $this->removeFlag(self::FLAG_WEIGHT_CLASSES); - return $this->grabArticle($this->body); - } - else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { - $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); - return $this->grabArticle($this->body); - } - else { - return false; - } - } - return $articleContent; - } - - /** - * Remove script tags from document - * - * @param DOMElement - * @return void - */ - public function removeScripts($doc) { - $scripts = $doc->getElementsByTagName('script'); - for($i = $scripts->length-1; $i >= 0; $i--) - { - $scripts->item($i)->parentNode->removeChild($scripts->item($i)); - } - } - - /** - * Get the inner text of a node. - * This also strips out any excess whitespace to be found. - * - * @param DOMElement $ - * @param boolean $normalizeSpaces (default: true) - * @return string - **/ - public function getInnerText($e, $normalizeSpaces=true) { - $textContent = ''; - - if (!isset($e->textContent) || $e->textContent == '') { - return ''; - } - - $textContent = trim($e->textContent); - - if ($normalizeSpaces) { - return preg_replace($this->regexps['normalize'], ' ', $textContent); - } else { - return $textContent; - } - } - - /** - * Get the number of times a string $s appears in the node $e. - * - * @param DOMElement $e - * @param string - what to count. Default is "," - * @return number (integer) - **/ - public function getCharCount($e, $s=',') { - return substr_count($this->getInnerText($e), $s); - } - - /** - * Remove the style attribute on every $e and under. - * - * @param DOMElement $e - * @return void - */ - public function cleanStyles($e) { - if (!is_object($e)) return; - $elems = $e->getElementsByTagName('*'); - foreach ($elems as $elem) { - $elem->removeAttribute('style'); - } - } - - /** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * - * @param DOMElement $e - * @return number (float) - */ - public function getLinkDensity($e) { - $links = $e->getElementsByTagName('a'); - $textLength = strlen($this->getInnerText($e)); - $linkLength = 0; - for ($i=0, $il=$links->length; $i < $il; $i++) - { - $linkLength += strlen($this->getInnerText($links->item($i))); - } - if ($textLength > 0) { - return $linkLength / $textLength; - } else { - return 0; - } - } - - /** - * Get an elements class/id weight. Uses regular expressions to tell if this - * element looks good or bad. - * - * @param DOMElement $e - * @return number (Integer) - */ - public function getClassWeight($e) { - if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { - return 0; - } - - $weight = 0; - - /* Look for a special classname */ - if ($e->hasAttribute('class') && $e->getAttribute('class') != '') - { - if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { - $weight -= 25; - } - if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { - $weight += 25; - } - } - - /* Look for a special ID */ - if ($e->hasAttribute('id') && $e->getAttribute('id') != '') - { - if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { - $weight -= 25; - } - if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { - $weight += 25; - } - } - return $weight; - } - - /** - * Remove extraneous break tags from a node. - * - * @param DOMElement $node - * @return void - */ - public function killBreaks($node) { - $html = $node->innerHTML; - $html = preg_replace($this->regexps['killBreaks'], '
    ', $html); - $node->innerHTML = $html; - } - - /** - * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.) - * - * Updated 2012-09-18 to preserve youtube/vimeo iframes - * - * @param DOMElement $e - * @param string $tag - * @return void - */ - public function clean($e, $tag) { - $targetList = $e->getElementsByTagName($tag); - $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); - - for ($y=$targetList->length-1; $y >= 0; $y--) { - /* Allow youtube and vimeo videos through as people usually want to see those. */ - if ($isEmbed) { - $attributeValues = ''; - for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { - $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) - } - - /* First, check the elements attributes to see if any of them contain youtube or vimeo */ - if (preg_match($this->regexps['video'], $attributeValues)) { - continue; - } - - /* Then check the elements inside this element for the same. */ - if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { - continue; - } - } - $targetList->item($y)->parentNode->removeChild($targetList->item($y)); - } - } - - /** - * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, - * link density, number of images & embeds, etc. - * - * @param DOMElement $e - * @param string $tag - * @return void - */ - public function cleanConditionally($e, $tag) { - if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { - return; - } - - $tagsList = $e->getElementsByTagName($tag); - $curTagsLength = $tagsList->length; - - /** - * Gather counts for other typical elements embedded within. - * Traverse backwards so we can remove nodes at the same time without effecting the traversal. - * - * TODO: Consider taking into account original contentScore here. - */ - for ($i=$curTagsLength-1; $i >= 0; $i--) { - $weight = $this->getClassWeight($tagsList->item($i)); - $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; - - $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); - - if ($weight + $contentScore < 0) { - $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); - } - else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { - /** - * If there are not very many commas, and the number of - * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. - **/ - $p = $tagsList->item($i)->getElementsByTagName('p')->length; - $img = $tagsList->item($i)->getElementsByTagName('img')->length; - $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; - $input = $tagsList->item($i)->getElementsByTagName('input')->length; - $a = $tagsList->item($i)->getElementsByTagName('a')->length; - - $embedCount = 0; - $embeds = $tagsList->item($i)->getElementsByTagName('embed'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { - if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; - } - } - $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); - for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { - if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; - } - } - - $linkDensity = $this->getLinkDensity($tagsList->item($i)); - $contentLength = strlen($this->getInnerText($tagsList->item($i))); - $toRemove = false; - - if ($this->lightClean) { - $this->dbg('Light clean...'); - if ( ($img > $p) && ($img > 4) ) { - $this->dbg(' more than 4 images and more image elements than paragraph elements'); - $toRemove = true; - } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many

  • elements, and parent is not
      or
        '); - $toRemove = true; - } else if ( $input > floor($p/3) ) { - $this->dbg(' too many elements'); - $toRemove = true; - } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { - $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); - $toRemove = true; - } else if($weight < 25 && $linkDensity > 0.2) { - $this->dbg(' weight smaller than 25 and link density above 0.2'); - $toRemove = true; - } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { - $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); - $toRemove = true; - } else if($embedCount > 3) { - $this->dbg(' more than 3 embeds'); - $toRemove = true; - } - } else { - $this->dbg('Standard clean...'); - if ( $img > $p ) { - $this->dbg(' more image elements than paragraph elements'); - $toRemove = true; - } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many
      1. elements, and parent is not
          or
            '); - $toRemove = true; - } else if ( $input > floor($p/3) ) { - $this->dbg(' too many elements'); - $toRemove = true; - } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { - $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); - $toRemove = true; - } else if($weight < 25 && $linkDensity > 0.2) { - $this->dbg(' weight smaller than 25 and link density above 0.2'); - $toRemove = true; - } else if($weight >= 25 && $linkDensity > 0.5) { - $this->dbg(' weight above 25 but link density greater than 0.5'); - $toRemove = true; - } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { - $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); - $toRemove = true; - } - } - - if ($toRemove) { - //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); - $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); - } - } - } - } - - /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. - * - * @param DOMElement $e - * @return void - */ - public function cleanHeaders($e) { - for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { - $headers = $e->getElementsByTagName('h' . $headerIndex); - for ($i=$headers->length-1; $i >=0; $i--) { - if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); - } - } - } - } - - public function flagIsActive($flag) { - return ($this->flags & $flag) > 0; - } - - public function addFlag($flag) { - $this->flags = $this->flags | $flag; - } - - public function removeFlag($flag) { - $this->flags = $this->flags & ~$flag; - } -} +init(); +echo $r->articleContent->innerHTML; +*/ + +class Readability +{ + public $version = '1.7.1-without-multi-page'; + public $convertLinksToFootnotes = false; + public $revertForcedParagraphElements = true; + public $articleTitle; + public $articleContent; + public $dom; + public $url = null; // optional - URL where HTML was retrieved + public $debug = false; + public $lightClean = true; // preserves more content (experimental) added 2012-09-19 + protected $body = null; // + protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later + protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. + protected $success = false; // indicates whether we were able to extract or not + + /** + * All of the regular expressions in use within readability. + * Defined up here so we don't instantiate them repeatedly in loops. + **/ + public $regexps = array( + 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', + 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', + 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', + 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', + 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', + 'replaceBrs' => '/(]*>[ \n\r\t]*){2,}/i', + 'replaceFonts' => '/<(\/?)font[^>]*>/i', + // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() + 'normalize' => '/\s{2,}/', + 'killBreaks' => '/((\s| ?)*){1,}/', + 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', + 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' + ); + + /* constants */ + const FLAG_STRIP_UNLIKELYS = 1; + const FLAG_WEIGHT_CLASSES = 2; + const FLAG_CLEAN_CONDITIONALLY = 4; + + /** + * Create instance of Readability + * @param string UTF-8 encoded string + * @param string (optional) URL associated with HTML (used for footnotes) + * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') + */ + function __construct($html, $url=null, $parser='libxml') + { + $this->url = $url; + /* Turn all double br's into p's */ + $html = preg_replace($this->regexps['replaceBrs'], '

            ', $html); + $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); + $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); + if (trim($html) == '') $html = ''; + if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { + // all good + } else { + $this->dom = new DOMDocument(); + $this->dom->preserveWhiteSpace = false; + @$this->dom->loadHTML($html); + } + $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); + } + + /** + * Get article title element + * @return DOMElement + */ + public function getTitle() { + return $this->articleTitle; + } + + /** + * Get article content element + * @return DOMElement + */ + public function getContent() { + return $this->articleContent; + } + + /** + * Runs readability. + * + * Workflow: + * 1. Prep the document by removing script tags, css, etc. + * 2. Build readability's DOM tree. + * 3. Grab the article content from the current dom tree. + * 4. Replace the current DOM tree with the new one. + * 5. Read peacefully. + * + * @return boolean true if we found content, false otherwise + **/ + public function init() + { + if (!isset($this->dom->documentElement)) return false; + $this->removeScripts($this->dom); + //die($this->getInnerHTML($this->dom->documentElement)); + + // Assume successful outcome + $this->success = true; + + $bodyElems = $this->dom->getElementsByTagName('body'); + if ($bodyElems->length > 0) { + if ($this->bodyCache == null) { + $this->bodyCache = $bodyElems->item(0)->innerHTML; + } + if ($this->body == null) { + $this->body = $bodyElems->item(0); + } + } + + $this->prepDocument(); + + //die($this->dom->documentElement->parentNode->nodeType); + //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); + //die($this->getInnerHTML($this->dom->documentElement)); + + /* Build readability's DOM tree */ + $overlay = $this->dom->createElement('div'); + $innerDiv = $this->dom->createElement('div'); + $articleTitle = $this->getArticleTitle(); + $articleContent = $this->grabArticle(); + + if (!$articleContent) { + $this->success = false; + $articleContent = $this->dom->createElement('div'); + $articleContent->setAttribute('id', 'readability-content'); + $articleContent->innerHTML = '

            Sorry, Readability was unable to parse this page for content.

            '; + } + + $overlay->setAttribute('id', 'readOverlay'); + $innerDiv->setAttribute('id', 'readInner'); + + /* Glue the structure of our document together. */ + $innerDiv->appendChild($articleTitle); + $innerDiv->appendChild($articleContent); + $overlay->appendChild($innerDiv); + + /* Clear the old HTML, insert the new content. */ + $this->body->innerHTML = ''; + $this->body->appendChild($overlay); + //document.body.insertBefore(overlay, document.body.firstChild); + $this->body->removeAttribute('style'); + + $this->postProcessContent($articleContent); + + // Set title and content instance variables + $this->articleTitle = $articleTitle; + $this->articleContent = $articleContent; + + return $this->success; + } + + /** + * Debug + */ + protected function dbg($msg) { + if ($this->debug) echo '* ',$msg, "\n"; + } + + /** + * Run any post-process modifications to article content as necessary. + * + * @param DOMElement + * @return void + */ + public function postProcessContent($articleContent) { + if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { + $this->addFootnotes($articleContent); + } + } + + /** + * Get the article title as an H1. + * + * @return DOMElement + */ + protected function getArticleTitle() { + $curTitle = ''; + $origTitle = ''; + + try { + $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); + } catch(Exception $e) {} + + if (preg_match('/ [\|\-] /', $curTitle)) + { + $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); + + if (count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); + } + } + else if (strpos($curTitle, ': ') !== false) + { + $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); + + if (count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); + } + } + else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) + { + $hOnes = $this->dom->getElementsByTagName('h1'); + if($hOnes->length == 1) + { + $curTitle = $this->getInnerText($hOnes->item(0)); + } + } + + $curTitle = trim($curTitle); + + if (count(explode(' ', $curTitle)) <= 4) { + $curTitle = $origTitle; + } + + $articleTitle = $this->dom->createElement('h1'); + $articleTitle->innerHTML = $curTitle; + + return $articleTitle; + } + + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + **/ + protected function prepDocument() { + /** + * In some cases a body element can't be found (if the HTML is totally hosed for example) + * so we create a new body node and append it to the document. + */ + if ($this->body == null) + { + $this->body = $this->dom->createElement('body'); + $this->dom->documentElement->appendChild($this->body); + } + $this->body->setAttribute('id', 'readabilityBody'); + + /* Remove all style tags in head */ + $styleTags = $this->dom->getElementsByTagName('style'); + for ($i = $styleTags->length-1; $i >= 0; $i--) + { + $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + } + + /* Turn all double br's into p's */ + /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ + //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

            ').replace(readability.regexps.replaceFonts, '<$1span>'); + // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. + // Manipulating innerHTML as it's done in JS is not possible in PHP. + } + + /** + * For easier reading, convert this document to have footnotes at the bottom rather than inline links. + * @see http://www.roughtype.com/archives/2010/05/experiments_in.php + * + * @return void + **/ + public function addFootnotes($articleContent) { + $footnotesWrapper = $this->dom->createElement('div'); + $footnotesWrapper->setAttribute('id', 'readability-footnotes'); + $footnotesWrapper->innerHTML = '

            References

            '; + + $articleFootnotes = $this->dom->createElement('ol'); + $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); + $footnotesWrapper->appendChild($articleFootnotes); + + $articleLinks = $articleContent->getElementsByTagName('a'); + + $linkCount = 0; + for ($i = 0; $i < $articleLinks->length; $i++) + { + $articleLink = $articleLinks->item($i); + $footnoteLink = $articleLink->cloneNode(true); + $refLink = $this->dom->createElement('a'); + $footnote = $this->dom->createElement('li'); + $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); + if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); + //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, + $linkText = $this->getInnerText($articleLink); + + if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { + continue; + } + + $linkCount++; + + /** Add a superscript reference after the article link */ + $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); + $refLink->innerHTML = '[' . $linkCount . ']'; + $refLink->setAttribute('class', 'readability-DoNotFootnote'); + $refLink->setAttribute('style', 'color: inherit;'); + + //TODO: does this work or should we use DOMNode.isSameNode()? + if ($articleLink->parentNode->lastChild == $articleLink) { + $articleLink->parentNode->appendChild($refLink); + } else { + $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); + } + + $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); + $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); + + $footnote->innerHTML = '^ '; + + $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); + $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); + + $footnote->appendChild($footnoteLink); + if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; + + $articleFootnotes->appendChild($footnote); + } + + if ($linkCount > 0) { + $articleContent->appendChild($footnotesWrapper); + } + } + + /** + * Reverts P elements with class 'readability-styled' + * to text nodes - which is what they were before. + * + * @param DOMElement + * @return void + */ + function revertReadabilityStyledElements($articleContent) { + $xpath = new DOMXPath($articleContent->ownerDocument); + $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); + //$elems = $articleContent->getElementsByTagName('p'); + for ($i = $elems->length-1; $i >= 0; $i--) { + $e = $elems->item($i); + $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); + //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { + // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); + //} + } + } + + /** + * Prepare the article node for display. Clean out any inline styles, + * iframes, forms, strip extraneous

            tags, etc. + * + * @param DOMElement + * @return void + */ + function prepArticle($articleContent) { + $this->cleanStyles($articleContent); + $this->killBreaks($articleContent); + if ($this->revertForcedParagraphElements) { + $this->revertReadabilityStyledElements($articleContent); + } + + /* Clean out junk from the article content */ + $this->cleanConditionally($articleContent, 'form'); + $this->clean($articleContent, 'object'); + $this->clean($articleContent, 'h1'); + + /** + * If there is only one h2, they are probably using it + * as a header and not a subheader, so remove it since we already have a header. + ***/ + if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { + $this->clean($articleContent, 'h2'); + } + $this->clean($articleContent, 'iframe'); + + $this->cleanHeaders($articleContent); + + /* Do these last as the previous stuff may have removed junk that will affect these */ + $this->cleanConditionally($articleContent, 'table'); + $this->cleanConditionally($articleContent, 'ul'); + $this->cleanConditionally($articleContent, 'div'); + + /* Remove extra paragraphs */ + $articleParagraphs = $articleContent->getElementsByTagName('p'); + for ($i = $articleParagraphs->length-1; $i >= 0; $i--) + { + $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; + $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; + $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; + $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; + + if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') + { + $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); + } + } + + try { + $articleContent->innerHTML = preg_replace('/]*>\s*

            innerHTML); + //articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

            dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); + } + } + + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param Element + * @return void + **/ + protected function initializeNode($node) { + $readability = $this->dom->createAttribute('readability'); + $readability->value = 0; // this is our contentScore + $node->setAttributeNode($readability); + + switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case + case 'DIV': + $readability->value += 5; + break; + + case 'PRE': + case 'TD': + case 'BLOCKQUOTE': + $readability->value += 3; + break; + + case 'ADDRESS': + case 'OL': + case 'UL': + case 'DL': + case 'DD': + case 'DT': + case 'LI': + case 'FORM': + $readability->value -= 3; + break; + + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'TH': + $readability->value -= 5; + break; + } + $readability->value += $this->getClassWeight($node); + } + + /*** + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * + * @return DOMElement + **/ + protected function grabArticle($page=null) { + $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); + if (!$page) $page = $this->dom; + $allElements = $page->getElementsByTagName('*'); + /** + * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs + * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) + * + * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 + * TODO: Shouldn't this be a reverse traversal? + **/ + $node = null; + $nodesToScore = array(); + for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { + //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { + //$node = $targetList->item($nodeIndex); + $tagName = strtoupper($node->tagName); + /* Remove unlikely candidates */ + if ($stripUnlikelyCandidates) { + $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); + if ( + preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && + !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && + $tagName != 'BODY' + ) + { + $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); + //$nodesToRemove[] = $node; + $node->parentNode->removeChild($node); + $nodeIndex--; + continue; + } + } + + if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { + $nodesToScore[] = $node; + } + + /* Turn all divs that don't have children block level elements into p's */ + if ($tagName == 'DIV') { + if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { + //$this->dbg('Altering div to p'); + $newNode = $this->dom->createElement('p'); + try { + $newNode->innerHTML = $node->innerHTML; + //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); + $node->parentNode->replaceChild($newNode, $node); + $nodeIndex--; + $nodesToScore[] = $node; // or $newNode? + } + catch(Exception $e) { + $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); + } + } + else + { + /* EXPERIMENTAL */ + // TODO: change these p elements back to text nodes after processing + for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { + $childNode = $node->childNodes->item($i); + if ($childNode->nodeType == 3) { // XML_TEXT_NODE + //$this->dbg('replacing text node with a p tag with the same content.'); + $p = $this->dom->createElement('p'); + $p->innerHTML = $childNode->nodeValue; + $p->setAttribute('style', 'display: inline;'); + $p->setAttribute('class', 'readability-styled'); + $childNode->parentNode->replaceChild($p, $childNode); + } + } + } + } + } + + /** + * Loop through all paragraphs, and assign a score to them based on how content-y they look. + * Then add their score to their parent node. + * + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. + **/ + $candidates = array(); + for ($pt=0; $pt < count($nodesToScore); $pt++) { + $parentNode = $nodesToScore[$pt]->parentNode; + // $grandParentNode = $parentNode ? $parentNode->parentNode : null; + $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); + $innerText = $this->getInnerText($nodesToScore[$pt]); + + if (!$parentNode || !isset($parentNode->tagName)) { + continue; + } + + /* If this paragraph is less than 25 characters, don't even count it. */ + if(strlen($innerText) < 25) { + continue; + } + + /* Initialize readability data for the parent. */ + if (!$parentNode->hasAttribute('readability')) + { + $this->initializeNode($parentNode); + $candidates[] = $parentNode; + } + + /* Initialize readability data for the grandparent. */ + if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) + { + $this->initializeNode($grandParentNode); + $candidates[] = $grandParentNode; + } + + $contentScore = 0; + + /* Add a point for the paragraph itself as a base. */ + $contentScore++; + + /* Add points for any commas within this paragraph */ + $contentScore += count(explode(',', $innerText)); + + /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ + $contentScore += min(floor(strlen($innerText) / 100), 3); + + /* Add the score to the parent. The grandparent gets half. */ + $parentNode->getAttributeNode('readability')->value += $contentScore; + + if ($grandParentNode) { + $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; + } + } + + /** + * After we've calculated scores, loop through all of the possible candidate nodes we found + * and find the one with the highest score. + **/ + $topCandidate = null; + for ($c=0, $cl=count($candidates); $c < $cl; $c++) + { + /** + * Scale the final candidates score based on link density. Good content should have a + * relatively small link density (5% or less) and be mostly unaffected by this operation. + **/ + $readability = $candidates[$c]->getAttributeNode('readability'); + $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); + + $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); + + if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { + $topCandidate = $candidates[$c]; + } + } + + /** + * If we still have no top candidate, just use the body as a last resort. + * We also have to copy the body node so it is something we can modify. + **/ + if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') + { + $topCandidate = $this->dom->createElement('div'); + if ($page instanceof DOMDocument) { + if (!isset($page->documentElement)) { + // we don't have a body either? what a mess! :) + } else { + $topCandidate->innerHTML = $page->documentElement->innerHTML; + $page->documentElement->innerHTML = ''; + $page->documentElement->appendChild($topCandidate); + } + } else { + $topCandidate->innerHTML = $page->innerHTML; + $page->innerHTML = ''; + $page->appendChild($topCandidate); + } + $this->initializeNode($topCandidate); + } + + /** + * Now that we have the top candidate, look through its siblings for content that might also be related. + * Things like preambles, content split by ads that we removed, etc. + **/ + $articleContent = $this->dom->createElement('div'); + $articleContent->setAttribute('id', 'readability-content'); + $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); + $siblingNodes = $topCandidate->parentNode->childNodes; + if (!isset($siblingNodes)) { + $siblingNodes = new stdClass; + $siblingNodes->length = 0; + } + + for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) + { + $siblingNode = $siblingNodes->item($s); + $append = false; + + $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + + //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); + + if ($siblingNode === $topCandidate) + // or if ($siblingNode->isSameNode($topCandidate)) + { + $append = true; + } + + $contentBonus = 0; + /* Give a bonus if sibling nodes and top candidates have the example same classname */ + if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { + $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; + } + + if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) + { + $append = true; + } + + if (strtoupper($siblingNode->nodeName) == 'P') { + $linkDensity = $this->getLinkDensity($siblingNode); + $nodeContent = $this->getInnerText($siblingNode); + $nodeLength = strlen($nodeContent); + + if ($nodeLength > 80 && $linkDensity < 0.25) + { + $append = true; + } + else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) + { + $append = true; + } + } + + if ($append) + { + $this->dbg('Appending node: ' . $siblingNode->nodeName); + + $nodeToAppend = null; + $sibNodeName = strtoupper($siblingNode->nodeName); + if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { + /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ + + $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); + $nodeToAppend = $this->dom->createElement('div'); + try { + $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); + $nodeToAppend->innerHTML = $siblingNode->innerHTML; + } + catch(Exception $e) + { + $this->dbg('Could not alter siblingNode to div, reverting back to original.'); + $nodeToAppend = $siblingNode; + $s--; + $sl--; + } + } else { + $nodeToAppend = $siblingNode; + $s--; + $sl--; + } + + /* To ensure a node does not interfere with readability styles, remove its classnames */ + $nodeToAppend->removeAttribute('class'); + + /* Append sibling and subtract from our list because it removes the node when you append to another node */ + $articleContent->appendChild($nodeToAppend); + } + } + + /** + * So we have all of the content that we need. Now we clean it up for presentation. + **/ + $this->prepArticle($articleContent); + + /** + * Now that we've gone through the full algorithm, check to see if we got any meaningful content. + * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher + * likelihood of finding the content, and the sieve approach gives us a higher likelihood of + * finding the -right- content. + **/ + if (strlen($this->getInnerText($articleContent, false)) < 250) + { + // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 + // in the meantime, we check and create an empty element if it's not there. + if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); + $this->body->innerHTML = $this->bodyCache; + + if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { + $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); + return $this->grabArticle($this->body); + } + else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { + $this->removeFlag(self::FLAG_WEIGHT_CLASSES); + return $this->grabArticle($this->body); + } + else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); + return $this->grabArticle($this->body); + } + else { + return false; + } + } + return $articleContent; + } + + /** + * Remove script tags from document + * + * @param DOMElement + * @return void + */ + public function removeScripts($doc) { + $scripts = $doc->getElementsByTagName('script'); + for($i = $scripts->length-1; $i >= 0; $i--) + { + $scripts->item($i)->parentNode->removeChild($scripts->item($i)); + } + } + + /** + * Get the inner text of a node. + * This also strips out any excess whitespace to be found. + * + * @param DOMElement $ + * @param boolean $normalizeSpaces (default: true) + * @return string + **/ + public function getInnerText($e, $normalizeSpaces=true) { + $textContent = ''; + + if (!isset($e->textContent) || $e->textContent == '') { + return ''; + } + + $textContent = trim($e->textContent); + + if ($normalizeSpaces) { + return preg_replace($this->regexps['normalize'], ' ', $textContent); + } else { + return $textContent; + } + } + + /** + * Get the number of times a string $s appears in the node $e. + * + * @param DOMElement $e + * @param string - what to count. Default is "," + * @return number (integer) + **/ + public function getCharCount($e, $s=',') { + return substr_count($this->getInnerText($e), $s); + } + + /** + * Remove the style attribute on every $e and under. + * + * @param DOMElement $e + * @return void + */ + public function cleanStyles($e) { + if (!is_object($e)) return; + $elems = $e->getElementsByTagName('*'); + foreach ($elems as $elem) { + $elem->removeAttribute('style'); + } + } + + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * + * @param DOMElement $e + * @return number (float) + */ + public function getLinkDensity($e) { + $links = $e->getElementsByTagName('a'); + $textLength = strlen($this->getInnerText($e)); + $linkLength = 0; + for ($i=0, $il=$links->length; $i < $il; $i++) + { + $linkLength += strlen($this->getInnerText($links->item($i))); + } + if ($textLength > 0) { + return $linkLength / $textLength; + } else { + return 0; + } + } + + /** + * Get an elements class/id weight. Uses regular expressions to tell if this + * element looks good or bad. + * + * @param DOMElement $e + * @return number (Integer) + */ + public function getClassWeight($e) { + if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { + return 0; + } + + $weight = 0; + + /* Look for a special classname */ + if ($e->hasAttribute('class') && $e->getAttribute('class') != '') + { + if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { + $weight -= 25; + } + if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { + $weight += 25; + } + } + + /* Look for a special ID */ + if ($e->hasAttribute('id') && $e->getAttribute('id') != '') + { + if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { + $weight -= 25; + } + if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { + $weight += 25; + } + } + return $weight; + } + + /** + * Remove extraneous break tags from a node. + * + * @param DOMElement $node + * @return void + */ + public function killBreaks($node) { + $html = $node->innerHTML; + $html = preg_replace($this->regexps['killBreaks'], '
            ', $html); + $node->innerHTML = $html; + } + + /** + * Clean a node of all elements of type "tag". + * (Unless it's a youtube/vimeo video. People love movies.) + * + * Updated 2012-09-18 to preserve youtube/vimeo iframes + * + * @param DOMElement $e + * @param string $tag + * @return void + */ + public function clean($e, $tag) { + $targetList = $e->getElementsByTagName($tag); + $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); + + for ($y=$targetList->length-1; $y >= 0; $y--) { + /* Allow youtube and vimeo videos through as people usually want to see those. */ + if ($isEmbed) { + $attributeValues = ''; + for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { + $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) + } + + /* First, check the elements attributes to see if any of them contain youtube or vimeo */ + if (preg_match($this->regexps['video'], $attributeValues)) { + continue; + } + + /* Then check the elements inside this element for the same. */ + if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { + continue; + } + } + $targetList->item($y)->parentNode->removeChild($targetList->item($y)); + } + } + + /** + * Clean an element of all tags of type "tag" if they look fishy. + * "Fishy" is an algorithm based on content length, classnames, + * link density, number of images & embeds, etc. + * + * @param DOMElement $e + * @param string $tag + * @return void + */ + public function cleanConditionally($e, $tag) { + if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + return; + } + + $tagsList = $e->getElementsByTagName($tag); + $curTagsLength = $tagsList->length; + + /** + * Gather counts for other typical elements embedded within. + * Traverse backwards so we can remove nodes at the same time without effecting the traversal. + * + * TODO: Consider taking into account original contentScore here. + */ + for ($i=$curTagsLength-1; $i >= 0; $i--) { + $weight = $this->getClassWeight($tagsList->item($i)); + $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; + + $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); + + if ($weight + $contentScore < 0) { + $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); + } + else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { + /** + * If there are not very many commas, and the number of + * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. + **/ + $p = $tagsList->item($i)->getElementsByTagName('p')->length; + $img = $tagsList->item($i)->getElementsByTagName('img')->length; + $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; + $input = $tagsList->item($i)->getElementsByTagName('input')->length; + $a = $tagsList->item($i)->getElementsByTagName('a')->length; + + $embedCount = 0; + $embeds = $tagsList->item($i)->getElementsByTagName('embed'); + for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { + $embedCount++; + } + } + $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); + for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { + $embedCount++; + } + } + + $linkDensity = $this->getLinkDensity($tagsList->item($i)); + $contentLength = strlen($this->getInnerText($tagsList->item($i))); + $toRemove = false; + + if ($this->lightClean) { + $this->dbg('Light clean...'); + if ( ($img > $p) && ($img > 4) ) { + $this->dbg(' more than 4 images and more image elements than paragraph elements'); + $toRemove = true; + } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { + $this->dbg(' too many

          1. elements, and parent is not
              or
                '); + $toRemove = true; + } else if ( $input > floor($p/3) ) { + $this->dbg(' too many elements'); + $toRemove = true; + } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { + $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); + $toRemove = true; + } else if($weight < 25 && $linkDensity > 0.2) { + $this->dbg(' weight smaller than 25 and link density above 0.2'); + $toRemove = true; + } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { + $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); + $toRemove = true; + } else if($embedCount > 3) { + $this->dbg(' more than 3 embeds'); + $toRemove = true; + } + } else { + $this->dbg('Standard clean...'); + if ( $img > $p ) { + $this->dbg(' more image elements than paragraph elements'); + $toRemove = true; + } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { + $this->dbg(' too many
              1. elements, and parent is not
                  or
                    '); + $toRemove = true; + } else if ( $input > floor($p/3) ) { + $this->dbg(' too many elements'); + $toRemove = true; + } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { + $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); + $toRemove = true; + } else if($weight < 25 && $linkDensity > 0.2) { + $this->dbg(' weight smaller than 25 and link density above 0.2'); + $toRemove = true; + } else if($weight >= 25 && $linkDensity > 0.5) { + $this->dbg(' weight above 25 but link density greater than 0.5'); + $toRemove = true; + } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { + $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); + $toRemove = true; + } + } + + if ($toRemove) { + //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); + $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); + } + } + } + } + + /** + * Clean out spurious headers from an Element. Checks things like classnames and link density. + * + * @param DOMElement $e + * @return void + */ + public function cleanHeaders($e) { + for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { + $headers = $e->getElementsByTagName('h' . $headerIndex); + for ($i=$headers->length-1; $i >=0; $i--) { + if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { + $headers->item($i)->parentNode->removeChild($headers->item($i)); + } + } + } + } + + public function flagIsActive($flag) { + return ($this->flags & $flag) > 0; + } + + public function addFlag($flag) { + $this->flags = $this->flags | $flag; + } + + public function removeFlag($flag) { + $this->flags = $this->flags & ~$flag; + } +} ?> \ No newline at end of file -- cgit v1.2.3 From d18ff7d9565f982bc15c5930123992d44614e1e2 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Fri, 23 May 2014 19:25:48 +0300 Subject: two small unimportant forgotten changes to 3.2 version of full-text-rss, issue #694 --- inc/3rdparty/libraries/language-detect/Parser.php | 354 ---------------------- 1 file changed, 354 deletions(-) delete mode 100644 inc/3rdparty/libraries/language-detect/Parser.php (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/Parser.php deleted file mode 100644 index 7f15fa98..00000000 --- a/inc/3rdparty/libraries/language-detect/Parser.php +++ /dev/null @@ -1,354 +0,0 @@ -_db_filename = $db; - if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; - $this->_string = $string; - } - - /** - * Returns true if a string is suitable for parsing - * - * @static - * @access public - * @param string $str input string to test - * @return bool true if acceptable, false if not - */ - function validateString($str) { - if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { - return true; - } else { - return false; - } - } - - /** - * turn on/off trigram counting - * - * @access public - * @param bool $bool true for on, false for off - */ - function prepareTrigram($bool = true) - { - $this->_compile_trigram = $bool; - } - - /** - * turn on/off unicode block counting - * - * @access public - * @param bool $bool true for on, false for off - */ - function prepareUnicode($bool = true) - { - $this->_compile_unicode = $bool; - } - - /** - * turn on/off padding the beginning of the sample string - * - * @access public - * @param bool $bool true for on, false for off - */ - function setPadStart($bool = true) - { - $this->_trigram_pad_start = $bool; - } - - /** - * Should the unicode block counter skip non-alphabetical ascii chars? - * - * @access public - * @param bool $bool true for on, false for off - */ - function setUnicodeSkipSymbols($bool = true) - { - $this->_unicode_skip_symbols = $bool; - } - - /** - * Returns the trigram ranks for the text sample - * - * @access public - * @return array trigram ranks in the text sample - */ - function &getTrigramRanks() - { - return $this->_trigram_ranks; - } - - /** - * Return the trigram freqency table - * - * only used in testing to make sure the parser is working - * - * @access public - * @return array trigram freqencies in the text sample - */ - function &getTrigramFreqs() - { - return $this->_trigram; - } - - /** - * returns the array of unicode blocks - * - * @access public - * @return array unicode blocks in the text sample - */ - function &getUnicodeBlocks() - { - return $this->_unicode_blocks; - } - - /** - * Executes the parsing operation - * - * Be sure to call the set*() functions to set options and the - * prepare*() functions first to tell it what kind of data to compute - * - * Afterwards the get*() functions can be used to access the compiled - * information. - * - * @access public - */ - function analyze() - { - $len = strlen($this->_string); - $byte_counter = 0; - - - // unicode startup - if ($this->_compile_unicode) { - $blocks =& $this->_read_unicode_block_db(); - - $block_count = count($blocks); - - $skipped_count = 0; - $unicode_chars = array(); - } - - // trigram startup - if ($this->_compile_trigram) { - // initialize them as blank so the parser will skip the first two - // (since it skips trigrams with more than 2 contiguous spaces) - $a = ' '; - $b = ' '; - - // kludge - // if it finds a valid trigram to start and the start pad option is - // off, then set a variable that will be used to reduce this - // trigram after parsing has finished - if (!$this->_trigram_pad_start) { - $a = $this->_next_char($this->_string, $byte_counter, true); - - if ($a != ' ') { - $b = $this->_next_char($this->_string, $byte_counter, true); - $dropone = " $a$b"; - } - - $byte_counter = 0; - $a = ' '; - $b = ' '; - } - } - - while ($byte_counter < $len) { - $char = $this->_next_char($this->_string, $byte_counter, true); - - - // language trigram detection - if ($this->_compile_trigram) { - if (!($b == ' ' && ($a == ' ' || $char == ' '))) { - if (!isset($this->_trigram[$a . $b . $char])) { - $this->_trigram[$a . $b . $char] = 1; - } else { - $this->_trigram[$a . $b . $char]++; - } - } - - $a = $b; - $b = $char; - } - - // unicode block detection - if ($this->_compile_unicode) { - if ($this->_unicode_skip_symbols - && strlen($char) == 1 - && ($char < 'A' || $char > 'z' - || ($char > 'Z' && $char < 'a')) - && $char != "'") { // does not skip the apostrophe - // since it's included in the language - // models - - $skipped_count++; - continue; - } - - // build an array of all the characters - if (isset($unicode_chars[$char])) { - $unicode_chars[$char]++; - } else { - $unicode_chars[$char] = 1; - } - } - - // todo: add byte detection here - } - - // unicode cleanup - if ($this->_compile_unicode) { - foreach ($unicode_chars as $utf8_char => $count) { - $search_result = $this->_unicode_block_name( - $this->_utf8char2unicode($utf8_char), $blocks, $block_count); - - if ($search_result != -1) { - $block_name = $search_result[2]; - } else { - $block_name = '[Malformatted]'; - } - - if (isset($this->_unicode_blocks[$block_name])) { - $this->_unicode_blocks[$block_name] += $count; - } else { - $this->_unicode_blocks[$block_name] = $count; - } - } - } - - - // trigram cleanup - if ($this->_compile_trigram) { - // pad the end - if ($b != ' ') { - if (!isset($this->_trigram["$a$b "])) { - $this->_trigram["$a$b "] = 1; - } else { - $this->_trigram["$a$b "]++; - } - } - - // perl compatibility; Language::Guess does not pad the beginning - // kludge - if (isset($dropone)) { - if ($this->_trigram[$dropone] == 1) { - unset($this->_trigram[$dropone]); - } else { - $this->_trigram[$dropone]--; - } - } - - if (!empty($this->_trigram)) { - $this->_trigram_ranks = $this->_arr_rank($this->_trigram); - } else { - $this->_trigram_ranks = array(); - } - } - } -} - -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ - -?> -- cgit v1.2.3 From a50583fb97615f4c26cc84ee95d62f867a84b4e6 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Fri, 23 May 2014 19:27:17 +0300 Subject: last 3 important changes to 3.2 version of full-text-rss, issue #694 --- .../language-detect/LanguageDetect/Exception.php | 57 ++++ .../language-detect/LanguageDetect/ISO639.php | 339 ++++++++++++++++++++ .../language-detect/LanguageDetect/Parser.php | 347 +++++++++++++++++++++ 3 files changed, 743 insertions(+) create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 00000000..196d994f --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php @@ -0,0 +1,57 @@ + + * @copyright 2011 Christian Weiske + * @license http://www.debian.org/misc/bsd.license BSD + * @version SVN: $Id$ + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ + +/** + * Provides a mapping between the languages from lang.dat and the + * ISO 639-1 and ISO-639-2 codes. + * + * Note that this class contains only languages that exist in lang.dat. + * + * @category Text + * @package Text_LanguageDetect + * @author Christian Weiske + * @copyright 2011 Christian Weiske + * @license http://www.debian.org/misc/bsd.license BSD + * @link http://www.loc.gov/standards/iso639-2/php/code_list.php + */ +class Text_LanguageDetect_ISO639 +{ + /** + * Maps all language names from the language database to the + * ISO 639-1 2-letter language code. + * + * NULL indicates that there is no 2-letter code. + * + * @var array + */ + public static $nameToCode2 = array( + 'albanian' => 'sq', + 'arabic' => 'ar', + 'azeri' => 'az', + 'bengali' => 'bn', + 'bulgarian' => 'bg', + 'cebuano' => null, + 'croatian' => 'hr', + 'czech' => 'cs', + 'danish' => 'da', + 'dutch' => 'nl', + 'english' => 'en', + 'estonian' => 'et', + 'farsi' => 'fa', + 'finnish' => 'fi', + 'french' => 'fr', + 'german' => 'de', + 'hausa' => 'ha', + 'hawaiian' => null, + 'hindi' => 'hi', + 'hungarian' => 'hu', + 'icelandic' => 'is', + 'indonesian' => 'id', + 'italian' => 'it', + 'kazakh' => 'kk', + 'kyrgyz' => 'ky', + 'latin' => 'la', + 'latvian' => 'lv', + 'lithuanian' => 'lt', + 'macedonian' => 'mk', + 'mongolian' => 'mn', + 'nepali' => 'ne', + 'norwegian' => 'no', + 'pashto' => 'ps', + 'pidgin' => null, + 'polish' => 'pl', + 'portuguese' => 'pt', + 'romanian' => 'ro', + 'russian' => 'ru', + 'serbian' => 'sr', + 'slovak' => 'sk', + 'slovene' => 'sl', + 'somali' => 'so', + 'spanish' => 'es', + 'swahili' => 'sw', + 'swedish' => 'sv', + 'tagalog' => 'tl', + 'turkish' => 'tr', + 'ukrainian' => 'uk', + 'urdu' => 'ur', + 'uzbek' => 'uz', + 'vietnamese' => 'vi', + 'welsh' => 'cy', + ); + + /** + * Maps all language names from the language database to the + * ISO 639-2 3-letter language code. + * + * @var array + */ + public static $nameToCode3 = array( + 'albanian' => 'sqi', + 'arabic' => 'ara', + 'azeri' => 'aze', + 'bengali' => 'ben', + 'bulgarian' => 'bul', + 'cebuano' => 'ceb', + 'croatian' => 'hrv', + 'czech' => 'ces', + 'danish' => 'dan', + 'dutch' => 'nld', + 'english' => 'eng', + 'estonian' => 'est', + 'farsi' => 'fas', + 'finnish' => 'fin', + 'french' => 'fra', + 'german' => 'deu', + 'hausa' => 'hau', + 'hawaiian' => 'haw', + 'hindi' => 'hin', + 'hungarian' => 'hun', + 'icelandic' => 'isl', + 'indonesian' => 'ind', + 'italian' => 'ita', + 'kazakh' => 'kaz', + 'kyrgyz' => 'kir', + 'latin' => 'lat', + 'latvian' => 'lav', + 'lithuanian' => 'lit', + 'macedonian' => 'mkd', + 'mongolian' => 'mon', + 'nepali' => 'nep', + 'norwegian' => 'nor', + 'pashto' => 'pus', + 'pidgin' => 'crp', + 'polish' => 'pol', + 'portuguese' => 'por', + 'romanian' => 'ron', + 'russian' => 'rus', + 'serbian' => 'srp', + 'slovak' => 'slk', + 'slovene' => 'slv', + 'somali' => 'som', + 'spanish' => 'spa', + 'swahili' => 'swa', + 'swedish' => 'swe', + 'tagalog' => 'tgl', + 'turkish' => 'tur', + 'ukrainian' => 'ukr', + 'urdu' => 'urd', + 'uzbek' => 'uzb', + 'vietnamese' => 'vie', + 'welsh' => 'cym', + ); + + /** + * Maps ISO 639-1 2-letter language codes to the language names + * in the language database + * + * Not all languages have a 2 letter code, so some are missing + * + * @var array + */ + public static $code2ToName = array( + 'ar' => 'arabic', + 'az' => 'azeri', + 'bg' => 'bulgarian', + 'bn' => 'bengali', + 'cs' => 'czech', + 'cy' => 'welsh', + 'da' => 'danish', + 'de' => 'german', + 'en' => 'english', + 'es' => 'spanish', + 'et' => 'estonian', + 'fa' => 'farsi', + 'fi' => 'finnish', + 'fr' => 'french', + 'ha' => 'hausa', + 'hi' => 'hindi', + 'hr' => 'croatian', + 'hu' => 'hungarian', + 'id' => 'indonesian', + 'is' => 'icelandic', + 'it' => 'italian', + 'kk' => 'kazakh', + 'ky' => 'kyrgyz', + 'la' => 'latin', + 'lt' => 'lithuanian', + 'lv' => 'latvian', + 'mk' => 'macedonian', + 'mn' => 'mongolian', + 'ne' => 'nepali', + 'nl' => 'dutch', + 'no' => 'norwegian', + 'pl' => 'polish', + 'ps' => 'pashto', + 'pt' => 'portuguese', + 'ro' => 'romanian', + 'ru' => 'russian', + 'sk' => 'slovak', + 'sl' => 'slovene', + 'so' => 'somali', + 'sq' => 'albanian', + 'sr' => 'serbian', + 'sv' => 'swedish', + 'sw' => 'swahili', + 'tl' => 'tagalog', + 'tr' => 'turkish', + 'uk' => 'ukrainian', + 'ur' => 'urdu', + 'uz' => 'uzbek', + 'vi' => 'vietnamese', + ); + + /** + * Maps ISO 639-2 3-letter language codes to the language names + * in the language database. + * + * @var array + */ + public static $code3ToName = array( + 'ara' => 'arabic', + 'aze' => 'azeri', + 'ben' => 'bengali', + 'bul' => 'bulgarian', + 'ceb' => 'cebuano', + 'ces' => 'czech', + 'crp' => 'pidgin', + 'cym' => 'welsh', + 'dan' => 'danish', + 'deu' => 'german', + 'eng' => 'english', + 'est' => 'estonian', + 'fas' => 'farsi', + 'fin' => 'finnish', + 'fra' => 'french', + 'hau' => 'hausa', + 'haw' => 'hawaiian', + 'hin' => 'hindi', + 'hrv' => 'croatian', + 'hun' => 'hungarian', + 'ind' => 'indonesian', + 'isl' => 'icelandic', + 'ita' => 'italian', + 'kaz' => 'kazakh', + 'kir' => 'kyrgyz', + 'lat' => 'latin', + 'lav' => 'latvian', + 'lit' => 'lithuanian', + 'mkd' => 'macedonian', + 'mon' => 'mongolian', + 'nep' => 'nepali', + 'nld' => 'dutch', + 'nor' => 'norwegian', + 'pol' => 'polish', + 'por' => 'portuguese', + 'pus' => 'pashto', + 'rom' => 'romanian', + 'rus' => 'russian', + 'slk' => 'slovak', + 'slv' => 'slovene', + 'som' => 'somali', + 'spa' => 'spanish', + 'sqi' => 'albanian', + 'srp' => 'serbian', + 'swa' => 'swahili', + 'swe' => 'swedish', + 'tgl' => 'tagalog', + 'tur' => 'turkish', + 'ukr' => 'ukrainian', + 'urd' => 'urdu', + 'uzb' => 'uzbek', + 'vie' => 'vietnamese', + ); + + /** + * Returns the 2-letter ISO 639-1 code for the given language name. + * + * @param string $lang English language name like "swedish" + * + * @return string Two-letter language code (e.g. "sv") or NULL if not found + */ + public static function nameToCode2($lang) + { + $lang = strtolower($lang); + if (!isset(self::$nameToCode2[$lang])) { + return null; + } + return self::$nameToCode2[$lang]; + } + + /** + * Returns the 3-letter ISO 639-2 code for the given language name. + * + * @param string $lang English language name like "swedish" + * + * @return string Three-letter language code (e.g. "swe") or NULL if not found + */ + public static function nameToCode3($lang) + { + $lang = strtolower($lang); + if (!isset(self::$nameToCode3[$lang])) { + return null; + } + return self::$nameToCode3[$lang]; + } + + /** + * Returns the language name for the given 2-letter ISO 639-1 code. + * + * @param string $code Two-letter language code (e.g. "sv") + * + * @return string English language name like "swedish" + */ + public static function code2ToName($code) + { + $lang = strtolower($code); + if (!isset(self::$code2ToName[$code])) { + return null; + } + return self::$code2ToName[$code]; + } + + /** + * Returns the language name for the given 3-letter ISO 639-2 code. + * + * @param string $code Three-letter language code (e.g. "swe") + * + * @return string English language name like "swedish" + */ + public static function code3ToName($code) + { + $lang = strtolower($code); + if (!isset(self::$code3ToName[$code])) { + return null; + } + return self::$code3ToName[$code]; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php new file mode 100644 index 00000000..fb0e1e20 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php @@ -0,0 +1,347 @@ +_string = $string; + } + + /** + * Returns true if a string is suitable for parsing + * + * @param string $str input string to test + * @return bool true if acceptable, false if not + */ + public static function validateString($str) { + if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { + return true; + } else { + return false; + } + } + + /** + * turn on/off trigram counting + * + * @access public + * @param bool $bool true for on, false for off + */ + function prepareTrigram($bool = true) + { + $this->_compile_trigram = $bool; + } + + /** + * turn on/off unicode block counting + * + * @access public + * @param bool $bool true for on, false for off + */ + function prepareUnicode($bool = true) + { + $this->_compile_unicode = $bool; + } + + /** + * turn on/off padding the beginning of the sample string + * + * @access public + * @param bool $bool true for on, false for off + */ + function setPadStart($bool = true) + { + $this->_trigram_pad_start = $bool; + } + + /** + * Should the unicode block counter skip non-alphabetical ascii chars? + * + * @access public + * @param bool $bool true for on, false for off + */ + function setUnicodeSkipSymbols($bool = true) + { + $this->_unicode_skip_symbols = $bool; + } + + /** + * Returns the trigram ranks for the text sample + * + * @access public + * @return array trigram ranks in the text sample + */ + function &getTrigramRanks() + { + return $this->_trigram_ranks; + } + + /** + * Return the trigram freqency table + * + * only used in testing to make sure the parser is working + * + * @access public + * @return array trigram freqencies in the text sample + */ + function &getTrigramFreqs() + { + return $this->_trigram; + } + + /** + * returns the array of unicode blocks + * + * @access public + * @return array unicode blocks in the text sample + */ + function &getUnicodeBlocks() + { + return $this->_unicode_blocks; + } + + /** + * Executes the parsing operation + * + * Be sure to call the set*() functions to set options and the + * prepare*() functions first to tell it what kind of data to compute + * + * Afterwards the get*() functions can be used to access the compiled + * information. + * + * @access public + */ + function analyze() + { + $len = strlen($this->_string); + $byte_counter = 0; + + + // unicode startup + if ($this->_compile_unicode) { + $blocks = $this->_read_unicode_block_db(); + $block_count = count($blocks); + + $skipped_count = 0; + $unicode_chars = array(); + } + + // trigram startup + if ($this->_compile_trigram) { + // initialize them as blank so the parser will skip the first two + // (since it skips trigrams with more than 2 contiguous spaces) + $a = ' '; + $b = ' '; + + // kludge + // if it finds a valid trigram to start and the start pad option is + // off, then set a variable that will be used to reduce this + // trigram after parsing has finished + if (!$this->_trigram_pad_start) { + $a = $this->_next_char($this->_string, $byte_counter, true); + + if ($a != ' ') { + $b = $this->_next_char($this->_string, $byte_counter, true); + $dropone = " $a$b"; + } + + $byte_counter = 0; + $a = ' '; + $b = ' '; + } + } + + while ($byte_counter < $len) { + $char = $this->_next_char($this->_string, $byte_counter, true); + + + // language trigram detection + if ($this->_compile_trigram) { + if (!($b == ' ' && ($a == ' ' || $char == ' '))) { + if (!isset($this->_trigram[$a . $b . $char])) { + $this->_trigram[$a . $b . $char] = 1; + } else { + $this->_trigram[$a . $b . $char]++; + } + } + + $a = $b; + $b = $char; + } + + // unicode block detection + if ($this->_compile_unicode) { + if ($this->_unicode_skip_symbols + && strlen($char) == 1 + && ($char < 'A' || $char > 'z' + || ($char > 'Z' && $char < 'a')) + && $char != "'") { // does not skip the apostrophe + // since it's included in the language + // models + + $skipped_count++; + continue; + } + + // build an array of all the characters + if (isset($unicode_chars[$char])) { + $unicode_chars[$char]++; + } else { + $unicode_chars[$char] = 1; + } + } + + // todo: add byte detection here + } + + // unicode cleanup + if ($this->_compile_unicode) { + foreach ($unicode_chars as $utf8_char => $count) { + $search_result = $this->_unicode_block_name( + $this->_utf8char2unicode($utf8_char), $blocks, $block_count); + + if ($search_result != -1) { + $block_name = $search_result[2]; + } else { + $block_name = '[Malformatted]'; + } + + if (isset($this->_unicode_blocks[$block_name])) { + $this->_unicode_blocks[$block_name] += $count; + } else { + $this->_unicode_blocks[$block_name] = $count; + } + } + } + + + // trigram cleanup + if ($this->_compile_trigram) { + // pad the end + if ($b != ' ') { + if (!isset($this->_trigram["$a$b "])) { + $this->_trigram["$a$b "] = 1; + } else { + $this->_trigram["$a$b "]++; + } + } + + // perl compatibility; Language::Guess does not pad the beginning + // kludge + if (isset($dropone)) { + if ($this->_trigram[$dropone] == 1) { + unset($this->_trigram[$dropone]); + } else { + $this->_trigram[$dropone]--; + } + } + + if (!empty($this->_trigram)) { + $this->_trigram_ranks = $this->_arr_rank($this->_trigram); + } else { + $this->_trigram_ranks = array(); + } + } + } +} + +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file -- cgit v1.2.3 From 3dc8d84229ed0f3ccd40b44420ed6e818a6edea9 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 29 May 2014 16:35:00 +0300 Subject: fix of uninitialized object warning, issue #710 --- inc/3rdparty/libraries/feedwriter/FeedWriter.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'inc/3rdparty/libraries') diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index 77755690..aa064afb 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php @@ -97,13 +97,16 @@ define('JSONP', 3, true); header('X-content-type-options: nosniff'); } elseif ($this->version == JSON) { header('Content-type: application/json; charset=UTF-8'); - $this->json = new stdClass(); } elseif ($this->version == JSONP) { header('Content-type: application/javascript; charset=UTF-8'); - $this->json = new stdClass(); } } + if ($this->version == JSON || $this->version == JSONP) { + $this->json = new stdClass(); + } + + $this->printHead(); $this->printChannels(); $this->printItems(); @@ -202,7 +205,7 @@ define('JSONP', 3, true); public function setDescription($description) { $tag = ($this->version == ATOM)? 'subtitle' : 'description'; - $this->setChannelElement($tag, $desciption); + $this->setChannelElement($tag, $description); } /** -- cgit v1.2.3