Merge pull request #712 from wallabag/dev1.7.0

1.7, call me "Premium version"
author: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-05-29 18:54:06 +0200
committer: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-05-29 18:54:06 +0200
commit: a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch)
tree: 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
parent: 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff)
parent: 8038b38802769031e050c753fc0a388a2276629e (diff)
download: wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz
wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst
wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip
1 files changed, 201 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
new file mode 100644
index 00000000..1d44f238
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
@@ -0,0 +1,201 @@
+<?php
+/**
+ * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
+ * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
+ * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
+ *
+ * @author A. Grandt <php@grandt.com>
+ * @copyright 2009-2014 A. Grandt
+ * @license GNU LGPL 2.1
+ * @link http://www.phpclasses.org/package/6115
+ * @link https://github.com/Grandt/PHPePub
+ * @version 3.20
+ */
+class EPubChapterSplitter {
+    const VERSION = 3.20;
+    private $splitDefaultSize = 250000;
+    private $bookVersion = EPub::BOOK_VERSION_EPUB2;
+    /**
+     *
+     * Enter description here ...
+     *
+     * @param unknown_type $ident
+     */
+    function setVersion($bookVersion) {
+        $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
+    }
+        /**
+     * Set default chapter target size.
+     * Default is 250000 bytes, and minimum is 10240 bytes.
+     *
+     * @param $size segment size in bytes
+     * @return void
+     */
+    function setSplitSize($size) {
+        $this->splitDefaultSize = (int)$size;
+        if ($size < 10240) {
+            $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
+        }
+    }
+    /**
+     * Get the chapter target size.
+     *
+     * @return $size
+     */
+    function getSplitSize() {
+        return $this->splitDefaultSize;
+    }
+    /**
+     * Split $chapter into multiple parts.
+     *
+     * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
+     * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
+     *
+     * @param String $chapter XHTML file
+     * @param Bool   $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
+     * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
+     *
+     * @return array with 1 or more parts
+     */
+    function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
+        $chapterData = array();
+        $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
+        if ($splitOnSearchString && !$isSearchRegexp) {
+            $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
+        }
+        if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
+            return array($chapter);
+        }
+        $xmlDoc = new DOMDocument();
+        @$xmlDoc->loadHTML($chapter);
+        $head = $xmlDoc->getElementsByTagName("head");
+        $body = $xmlDoc->getElementsByTagName("body");
+        $htmlPos = stripos($chapter, "<html");
+        $htmlEndPos = stripos($chapter, ">", $htmlPos);
+        $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
+        if (strpos(trim($newXML), "<?xml ") === FALSE) {
+            $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
+        }
+        $headerLength = strlen($newXML);
+        $files = array();
+        $chapterNames = array();
+        $domDepth = 0;
+        $domPath = array();
+        $domClonedPath = array();
+        $curFile = $xmlDoc->createDocumentFragment();
+        $files[] = $curFile;
+        $curParent = $curFile;
+        $curSize = 0;
+        $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
+        $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
+        $partSize = $this->splitDefaultSize - $headLen;
+        if ($bodyLen > $partSize) {
+            $parts = ceil($bodyLen / $partSize);
+            $partSize = ($bodyLen / $parts)  - $headLen;
+        }
+        $node = $body->item(0)->firstChild;
+        do {
+            $nodeData = $xmlDoc->saveXML($node);
+            $nodeLen = strlen($nodeData);
+            if ($nodeLen > $partSize && $node->hasChildNodes()) {
+                $domPath[] = $node;
+                $domClonedPath[] = $node->cloneNode(false);
+                $domDepth++;
+                $node = $node->firstChild;
+            }
+            $node2 = $node->nextSibling;
+            if ($node != null && $node->nodeName != "#text") {
+                $doSplit = false;
+                if ($splitOnSearchString) {
+                    $doSplit = preg_match($searchString, $nodeData) == 1;
+                    if ($doSplit) {
+                        $chapterNames[] = trim($nodeData);
+                    }
+                }
+                if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
+                    $curFile = $xmlDoc->createDocumentFragment();
+                    $files[] = $curFile;
+                    $curParent = $curFile;
+                    if ($domDepth > 0) {
+                        reset($domPath);
+                        reset($domClonedPath);
+                        $oneDomClonedPath = each($domClonedPath);
+                        while ($oneDomClonedPath) {
+                            list($k, $v) = $oneDomClonedPath;
+                            $newParent = $v->cloneNode(false);
+                            $curParent->appendChild($newParent);
+                            $curParent = $newParent;
+                            $oneDomClonedPath = each($domClonedPath);
+                        }
+                    }
+                    $curSize = strlen($xmlDoc->saveXML($curFile));
+                }
+                $curParent->appendChild($node->cloneNode(true));
+                $curSize += $nodeLen;
+            }
+            $node = $node2;
+            while ($node == null && $domDepth > 0) {
+                $domDepth--;
+                $node = end($domPath)->nextSibling;
+                array_pop($domPath);
+                array_pop($domClonedPath);
+                $curParent = $curParent->parentNode;
+            }
+        } while ($node != null);
+        $curFile = null;
+        $curSize = 0;
+        $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
+        $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
+        $xml->preserveWhiteSpace = false;
+        $xml->formatOutput = true;
+        for ($idx = 0; $idx < count($files); $idx++) {
+            $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
+            $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
+            $xml2Doc->loadXML($newXML);
+            $html = $xml2Doc->getElementsByTagName("html")->item(0);
+            $html->appendChild($xml2Doc->importNode($head->item(0), true));
+            $body = $xml2Doc->createElement("body");
+            $html->appendChild($body);
+            $body->appendChild($xml2Doc->importNode($files[$idx], true));
+            // force pretty printing and correct formatting, should not be needed, but it is.
+            $xml->loadXML($xml2Doc->saveXML());
+                        $doc = $xml->saveXML();
+                        if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
+                                $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
+                        }
+            $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
+        }
+        return $chapterData;
+    }
+}
+?>
author	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-05-29 18:54:06 +0200
committer	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-05-29 18:54:06 +0200
commit	a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch)
tree	80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
parent	96834a47b09985e1c82b82857fc108f20e8b8f2b (diff)
parent	8038b38802769031e050c753fc0a388a2276629e (diff)
download	wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip

diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php new file mode 100644 index 00000000..1d44f238 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
@@ -0,0 +1,201 @@
	1	<?php
	2	/**
	3	* Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
	4	* What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
	5	* Split size is considered max target size. The actual size is the result of an even split across the resulting files.
	6	*
	7	* @author A. Grandt <php@grandt.com>
	8	* @copyright 2009-2014 A. Grandt
	9	* @license GNU LGPL 2.1
	10	* @link http://www.phpclasses.org/package/6115
	11	* @link https://github.com/Grandt/PHPePub
	12	* @version 3.20
	13	*/
	14	class EPubChapterSplitter {
	15	const VERSION = 3.20;
	16
	17	private $splitDefaultSize = 250000;
	18	private $bookVersion = EPub::BOOK_VERSION_EPUB2;
	19
	20	/**
	21	*
	22	* Enter description here ...
	23	*
	24	* @param unknown_type $ident
	25	*/
	26	function setVersion($bookVersion) {
	27	$this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
	28	}
	29
	30	/**
	31	* Set default chapter target size.
	32	* Default is 250000 bytes, and minimum is 10240 bytes.
	33	*
	34	* @param $size segment size in bytes
	35	* @return void
	36	*/
	37	function setSplitSize($size) {
	38	$this->splitDefaultSize = (int)$size;
	39	if ($size < 10240) {
	40	$this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
	41	}
	42	}
	43
	44	/**
	45	* Get the chapter target size.
	46	*
	47	* @return $size
	48	*/
	49	function getSplitSize() {
	50	return $this->splitDefaultSize;
	51	}
	52
	53	/**
	54	* Split $chapter into multiple parts.
	55	*
	56	* The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
	57	* If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
	58	*
	59	* @param String $chapter XHTML file
	60	* @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
	61	* @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
	62	*
	63	* @return array with 1 or more parts
	64	*/
	65	function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
	66	$chapterData = array();
	67	$isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D\|\S\|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
	68	if ($splitOnSearchString && !$isSearchRegexp) {
	69	$searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
	70	}
	71
	72	if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
	73	return array($chapter);
	74	}
	75
	76	$xmlDoc = new DOMDocument();
	77	@$xmlDoc->loadHTML($chapter);
	78
	79	$head = $xmlDoc->getElementsByTagName("head");
	80	$body = $xmlDoc->getElementsByTagName("body");
	81
	82	$htmlPos = stripos($chapter, "<html");
	83	$htmlEndPos = stripos($chapter, ">", $htmlPos);
	84	$newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
	85	if (strpos(trim($newXML), "<?xml ") === FALSE) {
	86	$newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
	87	}
	88	$headerLength = strlen($newXML);
	89
	90	$files = array();
	91	$chapterNames = array();
	92	$domDepth = 0;
	93	$domPath = array();
	94	$domClonedPath = array();
	95
	96	$curFile = $xmlDoc->createDocumentFragment();
	97	$files[] = $curFile;
	98	$curParent = $curFile;
	99	$curSize = 0;
	100
	101	$bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
	102	$headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
	103
	104	$partSize = $this->splitDefaultSize - $headLen;
	105
	106	if ($bodyLen > $partSize) {
	107	$parts = ceil($bodyLen / $partSize);
	108	$partSize = ($bodyLen / $parts) - $headLen;
	109	}
	110
	111	$node = $body->item(0)->firstChild;
	112
	113	do {
	114	$nodeData = $xmlDoc->saveXML($node);
	115	$nodeLen = strlen($nodeData);
	116
	117	if ($nodeLen > $partSize && $node->hasChildNodes()) {
	118	$domPath[] = $node;
	119	$domClonedPath[] = $node->cloneNode(false);
	120	$domDepth++;
	121
	122	$node = $node->firstChild;
	123	}
	124
	125	$node2 = $node->nextSibling;
	126
	127	if ($node != null && $node->nodeName != "#text") {
	128	$doSplit = false;
	129	if ($splitOnSearchString) {
	130	$doSplit = preg_match($searchString, $nodeData) == 1;
	131	if ($doSplit) {
	132	$chapterNames[] = trim($nodeData);
	133	}
	134	}
	135
	136	if ($curSize > 0 && ($doSplit \|\| (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
	137	$curFile = $xmlDoc->createDocumentFragment();
	138	$files[] = $curFile;
	139	$curParent = $curFile;
	140	if ($domDepth > 0) {
	141	reset($domPath);
	142	reset($domClonedPath);
	143	$oneDomClonedPath = each($domClonedPath);
	144	while ($oneDomClonedPath) {
	145	list($k, $v) = $oneDomClonedPath;
	146	$newParent = $v->cloneNode(false);
	147	$curParent->appendChild($newParent);
	148	$curParent = $newParent;
	149	$oneDomClonedPath = each($domClonedPath);
	150	}
	151	}
	152	$curSize = strlen($xmlDoc->saveXML($curFile));
	153	}
	154	$curParent->appendChild($node->cloneNode(true));
	155	$curSize += $nodeLen;
	156	}
	157
	158	$node = $node2;
	159	while ($node == null && $domDepth > 0) {
	160	$domDepth--;
	161	$node = end($domPath)->nextSibling;
	162	array_pop($domPath);
	163	array_pop($domClonedPath);
	164	$curParent = $curParent->parentNode;
	165	}
	166	} while ($node != null);
	167
	168	$curFile = null;
	169	$curSize = 0;
	170
	171	$xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
	172	$xml->lookupPrefix("http://www.w3.org/1999/xhtml");
	173	$xml->preserveWhiteSpace = false;
	174	$xml->formatOutput = true;
	175
	176	for ($idx = 0; $idx < count($files); $idx++) {
	177	$xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
	178	$xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
	179	$xml2Doc->loadXML($newXML);
	180	$html = $xml2Doc->getElementsByTagName("html")->item(0);
	181	$html->appendChild($xml2Doc->importNode($head->item(0), true));
	182	$body = $xml2Doc->createElement("body");
	183	$html->appendChild($body);
	184	$body->appendChild($xml2Doc->importNode($files[$idx], true));
	185
	186	// force pretty printing and correct formatting, should not be needed, but it is.
	187	$xml->loadXML($xml2Doc->saveXML());
	188
	189	$doc = $xml->saveXML();
	190
	191	if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
	192	$doc = preg_replace('#^\s<!DOCTYPE\ .+?>\s#im', '', $doc);
	193	}
	194
	195	$chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
	196	}
	197
	198	return $chapterData;
	199	}
	200	}
	201	?>