]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
phinx for database migration
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / PHPePub / EPubChapterSplitter.php
CommitLineData
87090d8a 1<?php
2/**
3 * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
4 * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
5 * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
6 *
7 * @author A. Grandt <php@grandt.com>
8 * @copyright 2009-2014 A. Grandt
9 * @license GNU LGPL 2.1
10 * @link http://www.phpclasses.org/package/6115
11 * @link https://github.com/Grandt/PHPePub
12 * @version 3.20
13 */
14class EPubChapterSplitter {
15 const VERSION = 3.20;
16
17 private $splitDefaultSize = 250000;
18 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
19
20 /**
21 *
22 * Enter description here ...
23 *
24 * @param unknown_type $ident
25 */
26 function setVersion($bookVersion) {
27 $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
28 }
29
30 /**
31 * Set default chapter target size.
32 * Default is 250000 bytes, and minimum is 10240 bytes.
33 *
34 * @param $size segment size in bytes
35 * @return void
36 */
37 function setSplitSize($size) {
38 $this->splitDefaultSize = (int)$size;
39 if ($size < 10240) {
40 $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
41 }
42 }
43
44 /**
45 * Get the chapter target size.
46 *
47 * @return $size
48 */
49 function getSplitSize() {
50 return $this->splitDefaultSize;
51 }
52
53 /**
54 * Split $chapter into multiple parts.
55 *
56 * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
57 * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
58 *
59 * @param String $chapter XHTML file
60 * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
61 * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
62 *
63 * @return array with 1 or more parts
64 */
65 function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
66 $chapterData = array();
67 $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
68 if ($splitOnSearchString && !$isSearchRegexp) {
69 $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
70 }
71
72 if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
73 return array($chapter);
74 }
75
76 $xmlDoc = new DOMDocument();
77 @$xmlDoc->loadHTML($chapter);
78
79 $head = $xmlDoc->getElementsByTagName("head");
80 $body = $xmlDoc->getElementsByTagName("body");
81
82 $htmlPos = stripos($chapter, "<html");
83 $htmlEndPos = stripos($chapter, ">", $htmlPos);
84 $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
85 if (strpos(trim($newXML), "<?xml ") === FALSE) {
86 $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
87 }
88 $headerLength = strlen($newXML);
89
90 $files = array();
91 $chapterNames = array();
92 $domDepth = 0;
93 $domPath = array();
94 $domClonedPath = array();
95
96 $curFile = $xmlDoc->createDocumentFragment();
97 $files[] = $curFile;
98 $curParent = $curFile;
99 $curSize = 0;
100
101 $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
102 $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
103
104 $partSize = $this->splitDefaultSize - $headLen;
105
106 if ($bodyLen > $partSize) {
107 $parts = ceil($bodyLen / $partSize);
108 $partSize = ($bodyLen / $parts) - $headLen;
109 }
110
111 $node = $body->item(0)->firstChild;
112
113 do {
114 $nodeData = $xmlDoc->saveXML($node);
115 $nodeLen = strlen($nodeData);
116
117 if ($nodeLen > $partSize && $node->hasChildNodes()) {
118 $domPath[] = $node;
119 $domClonedPath[] = $node->cloneNode(false);
120 $domDepth++;
121
122 $node = $node->firstChild;
123 }
124
125 $node2 = $node->nextSibling;
126
127 if ($node != null && $node->nodeName != "#text") {
128 $doSplit = false;
129 if ($splitOnSearchString) {
130 $doSplit = preg_match($searchString, $nodeData) == 1;
131 if ($doSplit) {
132 $chapterNames[] = trim($nodeData);
133 }
134 }
135
136 if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
137 $curFile = $xmlDoc->createDocumentFragment();
138 $files[] = $curFile;
139 $curParent = $curFile;
140 if ($domDepth > 0) {
141 reset($domPath);
142 reset($domClonedPath);
143 $oneDomClonedPath = each($domClonedPath);
144 while ($oneDomClonedPath) {
145 list($k, $v) = $oneDomClonedPath;
146 $newParent = $v->cloneNode(false);
147 $curParent->appendChild($newParent);
148 $curParent = $newParent;
149 $oneDomClonedPath = each($domClonedPath);
150 }
151 }
152 $curSize = strlen($xmlDoc->saveXML($curFile));
153 }
154 $curParent->appendChild($node->cloneNode(true));
155 $curSize += $nodeLen;
156 }
157
158 $node = $node2;
159 while ($node == null && $domDepth > 0) {
160 $domDepth--;
161 $node = end($domPath)->nextSibling;
162 array_pop($domPath);
163 array_pop($domClonedPath);
164 $curParent = $curParent->parentNode;
165 }
166 } while ($node != null);
167
168 $curFile = null;
169 $curSize = 0;
170
171 $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
172 $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
173 $xml->preserveWhiteSpace = false;
174 $xml->formatOutput = true;
175
176 for ($idx = 0; $idx < count($files); $idx++) {
177 $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
178 $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
179 $xml2Doc->loadXML($newXML);
180 $html = $xml2Doc->getElementsByTagName("html")->item(0);
181 $html->appendChild($xml2Doc->importNode($head->item(0), true));
182 $body = $xml2Doc->createElement("body");
183 $html->appendChild($body);
184 $body->appendChild($xml2Doc->importNode($files[$idx], true));
185
186 // force pretty printing and correct formatting, should not be needed, but it is.
187 $xml->loadXML($xml2Doc->saveXML());
188
189 $doc = $xml->saveXML();
190
191 if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
192 $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
193 }
194
195 $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
196 }
197
198 return $chapterData;
199 }
200}
201?>