diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
commit | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch) | |
tree | 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php | |
parent | 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff) | |
parent | 8038b38802769031e050c753fc0a388a2276629e (diff) | |
download | wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip |
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php')
-rw-r--r-- | inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php new file mode 100644 index 00000000..1d44f238 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php | |||
@@ -0,0 +1,201 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts. | ||
4 | * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts. | ||
5 | * Split size is considered max target size. The actual size is the result of an even split across the resulting files. | ||
6 | * | ||
7 | * @author A. Grandt <php@grandt.com> | ||
8 | * @copyright 2009-2014 A. Grandt | ||
9 | * @license GNU LGPL 2.1 | ||
10 | * @link http://www.phpclasses.org/package/6115 | ||
11 | * @link https://github.com/Grandt/PHPePub | ||
12 | * @version 3.20 | ||
13 | */ | ||
14 | class EPubChapterSplitter { | ||
15 | const VERSION = 3.20; | ||
16 | |||
17 | private $splitDefaultSize = 250000; | ||
18 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
19 | |||
20 | /** | ||
21 | * | ||
22 | * Enter description here ... | ||
23 | * | ||
24 | * @param unknown_type $ident | ||
25 | */ | ||
26 | function setVersion($bookVersion) { | ||
27 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
28 | } | ||
29 | |||
30 | /** | ||
31 | * Set default chapter target size. | ||
32 | * Default is 250000 bytes, and minimum is 10240 bytes. | ||
33 | * | ||
34 | * @param $size segment size in bytes | ||
35 | * @return void | ||
36 | */ | ||
37 | function setSplitSize($size) { | ||
38 | $this->splitDefaultSize = (int)$size; | ||
39 | if ($size < 10240) { | ||
40 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | ||
41 | } | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * Get the chapter target size. | ||
46 | * | ||
47 | * @return $size | ||
48 | */ | ||
49 | function getSplitSize() { | ||
50 | return $this->splitDefaultSize; | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * Split $chapter into multiple parts. | ||
55 | * | ||
56 | * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php | ||
57 | * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given | ||
58 | * | ||
59 | * @param String $chapter XHTML file | ||
60 | * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. | ||
61 | * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. | ||
62 | * | ||
63 | * @return array with 1 or more parts | ||
64 | */ | ||
65 | function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { | ||
66 | $chapterData = array(); | ||
67 | $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1); | ||
68 | if ($splitOnSearchString && !$isSearchRegexp) { | ||
69 | $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; | ||
70 | } | ||
71 | |||
72 | if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { | ||
73 | return array($chapter); | ||
74 | } | ||
75 | |||
76 | $xmlDoc = new DOMDocument(); | ||
77 | @$xmlDoc->loadHTML($chapter); | ||
78 | |||
79 | $head = $xmlDoc->getElementsByTagName("head"); | ||
80 | $body = $xmlDoc->getElementsByTagName("body"); | ||
81 | |||
82 | $htmlPos = stripos($chapter, "<html"); | ||
83 | $htmlEndPos = stripos($chapter, ">", $htmlPos); | ||
84 | $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>"; | ||
85 | if (strpos(trim($newXML), "<?xml ") === FALSE) { | ||
86 | $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML; | ||
87 | } | ||
88 | $headerLength = strlen($newXML); | ||
89 | |||
90 | $files = array(); | ||
91 | $chapterNames = array(); | ||
92 | $domDepth = 0; | ||
93 | $domPath = array(); | ||
94 | $domClonedPath = array(); | ||
95 | |||
96 | $curFile = $xmlDoc->createDocumentFragment(); | ||
97 | $files[] = $curFile; | ||
98 | $curParent = $curFile; | ||
99 | $curSize = 0; | ||
100 | |||
101 | $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); | ||
102 | $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; | ||
103 | |||
104 | $partSize = $this->splitDefaultSize - $headLen; | ||
105 | |||
106 | if ($bodyLen > $partSize) { | ||
107 | $parts = ceil($bodyLen / $partSize); | ||
108 | $partSize = ($bodyLen / $parts) - $headLen; | ||
109 | } | ||
110 | |||
111 | $node = $body->item(0)->firstChild; | ||
112 | |||
113 | do { | ||
114 | $nodeData = $xmlDoc->saveXML($node); | ||
115 | $nodeLen = strlen($nodeData); | ||
116 | |||
117 | if ($nodeLen > $partSize && $node->hasChildNodes()) { | ||
118 | $domPath[] = $node; | ||
119 | $domClonedPath[] = $node->cloneNode(false); | ||
120 | $domDepth++; | ||
121 | |||
122 | $node = $node->firstChild; | ||
123 | } | ||
124 | |||
125 | $node2 = $node->nextSibling; | ||
126 | |||
127 | if ($node != null && $node->nodeName != "#text") { | ||
128 | $doSplit = false; | ||
129 | if ($splitOnSearchString) { | ||
130 | $doSplit = preg_match($searchString, $nodeData) == 1; | ||
131 | if ($doSplit) { | ||
132 | $chapterNames[] = trim($nodeData); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) { | ||
137 | $curFile = $xmlDoc->createDocumentFragment(); | ||
138 | $files[] = $curFile; | ||
139 | $curParent = $curFile; | ||
140 | if ($domDepth > 0) { | ||
141 | reset($domPath); | ||
142 | reset($domClonedPath); | ||
143 | $oneDomClonedPath = each($domClonedPath); | ||
144 | while ($oneDomClonedPath) { | ||
145 | list($k, $v) = $oneDomClonedPath; | ||
146 | $newParent = $v->cloneNode(false); | ||
147 | $curParent->appendChild($newParent); | ||
148 | $curParent = $newParent; | ||
149 | $oneDomClonedPath = each($domClonedPath); | ||
150 | } | ||
151 | } | ||
152 | $curSize = strlen($xmlDoc->saveXML($curFile)); | ||
153 | } | ||
154 | $curParent->appendChild($node->cloneNode(true)); | ||
155 | $curSize += $nodeLen; | ||
156 | } | ||
157 | |||
158 | $node = $node2; | ||
159 | while ($node == null && $domDepth > 0) { | ||
160 | $domDepth--; | ||
161 | $node = end($domPath)->nextSibling; | ||
162 | array_pop($domPath); | ||
163 | array_pop($domClonedPath); | ||
164 | $curParent = $curParent->parentNode; | ||
165 | } | ||
166 | } while ($node != null); | ||
167 | |||
168 | $curFile = null; | ||
169 | $curSize = 0; | ||
170 | |||
171 | $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
172 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
173 | $xml->preserveWhiteSpace = false; | ||
174 | $xml->formatOutput = true; | ||
175 | |||
176 | for ($idx = 0; $idx < count($files); $idx++) { | ||
177 | $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
178 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
179 | $xml2Doc->loadXML($newXML); | ||
180 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | ||
181 | $html->appendChild($xml2Doc->importNode($head->item(0), true)); | ||
182 | $body = $xml2Doc->createElement("body"); | ||
183 | $html->appendChild($body); | ||
184 | $body->appendChild($xml2Doc->importNode($files[$idx], true)); | ||
185 | |||
186 | // force pretty printing and correct formatting, should not be needed, but it is. | ||
187 | $xml->loadXML($xml2Doc->saveXML()); | ||
188 | |||
189 | $doc = $xml->saveXML(); | ||
190 | |||
191 | if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) { | ||
192 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | ||
193 | } | ||
194 | |||
195 | $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc; | ||
196 | } | ||
197 | |||
198 | return $chapterData; | ||
199 | } | ||
200 | } | ||
201 | ?> | ||