]>
Commit | Line | Data |
---|---|---|
87090d8a | 1 | <?php |
2 | /** | |
3 | * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts. | |
4 | * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts. | |
5 | * Split size is considered max target size. The actual size is the result of an even split across the resulting files. | |
6 | * | |
7 | * @author A. Grandt <php@grandt.com> | |
8 | * @copyright 2009-2014 A. Grandt | |
9 | * @license GNU LGPL 2.1 | |
10 | * @link http://www.phpclasses.org/package/6115 | |
11 | * @link https://github.com/Grandt/PHPePub | |
12 | * @version 3.20 | |
13 | */ | |
14 | class EPubChapterSplitter { | |
15 | const VERSION = 3.20; | |
16 | ||
17 | private $splitDefaultSize = 250000; | |
18 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | |
19 | ||
20 | /** | |
21 | * | |
22 | * Enter description here ... | |
23 | * | |
24 | * @param unknown_type $ident | |
25 | */ | |
26 | function setVersion($bookVersion) { | |
27 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | |
28 | } | |
29 | ||
30 | /** | |
31 | * Set default chapter target size. | |
32 | * Default is 250000 bytes, and minimum is 10240 bytes. | |
33 | * | |
34 | * @param $size segment size in bytes | |
35 | * @return void | |
36 | */ | |
37 | function setSplitSize($size) { | |
38 | $this->splitDefaultSize = (int)$size; | |
39 | if ($size < 10240) { | |
40 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | |
41 | } | |
42 | } | |
43 | ||
44 | /** | |
45 | * Get the chapter target size. | |
46 | * | |
47 | * @return $size | |
48 | */ | |
49 | function getSplitSize() { | |
50 | return $this->splitDefaultSize; | |
51 | } | |
52 | ||
53 | /** | |
54 | * Split $chapter into multiple parts. | |
55 | * | |
56 | * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php | |
57 | * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given | |
58 | * | |
59 | * @param String $chapter XHTML file | |
60 | * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. | |
61 | * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. | |
62 | * | |
63 | * @return array with 1 or more parts | |
64 | */ | |
65 | function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { | |
66 | $chapterData = array(); | |
67 | $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1); | |
68 | if ($splitOnSearchString && !$isSearchRegexp) { | |
69 | $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; | |
70 | } | |
71 | ||
72 | if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { | |
73 | return array($chapter); | |
74 | } | |
75 | ||
76 | $xmlDoc = new DOMDocument(); | |
77 | @$xmlDoc->loadHTML($chapter); | |
78 | ||
79 | $head = $xmlDoc->getElementsByTagName("head"); | |
80 | $body = $xmlDoc->getElementsByTagName("body"); | |
81 | ||
82 | $htmlPos = stripos($chapter, "<html"); | |
83 | $htmlEndPos = stripos($chapter, ">", $htmlPos); | |
84 | $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>"; | |
85 | if (strpos(trim($newXML), "<?xml ") === FALSE) { | |
86 | $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML; | |
87 | } | |
88 | $headerLength = strlen($newXML); | |
89 | ||
90 | $files = array(); | |
91 | $chapterNames = array(); | |
92 | $domDepth = 0; | |
93 | $domPath = array(); | |
94 | $domClonedPath = array(); | |
95 | ||
96 | $curFile = $xmlDoc->createDocumentFragment(); | |
97 | $files[] = $curFile; | |
98 | $curParent = $curFile; | |
99 | $curSize = 0; | |
100 | ||
101 | $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); | |
102 | $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; | |
103 | ||
104 | $partSize = $this->splitDefaultSize - $headLen; | |
105 | ||
106 | if ($bodyLen > $partSize) { | |
107 | $parts = ceil($bodyLen / $partSize); | |
108 | $partSize = ($bodyLen / $parts) - $headLen; | |
109 | } | |
110 | ||
111 | $node = $body->item(0)->firstChild; | |
112 | ||
113 | do { | |
114 | $nodeData = $xmlDoc->saveXML($node); | |
115 | $nodeLen = strlen($nodeData); | |
116 | ||
117 | if ($nodeLen > $partSize && $node->hasChildNodes()) { | |
118 | $domPath[] = $node; | |
119 | $domClonedPath[] = $node->cloneNode(false); | |
120 | $domDepth++; | |
121 | ||
122 | $node = $node->firstChild; | |
123 | } | |
124 | ||
125 | $node2 = $node->nextSibling; | |
126 | ||
127 | if ($node != null && $node->nodeName != "#text") { | |
128 | $doSplit = false; | |
129 | if ($splitOnSearchString) { | |
130 | $doSplit = preg_match($searchString, $nodeData) == 1; | |
131 | if ($doSplit) { | |
132 | $chapterNames[] = trim($nodeData); | |
133 | } | |
134 | } | |
135 | ||
136 | if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) { | |
137 | $curFile = $xmlDoc->createDocumentFragment(); | |
138 | $files[] = $curFile; | |
139 | $curParent = $curFile; | |
140 | if ($domDepth > 0) { | |
141 | reset($domPath); | |
142 | reset($domClonedPath); | |
143 | $oneDomClonedPath = each($domClonedPath); | |
144 | while ($oneDomClonedPath) { | |
145 | list($k, $v) = $oneDomClonedPath; | |
146 | $newParent = $v->cloneNode(false); | |
147 | $curParent->appendChild($newParent); | |
148 | $curParent = $newParent; | |
149 | $oneDomClonedPath = each($domClonedPath); | |
150 | } | |
151 | } | |
152 | $curSize = strlen($xmlDoc->saveXML($curFile)); | |
153 | } | |
154 | $curParent->appendChild($node->cloneNode(true)); | |
155 | $curSize += $nodeLen; | |
156 | } | |
157 | ||
158 | $node = $node2; | |
159 | while ($node == null && $domDepth > 0) { | |
160 | $domDepth--; | |
161 | $node = end($domPath)->nextSibling; | |
162 | array_pop($domPath); | |
163 | array_pop($domClonedPath); | |
164 | $curParent = $curParent->parentNode; | |
165 | } | |
166 | } while ($node != null); | |
167 | ||
168 | $curFile = null; | |
169 | $curSize = 0; | |
170 | ||
171 | $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | |
172 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | |
173 | $xml->preserveWhiteSpace = false; | |
174 | $xml->formatOutput = true; | |
175 | ||
176 | for ($idx = 0; $idx < count($files); $idx++) { | |
177 | $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | |
178 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | |
179 | $xml2Doc->loadXML($newXML); | |
180 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | |
181 | $html->appendChild($xml2Doc->importNode($head->item(0), true)); | |
182 | $body = $xml2Doc->createElement("body"); | |
183 | $html->appendChild($body); | |
184 | $body->appendChild($xml2Doc->importNode($files[$idx], true)); | |
185 | ||
186 | // force pretty printing and correct formatting, should not be needed, but it is. | |
187 | $xml->loadXML($xml2Doc->saveXML()); | |
188 | ||
189 | $doc = $xml->saveXML(); | |
190 | ||
191 | if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) { | |
192 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | |
193 | } | |
194 | ||
195 | $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc; | |
196 | } | |
197 | ||
198 | return $chapterData; | |
199 | } | |
200 | } | |
201 | ?> |