]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
3 * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
4 * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
5 * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
7 * @author A. Grandt <php@grandt.com>
8 * @copyright 2009-2014 A. Grandt
9 * @license GNU LGPL 2.1
10 * @link http://www.phpclasses.org/package/6115
11 * @link https://github.com/Grandt/PHPePub
14 class EPubChapterSplitter
{
17 private $splitDefaultSize = 250000 ;
18 private $bookVersion = EPub
:: BOOK_VERSION_EPUB2
;
22 * Enter description here ...
24 * @param unknown_type $ident
26 function setVersion ( $bookVersion ) {
27 $this- > bookVersion
= is_string ( $bookVersion ) ? trim ( $bookVersion ) : EPub
:: BOOK_VERSION_EPUB2
;
31 * Set default chapter target size.
32 * Default is 250000 bytes, and minimum is 10240 bytes.
34 * @param $size segment size in bytes
37 function setSplitSize ( $size ) {
38 $this- > splitDefaultSize
= ( int ) $size ;
40 $this- > splitDefaultSize
= 10240 ; // Making the file smaller than 10k is not a good idea.
45 * Get the chapter target size.
49 function getSplitSize () {
50 return $this- > splitDefaultSize
;
54 * Split $chapter into multiple parts.
56 * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
57 * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
59 * @param String $chapter XHTML file
60 * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
61 * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
63 * @return array with 1 or more parts
65 function splitChapter ( $chapter , $splitOnSearchString = false , $searchString = '/^Chapter \\ /i' ) {
66 $chapterData = array ();
67 $isSearchRegexp = $splitOnSearchString && ( preg_match ( '#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m' , $searchString ) == 1 );
68 if ( $splitOnSearchString && ! $isSearchRegexp ) {
69 $searchString = '#^<.+?>' . preg_quote ( $searchString , '#' ) . "#" ;
72 if (! $splitOnSearchString && strlen ( $chapter ) <= $this- > splitDefaultSize
) {
73 return array ( $chapter );
76 $xmlDoc = new DOMDocument ();
77 @ $xmlDoc- > loadHTML ( $chapter );
79 $head = $xmlDoc- > getElementsByTagName ( "head" );
80 $body = $xmlDoc- > getElementsByTagName ( "body" );
82 $htmlPos = stripos ( $chapter , "<html" );
83 $htmlEndPos = stripos ( $chapter , ">" , $htmlPos );
84 $newXML = substr ( $chapter , 0 , $htmlEndPos +
1 ) . " \n </html>" ;
85 if ( strpos ( trim ( $newXML ), "<?xml " ) === FALSE ) {
86 $newXML = "<?xml version= \" 1.0 \" encoding= \" utf-8 \" ?> \n " . $newXML ;
88 $headerLength = strlen ( $newXML );
91 $chapterNames = array ();
94 $domClonedPath = array ();
96 $curFile = $xmlDoc- > createDocumentFragment ();
98 $curParent = $curFile ;
101 $bodyLen = strlen ( $xmlDoc- > saveXML ( $body- > item ( 0 )));
102 $headLen = strlen ( $xmlDoc- > saveXML ( $head- > item ( 0 ))) +
$headerLength ;
104 $partSize = $this- > splitDefaultSize
- $headLen ;
106 if ( $bodyLen > $partSize ) {
107 $parts = ceil ( $bodyLen / $partSize );
108 $partSize = ( $bodyLen / $parts ) - $headLen ;
111 $node = $body- > item ( 0 )-> firstChild
;
114 $nodeData = $xmlDoc- > saveXML ( $node );
115 $nodeLen = strlen ( $nodeData );
117 if ( $nodeLen > $partSize && $node- > hasChildNodes ()) {
119 $domClonedPath [] = $node- > cloneNode ( false );
122 $node = $node- > firstChild
;
125 $node2 = $node- > nextSibling
;
127 if ( $node != null && $node- > nodeName
!= "#text" ) {
129 if ( $splitOnSearchString ) {
130 $doSplit = preg_match ( $searchString , $nodeData ) == 1 ;
132 $chapterNames [] = trim ( $nodeData );
136 if ( $curSize > 0 && ( $doSplit || (! $splitOnSearchString && $curSize +
$nodeLen > $partSize ))) {
137 $curFile = $xmlDoc- > createDocumentFragment ();
139 $curParent = $curFile ;
142 reset ( $domClonedPath );
143 $oneDomClonedPath = each ( $domClonedPath );
144 while ( $oneDomClonedPath ) {
145 list ( $k , $v ) = $oneDomClonedPath ;
146 $newParent = $v- > cloneNode ( false );
147 $curParent- > appendChild ( $newParent );
148 $curParent = $newParent ;
149 $oneDomClonedPath = each ( $domClonedPath );
152 $curSize = strlen ( $xmlDoc- > saveXML ( $curFile ));
154 $curParent- > appendChild ( $node- > cloneNode ( true ));
155 $curSize +
= $nodeLen ;
159 while ( $node == null && $domDepth > 0 ) {
161 $node = end ( $domPath )-> nextSibling
;
163 array_pop ( $domClonedPath );
164 $curParent = $curParent- > parentNode
;
166 } while ( $node != null );
171 $xml = new DOMDocument ( '1.0' , $xmlDoc- > xmlEncoding
);
172 $xml- > lookupPrefix ( "http://www.w3.org/1999/xhtml" );
173 $xml- > preserveWhiteSpace
= false ;
174 $xml- > formatOutput
= true ;
176 for ( $idx = 0 ; $idx < count ( $files ); $idx ++
) {
177 $xml2Doc = new DOMDocument ( '1.0' , $xmlDoc- > xmlEncoding
);
178 $xml2Doc- > lookupPrefix ( "http://www.w3.org/1999/xhtml" );
179 $xml2Doc- > loadXML ( $newXML );
180 $html = $xml2Doc- > getElementsByTagName ( "html" )-> item ( 0 );
181 $html- > appendChild ( $xml2Doc- > importNode ( $head- > item ( 0 ), true ));
182 $body = $xml2Doc- > createElement ( "body" );
183 $html- > appendChild ( $body );
184 $body- > appendChild ( $xml2Doc- > importNode ( $files [ $idx ], true ));
186 // force pretty printing and correct formatting, should not be needed, but it is.
187 $xml- > loadXML ( $xml2Doc- > saveXML ());
189 $doc = $xml- > saveXML ();
191 if ( $this- > bookVersion
=== EPub
:: BOOK_VERSION_EPUB3
) {
192 $doc = preg_replace ( '#^\s*<!DOCTYPE\ .+?>\s*#im' , '' , $doc );
195 $chapterData [ $splitOnSearchString ? $chapterNames [ $idx ] : $idx ] = $doc ;