inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php

   1 <?php
   2 /**
   3  * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
   4  * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
   5  * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
   6  *
   7  * @author A. Grandt <php@grandt.com>
   8  * @copyright 2009-2014 A. Grandt
   9  * @license GNU LGPL 2.1
  10  * @link http://www.phpclasses.org/package/6115
  11  * @link https://github.com/Grandt/PHPePub
  12  * @version 3.20
  13  */
  14 class EPubChapterSplitter {
  15     const VERSION = 3.20;
  16
  17     private $splitDefaultSize = 250000;
  18     private $bookVersion = EPub::BOOK_VERSION_EPUB2;
  19
  20     /**
  21      *
  22      * Enter description here ...
  23      *
  24      * @param unknown_type $ident
  25      */
  26     function setVersion($bookVersion) {
  27         $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
  28     }
  29
  30         /**
  31      * Set default chapter target size.
  32      * Default is 250000 bytes, and minimum is 10240 bytes.
  33      *
  34      * @param $size segment size in bytes
  35      * @return void
  36      */
  37     function setSplitSize($size) {
  38         $this->splitDefaultSize = (int)$size;
  39         if ($size < 10240) {
  40             $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
  41         }
  42     }
  43
  44     /**
  45      * Get the chapter target size.
  46      *
  47      * @return $size
  48      */
  49     function getSplitSize() {
  50         return $this->splitDefaultSize;
  51     }
  52
  53     /**
  54      * Split $chapter into multiple parts.
  55      *
  56      * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
  57      * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
  58      *
  59      * @param String $chapter XHTML file
  60      * @param Bool   $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
  61      * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
  62      *
  63      * @return array with 1 or more parts
  64      */
  65     function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
  66         $chapterData = array();
  67         $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
  68         if ($splitOnSearchString && !$isSearchRegexp) {
  69             $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
  70         }
  71
  72         if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
  73             return array($chapter);
  74         }
  75
  76         $xmlDoc = new DOMDocument();
  77         @$xmlDoc->loadHTML($chapter);
  78
  79         $head = $xmlDoc->getElementsByTagName("head");
  80         $body = $xmlDoc->getElementsByTagName("body");
  81
  82         $htmlPos = stripos($chapter, "<html");
  83         $htmlEndPos = stripos($chapter, ">", $htmlPos);
  84         $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
  85         if (strpos(trim($newXML), "<?xml ") === FALSE) {
  86             $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
  87         }
  88         $headerLength = strlen($newXML);
  89
  90         $files = array();
  91         $chapterNames = array();
  92         $domDepth = 0;
  93         $domPath = array();
  94         $domClonedPath = array();
  95
  96         $curFile = $xmlDoc->createDocumentFragment();
  97         $files[] = $curFile;
  98         $curParent = $curFile;
  99         $curSize = 0;
 100
 101         $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
 102         $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
 103
 104         $partSize = $this->splitDefaultSize - $headLen;
 105
 106         if ($bodyLen > $partSize) {
 107             $parts = ceil($bodyLen / $partSize);
 108             $partSize = ($bodyLen / $parts)  - $headLen;
 109         }
 110
 111         $node = $body->item(0)->firstChild;
 112
 113         do {
 114             $nodeData = $xmlDoc->saveXML($node);
 115             $nodeLen = strlen($nodeData);
 116
 117             if ($nodeLen > $partSize && $node->hasChildNodes()) {
 118                 $domPath[] = $node;
 119                 $domClonedPath[] = $node->cloneNode(false);
 120                 $domDepth++;
 121
 122                 $node = $node->firstChild;
 123             }
 124
 125             $node2 = $node->nextSibling;
 126
 127             if ($node != null && $node->nodeName != "#text") {
 128                 $doSplit = false;
 129                 if ($splitOnSearchString) {
 130                     $doSplit = preg_match($searchString, $nodeData) == 1;
 131                     if ($doSplit) {
 132                         $chapterNames[] = trim($nodeData);
 133                     }
 134                 }
 135
 136                 if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
 137                     $curFile = $xmlDoc->createDocumentFragment();
 138                     $files[] = $curFile;
 139                     $curParent = $curFile;
 140                     if ($domDepth > 0) {
 141                         reset($domPath);
 142                         reset($domClonedPath);
 143                         $oneDomClonedPath = each($domClonedPath);
 144                         while ($oneDomClonedPath) {
 145                             list($k, $v) = $oneDomClonedPath;
 146                             $newParent = $v->cloneNode(false);
 147                             $curParent->appendChild($newParent);
 148                             $curParent = $newParent;
 149                             $oneDomClonedPath = each($domClonedPath);
 150                         }
 151                     }
 152                     $curSize = strlen($xmlDoc->saveXML($curFile));
 153                 }
 154                 $curParent->appendChild($node->cloneNode(true));
 155                 $curSize += $nodeLen;
 156             }
 157
 158             $node = $node2;
 159             while ($node == null && $domDepth > 0) {
 160                 $domDepth--;
 161                 $node = end($domPath)->nextSibling;
 162                 array_pop($domPath);
 163                 array_pop($domClonedPath);
 164                 $curParent = $curParent->parentNode;
 165             }
 166         } while ($node != null);
 167
 168         $curFile = null;
 169         $curSize = 0;
 170
 171         $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
 172         $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
 173         $xml->preserveWhiteSpace = false;
 174         $xml->formatOutput = true;
 175
 176         for ($idx = 0; $idx < count($files); $idx++) {
 177             $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
 178             $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
 179             $xml2Doc->loadXML($newXML);
 180             $html = $xml2Doc->getElementsByTagName("html")->item(0);
 181             $html->appendChild($xml2Doc->importNode($head->item(0), true));
 182             $body = $xml2Doc->createElement("body");
 183             $html->appendChild($body);
 184             $body->appendChild($xml2Doc->importNode($files[$idx], true));
 185
 186             // force pretty printing and correct formatting, should not be needed, but it is.
 187             $xml->loadXML($xml2Doc->saveXML());
 188
 189                         $doc = $xml->saveXML();
 190
 191                         if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
 192                                 $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
 193                         }
 194
 195             $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
 196         }
 197
 198         return $chapterData;
 199     }
 200 }
 201 ?>