%PDF- %PDF-
Direktori : /www/varak.net/mail.varak.net/vendor/roundcube/rtf-html-php/src/Html/ |
Current File : //www/varak.net/mail.varak.net/vendor/roundcube/rtf-html-php/src/Html/HtmlFormatter.php |
<?php namespace RtfHtmlPhp\Html; use RtfHtmlPhp\Document; class HtmlFormatter { protected $encoding; protected $defaultFont; protected $fromhtml = false; protected $openedTags = []; protected $output = ''; protected $previousState; protected $rtfEncoding; protected $state; protected $states = []; /** * Object constructor. * * By default, HtmlFormatter uses HTML_ENTITIES for code conversion. * You can optionally support a different endoing when creating * the HtmlFormatter instance. * * @param string $encoding Output encoding */ public function __construct($encoding = 'HTML-ENTITIES') { if (!extension_loaded('mbstring')) { throw new \Exception("PHP mbstring extension not enabled"); } if ($encoding != 'HTML-ENTITIES') { // Check if the encoding is reconized by mbstring extension if (!in_array($encoding, mb_list_encodings())) { throw new \Exception("Unsupported encoding: $encoding"); } } $this->encoding = $encoding; } /** * Generates HTML output for the document * * @param Document $document The document * * @return string HTML content */ public function format(Document $document) { // Clear current output $this->output = ''; // Keep track of style modifications $this->previousState = null; // and create a stack of states $this->states = []; // Put an initial standard state onto the stack $this->state = new State(); array_push($this->states, $this->state); // Keep track of opened html tags $this->openedTags = ['span' => false, 'p' => null]; // Begin format $this->processGroup($document->root); // Instead of removing opened tags, we close them $this->output .= $this->openedTags['span'] ? '</span>' : ''; // @phpstan-ignore-line $this->output .= $this->openedTags['p'] ? '</p>' : ''; // @phpstan-ignore-line // Remove extra empty paragraph at the end // TODO: Find the real reason it's there and fix it $this->output = preg_replace('|<p></p>$|', '', $this->output); return $this->output; } /** * Registers a font definition. * * @param \RtfHtmlPhp\Group $fontGroup A group element with a font definition * * @return void */ protected function loadFont(\RtfHtmlPhp\Group $fontGroup) { $fontNumber = 0; $font = new Font(); // Loop through children of the font group. The font group // contains control words with the font number and charset, // and a control text with the font name. foreach ($fontGroup->children as $child) { // Control word if ($child instanceof \RtfHtmlPhp\ControlWord) { switch ($child->word) { case 'f': $fontNumber = $child->parameter; break; // Font family names case 'froman': $font->family = "serif"; break; case 'fswiss': $font->family = "sans-serif"; break; case 'fmodern': $font->family = "monospace"; break; case 'fscript': $font->family = "cursive"; break; case 'fdecor': $font->family = "fantasy"; break; // case 'fnil': break; // default font // case 'ftech': break; // symbol // case 'fbidi': break; // bidirectional font case 'fcharset': // charset $font->charset = $this->getEncodingFromCharset($child->parameter); break; case 'cpg': // code page $font->codepage = $this->getEncodingFromCodepage($child->parameter); break; case 'fprq': // Font pitch $font->fprq = $child->parameter; break; } } // Control text contains the font name, if any: if ($child instanceof \RtfHtmlPhp\Text) { // Store font name (except ; delimiter at end) $font->name = substr($child->text, 0, -1); } /* elseif ($child instanceof \RtfHtmlPhp\Group) { // possible subgroups: // '{\*' \falt #PCDATA '}' = alternate font name // '{\*' \fontemb <fonttype> <fontfname>? <data>? '}' // '{\*' \fontfile <codepage>? #PCDATA '}' // '{\*' \panose <data> '}' continue; } elseif ($child instanceof \RtfHtmlPhp\ControlSymbol) { // the only authorized symbol here is '*': // \*\fname = non tagged file name (only WordPad uses it) continue; } */ } State::setFont($fontNumber, $font); } protected function extractFontTable($fontTblGrp) { // {' \fonttbl (<fontinfo> | ('{' <fontinfo> '}'))+ '}' // <fontnum><fontfamily><fcharset>?<fprq>?<panose>? // <nontaggedname>?<fontemb>?<codepage>? <fontname><fontaltname>? ';' // The Font Table group contains the control word "fonttbl" and some // subgroups. Go through the subgroups, ignoring the "fonttbl" // identifier. foreach ($fontTblGrp->children as $child) { // Ignore non-group, which should be the fonttbl identified word. if (!($child instanceof \RtfHtmlPhp\Group)) { continue; } // Load the font specification in the subgroup: $this->loadFont($child); } } protected function extractColorTable($colorTblGrp) { // {\colortbl;\red0\green0\blue0;} // Index 0 of the RTF color table is the 'auto' color $colortbl = []; $c = count($colorTblGrp); $color = ''; for ($i=1; $i<$c; $i++) { // Iterate through colors if ($colorTblGrp[$i] instanceof \RtfHtmlPhp\ControlWord) { // Extract RGB color and convert it to hex string $color = sprintf( '#%02x%02x%02x', // hex string format $colorTblGrp[$i]->parameter, // red $colorTblGrp[$i+1]->parameter, // green $colorTblGrp[$i+2]->parameter // blue ); $i+=2; } elseif ($colorTblGrp[$i] instanceof \RtfHtmlPhp\Text) { // This is a delimiter ';' so if ($i != 1) { // Store the already extracted color $colortbl[] = $color; } else { // This is the 'auto' color $colortbl[] = 0; } } } State::$colortbl = $colortbl; } protected function extractImage($pictGrp) { $image = new Image(); foreach ($pictGrp as $child) { if ($child instanceof \RtfHtmlPhp\ControlWord) { switch ($child->word) { // Picture Format case "emfblip": $image->format = 'emf'; break; case "pngblip": $image->format = 'png'; break; case "jpegblip": $image->format = 'jpeg'; break; case "macpict": $image->format = 'pict'; break; // case "wmetafile": $Image->format = 'bmp'; break; // Picture size and scaling case "picw": $image->width = $child->parameter; break; case "pich": $image->height = $child->parameter; break; case "picwgoal": $image->goalWidth = $child->parameter; break; case "pichgoal": $image->goalHeight = $child->parameter; break; case "picscalex": $image->pcScaleX = $child->parameter; break; case "picscaley": $image->pcScaleY = $child->parameter; break; // Binary or Hexadecimal Data ? case "bin": $image->binarySize = $child->parameter; break; } } elseif ($child instanceof \RtfHtmlPhp\Text) { // store Data $image->imageData = $child->text; } } // output Image $this->output .= $image->printImage(); } protected function processGroup($group) { // Special group processing: switch ($group->getType()) { case "fonttbl": // Extract font table $this->extractFontTable($group); return; case "colortbl": // Extract color table $this->extractColorTable($group->children); return; case "stylesheet": // Stylesheet extraction not yet supported return; case "info": // Ignore Document information return; case "pict": $this->extractImage($group->children); return; case "nonshppict": // Ignore alternative images return; case "*": // Process destination $this->processDestination($group->children); return; } // Pictures extraction not yet supported // if (substr($group->GetType(), 0, 4) == "pict") { return; } // Push a new state onto the stack: $this->state = clone $this->state; array_push($this->states, $this->state); foreach ($group->children as $child) { $this->formatEntry($child); } // Pop state from stack array_pop($this->states); $this->state = $this->states[count($this->states) - 1]; } protected function processDestination($dest) { if (!$dest[1] instanceof \RtfHtmlPhp\ControlWord) { return; } // Check if this is a Word 97 picture if ($dest[1]->word == "shppict") { $c = count($dest); for ($i = 2; $i < $c; $i++) { $this->formatEntry($dest[$i]); } } elseif ($dest[1]->word == "htmltag") { $c = count($dest); for ($i = 2; $i < $c; $i++) { $entry = $dest[$i]; if ($entry instanceof \RtfHtmlPhp\Text) { $this->output .= $entry->text; } else { $this->formatEntry($entry); } } } } protected function formatEntry($entry) { if ($entry instanceof \RtfHtmlPhp\Group) { $this->processGroup($entry); } elseif ($entry instanceof \RtfHtmlPhp\ControlWord) { $this->formatControlWord($entry); } elseif ($entry instanceof \RtfHtmlPhp\ControlSymbol) { $this->formatControlSymbol($entry); } elseif ($entry instanceof \RtfHtmlPhp\Text) { $this->formatText($entry); } } protected function formatControlWord($word) { switch($word->word) { case 'fromhtml': $this->fromhtml = $word->parameter > 0; break; case 'htmlrtf': $this->state->htmlrtf = $word->parameter > 0; break; case 'plain': // Reset font formatting properties to default. case 'pard': // Reset to default paragraph properties. $this->state->reset($this->defaultFont); break; // Font formatting properties: case 'b': // bold $this->state->bold = $word->parameter; break; case 'i': // italic $this->state->italic = $word->parameter; break; case 'ul': // underline $this->state->underline = $word->parameter; break; case 'ulnone': // no underline $this->state->underline = false; break; case 'strike': // strike-through $this->state->strike = $word->parameter; break; case 'v': // hidden $this->state->hidden = $word->parameter; break; case 'fs': // Font size $this->state->fontsize = ceil(($word->parameter / 24) * 16); break; case 'f': // Font $this->state->font = $word->parameter; break; case 'deff': // Store default font $this->defaultFont = $word->parameter; break; // Colors case 'cf': case 'chcfpat': $this->state->fontcolor = $word->parameter; break; case 'cb': case 'chcbpat': $this->state->background = $word->parameter; break; case 'highlight': $this->state->hcolor = $word->parameter; break; // Special characters case 'lquote': $this->write($this->fromhtml ? "‘" : "‘"); break; // ‘ ‘ case 'rquote': $this->write($this->fromhtml ? "’" : "’"); break; // ’ ’ case 'ldblquote': $this->write($this->fromhtml ? "“" : "“"); break; // “ “ case 'rdblquote': $this->write($this->fromhtml ? "”" : "”"); break; // ” ” case 'bullet': $this->write($this->fromhtml ? "•" : "•"); break; // • • case 'endash': $this->write($this->fromhtml ? "–" : "–"); break; // – – case 'emdash': $this->write($this->fromhtml ? "—" : "—"); break; // — — case 'enspace': $this->write($this->fromhtml ? " " : " "); break; //   case 'emspace': $this->write($this->fromhtml ? " " : " "); break; //   case 'tab': $this->write($this->fromhtml ? "\t" : " "); break; // Character value 9 case 'line': $this->output .= $this->fromhtml ? "\n" : "<br/>"; break; // character value (line feed = ) (carriage return = ) // Unicode characters case 'u': $uchar = $this->decodeUnicode($word->parameter); $this->write($uchar); break; // Paragraphs case 'par': case 'row': if ($this->fromhtml) { $this->output .= "\n"; break; } // Close previously opened tags $this->closeTags(); // Begin a new paragraph $this->openTag('p'); break; // Code pages case 'ansi': case 'mac': case 'pc': case 'pca': $this->rtfEncoding = $this->getEncodingFromCodepage($word->word); break; case 'ansicpg': if ($word->parameter) { $this->rtfEncoding = $this->getEncodingFromCodepage($word->parameter); } break; } } protected function decodeUnicode($code, $srcEnc = 'UTF-8') { $utf8 = false; if ($srcEnc != 'UTF-8') { // convert character to Unicode $utf8 = iconv($srcEnc, 'UTF-8', chr($code)); } if ($this->encoding == 'HTML-ENTITIES') { return $utf8 !== false ? "&#{$this->ordUtf8($utf8)};" : "&#{$code};"; } if ($this->encoding == 'UTF-8') { return $utf8 !== false ? $utf8 : mb_convert_encoding("&#{$code};", $this->encoding, 'HTML-ENTITIES'); } return $utf8 !== false ? mb_convert_encoding($utf8, $this->encoding, 'UTF-8') : mb_convert_encoding("&#{$code};", $this->encoding, 'HTML-ENTITIES'); } protected function write($txt) { // Ignore regions that are not part of the original (encapsulated) HTML content if ($this->state->htmlrtf) { return; } if ($this->fromhtml) { $this->output .= $txt; return; } if (!isset($this->openedTags['p'])) { // Create the first paragraph $this->openTag('p'); } // Create a new 'span' element only when a style change occurs. // 1st case: style change occured // 2nd case: there is no change in style but the already created 'span' // element is somehow closed (ex. because of an end of paragraph) if (!$this->state->equals($this->previousState) || empty($this->openedTags['span'])) { // If applicable close previously opened 'span' tag $this->closeTag('span'); $style = $this->state->printStyle(); // Keep track of preceding style $this->previousState = clone $this->state; // Create style attribute and open span $attr = $style ? "style=\"{$style}\"" : ""; $this->openTag('span', $attr); } $this->output .= $txt; } protected function openTag($tag, $attr = '') { // Ignore regions that are not part of the original (encapsulated) HTML content if ($this->fromhtml) { return; } $this->output .= $attr ? "<{$tag} {$attr}>" : "<{$tag}>"; $this->openedTags[$tag] = true; } protected function closeTag($tag) { if ($this->fromhtml) { return; } if (!empty($this->openedTags[$tag])) { // Check for empty html elements if (substr($this->output, -strlen("<{$tag}>")) == "<{$tag}>") { switch ($tag) { case 'p': // Replace empty 'p' element with a line break $this->output = substr($this->output, 0, -3) . "<br>"; break; default: // Delete empty elements $this->output = substr($this->output, 0, -strlen("<{$tag}>")); break; } } else { $this->output .= "</{$tag}>"; } $this->openedTags[$tag] = false; } } /** * Closes all opened tags * * @return void */ protected function closeTags() { // Close all opened tags foreach ($this->openedTags as $tag => $b) { $this->closeTag($tag); } } protected function formatControlSymbol($symbol) { if ($symbol->symbol == '\'') { $enc = $this->getSourceEncoding(); $uchar = $this->decodeUnicode($symbol->parameter, $enc); $this->write($uchar); } elseif ($symbol->symbol == '~') { $this->write(" "); // Non breaking space } elseif ($symbol->symbol == '-') { $this->write("­"); // Optional hyphen } elseif ($symbol->symbol == '_') { $this->write("‑"); // Non breaking hyphen } elseif ($symbol->symbol == '{') { $this->write("{"); // Non breaking hyphen } } protected function formatText($text) { // Convert special characters to HTML entities $txt = htmlspecialchars($text->text, ENT_NOQUOTES, 'UTF-8'); if ($this->encoding == 'HTML-ENTITIES') { $this->write($txt); } else { $this->write(mb_convert_encoding($txt, $this->encoding, 'UTF-8')); } } protected function getSourceEncoding() { if (isset($this->state->font)) { if (isset(State::$fonttbl[$this->state->font]->codepage)) { return State::$fonttbl[$this->state->font]->codepage; } if (isset(State::$fonttbl[$this->state->font]->charset)) { return State::$fonttbl[$this->state->font]->charset; } } return $this->rtfEncoding; } /** * Convert RTF charset identifier into an encoding name (for iconv) * * @param int $charset Charset identifier * * @return string|null Encoding name or NULL on unknown CodePage */ protected function getEncodingFromCharset($charset) { // maps windows character sets to iconv encoding names $map = array ( 0 => 'CP1252', // ANSI: Western Europe 1 => 'CP1252', //*Default 2 => 'CP1252', //*Symbol 3 => null, // Invalid 77 => 'MAC', //*also [MacRoman]: Macintosh 128 => 'CP932', //*or [Shift_JIS]?: Japanese 129 => 'CP949', //*also [UHC]: Korean (Hangul) 130 => 'CP1361', //*also [JOHAB]: Korean (Johab) 134 => 'CP936', //*or [GB2312]?: Simplified Chinese 136 => 'CP950', //*or [BIG5]?: Traditional Chinese 161 => 'CP1253', // Greek 162 => 'CP1254', // Turkish (latin 5) 163 => 'CP1258', // Vietnamese 177 => 'CP1255', // Hebrew 178 => 'CP1256', // Simplified Arabic 179 => 'CP1256', //*Traditional Arabic 180 => 'CP1256', //*Arabic User 181 => 'CP1255', //*Hebrew User 186 => 'CP1257', // Baltic 204 => 'CP1251', // Russian (Cyrillic) 222 => 'CP874', // Thai 238 => 'CP1250', // Eastern European (latin 2) 254 => 'CP437', //*also [IBM437][437]: PC437 255 => 'CP437', //*OEM still PC437 ); if (isset($map[$charset])) { return $map[$charset]; } return null; } /** * Convert RTF CodePage identifier into an encoding name (for iconv) * * @param string $cpg CodePage identifier * * @return string|null Encoding name or NULL on unknown CodePage */ protected function getEncodingFromCodepage($cpg) { $map = array ( 'ansi' => 'CP1252', 'mac' => 'MAC', 'pc' => 'CP437', 'pca' => 'CP850', 437 => 'CP437', // United States IBM 708 => 'ASMO-708', // also [ISO-8859-6][ARABIC] Arabic /* Not supported by iconv 709, => '' // Arabic (ASMO 449+, BCON V4) 710, => '' // Arabic (transparent Arabic) 711, => '' // Arabic (Nafitha Enhanced) 720, => '' // Arabic (transparent ASMO) */ 819 => 'CP819', // Windows 3.1 (US and Western Europe) 850 => 'CP850', // IBM multilingual 852 => 'CP852', // Eastern European 860 => 'CP860', // Portuguese 862 => 'CP862', // Hebrew 863 => 'CP863', // French Canadian 864 => 'CP864', // Arabic 865 => 'CP865', // Norwegian 866 => 'CP866', // Soviet Union 874 => 'CP874', // Thai 932 => 'CP932', // Japanese 936 => 'CP936', // Simplified Chinese 949 => 'CP949', // Korean 950 => 'CP950', // Traditional Chinese 1250 => 'CP1250', // Windows 3.1 (Eastern European) 1251 => 'CP1251', // Windows 3.1 (Cyrillic) 1252 => 'CP1252', // Western European 1253 => 'CP1253', // Greek 1254 => 'CP1254', // Turkish 1255 => 'CP1255', // Hebrew 1256 => 'CP1256', // Arabic 1257 => 'CP1257', // Baltic 1258 => 'CP1258', // Vietnamese 1361 => 'CP1361', // Johab ); if (isset($map[$cpg])) { return $map[$cpg]; } return null; } protected function ordUtf8($chr) { $ord0 = ord($chr); if ($ord0 <= 127) { return $ord0; } $ord1 = ord($chr[1]); if ($ord0 >= 192 && $ord0 <= 223) { return ($ord0 - 192) * 64 + ($ord1 - 128); } $ord2 = ord($chr[2]); if ($ord0 >= 224 && $ord0 <= 239) { return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128); } $ord3 = ord($chr[3]); if ($ord0 >= 240 && $ord0 <= 247) { return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2 - 128) * 64 + ($ord3 - 128); } $ord4 = ord($chr[4]); if ($ord0 >= 248 && $ord0 <= 251) { return ($ord0 - 248) * 16777216 + ($ord1 - 128) * 262144 + ($ord2 - 128) * 4096 + ($ord3 - 128) * 64 + ($ord4 - 128); } if ($ord0 >= 252 && $ord0 <= 253) { return ($ord0 - 252) * 1073741824 + ($ord1 - 128) * 16777216 + ($ord2 - 128) * 262144 + ($ord3 - 128) * 4096 + ($ord4 - 128) * 64 + (ord($chr[5]) - 128); } // trigger_error("Invalid Unicode character: {$chr}"); } }