%PDF- %PDF-
Direktori : /www/varak.net/wiki.varak.net/vendor/wikimedia/remex-html/RemexHtml/ |
Current File : //www/varak.net/wiki.varak.net/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php |
<?php namespace RemexHtml; /** * Generate HTMLData.php. This can be executed e.g. with * * echo 'RemexHtml\GenerateDataFiles::run()' | hhvm bin/test.php */ class GenerateDataFiles { const NS_HTML = 'http://www.w3.org/1999/xhtml'; const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_XLINK = 'http://www.w3.org/1999/xlink'; const NS_XML = 'http://www.w3.org/XML/1998/namespace'; const NS_XMLNS = 'http://www.w3.org/2000/xmlns/'; /** * The only public entry point */ public static function run() { $instance = new self; $instance->execute(); } /** * This is the character entity mapping table copied from * https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references */ private static $legacyNumericEntityData = <<<EOT 0x00 U+FFFD REPLACEMENT CHARACTER 0x80 U+20AC EURO SIGN (€) 0x82 U+201A SINGLE LOW-9 QUOTATION MARK (‚) 0x83 U+0192 LATIN SMALL LETTER F WITH HOOK (ƒ) 0x84 U+201E DOUBLE LOW-9 QUOTATION MARK („) 0x85 U+2026 HORIZONTAL ELLIPSIS (…) 0x86 U+2020 DAGGER (†) 0x87 U+2021 DOUBLE DAGGER (‡) 0x88 U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) 0x89 U+2030 PER MILLE SIGN (‰) 0x8A U+0160 LATIN CAPITAL LETTER S WITH CARON (Š) 0x8B U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) 0x8C U+0152 LATIN CAPITAL LIGATURE OE (Œ) 0x8E U+017D LATIN CAPITAL LETTER Z WITH CARON (Ž) 0x91 U+2018 LEFT SINGLE QUOTATION MARK (‘) 0x92 U+2019 RIGHT SINGLE QUOTATION MARK (’) 0x93 U+201C LEFT DOUBLE QUOTATION MARK (“) 0x94 U+201D RIGHT DOUBLE QUOTATION MARK (”) 0x95 U+2022 BULLET (•) 0x96 U+2013 EN DASH (–) 0x97 U+2014 EM DASH (—) 0x98 U+02DC SMALL TILDE (˜) 0x99 U+2122 TRADE MARK SIGN (™) 0x9A U+0161 LATIN SMALL LETTER S WITH CARON (š) 0x9B U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) 0x9C U+0153 LATIN SMALL LIGATURE OE (œ) 0x9E U+017E LATIN SMALL LETTER Z WITH CARON (ž) 0x9F U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) EOT; /** * This is the list of public identifier prefixes that cause quirks mode * to be set, from § 8.2.5.4.1 */ private static $quirkyPublicPrefixes = [ "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//", ]; private static $special = [ self::NS_HTML => 'address, applet, area, article, aside, base, basefont, bgsound, blockquote, body, br, button, caption, center, col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset, figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4, h5, h6, head, header, hr, html, iframe, img, input, li, link, listing, main, marquee, menu, menuitem, meta, nav, noembed, noframes, noscript, object, ol, p, param, plaintext, pre, script, section, select, source, style, summary, table, tbody, td, template, textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp', self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml', self::NS_SVG => 'foreignObject, desc, title', ]; // @codingStandardsIgnoreStart /** * The NameStartChar production from XML 1.0, but with colon excluded since * there's a lot of ways to break namespace validation, and we actually need * this for local names */ private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]'; /** The NameChar production from XML 1.0 */ private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]'; // @codingStandardsIgnoreEnd private function makeRegexAlternation( $array ) { $regex = ''; foreach ( $array as $value ) { if ( $regex !== '' ) { $regex .= '|'; } $regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' ); } return $regex; } private function getCharRanges( $input, $nonterminals = [] ) { $ranges = []; foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) { if ( preg_match( '/^"(.)"$/', $case, $m ) ) { // Single ASCII character $ranges[] = [ ord( $m[1] ), ord( $m[1] ) ]; } elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) { // ASCII range $ranges[] = [ ord( $m[1] ), ord( $m[2] ) ]; } elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) { // Single encoded character $codepoint = intval( $m[1], 16 ); $ranges[] = [ $codepoint, $codepoint ]; } elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) { // Encoded range $ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ]; } elseif ( isset( $nonterminals[$case] ) ) { $ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) ); } else { throw new \Exception( "Invalid XML char case \"$case\"" ); } } usort( $ranges, function ( $a, $b ) { return $a[0] - $b[0]; } ); return $ranges; } private function makeConvTable( $input, $nonterminals = [] ) { $ranges = $this->getCharRanges( $input, $nonterminals ); // Invert the ranges, produce a set complement $lastEndPlusOne = 0; $table = []; for ( $i = 0; $i < count( $ranges ); $i++ ) { $start = $ranges[$i][0]; $end = $ranges[$i][1]; // Merge consecutive ranges for ( $j = $i + 1; $j < count( $ranges ); $j++ ) { if ( $ranges[$j][0] === $end + 1 ) { $end = $ranges[$j][1]; $i = $j; } else { break; } } $table[] = $lastEndPlusOne; $table[] = $start - 1; $table[] = 0; $table[] = 0xffffff; $lastEndPlusOne = $end + 1; } // Last range $table[] = $lastEndPlusOne; $table[] = 0x10ffff; $table[] = 0; $table[] = 0xffffff; return $table; } private function encodeConvTable( $table ) { return "[\n\t\t" . implode( ",\n\t\t", array_map( function ( $a ) { return implode( ', ', $a ); }, array_chunk( $table, 4 ) ) ) . ' ]'; } private function execute() { $entitiesJson = file_get_contents( __DIR__ . '/entities.json' ); if ( $entitiesJson === false ) { throw new \Exception( "Please download entities.json from " . "https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" ); } $entities = (array)json_decode( $entitiesJson ); $entityTranslations = []; foreach ( $entities as $entity => $info ) { $entityTranslations[substr( $entity, 1 )] = $info->characters; } // Sort descending by length uksort( $entities, function ( $a, $b ) { if ( strlen( $a ) > strlen( $b ) ) { return -1; } elseif ( strlen( $a ) < strlen( $b ) ) { return 1; } else { return strcmp( $a, $b ); } } ); $entityRegex = $this->makeRegexAlternation( array_keys( $entities ) ); $matches = []; preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m', self::$legacyNumericEntityData, $matches, PREG_SET_ORDER ); $legacyNumericEntities = []; foreach ( $matches as $match ) { $legacyNumericEntities[ intval( $match[1], 16 ) ] = \UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) ); } $quirkyRegex = '~' . $this->makeRegexAlternation( self::$quirkyPublicPrefixes ) . '~xAi'; $nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar ); $nameCharConvTable = $this->makeConvTable( self::$nameChar, [ 'NameStartChar' => self::$nameStartChar ] ); $encEntityRegex = var_export( $entityRegex, true ); $encTranslations = var_export( $entityTranslations, true ); $encLegacy = var_export( $legacyNumericEntities, true ); $encQuirkyRegex = var_export( $quirkyRegex, true ); $encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable ); $encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable ); $special = []; foreach ( self::$special as $ns => $str ) { foreach ( explode( ',', $str ) as $name ) { $special[$ns][trim( $name )] = true; } } $encSpecial = var_export( $special, true ); $fileContents = '<' . <<<PHP ?php /** * This data file is machine generated, see GenerateDataFiles.php */ namespace RemexHtml; class HTMLData { const NS_HTML = 'http://www.w3.org/1999/xhtml'; const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_XLINK = 'http://www.w3.org/1999/xlink'; const NS_XML = 'http://www.w3.org/XML/1998/namespace'; const NS_XMLNS = 'http://www.w3.org/2000/xmlns/'; static public \$special = $encSpecial; static public \$namedEntityRegex = $encEntityRegex; static public \$namedEntityTranslations = $encTranslations; static public \$legacyNumericEntities = $encLegacy; static public \$quirkyPrefixRegex = $encQuirkyRegex; static public \$nameStartCharConvTable = $encNameStartCharConvTable; static public \$nameCharConvTable = $encNameCharConvTable; } PHP; file_put_contents( __DIR__ . '/HTMLData.php', $fileContents ); } }