%PDF- %PDF-
| Direktori : /www/varak.net/wiki.varak.net/vendor/wikimedia/remex-html/RemexHtml/ |
| Current File : //www/varak.net/wiki.varak.net/vendor/wikimedia/remex-html/RemexHtml/GenerateDataFiles.php |
<?php
namespace RemexHtml;
/**
* Generate HTMLData.php. This can be executed e.g. with
*
* echo 'RemexHtml\GenerateDataFiles::run()' | hhvm bin/test.php
*/
class GenerateDataFiles {
const NS_HTML = 'http://www.w3.org/1999/xhtml';
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NS_SVG = 'http://www.w3.org/2000/svg';
const NS_XLINK = 'http://www.w3.org/1999/xlink';
const NS_XML = 'http://www.w3.org/XML/1998/namespace';
const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
/**
* The only public entry point
*/
public static function run() {
$instance = new self;
$instance->execute();
}
/**
* This is the character entity mapping table copied from
* https://www.w3.org/TR/2014/REC-html5-20141028/syntax.html#tokenizing-character-references
*/
private static $legacyNumericEntityData = <<<EOT
0x00 U+FFFD REPLACEMENT CHARACTER
0x80 U+20AC EURO SIGN (€)
0x82 U+201A SINGLE LOW-9 QUOTATION MARK (‚)
0x83 U+0192 LATIN SMALL LETTER F WITH HOOK (ƒ)
0x84 U+201E DOUBLE LOW-9 QUOTATION MARK („)
0x85 U+2026 HORIZONTAL ELLIPSIS (…)
0x86 U+2020 DAGGER (†)
0x87 U+2021 DOUBLE DAGGER (‡)
0x88 U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
0x89 U+2030 PER MILLE SIGN (‰)
0x8A U+0160 LATIN CAPITAL LETTER S WITH CARON (Š)
0x8B U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
0x8C U+0152 LATIN CAPITAL LIGATURE OE (Œ)
0x8E U+017D LATIN CAPITAL LETTER Z WITH CARON (Ž)
0x91 U+2018 LEFT SINGLE QUOTATION MARK (‘)
0x92 U+2019 RIGHT SINGLE QUOTATION MARK (’)
0x93 U+201C LEFT DOUBLE QUOTATION MARK (“)
0x94 U+201D RIGHT DOUBLE QUOTATION MARK (”)
0x95 U+2022 BULLET (•)
0x96 U+2013 EN DASH (–)
0x97 U+2014 EM DASH (—)
0x98 U+02DC SMALL TILDE (˜)
0x99 U+2122 TRADE MARK SIGN (™)
0x9A U+0161 LATIN SMALL LETTER S WITH CARON (š)
0x9B U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
0x9C U+0153 LATIN SMALL LIGATURE OE (œ)
0x9E U+017E LATIN SMALL LETTER Z WITH CARON (ž)
0x9F U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
EOT;
/**
* This is the list of public identifier prefixes that cause quirks mode
* to be set, from § 8.2.5.4.1
*/
private static $quirkyPublicPrefixes = [
"+//Silmaril//dtd html Pro v0r11 19970101//",
"-//AS//DTD HTML 3.0 asWedit + extensions//",
"-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//",
"-//IETF//DTD HTML 2.0 Level 1//",
"-//IETF//DTD HTML 2.0 Level 2//",
"-//IETF//DTD HTML 2.0 Strict Level 1//",
"-//IETF//DTD HTML 2.0 Strict Level 2//",
"-//IETF//DTD HTML 2.0 Strict//",
"-//IETF//DTD HTML 2.0//",
"-//IETF//DTD HTML 2.1E//",
"-//IETF//DTD HTML 3.0//",
"-//IETF//DTD HTML 3.2 Final//",
"-//IETF//DTD HTML 3.2//",
"-//IETF//DTD HTML 3//",
"-//IETF//DTD HTML Level 0//",
"-//IETF//DTD HTML Level 1//",
"-//IETF//DTD HTML Level 2//",
"-//IETF//DTD HTML Level 3//",
"-//IETF//DTD HTML Strict Level 0//",
"-//IETF//DTD HTML Strict Level 1//",
"-//IETF//DTD HTML Strict Level 2//",
"-//IETF//DTD HTML Strict Level 3//",
"-//IETF//DTD HTML Strict//",
"-//IETF//DTD HTML//",
"-//Metrius//DTD Metrius Presentational//",
"-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//",
"-//Microsoft//DTD Internet Explorer 2.0 HTML//",
"-//Microsoft//DTD Internet Explorer 2.0 Tables//",
"-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//",
"-//Microsoft//DTD Internet Explorer 3.0 HTML//",
"-//Microsoft//DTD Internet Explorer 3.0 Tables//",
"-//Netscape Comm. Corp.//DTD HTML//",
"-//Netscape Comm. Corp.//DTD Strict HTML//",
"-//O'Reilly and Associates//DTD HTML 2.0//",
"-//O'Reilly and Associates//DTD HTML Extended 1.0//",
"-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//",
"-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//",
"-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//",
"-//Spyglass//DTD HTML 2.0 Extended//",
"-//SQ//DTD HTML 2.0 HoTMetaL + extensions//",
"-//Sun Microsystems Corp.//DTD HotJava HTML//",
"-//Sun Microsystems Corp.//DTD HotJava Strict HTML//",
"-//W3C//DTD HTML 3 1995-03-24//",
"-//W3C//DTD HTML 3.2 Draft//",
"-//W3C//DTD HTML 3.2 Final//",
"-//W3C//DTD HTML 3.2//",
"-//W3C//DTD HTML 3.2S Draft//",
"-//W3C//DTD HTML 4.0 Frameset//",
"-//W3C//DTD HTML 4.0 Transitional//",
"-//W3C//DTD HTML Experimental 19960712//",
"-//W3C//DTD HTML Experimental 970421//",
"-//W3C//DTD W3 HTML//",
"-//W3O//DTD W3 HTML 3.0//",
"-//WebTechs//DTD Mozilla HTML 2.0//",
"-//WebTechs//DTD Mozilla HTML//",
];
private static $special = [
self::NS_HTML => 'address, applet, area, article, aside, base,
basefont, bgsound, blockquote, body, br, button, caption, center,
col, colgroup, dd, details, dir, div, dl, dt, embed, fieldset,
figcaption, figure, footer, form, frame, frameset, h1, h2, h3, h4,
h5, h6, head, header, hr, html, iframe, img, input, li, link,
listing, main, marquee, menu, menuitem, meta, nav, noembed,
noframes, noscript, object, ol, p, param, plaintext, pre, script,
section, select, source, style, summary, table, tbody, td, template,
textarea, tfoot, th, thead, title, tr, track, ul, wbr, xmp',
self::NS_MATHML => 'mi, mo, mn, ms, mtext, annotation-xml',
self::NS_SVG => 'foreignObject, desc, title',
];
// @codingStandardsIgnoreStart
/**
* The NameStartChar production from XML 1.0, but with colon excluded since
* there's a lot of ways to break namespace validation, and we actually need
* this for local names
*/
private static $nameStartChar = '[A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]';
/** The NameChar production from XML 1.0 */
private static $nameChar = 'NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]';
// @codingStandardsIgnoreEnd
private function makeRegexAlternation( $array ) {
$regex = '';
foreach ( $array as $value ) {
if ( $regex !== '' ) {
$regex .= '|';
}
$regex .= "\n\t\t" . preg_quote( substr( $value, 1 ), '~' );
}
return $regex;
}
private function getCharRanges( $input, $nonterminals = [] ) {
$ranges = [];
foreach ( preg_split( '/\s*\|\s*/', $input ) as $case ) {
if ( preg_match( '/^"(.)"$/', $case, $m ) ) {
// Single ASCII character
$ranges[] = [ ord( $m[1] ), ord( $m[1] ) ];
} elseif ( preg_match( '/^\[(.)-(.)\]$/', $case, $m ) ) {
// ASCII range
$ranges[] = [ ord( $m[1] ), ord( $m[2] ) ];
} elseif ( preg_match( '/^#x([0-9A-F]+)$/', $case, $m ) ) {
// Single encoded character
$codepoint = intval( $m[1], 16 );
$ranges[] = [ $codepoint, $codepoint ];
} elseif ( preg_match( '/^\[#x([0-9A-F]+)-#x([0-9A-F]+)\]$/', $case, $m ) ) {
// Encoded range
$ranges[] = [ intval( $m[1], 16 ), intval( $m[2], 16 ) ];
} elseif ( isset( $nonterminals[$case] ) ) {
$ranges = array_merge( $ranges, $this->getCharRanges( $nonterminals[$case] ) );
} else {
throw new \Exception( "Invalid XML char case \"$case\"" );
}
}
usort( $ranges, function ( $a, $b ) {
return $a[0] - $b[0];
} );
return $ranges;
}
private function makeConvTable( $input, $nonterminals = [] ) {
$ranges = $this->getCharRanges( $input, $nonterminals );
// Invert the ranges, produce a set complement
$lastEndPlusOne = 0;
$table = [];
for ( $i = 0; $i < count( $ranges ); $i++ ) {
$start = $ranges[$i][0];
$end = $ranges[$i][1];
// Merge consecutive ranges
for ( $j = $i + 1; $j < count( $ranges ); $j++ ) {
if ( $ranges[$j][0] === $end + 1 ) {
$end = $ranges[$j][1];
$i = $j;
} else {
break;
}
}
$table[] = $lastEndPlusOne;
$table[] = $start - 1;
$table[] = 0;
$table[] = 0xffffff;
$lastEndPlusOne = $end + 1;
}
// Last range
$table[] = $lastEndPlusOne;
$table[] = 0x10ffff;
$table[] = 0;
$table[] = 0xffffff;
return $table;
}
private function encodeConvTable( $table ) {
return "[\n\t\t" . implode( ",\n\t\t", array_map(
function ( $a ) {
return implode( ', ', $a );
},
array_chunk( $table, 4 ) ) ) . ' ]';
}
private function execute() {
$entitiesJson = file_get_contents( __DIR__ . '/entities.json' );
if ( $entitiesJson === false ) {
throw new \Exception( "Please download entities.json from " .
"https://www.w3.org/TR/2016/REC-html51-20161101/entities.json" );
}
$entities = (array)json_decode( $entitiesJson );
$entityTranslations = [];
foreach ( $entities as $entity => $info ) {
$entityTranslations[substr( $entity, 1 )] = $info->characters;
}
// Sort descending by length
uksort( $entities, function ( $a, $b ) {
if ( strlen( $a ) > strlen( $b ) ) {
return -1;
} elseif ( strlen( $a ) < strlen( $b ) ) {
return 1;
} else {
return strcmp( $a, $b );
}
} );
$entityRegex = $this->makeRegexAlternation( array_keys( $entities ) );
$matches = [];
preg_match_all( '/^0x([0-9A-F]+)\s+U\+([0-9A-F]+)/m',
self::$legacyNumericEntityData, $matches, PREG_SET_ORDER );
$legacyNumericEntities = [];
foreach ( $matches as $match ) {
$legacyNumericEntities[ intval( $match[1], 16 ) ] =
\UtfNormal\Utils::codepointToUtf8( intval( $match[2], 16 ) );
}
$quirkyRegex =
'~' .
$this->makeRegexAlternation( self::$quirkyPublicPrefixes ) .
'~xAi';
$nameStartCharConvTable = $this->makeConvTable( self::$nameStartChar );
$nameCharConvTable = $this->makeConvTable( self::$nameChar,
[ 'NameStartChar' => self::$nameStartChar ] );
$encEntityRegex = var_export( $entityRegex, true );
$encTranslations = var_export( $entityTranslations, true );
$encLegacy = var_export( $legacyNumericEntities, true );
$encQuirkyRegex = var_export( $quirkyRegex, true );
$encNameStartCharConvTable = $this->encodeConvTable( $nameStartCharConvTable );
$encNameCharConvTable = $this->encodeConvTable( $nameCharConvTable );
$special = [];
foreach ( self::$special as $ns => $str ) {
foreach ( explode( ',', $str ) as $name ) {
$special[$ns][trim( $name )] = true;
}
}
$encSpecial = var_export( $special, true );
$fileContents = '<' . <<<PHP
?php
/**
* This data file is machine generated, see GenerateDataFiles.php
*/
namespace RemexHtml;
class HTMLData {
const NS_HTML = 'http://www.w3.org/1999/xhtml';
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NS_SVG = 'http://www.w3.org/2000/svg';
const NS_XLINK = 'http://www.w3.org/1999/xlink';
const NS_XML = 'http://www.w3.org/XML/1998/namespace';
const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
static public \$special = $encSpecial;
static public \$namedEntityRegex = $encEntityRegex;
static public \$namedEntityTranslations = $encTranslations;
static public \$legacyNumericEntities = $encLegacy;
static public \$quirkyPrefixRegex = $encQuirkyRegex;
static public \$nameStartCharConvTable = $encNameStartCharConvTable;
static public \$nameCharConvTable = $encNameCharConvTable;
}
PHP;
file_put_contents( __DIR__ . '/HTMLData.php', $fileContents );
}
}