%PDF- %PDF-
Direktori : /usr/share/nodejs/saxes/src/ |
Current File : //usr/share/nodejs/saxes/src/saxes.ts |
import * as ed5 from "xmlchars/xml/1.0/ed5"; import * as ed2 from "xmlchars/xml/1.1/ed2"; import * as NSed3 from "xmlchars/xmlns/1.0/ed3"; import isS = ed5.isS; import isChar10 = ed5.isChar; import isNameStartChar = ed5.isNameStartChar; import isNameChar = ed5.isNameChar; import S_LIST = ed5.S_LIST; import NAME_RE = ed5.NAME_RE; import isChar11 = ed2.isChar; import isNCNameStartChar = NSed3.isNCNameStartChar; import isNCNameChar = NSed3.isNCNameChar; import NC_NAME_RE = NSed3.NC_NAME_RE; const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"; const rootNS: Record<string, string> = { // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-assignment __proto__: null as any, xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE, }; const XML_ENTITIES: Record<string, string> = { // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-assignment __proto__: null as any, amp: "&", gt: ">", lt: "<", quot: "\"", apos: "'", }; // EOC: end-of-chunk const EOC = -1; const NL_LIKE = -2; const S_BEGIN = 0; // Initial state. const S_BEGIN_WHITESPACE = 1; // leading whitespace const S_DOCTYPE = 2; // <!DOCTYPE const S_DOCTYPE_QUOTE = 3; // <!DOCTYPE "//blah const S_DTD = 4; // <!DOCTYPE "//blah" [ ... const S_DTD_QUOTED = 5; // <!DOCTYPE "//blah" [ "foo const S_DTD_OPEN_WAKA = 6; const S_DTD_OPEN_WAKA_BANG = 7; const S_DTD_COMMENT = 8; // <!-- const S_DTD_COMMENT_ENDING = 9; // <!-- blah - const S_DTD_COMMENT_ENDED = 10; // <!-- blah -- const S_DTD_PI = 11; // <? const S_DTD_PI_ENDING = 12; // <?hi "there" ? const S_TEXT = 13; // general stuff const S_ENTITY = 14; // & and such const S_OPEN_WAKA = 15; // < const S_OPEN_WAKA_BANG = 16; // <!... const S_COMMENT = 17; // <!-- const S_COMMENT_ENDING = 18; // <!-- blah - const S_COMMENT_ENDED = 19; // <!-- blah -- const S_CDATA = 20; // <![CDATA[ something const S_CDATA_ENDING = 21; // ] const S_CDATA_ENDING_2 = 22; // ]] const S_PI_FIRST_CHAR = 23; // <?hi, first char const S_PI_REST = 24; // <?hi, rest of the name const S_PI_BODY = 25; // <?hi there const S_PI_ENDING = 26; // <?hi "there" ? const S_XML_DECL_NAME_START = 27; // <?xml const S_XML_DECL_NAME = 28; // <?xml foo const S_XML_DECL_EQ = 29; // <?xml foo= const S_XML_DECL_VALUE_START = 30; // <?xml foo= const S_XML_DECL_VALUE = 31; // <?xml foo="bar" const S_XML_DECL_SEPARATOR = 32; // <?xml foo="bar" const S_XML_DECL_ENDING = 33; // <?xml ... ? const S_OPEN_TAG = 34; // <strong const S_OPEN_TAG_SLASH = 35; // <strong / const S_ATTRIB = 36; // <a const S_ATTRIB_NAME = 37; // <a foo const S_ATTRIB_NAME_SAW_WHITE = 38; // <a foo _ const S_ATTRIB_VALUE = 39; // <a foo= const S_ATTRIB_VALUE_QUOTED = 40; // <a foo="bar const S_ATTRIB_VALUE_CLOSED = 41; // <a foo="bar" const S_ATTRIB_VALUE_UNQUOTED = 42; // <a foo=bar const S_CLOSE_TAG = 43; // </a const S_CLOSE_TAG_SAW_WHITE = 44; // </a > const TAB = 9; const NL = 0xA; const CR = 0xD; const SPACE = 0x20; const BANG = 0x21; const DQUOTE = 0x22; const AMP = 0x26; const SQUOTE = 0x27; const MINUS = 0x2D; const FORWARD_SLASH = 0x2F; const SEMICOLON = 0x3B; const LESS = 0x3C; const EQUAL = 0x3D; const GREATER = 0x3E; const QUESTION = 0x3F; const OPEN_BRACKET = 0x5B; const CLOSE_BRACKET = 0x5D; const NEL = 0x85; const LS = 0x2028; // Line Separator const isQuote = (c: number): boolean => c === DQUOTE || c === SQUOTE; const QUOTES = [DQUOTE, SQUOTE]; const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER]; const DTD_TERMINATOR = [...QUOTES, LESS, CLOSE_BRACKET]; const XML_DECL_NAME_TERMINATOR = [EQUAL, QUESTION, ...S_LIST]; const ATTRIB_VALUE_UNQUOTED_TERMINATOR = [...S_LIST, GREATER, AMP, LESS]; function nsPairCheck(parser: SaxesParser<SaxesOptions>, prefix: string, uri: string): void { switch (prefix) { case "xml": if (uri !== XML_NAMESPACE) { parser.fail(`xml prefix must be bound to ${XML_NAMESPACE}.`); } break; case "xmlns": if (uri !== XMLNS_NAMESPACE) { parser.fail(`xmlns prefix must be bound to ${XMLNS_NAMESPACE}.`); } break; default: } switch (uri) { case XMLNS_NAMESPACE: parser.fail(prefix === "" ? `the default namespace may not be set to ${uri}.` : `may not assign a prefix (even "xmlns") to the URI \ ${XMLNS_NAMESPACE}.`); break; case XML_NAMESPACE: switch (prefix) { case "xml": // Assinging the XML namespace to "xml" is fine. break; case "": parser.fail(`the default namespace may not be set to ${uri}.`); break; default: parser.fail("may not assign the xml namespace to another prefix."); } break; default: } } function nsMappingCheck(parser: SaxesParser<SaxesOptions>, mapping: Record<string, string>): void { for (const local of Object.keys(mapping)) { nsPairCheck(parser, local, mapping[local]); } } const isNCName = (name: string): boolean => NC_NAME_RE.test(name); const isName = (name: string): boolean => NAME_RE.test(name); const FORBIDDEN_START = 0; const FORBIDDEN_BRACKET = 1; const FORBIDDEN_BRACKET_BRACKET = 2; /** * The list of supported events. */ export const EVENTS = [ "xmldecl", "text", "processinginstruction", "doctype", "comment", "opentagstart", "attribute", "opentag", "closetag", "cdata", "error", "end", "ready", ] as const; const EVENT_NAME_TO_HANDLER_NAME: Record<EventName, string> = { xmldecl: "xmldeclHandler", text: "textHandler", processinginstruction: "piHandler", doctype: "doctypeHandler", comment: "commentHandler", opentagstart: "openTagStartHandler", attribute: "attributeHandler", opentag: "openTagHandler", closetag: "closeTagHandler", cdata: "cdataHandler", error: "errorHandler", end: "endHandler", ready: "readyHandler", }; /** * Event handler for the * * @param text The text data encountered by the parser. * */ export type XMLDeclHandler = (decl: XMLDecl) => void; /** * Event handler for text data. * * @param text The text data encountered by the parser. * */ export type TextHandler = (text: string) => void; /** * Event handler for processing instructions. * * @param data The target and body of the processing instruction. */ export type PIHandler = (data: { target: string; body: string }) => void; /** * Event handler for doctype. * * @param doctype The doctype contents. */ export type DoctypeHandler = (doctype: string) => void; /** * Event handler for comments. * * @param comment The comment contents. */ export type CommentHandler = (comment: string) => void; /** * Event handler for the start of an open tag. This is called as soon as we * have a tag name. * * @param tag The tag. */ export type OpenTagStartHandler<O> = (tag: StartTagForOptions<O>) => void; export type AttributeEventForOptions<O extends SaxesOptions> = O extends { xmlns: true } ? SaxesAttributeNSIncomplete : O extends { xmlns?: false | undefined } ? SaxesAttributePlain : SaxesAttribute; /** * Event handler for attributes. */ export type AttributeHandler<O> = (attribute: AttributeEventForOptions<O>) => void; /** * Event handler for an open tag. This is called when the open tag is * complete. (We've encountered the ">" that ends the open tag.) * * @param tag The tag. */ export type OpenTagHandler<O> = (tag: TagForOptions<O>) => void; /** * Event handler for a close tag. Note that for self-closing tags, this is * called right after ``opentag``. * * @param tag The tag. */ export type CloseTagHandler<O> = (tag: TagForOptions<O>) => void; /** * Event handler for a CDATA section. This is called when ending the * CDATA section. * * @param cdata The contents of the CDATA section. */ export type CDataHandler = (cdata: string) => void; /** * Event handler for the stream end. This is called when the stream has been * closed with ``close`` or by passing ``null`` to ``write``. */ export type EndHandler = () => void; /** * Event handler indicating parser readiness . This is called when the parser * is ready to parse a new document. */ export type ReadyHandler = () => void; /** * Event handler indicating an error. * * @param err The error that occurred. */ export type ErrorHandler = (err: Error) => void; export type EventName = (typeof EVENTS)[number]; export type EventNameToHandler<O, N extends EventName> = { "xmldecl": XMLDeclHandler; "text": TextHandler; "processinginstruction": PIHandler; "doctype": DoctypeHandler; "comment": CommentHandler; "opentagstart": OpenTagStartHandler<O>; "attribute": AttributeHandler<O>; "opentag": OpenTagHandler<O>; "closetag": CloseTagHandler<O>; "cdata": CDataHandler; "error": ErrorHandler; "end": EndHandler; "ready": ReadyHandler; }[N]; /** * This interface defines the structure of attributes when the parser is * processing namespaces (created with ``xmlns: true``). */ export interface SaxesAttributeNS { /** * The attribute's name. This is the combination of prefix and local name. * For instance ``a:b="c"`` would have ``a:b`` for name. */ name: string; /** * The attribute's prefix. For instance ``a:b="c"`` would have ``"a"`` for * ``prefix``. */ prefix: string; /** * The attribute's local name. For instance ``a:b="c"`` would have ``"b"`` for * ``local``. */ local: string; /** The namespace URI of this attribute. */ uri: string; /** The attribute's value. */ value: string; } /** * This is an attribute, as recorded by a parser which parses namespaces but * prior to the URI being resolvable. This is what is passed to the attribute * event handler. */ export type SaxesAttributeNSIncomplete = Exclude<SaxesAttributeNS, "uri">; /** * This interface defines the structure of attributes when the parser is * NOT processing namespaces (created with ``xmlns: false``). */ export interface SaxesAttributePlain { /** * The attribute's name. */ name: string; /** The attribute's value. */ value: string; } /** * A saxes attribute, with or without namespace information. */ export type SaxesAttribute = SaxesAttributeNS | SaxesAttributePlain; /** * This are the fields that MAY be present on a complete tag. */ export interface SaxesTag { /** * The tag's name. This is the combination of prefix and global name. For * instance ``<a:b>`` would have ``"a:b"`` for ``name``. */ name: string; /** * A map of attribute name to attributes. If namespaces are tracked, the * values in the map are attribute objects. Otherwise, they are strings. */ attributes: Record<string, SaxesAttributeNS> | Record<string, string>; /** * The namespace bindings in effect. */ ns?: Record<string, string>; /** * The tag's prefix. For instance ``<a:b>`` would have ``"a"`` for * ``prefix``. Undefined if we do not track namespaces. */ prefix?: string; /** * The tag's local name. For instance ``<a:b>`` would * have ``"b"`` for ``local``. Undefined if we do not track namespaces. */ local?: string; /** * The namespace URI of this tag. Undefined if we do not track namespaces. */ uri?: string; /** Whether the tag is self-closing (e.g. ``<foo/>``). */ isSelfClosing: boolean; } /** * This type defines the fields that are present on a tag object when * ``onopentagstart`` is called. This interface is namespace-agnostic. */ export type SaxesStartTag = Pick<SaxesTag, "name" | "attributes" | "ns">; /** * This type defines the fields that are present on a tag object when * ``onopentagstart`` is called on a parser that does not processes namespaces. */ export type SaxesStartTagPlain = Pick<SaxesStartTag, "name" | "attributes">; /** * This type defines the fields that are present on a tag object when * ``onopentagstart`` is called on a parser that does process namespaces. */ export type SaxesStartTagNS = Required<SaxesStartTag>; /** * This are the fields that are present on a complete tag produced by a parser * that does process namespaces. */ export type SaxesTagNS = Required<SaxesTag> & { attributes: Record<string, SaxesAttributeNS>; }; /** * This are the fields that are present on a complete tag produced by a parser * that does not process namespaces. */ export type SaxesTagPlain = Pick<SaxesTag, "name" | "attributes" | "isSelfClosing"> & { attributes: Record<string, string>; }; // This is an internal type used for holding tags while they are being built. type SaxesTagIncomplete = Omit<SaxesTag, "isSelfClosing"> & Partial<Pick<SaxesTag, "isSelfClosing">>; /** * An XML declaration. */ export interface XMLDecl { /** The version specified by the XML declaration. */ version?: string; /** The encoding specified by the XML declaration. */ encoding?: string; /** The value of the standalone parameter */ standalone?: string; } /** * A callback for resolving name prefixes. * * @param prefix The prefix to check. * * @returns The URI corresponding to the prefix, if any. */ export type ResolvePrefix = (prefix: string) => string | undefined; export interface CommonOptions { /** Whether to accept XML fragments. Unset means ``false``. */ fragment?: boolean; /** Whether to track positions. Unset means ``true``. */ position?: boolean; /** * A file name to use for error reporting. "File name" is a loose concept. You * could use a URL to some resource, or any descriptive name you like. */ fileName?: string; } export interface NSOptions { /** Whether to track namespaces. Unset means ``false``. */ xmlns?: boolean; /** * A plain object whose key, value pairs define namespaces known before * parsing the XML file. It is not legal to pass bindings for the namespaces * ``"xml"`` or ``"xmlns"``. */ additionalNamespaces?: Record<string, string>; /** * A function that will be used if the parser cannot resolve a namespace * prefix on its own. */ resolvePrefix?: ResolvePrefix; } export interface NSOptionsWithoutNamespaces extends NSOptions { xmlns?: false; // It makes no sense to set these if namespaces are not used. additionalNamespaces?: undefined; resolvePrefix?: undefined; } export interface NSOptionsWithNamespaces extends NSOptions { xmlns: true; // The other options are still optional. } export interface XMLVersionOptions { /** * The default XML version to use. If unspecified, and there is no XML * encoding declaration, the default version is "1.0". */ defaultXMLVersion?: "1.0" | "1.1"; /** * A flag indicating whether to force the XML version used for parsing to the * value of ``defaultXMLVersion``. When this flag is ``true``, * ``defaultXMLVersion`` must be specified. If unspecified, the default value * of this flag is ``false``. */ forceXMLVersion?: boolean; } export interface NoForcedXMLVersion extends XMLVersionOptions { forceXMLVersion?: false; // defaultXMLVersion stays the same. } export interface ForcedXMLVersion extends XMLVersionOptions { forceXMLVersion: true; // defaultXMLVersion becomes mandatory. defaultXMLVersion: Exclude<XMLVersionOptions["defaultXMLVersion"], undefined>; } /** * The entire set of options supported by saxes. */ export type SaxesOptions = CommonOptions & NSOptions & XMLVersionOptions; export type TagForOptions<O extends SaxesOptions> = O extends { xmlns: true } ? SaxesTagNS : O extends { xmlns?: false | undefined } ? SaxesTagPlain : SaxesTag; export type StartTagForOptions<O extends SaxesOptions> = O extends { xmlns: true } ? SaxesStartTagNS : O extends { xmlns?: false | undefined } ? SaxesStartTagPlain : SaxesStartTag; // eslint-disable-next-line @typescript-eslint/ban-types export class SaxesParser<O extends SaxesOptions = {}> { private readonly fragmentOpt: boolean; private readonly xmlnsOpt: boolean; private readonly trackPosition: boolean; private readonly fileName?: string; private readonly nameStartCheck: (c: number) => boolean; private readonly nameCheck: (c: number) => boolean; private readonly isName: (name: string) => boolean; private readonly ns!: Record<string, string>; private openWakaBang!: string; private text!: string; private name!: string; private piTarget!: string; private entity!: string; private q!: null | number; private tags!: SaxesTagIncomplete[]; private tag!: SaxesTagIncomplete | null; private topNS!: Record<string, string> | null; private chunk!: string; private chunkPosition!: number; private i!: number; // // We use prevI to allow "ungetting" the previously read code point. Note // however, that it is not safe to unget everything and anything. In // particular ungetting EOL characters will screw positioning up. // // Practically, you must not unget a code which has any side effect beyond // updating ``this.i`` and ``this.prevI``. Only EOL codes have such side // effects. // private prevI!: number; private carriedFromPrevious?: string; private forbiddenState!: number; private attribList!: (SaxesAttributeNSIncomplete | SaxesAttributePlain)[]; private state!: number; private reportedTextBeforeRoot!: boolean; private reportedTextAfterRoot!: boolean; private closedRoot!: boolean; private sawRoot!: boolean; private xmlDeclPossible!: boolean; private xmlDeclExpects!: string[]; private entityReturnState?: number; private processAttribs!: (this: this) => void; private positionAtNewLine!: number; private doctype!: boolean; private getCode!: () => number; private isChar!: (c: number) => boolean; private pushAttrib!: (name: string, value: string) => void; private _closed!: boolean; private currentXMLVersion!: string; private readonly stateTable: ((this: SaxesParser<O>) => void)[]; private xmldeclHandler?: XMLDeclHandler; private textHandler?: TextHandler; private piHandler?: PIHandler; private doctypeHandler?: DoctypeHandler; private commentHandler?: CommentHandler; private openTagStartHandler?: OpenTagStartHandler<O>; private openTagHandler?: OpenTagHandler<O>; private closeTagHandler?: CloseTagHandler<O>; private cdataHandler?: CDataHandler; private errorHandler?: ErrorHandler; private endHandler?: EndHandler; private readyHandler?: ReadyHandler; private attributeHandler?: AttributeHandler<O>; /** * Indicates whether or not the parser is closed. If ``true``, wait for * the ``ready`` event to write again. */ get closed(): boolean { return this._closed; } readonly opt: SaxesOptions; /** * The XML declaration for this document. */ xmlDecl!: XMLDecl; /** * The line number of the next character to be read by the parser. This field * is one-based. (The first line is numbered 1.) */ line!: number; /** * The column number of the next character to be read by the parser. * * This field is zero-based. (The first column is 0.) * * This field counts columns by *Unicode character*. Note that this *can* * be different from the index of the character in a JavaScript string due * to how JavaScript handles astral plane characters. * * See [[columnIndex]] for a number that corresponds to the JavaScript index. */ column!: number; /** * A map of entity name to expansion. */ ENTITIES!: Record<string, string>; /** * @param opt The parser options. */ constructor(opt?: O) { this.opt = opt ?? {}; this.fragmentOpt = !!(this.opt.fragment as boolean); const xmlnsOpt = this.xmlnsOpt = !!(this.opt.xmlns as boolean); this.trackPosition = this.opt.position !== false; this.fileName = this.opt.fileName; if (xmlnsOpt) { // This is the function we use to perform name checks on PIs and entities. // When namespaces are used, colons are not allowed in PI target names or // entity names. So the check depends on whether namespaces are used. See: // // https://www.w3.org/XML/xml-names-19990114-errata.html // NE08 // this.nameStartCheck = isNCNameStartChar; this.nameCheck = isNCNameChar; this.isName = isNCName; // eslint-disable-next-line @typescript-eslint/unbound-method this.processAttribs = this.processAttribsNS; // eslint-disable-next-line @typescript-eslint/unbound-method this.pushAttrib = this.pushAttribNS; // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-assignment this.ns = { __proto__: null as any, ...rootNS }; const additional = this.opt.additionalNamespaces; if (additional != null) { nsMappingCheck(this, additional); Object.assign(this.ns, additional); } } else { this.nameStartCheck = isNameStartChar; this.nameCheck = isNameChar; this.isName = isName; // eslint-disable-next-line @typescript-eslint/unbound-method this.processAttribs = this.processAttribsPlain; // eslint-disable-next-line @typescript-eslint/unbound-method this.pushAttrib = this.pushAttribPlain; } // // The order of the members in this table needs to correspond to the state // numbers given to the states that correspond to the methods being recorded // here. // this.stateTable = [ /* eslint-disable @typescript-eslint/unbound-method */ this.sBegin, this.sBeginWhitespace, this.sDoctype, this.sDoctypeQuote, this.sDTD, this.sDTDQuoted, this.sDTDOpenWaka, this.sDTDOpenWakaBang, this.sDTDComment, this.sDTDCommentEnding, this.sDTDCommentEnded, this.sDTDPI, this.sDTDPIEnding, this.sText, this.sEntity, this.sOpenWaka, this.sOpenWakaBang, this.sComment, this.sCommentEnding, this.sCommentEnded, this.sCData, this.sCDataEnding, this.sCDataEnding2, this.sPIFirstChar, this.sPIRest, this.sPIBody, this.sPIEnding, this.sXMLDeclNameStart, this.sXMLDeclName, this.sXMLDeclEq, this.sXMLDeclValueStart, this.sXMLDeclValue, this.sXMLDeclSeparator, this.sXMLDeclEnding, this.sOpenTag, this.sOpenTagSlash, this.sAttrib, this.sAttribName, this.sAttribNameSawWhite, this.sAttribValue, this.sAttribValueQuoted, this.sAttribValueClosed, this.sAttribValueUnquoted, this.sCloseTag, this.sCloseTagSawWhite, /* eslint-enable @typescript-eslint/unbound-method */ ]; this._init(); } _init(): void { this.openWakaBang = ""; this.text = ""; this.name = ""; this.piTarget = ""; this.entity = ""; this.q = null; this.tags = []; this.tag = null; this.topNS = null; this.chunk = ""; this.chunkPosition = 0; this.i = 0; this.prevI = 0; this.carriedFromPrevious = undefined; this.forbiddenState = FORBIDDEN_START; this.attribList = []; // The logic is organized so as to minimize the need to check // this.opt.fragment while parsing. const { fragmentOpt } = this; this.state = fragmentOpt ? S_TEXT : S_BEGIN; // We want these to be all true if we are dealing with a fragment. this.reportedTextBeforeRoot = this.reportedTextAfterRoot = this.closedRoot = this.sawRoot = fragmentOpt; // An XML declaration is intially possible only when parsing whole // documents. this.xmlDeclPossible = !fragmentOpt; this.xmlDeclExpects = ["version"]; this.entityReturnState = undefined; let { defaultXMLVersion } = this.opt; if (defaultXMLVersion === undefined) { if (this.opt.forceXMLVersion === true) { throw new Error("forceXMLVersion set but defaultXMLVersion is not set"); } defaultXMLVersion = "1.0"; } this.setXMLVersion(defaultXMLVersion); this.positionAtNewLine = 0; this.doctype = false; this._closed = false; this.xmlDecl = { version: undefined, encoding: undefined, standalone: undefined, }; this.line = 1; this.column = 0; this.ENTITIES = Object.create(XML_ENTITIES) as Record<string, string>; this.readyHandler?.(); } /** * The stream position the parser is currently looking at. This field is * zero-based. * * This field is not based on counting Unicode characters but is to be * interpreted as a plain index into a JavaScript string. */ get position(): number { return this.chunkPosition + this.i; } /** * The column number of the next character to be read by the parser. * * This field is zero-based. (The first column in a line is 0.) * * This field reports the index at which the next character would be in the * line if the line were represented as a JavaScript string. Note that this * *can* be different to a count based on the number of *Unicode characters* * due to how JavaScript handles astral plane characters. * * See [[column]] for a number that corresponds to a count of Unicode * characters. */ get columnIndex(): number { return this.position - this.positionAtNewLine; } /** * Set an event listener on an event. The parser supports one handler per * event type. If you try to set an event handler over an existing handler, * the old handler is silently overwritten. * * @param name The event to listen to. * * @param handler The handler to set. */ on<N extends EventName>(name: N, handler: EventNameToHandler<O, N>): void { // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access (this as any)[EVENT_NAME_TO_HANDLER_NAME[name]] = handler; } /** * Unset an event handler. * * @parma name The event to stop listening to. */ off(name: EventName): void { // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access (this as any)[EVENT_NAME_TO_HANDLER_NAME[name]] = undefined; } /** * Make an error object. The error object will have a message that contains * the ``fileName`` option passed at the creation of the parser. If position * tracking was turned on, it will also have line and column number * information. * * @param message The message describing the error to report. * * @returns An error object with a properly formatted message. */ makeError(message: string): Error { let msg = this.fileName ?? ""; if (this.trackPosition) { if (msg.length > 0) { msg += ":"; } msg += `${this.line}:${this.column}`; } if (msg.length > 0) { msg += ": "; } return new Error(msg + message); } /** * Report a parsing error. This method is made public so that client code may * check for issues that are outside the scope of this project and can report * errors. * * @param message The error to report. * * @returns this */ fail(message: string): this { const err = this.makeError(message); const handler = this.errorHandler; if (handler === undefined) { throw err; } else { handler(err); } return this; } /** * Write a XML data to the parser. * * @param chunk The XML data to write. * * @returns this */ // We do need object for the type here. Yes, it often causes problems // but not in this case. write(chunk: string | object | null): this { if (this.closed) { return this.fail("cannot write after close; assign an onready handler."); } let end = false; if (chunk === null) { // We cannot return immediately because carriedFromPrevious may need // processing. end = true; chunk = ""; } else if (typeof chunk === "object") { chunk = chunk.toString(); } // We checked if performing a pre-decomposition of the string into an array // of single complete characters (``Array.from(chunk)``) would be faster // than the current repeated calls to ``charCodeAt``. As of August 2018, it // isn't. (There may be Node-specific code that would perform faster than // ``Array.from`` but don't want to be dependent on Node.) if (this.carriedFromPrevious !== undefined) { // The previous chunk had char we must carry over. chunk = `${this.carriedFromPrevious}${chunk}`; this.carriedFromPrevious = undefined; } let limit = chunk.length; const lastCode = chunk.charCodeAt(limit - 1); if (!end && // A trailing CR or surrogate must be carried over to the next // chunk. (lastCode === CR || (lastCode >= 0xD800 && lastCode <= 0xDBFF))) { // The chunk ends with a character that must be carried over. We cannot // know how to handle it until we get the next chunk or the end of the // stream. So save it for later. this.carriedFromPrevious = chunk[limit - 1]; limit--; chunk = chunk.slice(0, limit); } const { stateTable } = this; this.chunk = chunk; this.i = 0; while (this.i < limit) { // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-argument stateTable[this.state].call(this as any); } this.chunkPosition += limit; return end ? this.end() : this; } /** * Close the current stream. Perform final well-formedness checks and reset * the parser tstate. * * @returns this */ close(): this { return this.write(null); } /** * Get a single code point out of the current chunk. This updates the current * position if we do position tracking. * * This is the algorithm to use for XML 1.0. * * @returns The character read. */ private getCode10(): number { const { chunk, i } = this; this.prevI = i; // Yes, we do this instead of doing this.i++. Doing it this way, we do not // read this.i again, which is a bit faster. this.i = i + 1; if (i >= chunk.length) { return EOC; } // Using charCodeAt and handling the surrogates ourselves is faster // than using codePointAt. const code = chunk.charCodeAt(i); this.column++; if (code < 0xD800) { if (code >= SPACE || code === TAB) { return code; } switch (code) { case NL: this.line++; this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: // We may get NaN if we read past the end of the chunk, which is fine. if (chunk.charCodeAt(i + 1) === NL) { // A \r\n sequence is converted to \n so we have to skip over the // next character. We already know it has a size of 1 so ++ is fine // here. this.i = i + 2; } // Otherwise, a \r is just converted to \n, so we don't have to skip // ahead. // In either case, \r becomes \n. this.line++; this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: // If we get here, then code < SPACE and it is not NL CR or TAB. this.fail("disallowed character."); return code; } } if (code > 0xDBFF) { // This is a specialized version of isChar10 that takes into account // that in this context code > 0xDBFF and code <= 0xFFFF. So it does not // test cases that don't need testing. if (!(code >= 0xE000 && code <= 0xFFFD)) { this.fail("disallowed character."); } return code; } const final = 0x10000 + ((code - 0xD800) * 0x400) + (chunk.charCodeAt(i + 1) - 0xDC00); this.i = i + 2; // This is a specialized version of isChar10 that takes into account that in // this context necessarily final >= 0x10000. if (final > 0x10FFFF) { this.fail("disallowed character."); } return final; } /** * Get a single code point out of the current chunk. This updates the current * position if we do position tracking. * * This is the algorithm to use for XML 1.1. * * @returns {number} The character read. */ private getCode11(): number { const { chunk, i } = this; this.prevI = i; // Yes, we do this instead of doing this.i++. Doing it this way, we do not // read this.i again, which is a bit faster. this.i = i + 1; if (i >= chunk.length) { return EOC; } // Using charCodeAt and handling the surrogates ourselves is faster // than using codePointAt. const code = chunk.charCodeAt(i); this.column++; if (code < 0xD800) { if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) || code === TAB) { return code; } switch (code) { case NL: // 0xA this.line++; this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: { // 0xD // We may get NaN if we read past the end of the chunk, which is // fine. const next = chunk.charCodeAt(i + 1); if (next === NL || next === NEL) { // A CR NL or CR NEL sequence is converted to NL so we have to skip // over the next character. We already know it has a size of 1. this.i = i + 2; } // Otherwise, a CR is just converted to NL, no skip. } /* yes, fall through */ case NEL: // 0x85 case LS: // Ox2028 this.line++; this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: this.fail("disallowed character."); return code; } } if (code > 0xDBFF) { // This is a specialized version of isCharAndNotRestricted that takes into // account that in this context code > 0xDBFF and code <= 0xFFFF. So it // does not test cases that don't need testing. if (!(code >= 0xE000 && code <= 0xFFFD)) { this.fail("disallowed character."); } return code; } const final = 0x10000 + ((code - 0xD800) * 0x400) + (chunk.charCodeAt(i + 1) - 0xDC00); this.i = i + 2; // This is a specialized version of isCharAndNotRestricted that takes into // account that in this context necessarily final >= 0x10000. if (final > 0x10FFFF) { this.fail("disallowed character."); } return final; } /** * Like ``getCode`` but with the return value normalized so that ``NL`` is * returned for ``NL_LIKE``. */ private getCodeNorm(): number { const c = this.getCode(); return c === NL_LIKE ? NL : c; } private unget(): void { this.i = this.prevI; this.column--; } /** * Capture characters into a buffer until encountering one of a set of * characters. * * @param chars An array of codepoints. Encountering a character in the array * ends the capture. (``chars`` may safely contain ``NL``.) * * @return The character code that made the capture end, or ``EOC`` if we hit * the end of the chunk. The return value cannot be NL_LIKE: NL is returned * instead. */ private captureTo(chars: number[]): number { let { i: start } = this; const { chunk } = this; // eslint-disable-next-line no-constant-condition while (true) { const c = this.getCode(); const isNLLike = c === NL_LIKE; const final = isNLLike ? NL : c; if (final === EOC || chars.includes(final)) { this.text += chunk.slice(start, this.prevI); return final; } if (isNLLike) { this.text += `${chunk.slice(start, this.prevI)}\n`; start = this.i; } } } /** * Capture characters into a buffer until encountering a character. * * @param char The codepoint that ends the capture. **NOTE ``char`` MAY NOT * CONTAIN ``NL``.** Passing ``NL`` will result in buggy behavior. * * @return ``true`` if we ran into the character. Otherwise, we ran into the * end of the current chunk. */ private captureToChar(char: number): boolean { let { i: start } = this; const { chunk } = this; // eslint-disable-next-line no-constant-condition while (true) { let c = this.getCode(); switch (c) { case NL_LIKE: this.text += `${chunk.slice(start, this.prevI)}\n`; start = this.i; c = NL; break; case EOC: this.text += chunk.slice(start); return false; default: } if (c === char) { this.text += chunk.slice(start, this.prevI); return true; } } } /** * Capture characters that satisfy ``isNameChar`` into the ``name`` field of * this parser. * * @return The character code that made the test fail, or ``EOC`` if we hit * the end of the chunk. The return value cannot be NL_LIKE: NL is returned * instead. */ private captureNameChars(): number { const { chunk, i: start } = this; // eslint-disable-next-line no-constant-condition while (true) { const c = this.getCode(); if (c === EOC) { this.name += chunk.slice(start); return EOC; } // NL is not a name char so we don't have to test specifically for it. if (!isNameChar(c)) { this.name += chunk.slice(start, this.prevI); return c === NL_LIKE ? NL : c; } } } /** * Skip white spaces. * * @return The character that ended the skip, or ``EOC`` if we hit * the end of the chunk. The return value cannot be NL_LIKE: NL is returned * instead. */ private skipSpaces(): number { // eslint-disable-next-line no-constant-condition while (true) { const c = this.getCodeNorm(); if (c === EOC || !isS(c)) { return c; } } } private setXMLVersion(version: string): void { this.currentXMLVersion = version; /* eslint-disable @typescript-eslint/unbound-method */ if (version === "1.0") { this.isChar = isChar10; this.getCode = this.getCode10; } else { this.isChar = isChar11; this.getCode = this.getCode11; } /* eslint-enable @typescript-eslint/unbound-method */ } // STATE ENGINE METHODS // This needs to be a state separate from S_BEGIN_WHITESPACE because we want // to be sure never to come back to this state later. private sBegin(): void { // We are essentially peeking at the first character of the chunk. Since // S_BEGIN can be in effect only when we start working on the first chunk, // the index at which we must look is necessarily 0. Note also that the // following test does not depend on decoding surrogates. // If the initial character is 0xFEFF, ignore it. if (this.chunk.charCodeAt(0) === 0xFEFF) { this.i++; this.column++; } this.state = S_BEGIN_WHITESPACE; } private sBeginWhitespace(): void { // We need to know whether we've encountered spaces or not because as soon // as we run into a space, an XML declaration is no longer possible. Rather // than slow down skipSpaces even in places where we don't care whether it // skipped anything or not, we check whether prevI is equal to the value of // i from before we skip spaces. const iBefore = this.i; const c = this.skipSpaces(); if (this.prevI !== iBefore) { this.xmlDeclPossible = false; } switch (c) { case LESS: this.state = S_OPEN_WAKA; // We could naively call closeText but in this state, it is not normal // to have text be filled with any data. if (this.text.length !== 0) { throw new Error("no-empty text at start"); } break; case EOC: break; default: this.unget(); this.state = S_TEXT; this.xmlDeclPossible = false; } } private sDoctype(): void { const c = this.captureTo(DOCTYPE_TERMINATOR); switch (c) { case GREATER: { this.doctypeHandler?.(this.text); this.text = ""; this.state = S_TEXT; this.doctype = true; // just remember that we saw it. break; } case EOC: break; default: this.text += String.fromCodePoint(c); if (c === OPEN_BRACKET) { this.state = S_DTD; } else if (isQuote(c)) { this.state = S_DOCTYPE_QUOTE; this.q = c; } } } private sDoctypeQuote(): void { const q = this.q!; if (this.captureToChar(q)) { this.text += String.fromCodePoint(q); this.q = null; this.state = S_DOCTYPE; } } private sDTD(): void { const c = this.captureTo(DTD_TERMINATOR); if (c === EOC) { return; } this.text += String.fromCodePoint(c); if (c === CLOSE_BRACKET) { this.state = S_DOCTYPE; } else if (c === LESS) { this.state = S_DTD_OPEN_WAKA; } else if (isQuote(c)) { this.state = S_DTD_QUOTED; this.q = c; } } private sDTDQuoted(): void { const q = this.q!; if (this.captureToChar(q)) { this.text += String.fromCodePoint(q); this.state = S_DTD; this.q = null; } } private sDTDOpenWaka(): void { const c = this.getCodeNorm(); this.text += String.fromCodePoint(c); switch (c) { case BANG: this.state = S_DTD_OPEN_WAKA_BANG; this.openWakaBang = ""; break; case QUESTION: this.state = S_DTD_PI; break; default: this.state = S_DTD; } } private sDTDOpenWakaBang(): void { const char = String.fromCodePoint(this.getCodeNorm()); const owb = this.openWakaBang += char; this.text += char; if (owb !== "-") { this.state = owb === "--" ? S_DTD_COMMENT : S_DTD; this.openWakaBang = ""; } } private sDTDComment(): void { if (this.captureToChar(MINUS)) { this.text += "-"; this.state = S_DTD_COMMENT_ENDING; } } private sDTDCommentEnding(): void { const c = this.getCodeNorm(); this.text += String.fromCodePoint(c); this.state = c === MINUS ? S_DTD_COMMENT_ENDED : S_DTD_COMMENT; } private sDTDCommentEnded(): void { const c = this.getCodeNorm(); this.text += String.fromCodePoint(c); if (c === GREATER) { this.state = S_DTD; } else { this.fail("malformed comment."); // <!-- blah -- bloo --> will be recorded as // a comment of " blah -- bloo " this.state = S_DTD_COMMENT; } } private sDTDPI(): void { if (this.captureToChar(QUESTION)) { this.text += "?"; this.state = S_DTD_PI_ENDING; } } private sDTDPIEnding(): void { const c = this.getCodeNorm(); this.text += String.fromCodePoint(c); if (c === GREATER) { this.state = S_DTD; } } private sText(): void { // // We did try a version of saxes where the S_TEXT state was split in two // states: one for text inside the root element, and one for text // outside. This was avoiding having to test this.tags.length to decide // what implementation to actually use. // // Peformance testing on gigabyte-size files did not show any advantage to // using the two states solution instead of the current one. Conversely, it // made the code a bit more complicated elsewhere. For instance, a comment // can appear before the root element so when a comment ended it was // necessary to determine whether to return to the S_TEXT state or to the // new text-outside-root state. // if (this.tags.length !== 0) { this.handleTextInRoot(); } else { this.handleTextOutsideRoot(); } } private sEntity(): void { // This is essentially a specialized version of captureToChar(SEMICOLON...) let { i: start } = this; const { chunk } = this; // eslint-disable-next-line no-labels, no-restricted-syntax loop: // eslint-disable-next-line no-constant-condition while (true) { switch (this.getCode()) { case NL_LIKE: this.entity += `${chunk.slice(start, this.prevI)}\n`; start = this.i; break; case SEMICOLON: { const { entityReturnState } = this; const entity = this.entity + chunk.slice(start, this.prevI); this.state = entityReturnState!; let parsed: string; if (entity === "") { this.fail("empty entity name."); parsed = "&;"; } else { parsed = this.parseEntity(entity); this.entity = ""; } if (entityReturnState !== S_TEXT || this.textHandler !== undefined) { this.text += parsed; } // eslint-disable-next-line no-labels break loop; } case EOC: this.entity += chunk.slice(start); // eslint-disable-next-line no-labels break loop; default: } } } private sOpenWaka(): void { // Reminder: a state handler is called with at least one character // available in the current chunk. So the first call to get code inside of // a state handler cannot return ``EOC``. That's why we don't test // for it. const c = this.getCode(); // either a /, ?, !, or text is coming next. if (isNameStartChar(c)) { this.state = S_OPEN_TAG; this.unget(); this.xmlDeclPossible = false; } else { switch (c) { case FORWARD_SLASH: this.state = S_CLOSE_TAG; this.xmlDeclPossible = false; break; case BANG: this.state = S_OPEN_WAKA_BANG; this.openWakaBang = ""; this.xmlDeclPossible = false; break; case QUESTION: this.state = S_PI_FIRST_CHAR; break; default: this.fail("disallowed character in tag name"); this.state = S_TEXT; this.xmlDeclPossible = false; } } } private sOpenWakaBang(): void { this.openWakaBang += String.fromCodePoint(this.getCodeNorm()); switch (this.openWakaBang) { case "[CDATA[": if (!this.sawRoot && !this.reportedTextBeforeRoot) { this.fail("text data outside of root node."); this.reportedTextBeforeRoot = true; } if (this.closedRoot && !this.reportedTextAfterRoot) { this.fail("text data outside of root node."); this.reportedTextAfterRoot = true; } this.state = S_CDATA; this.openWakaBang = ""; break; case "--": this.state = S_COMMENT; this.openWakaBang = ""; break; case "DOCTYPE": this.state = S_DOCTYPE; if (this.doctype || this.sawRoot) { this.fail("inappropriately located doctype declaration."); } this.openWakaBang = ""; break; default: // 7 happens to be the maximum length of the string that can possibly // match one of the cases above. if (this.openWakaBang.length >= 7) { this.fail("incorrect syntax."); } } } private sComment(): void { if (this.captureToChar(MINUS)) { this.state = S_COMMENT_ENDING; } } private sCommentEnding(): void { const c = this.getCodeNorm(); if (c === MINUS) { this.state = S_COMMENT_ENDED; this.commentHandler?.(this.text); this.text = ""; } else { this.text += `-${String.fromCodePoint(c)}`; this.state = S_COMMENT; } } private sCommentEnded(): void { const c = this.getCodeNorm(); if (c !== GREATER) { this.fail("malformed comment."); // <!-- blah -- bloo --> will be recorded as // a comment of " blah -- bloo " this.text += `--${String.fromCodePoint(c)}`; this.state = S_COMMENT; } else { this.state = S_TEXT; } } private sCData(): void { if (this.captureToChar(CLOSE_BRACKET)) { this.state = S_CDATA_ENDING; } } private sCDataEnding(): void { const c = this.getCodeNorm(); if (c === CLOSE_BRACKET) { this.state = S_CDATA_ENDING_2; } else { this.text += `]${String.fromCodePoint(c)}`; this.state = S_CDATA; } } private sCDataEnding2(): void { const c = this.getCodeNorm(); switch (c) { case GREATER: { this.cdataHandler?.(this.text); this.text = ""; this.state = S_TEXT; break; } case CLOSE_BRACKET: this.text += "]"; break; default: this.text += `]]${String.fromCodePoint(c)}`; this.state = S_CDATA; } } // We need this separate state to check the first character fo the pi target // with this.nameStartCheck which allows less characters than this.nameCheck. private sPIFirstChar(): void { const c = this.getCodeNorm(); // This is first because in the case where the file is well-formed this is // the branch taken. We optimize for well-formedness. if (this.nameStartCheck(c)) { this.piTarget += String.fromCodePoint(c); this.state = S_PI_REST; } else if (c === QUESTION || isS(c)) { this.fail("processing instruction without a target."); this.state = c === QUESTION ? S_PI_ENDING : S_PI_BODY; } else { this.fail("disallowed character in processing instruction name."); this.piTarget += String.fromCodePoint(c); this.state = S_PI_REST; } } private sPIRest(): void { // Capture characters into a piTarget while ``this.nameCheck`` run on the // character read returns true. const { chunk, i: start } = this; // eslint-disable-next-line no-constant-condition while (true) { const c = this.getCodeNorm(); if (c === EOC) { this.piTarget += chunk.slice(start); return; } // NL cannot satisfy this.nameCheck so we don't have to test specifically // for it. if (!this.nameCheck(c)) { this.piTarget += chunk.slice(start, this.prevI); const isQuestion = c === QUESTION; if (isQuestion || isS(c)) { if (this.piTarget === "xml") { if (!this.xmlDeclPossible) { this.fail( "an XML declaration must be at the start of the document."); } this.state = isQuestion ? S_XML_DECL_ENDING : S_XML_DECL_NAME_START; } else { this.state = isQuestion ? S_PI_ENDING : S_PI_BODY; } } else { this.fail("disallowed character in processing instruction name."); this.piTarget += String.fromCodePoint(c); } break; } } } private sPIBody(): void { if (this.text.length === 0) { const c = this.getCodeNorm(); if (c === QUESTION) { this.state = S_PI_ENDING; } else if (!isS(c)) { this.text = String.fromCodePoint(c); } } // The question mark character is not valid inside any of the XML // declaration name/value pairs. else if (this.captureToChar(QUESTION)) { this.state = S_PI_ENDING; } } private sPIEnding(): void { const c = this.getCodeNorm(); if (c === GREATER) { const { piTarget } = this; if (piTarget.toLowerCase() === "xml") { this.fail( "the XML declaration must appear at the start of the document."); } this.piHandler?.({ target: piTarget, body: this.text, }); this.piTarget = this.text = ""; this.state = S_TEXT; } else if (c === QUESTION) { // We ran into ?? as part of a processing instruction. We initially took // the first ? as a sign that the PI was ending, but it is not. So we have // to add it to the body but we take the new ? as a sign that the PI is // ending. this.text += "?"; } else { this.text += `?${String.fromCodePoint(c)}`; this.state = S_PI_BODY; } this.xmlDeclPossible = false; } private sXMLDeclNameStart(): void { const c = this.skipSpaces(); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { // It is valid to go to S_XML_DECL_ENDING from this state. this.state = S_XML_DECL_ENDING; return; } if (c !== EOC) { this.state = S_XML_DECL_NAME; this.name = String.fromCodePoint(c); } } private sXMLDeclName(): void { const c = this.captureTo(XML_DECL_NAME_TERMINATOR); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { this.state = S_XML_DECL_ENDING; this.name += this.text; this.text = ""; this.fail("XML declaration is incomplete."); return; } if (!(isS(c) || c === EQUAL)) { return; } this.name += this.text; this.text = ""; if (!this.xmlDeclExpects.includes(this.name)) { switch (this.name.length) { case 0: this.fail("did not expect any more name/value pairs."); break; case 1: this.fail(`expected the name ${this.xmlDeclExpects[0]}.`); break; default: this.fail(`expected one of ${this.xmlDeclExpects.join(", ")}`); } } this.state = c === EQUAL ? S_XML_DECL_VALUE_START : S_XML_DECL_EQ; } private sXMLDeclEq(): void { const c = this.getCodeNorm(); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { this.state = S_XML_DECL_ENDING; this.fail("XML declaration is incomplete."); return; } if (isS(c)) { return; } if (c !== EQUAL) { this.fail("value required."); } this.state = S_XML_DECL_VALUE_START; } private sXMLDeclValueStart(): void { const c = this.getCodeNorm(); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { this.state = S_XML_DECL_ENDING; this.fail("XML declaration is incomplete."); return; } if (isS(c)) { return; } if (!isQuote(c)) { this.fail("value must be quoted."); this.q = SPACE; } else { this.q = c; } this.state = S_XML_DECL_VALUE; } private sXMLDeclValue(): void { const c = this.captureTo([this.q!, QUESTION]); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { this.state = S_XML_DECL_ENDING; this.text = ""; this.fail("XML declaration is incomplete."); return; } if (c === EOC) { return; } const value = this.text; this.text = ""; switch (this.name) { case "version": { this.xmlDeclExpects = ["encoding", "standalone"]; const version = value; this.xmlDecl.version = version; // This is the test specified by XML 1.0 but it is fine for XML 1.1. if (!/^1\.[0-9]+$/.test(version)) { this.fail("version number must match /^1\\.[0-9]+$/."); } // When forceXMLVersion is set, the XML declaration is ignored. else if (!(this.opt.forceXMLVersion as boolean)) { this.setXMLVersion(version); } break; } case "encoding": if (!/^[A-Za-z][A-Za-z0-9._-]*$/.test(value)) { this.fail("encoding value must match \ /^[A-Za-z0-9][A-Za-z0-9._-]*$/."); } this.xmlDeclExpects = ["standalone"]; this.xmlDecl.encoding = value; break; case "standalone": if (value !== "yes" && value !== "no") { this.fail("standalone value must match \"yes\" or \"no\"."); } this.xmlDeclExpects = []; this.xmlDecl.standalone = value; break; default: // We don't need to raise an error here since we've already raised one // when checking what name was expected. } this.name = ""; this.state = S_XML_DECL_SEPARATOR; } private sXMLDeclSeparator(): void { const c = this.getCodeNorm(); // The question mark character is not valid inside any of the XML // declaration name/value pairs. if (c === QUESTION) { // It is valid to go to S_XML_DECL_ENDING from this state. this.state = S_XML_DECL_ENDING; return; } if (!isS(c)) { this.fail("whitespace required."); this.unget(); } this.state = S_XML_DECL_NAME_START; } private sXMLDeclEnding(): void { const c = this.getCodeNorm(); if (c === GREATER) { if (this.piTarget !== "xml") { this.fail("processing instructions are not allowed before root."); } else if (this.name !== "version" && this.xmlDeclExpects.includes("version")) { this.fail("XML declaration must contain a version."); } this.xmldeclHandler?.(this.xmlDecl); this.name = ""; this.piTarget = this.text = ""; this.state = S_TEXT; } else { // We got here because the previous character was a ?, but the question // mark character is not valid inside any of the XML declaration // name/value pairs. this.fail( "The character ? is disallowed anywhere in XML declarations."); } this.xmlDeclPossible = false; } private sOpenTag(): void { const c = this.captureNameChars(); if (c === EOC) { return; } const tag: SaxesTagIncomplete = this.tag = { name: this.name, attributes: Object.create(null) as Record<string, string>, }; this.name = ""; if (this.xmlnsOpt) { this.topNS = tag.ns = Object.create(null) as Record<string, string>; } this.openTagStartHandler?.(tag as StartTagForOptions<O>); this.sawRoot = true; if (!this.fragmentOpt && this.closedRoot) { this.fail("documents may contain only one root."); } switch (c) { case GREATER: this.openTag(); break; case FORWARD_SLASH: this.state = S_OPEN_TAG_SLASH; break; default: if (!isS(c)) { this.fail("disallowed character in tag name."); } this.state = S_ATTRIB; } } private sOpenTagSlash(): void { if (this.getCode() === GREATER) { this.openSelfClosingTag(); } else { this.fail("forward-slash in opening tag not followed by >."); this.state = S_ATTRIB; } } private sAttrib(): void { const c = this.skipSpaces(); if (c === EOC) { return; } if (isNameStartChar(c)) { this.unget(); this.state = S_ATTRIB_NAME; } else if (c === GREATER) { this.openTag(); } else if (c === FORWARD_SLASH) { this.state = S_OPEN_TAG_SLASH; } else { this.fail("disallowed character in attribute name."); } } private sAttribName(): void { const c = this.captureNameChars(); if (c === EQUAL) { this.state = S_ATTRIB_VALUE; } else if (isS(c)) { this.state = S_ATTRIB_NAME_SAW_WHITE; } else if (c === GREATER) { this.fail("attribute without value."); this.pushAttrib(this.name, this.name); this.name = this.text = ""; this.openTag(); } else if (c !== EOC) { this.fail("disallowed character in attribute name."); } } private sAttribNameSawWhite(): void { const c = this.skipSpaces(); switch (c) { case EOC: return; case EQUAL: this.state = S_ATTRIB_VALUE; break; default: this.fail("attribute without value."); // Should we do this??? // this.tag.attributes[this.name] = ""; this.text = ""; this.name = ""; if (c === GREATER) { this.openTag(); } else if (isNameStartChar(c)) { this.unget(); this.state = S_ATTRIB_NAME; } else { this.fail("disallowed character in attribute name."); this.state = S_ATTRIB; } } } private sAttribValue(): void { const c = this.getCodeNorm(); if (isQuote(c)) { this.q = c; this.state = S_ATTRIB_VALUE_QUOTED; } else if (!isS(c)) { this.fail("unquoted attribute value."); this.state = S_ATTRIB_VALUE_UNQUOTED; this.unget(); } } private sAttribValueQuoted(): void { // We deliberately do not use captureTo here. The specialized code we use // here is faster than using captureTo. const { q, chunk } = this; let { i: start } = this; // eslint-disable-next-line no-constant-condition while (true) { switch (this.getCode()) { case q: this.pushAttrib(this.name, this.text + chunk.slice(start, this.prevI)); this.name = this.text = ""; this.q = null; this.state = S_ATTRIB_VALUE_CLOSED; return; case AMP: this.text += chunk.slice(start, this.prevI); this.state = S_ENTITY; this.entityReturnState = S_ATTRIB_VALUE_QUOTED; return; case NL: case NL_LIKE: case TAB: this.text += `${chunk.slice(start, this.prevI)} `; start = this.i; break; case LESS: this.text += chunk.slice(start, this.prevI); this.fail("disallowed character."); return; case EOC: this.text += chunk.slice(start); return; default: } } } private sAttribValueClosed(): void { const c = this.getCodeNorm(); if (isS(c)) { this.state = S_ATTRIB; } else if (c === GREATER) { this.openTag(); } else if (c === FORWARD_SLASH) { this.state = S_OPEN_TAG_SLASH; } else if (isNameStartChar(c)) { this.fail("no whitespace between attributes."); this.unget(); this.state = S_ATTRIB_NAME; } else { this.fail("disallowed character in attribute name."); } } private sAttribValueUnquoted(): void { // We don't do anything regarding EOL or space handling for unquoted // attributes. We already have failed by the time we get here, and the // contract that saxes upholds states that upon failure, it is not safe to // rely on the data passed to event handlers (other than // ``onerror``). Passing "bad" data is not a problem. const c = this.captureTo(ATTRIB_VALUE_UNQUOTED_TERMINATOR); switch (c) { case AMP: this.state = S_ENTITY; this.entityReturnState = S_ATTRIB_VALUE_UNQUOTED; break; case LESS: this.fail("disallowed character."); break; case EOC: break; default: if (this.text.includes("]]>")) { this.fail("the string \"]]>\" is disallowed in char data."); } this.pushAttrib(this.name, this.text); this.name = this.text = ""; if (c === GREATER) { this.openTag(); } else { this.state = S_ATTRIB; } } } private sCloseTag(): void { const c = this.captureNameChars(); if (c === GREATER) { this.closeTag(); } else if (isS(c)) { this.state = S_CLOSE_TAG_SAW_WHITE; } else if (c !== EOC) { this.fail("disallowed character in closing tag."); } } private sCloseTagSawWhite(): void { switch (this.skipSpaces()) { case GREATER: this.closeTag(); break; case EOC: break; default: this.fail("disallowed character in closing tag."); } } // END OF STATE ENGINE METHODS private handleTextInRoot(): void { // This is essentially a specialized version of captureTo which is optimized // for performing the ]]> check. A previous version of this code, checked // ``this.text`` for the presence of ]]>. It simplified the code but was // very costly when character data contained a lot of entities to be parsed. // // Since we are using a specialized loop, we also keep track of the presence // of ]]> in text data. The sequence ]]> is forbidden to appear as-is. // let { i: start, forbiddenState } = this; const { chunk, textHandler: handler } = this; // eslint-disable-next-line no-labels, no-restricted-syntax scanLoop: // eslint-disable-next-line no-constant-condition while (true) { switch (this.getCode()) { case LESS: { this.state = S_OPEN_WAKA; if (handler !== undefined) { const { text } = this; const slice = chunk.slice(start, this.prevI); if (text.length !== 0) { handler(text + slice); this.text = ""; } else if (slice.length !== 0) { handler(slice); } } forbiddenState = FORBIDDEN_START; // eslint-disable-next-line no-labels break scanLoop; } case AMP: this.state = S_ENTITY; this.entityReturnState = S_TEXT; if (handler !== undefined) { this.text += chunk.slice(start, this.prevI); } forbiddenState = FORBIDDEN_START; // eslint-disable-next-line no-labels break scanLoop; case CLOSE_BRACKET: switch (forbiddenState) { case FORBIDDEN_START: forbiddenState = FORBIDDEN_BRACKET; break; case FORBIDDEN_BRACKET: forbiddenState = FORBIDDEN_BRACKET_BRACKET; break; case FORBIDDEN_BRACKET_BRACKET: break; default: throw new Error("impossible state"); } break; case GREATER: if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) { this.fail("the string \"]]>\" is disallowed in char data."); } forbiddenState = FORBIDDEN_START; break; case NL_LIKE: if (handler !== undefined) { this.text += `${chunk.slice(start, this.prevI)}\n`; } start = this.i; forbiddenState = FORBIDDEN_START; break; case EOC: if (handler !== undefined) { this.text += chunk.slice(start); } // eslint-disable-next-line no-labels break scanLoop; default: forbiddenState = FORBIDDEN_START; } } this.forbiddenState = forbiddenState; } private handleTextOutsideRoot(): void { // This is essentially a specialized version of captureTo which is optimized // for a specialized task. We keep track of the presence of non-space // characters in the text since these are errors when appearing outside the // document root element. let { i: start } = this; const { chunk, textHandler: handler } = this; let nonSpace = false; // eslint-disable-next-line no-labels, no-restricted-syntax outRootLoop: // eslint-disable-next-line no-constant-condition while (true) { const code = this.getCode(); switch (code) { case LESS: { this.state = S_OPEN_WAKA; if (handler !== undefined) { const { text } = this; const slice = chunk.slice(start, this.prevI); if (text.length !== 0) { handler(text + slice); this.text = ""; } else if (slice.length !== 0) { handler(slice); } } // eslint-disable-next-line no-labels break outRootLoop; } case AMP: this.state = S_ENTITY; this.entityReturnState = S_TEXT; if (handler !== undefined) { this.text += chunk.slice(start, this.prevI); } nonSpace = true; // eslint-disable-next-line no-labels break outRootLoop; case NL_LIKE: if (handler !== undefined) { this.text += `${chunk.slice(start, this.prevI)}\n`; } start = this.i; break; case EOC: if (handler !== undefined) { this.text += chunk.slice(start); } // eslint-disable-next-line no-labels break outRootLoop; default: if (!isS(code)) { nonSpace = true; } } } if (!nonSpace) { return; } // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags // to avoid reporting errors for every single character that is out of // place. if (!this.sawRoot && !this.reportedTextBeforeRoot) { this.fail("text data outside of root node."); this.reportedTextBeforeRoot = true; } if (this.closedRoot && !this.reportedTextAfterRoot) { this.fail("text data outside of root node."); this.reportedTextAfterRoot = true; } } private pushAttribNS(name: string, value: string): void { const { prefix, local } = this.qname(name); const attr = { name, prefix, local, value }; this.attribList.push(attr); this.attributeHandler?.(attr as AttributeEventForOptions<O>); if (prefix === "xmlns") { const trimmed = value.trim(); if (this.currentXMLVersion === "1.0" && trimmed === "") { this.fail("invalid attempt to undefine prefix in XML 1.0"); } this.topNS![local] = trimmed; nsPairCheck(this, local, trimmed); } else if (name === "xmlns") { const trimmed = value.trim(); this.topNS![""] = trimmed; nsPairCheck(this, "", trimmed); } } private pushAttribPlain(name: string, value: string): void { const attr = { name, value }; this.attribList.push(attr); this.attributeHandler?.(attr as AttributeEventForOptions<O>); } /** * End parsing. This performs final well-formedness checks and resets the * parser to a clean state. * * @returns this */ private end(): this { if (!this.sawRoot) { this.fail("document must contain a root element."); } const { tags } = this; while (tags.length > 0) { const tag = tags.pop()!; this.fail(`unclosed tag: ${tag.name}`); } if ((this.state !== S_BEGIN) && (this.state !== S_TEXT)) { this.fail("unexpected end."); } const { text } = this; if (text.length !== 0) { this.textHandler?.(text); this.text = ""; } this._closed = true; this.endHandler?.(); this._init(); return this; } /** * Resolve a namespace prefix. * * @param prefix The prefix to resolve. * * @returns The namespace URI or ``undefined`` if the prefix is not defined. */ resolve(prefix: string): string | undefined { let uri = this.topNS![prefix]; if (uri !== undefined) { return uri; } const { tags } = this; for (let index = tags.length - 1; index >= 0; index--) { uri = tags[index]!.ns![prefix]; if (uri !== undefined) { return uri; } } uri = this.ns[prefix]; if (uri !== undefined) { return uri; } return this.opt.resolvePrefix?.(prefix); } /** * Parse a qname into its prefix and local name parts. * * @param name The name to parse * * @returns */ private qname(name: string): { prefix: string; local: string } { // This is faster than using name.split(":"). const colon = name.indexOf(":"); if (colon === -1) { return { prefix: "", local: name }; } const local = name.slice(colon + 1); const prefix = name.slice(0, colon); if (prefix === "" || local === "" || local.includes(":")) { this.fail(`malformed name: ${name}.`); } return { prefix, local }; } private processAttribsNS(): void { const { attribList } = this; const tag = this.tag!; { // add namespace info to tag const { prefix, local } = this.qname(tag.name); tag.prefix = prefix; tag.local = local; const uri = tag.uri = this.resolve(prefix) ?? ""; if (prefix !== "") { if (prefix === "xmlns") { this.fail("tags may not have \"xmlns\" as prefix."); } if (uri === "") { this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`); tag.uri = prefix; } } } if (attribList.length === 0) { return; } const { attributes } = tag; const seen = new Set(); // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (const attr of attribList as SaxesAttributeNSIncomplete[]) { const { name, prefix, local } = attr; let uri; let eqname; if (prefix === "") { uri = name === "xmlns" ? XMLNS_NAMESPACE : ""; eqname = name; } else { uri = this.resolve(prefix); // if there's any attributes with an undefined namespace, // then fail on them now. if (uri === undefined) { this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`); uri = prefix; } eqname = `{${uri}}${local}`; } if (seen.has(eqname)) { this.fail(`duplicate attribute: ${eqname}.`); } seen.add(eqname); attr.uri = uri; attributes[name] = attr; } this.attribList = []; } private processAttribsPlain(): void { const { attribList } = this; // eslint-disable-next-line prefer-destructuring const attributes = this.tag!.attributes; for (const { name, value } of attribList) { if (attributes[name] !== undefined) { this.fail(`duplicate attribute: ${name}.`); } attributes[name] = value; } this.attribList = []; } /** * Handle a complete open tag. This parser code calls this once it has seen * the whole tag. This method checks for well-formeness and then emits * ``onopentag``. */ private openTag(): void { this.processAttribs(); const { tags } = this; const tag = this.tag as SaxesTag; tag.isSelfClosing = false; // There cannot be any pending text here due to the onopentagstart that was // necessarily emitted before we get here. So we do not check text. this.openTagHandler?.(tag as TagForOptions<O>); tags.push(tag); this.state = S_TEXT; this.name = ""; } /** * Handle a complete self-closing tag. This parser code calls this once it has * seen the whole tag. This method checks for well-formeness and then emits * ``onopentag`` and ``onclosetag``. */ private openSelfClosingTag(): void { this.processAttribs(); const { tags } = this; const tag = this.tag as SaxesTag; tag.isSelfClosing = true; // There cannot be any pending text here due to the onopentagstart that was // necessarily emitted before we get here. So we do not check text. this.openTagHandler?.(tag as TagForOptions<O>); this.closeTagHandler?.(tag as TagForOptions<O>); const top = this.tag = tags[tags.length - 1] ?? null; if (top === null) { this.closedRoot = true; } this.state = S_TEXT; this.name = ""; } /** * Handle a complete close tag. This parser code calls this once it has seen * the whole tag. This method checks for well-formeness and then emits * ``onclosetag``. */ private closeTag(): void { const { tags, name } = this; // Our state after this will be S_TEXT, no matter what, and we can clear // tagName now. this.state = S_TEXT; this.name = ""; if (name === "") { this.fail("weird empty close tag."); this.text += "</>"; return; } const handler = this.closeTagHandler; let l = tags.length; while (l-- > 0) { const tag = this.tag = tags.pop() as SaxesTag; this.topNS = tag.ns!; handler?.(tag as TagForOptions<O>); if (tag.name === name) { break; } this.fail("unexpected close tag."); } if (l === 0) { this.closedRoot = true; } else if (l < 0) { this.fail(`unmatched closing tag: ${name}.`); this.text += `</${name}>`; } } /** * Resolves an entity. Makes any necessary well-formedness checks. * * @param entity The entity to resolve. * * @returns The parsed entity. */ private parseEntity(entity: string): string { // startsWith would be significantly slower for this test. if (entity[0] !== "#") { const defined = this.ENTITIES[entity]; if (defined !== undefined) { return defined; } this.fail(this.isName(entity) ? "undefined entity." : "disallowed character in entity name."); return `&${entity};`; } let num = NaN; if (entity[1] === "x" && /^#x[0-9a-f]+$/i.test(entity)) { num = parseInt(entity.slice(2), 16); } else if (/^#[0-9]+$/.test(entity)) { num = parseInt(entity.slice(1), 10); } // The character reference is required to match the CHAR production. if (!this.isChar(num)) { this.fail("malformed character entity."); return `&${entity};`; } return String.fromCodePoint(num); } }