saxes.js
1 "use strict"; 2 Object.defineProperty(exports, "__esModule", { value: true }); 3 const ed5 = require("xmlchars/xml/1.0/ed5"); 4 const ed2 = require("xmlchars/xml/1.1/ed2"); 5 const NSed3 = require("xmlchars/xmlns/1.0/ed3"); 6 var isS = ed5.isS; 7 var isChar10 = ed5.isChar; 8 var isNameStartChar = ed5.isNameStartChar; 9 var isNameChar = ed5.isNameChar; 10 var S_LIST = ed5.S_LIST; 11 var NAME_RE = ed5.NAME_RE; 12 var isChar11 = ed2.isChar; 13 var isNCNameStartChar = NSed3.isNCNameStartChar; 14 var isNCNameChar = NSed3.isNCNameChar; 15 var NC_NAME_RE = NSed3.NC_NAME_RE; 16 const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; 17 const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"; 18 const rootNS = { 19 // eslint-disable-next-line @typescript-eslint/no-explicit-any 20 __proto__: null, 21 xml: XML_NAMESPACE, 22 xmlns: XMLNS_NAMESPACE, 23 }; 24 const XML_ENTITIES = { 25 // eslint-disable-next-line @typescript-eslint/no-explicit-any 26 __proto__: null, 27 amp: "&", 28 gt: ">", 29 lt: "<", 30 quot: "\"", 31 apos: "'", 32 }; 33 // EOC: end-of-chunk 34 const EOC = -1; 35 const NL_LIKE = -2; 36 const S_BEGIN = 0; // Initial state. 37 const S_BEGIN_WHITESPACE = 1; // leading whitespace 38 const S_DOCTYPE = 2; // <!DOCTYPE 39 const S_DOCTYPE_QUOTE = 3; // <!DOCTYPE "//blah 40 const S_DTD = 4; // <!DOCTYPE "//blah" [ ... 41 const S_DTD_QUOTED = 5; // <!DOCTYPE "//blah" [ "foo 42 const S_DTD_OPEN_WAKA = 6; 43 const S_DTD_OPEN_WAKA_BANG = 7; 44 const S_DTD_COMMENT = 8; // <!-- 45 const S_DTD_COMMENT_ENDING = 9; // <!-- blah - 46 const S_DTD_COMMENT_ENDED = 10; // <!-- blah -- 47 const S_DTD_PI = 11; // <? 48 const S_DTD_PI_ENDING = 12; // <?hi "there" ? 49 const S_TEXT = 13; // general stuff 50 const S_ENTITY = 14; // & and such 51 const S_OPEN_WAKA = 15; // < 52 const S_OPEN_WAKA_BANG = 16; // <!... 53 const S_COMMENT = 17; // <!-- 54 const S_COMMENT_ENDING = 18; // <!-- blah - 55 const S_COMMENT_ENDED = 19; // <!-- blah -- 56 const S_CDATA = 20; // <![CDATA[ something 57 const S_CDATA_ENDING = 21; // ] 58 const S_CDATA_ENDING_2 = 22; // ]] 59 const S_PI_FIRST_CHAR = 23; // <?hi, first char 60 const S_PI_REST = 24; // <?hi, rest of the name 61 const S_PI_BODY = 25; // <?hi there 62 const S_PI_ENDING = 26; // <?hi "there" ? 63 const S_XML_DECL_NAME_START = 27; // <?xml 64 const S_XML_DECL_NAME = 28; // <?xml foo 65 const S_XML_DECL_EQ = 29; // <?xml foo= 66 const S_XML_DECL_VALUE_START = 30; // <?xml foo= 67 const S_XML_DECL_VALUE = 31; // <?xml foo="bar" 68 const S_XML_DECL_SEPARATOR = 32; // <?xml foo="bar" 69 const S_XML_DECL_ENDING = 33; // <?xml ... ? 70 const S_OPEN_TAG = 34; // <strong 71 const S_OPEN_TAG_SLASH = 35; // <strong / 72 const S_ATTRIB = 36; // <a 73 const S_ATTRIB_NAME = 37; // <a foo 74 const S_ATTRIB_NAME_SAW_WHITE = 38; // <a foo _ 75 const S_ATTRIB_VALUE = 39; // <a foo= 76 const S_ATTRIB_VALUE_QUOTED = 40; // <a foo="bar 77 const S_ATTRIB_VALUE_CLOSED = 41; // <a foo="bar" 78 const S_ATTRIB_VALUE_UNQUOTED = 42; // <a foo=bar 79 const S_CLOSE_TAG = 43; // </a 80 const S_CLOSE_TAG_SAW_WHITE = 44; // </a > 81 const TAB = 9; 82 const NL = 0xA; 83 const CR = 0xD; 84 const SPACE = 0x20; 85 const BANG = 0x21; 86 const DQUOTE = 0x22; 87 const AMP = 0x26; 88 const SQUOTE = 0x27; 89 const MINUS = 0x2D; 90 const FORWARD_SLASH = 0x2F; 91 const SEMICOLON = 0x3B; 92 const LESS = 0x3C; 93 const EQUAL = 0x3D; 94 const GREATER = 0x3E; 95 const QUESTION = 0x3F; 96 const OPEN_BRACKET = 0x5B; 97 const CLOSE_BRACKET = 0x5D; 98 const NEL = 0x85; 99 const LS = 0x2028; // Line Separator 100 const isQuote = (c) => c === DQUOTE || c === SQUOTE; 101 const QUOTES = [DQUOTE, SQUOTE]; 102 const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER]; 103 const DTD_TERMINATOR = [...QUOTES, LESS, CLOSE_BRACKET]; 104 const XML_DECL_NAME_TERMINATOR = [EQUAL, QUESTION, ...S_LIST]; 105 const ATTRIB_VALUE_UNQUOTED_TERMINATOR = [...S_LIST, GREATER, AMP, LESS]; 106 function nsPairCheck(parser, prefix, uri) { 107 switch (prefix) { 108 case "xml": 109 if (uri !== XML_NAMESPACE) { 110 parser.fail(`xml prefix must be bound to ${XML_NAMESPACE}.`); 111 } 112 break; 113 case "xmlns": 114 if (uri !== XMLNS_NAMESPACE) { 115 parser.fail(`xmlns prefix must be bound to ${XMLNS_NAMESPACE}.`); 116 } 117 break; 118 default: 119 } 120 switch (uri) { 121 case XMLNS_NAMESPACE: 122 parser.fail(prefix === "" ? 123 `the default namespace may not be set to ${uri}.` : 124 `may not assign a prefix (even "xmlns") to the URI \ 125 ${XMLNS_NAMESPACE}.`); 126 break; 127 case XML_NAMESPACE: 128 switch (prefix) { 129 case "xml": 130 // Assinging the XML namespace to "xml" is fine. 131 break; 132 case "": 133 parser.fail(`the default namespace may not be set to ${uri}.`); 134 break; 135 default: 136 parser.fail("may not assign the xml namespace to another prefix."); 137 } 138 break; 139 default: 140 } 141 } 142 function nsMappingCheck(parser, mapping) { 143 for (const local of Object.keys(mapping)) { 144 nsPairCheck(parser, local, mapping[local]); 145 } 146 } 147 const isNCName = (name) => NC_NAME_RE.test(name); 148 const isName = (name) => NAME_RE.test(name); 149 const FORBIDDEN_START = 0; 150 const FORBIDDEN_BRACKET = 1; 151 const FORBIDDEN_BRACKET_BRACKET = 2; 152 /** 153 * The list of supported events. 154 */ 155 exports.EVENTS = [ 156 "xmldecl", 157 "text", 158 "processinginstruction", 159 "doctype", 160 "comment", 161 "opentagstart", 162 "attribute", 163 "opentag", 164 "closetag", 165 "cdata", 166 "error", 167 "end", 168 "ready", 169 ]; 170 const EVENT_NAME_TO_HANDLER_NAME = { 171 xmldecl: "xmldeclHandler", 172 text: "textHandler", 173 processinginstruction: "piHandler", 174 doctype: "doctypeHandler", 175 comment: "commentHandler", 176 opentagstart: "openTagStartHandler", 177 attribute: "attributeHandler", 178 opentag: "openTagHandler", 179 closetag: "closeTagHandler", 180 cdata: "cdataHandler", 181 error: "errorHandler", 182 end: "endHandler", 183 ready: "readyHandler", 184 }; 185 class SaxesParser { 186 /** 187 * @param opt The parser options. 188 */ 189 constructor(opt) { 190 this.opt = opt !== null && opt !== void 0 ? opt : {}; 191 this.fragmentOpt = !!this.opt.fragment; 192 const xmlnsOpt = this.xmlnsOpt = !!this.opt.xmlns; 193 this.trackPosition = this.opt.position !== false; 194 this.fileName = this.opt.fileName; 195 if (xmlnsOpt) { 196 // This is the function we use to perform name checks on PIs and entities. 197 // When namespaces are used, colons are not allowed in PI target names or 198 // entity names. So the check depends on whether namespaces are used. See: 199 // 200 // https://www.w3.org/XML/xml-names-19990114-errata.html 201 // NE08 202 // 203 this.nameStartCheck = isNCNameStartChar; 204 this.nameCheck = isNCNameChar; 205 this.isName = isNCName; 206 // eslint-disable-next-line @typescript-eslint/unbound-method 207 this.processAttribs = this.processAttribsNS; 208 // eslint-disable-next-line @typescript-eslint/unbound-method 209 this.pushAttrib = this.pushAttribNS; 210 // eslint-disable-next-line @typescript-eslint/no-explicit-any 211 this.ns = Object.assign({ __proto__: null }, rootNS); 212 const additional = this.opt.additionalNamespaces; 213 if (additional != null) { 214 nsMappingCheck(this, additional); 215 Object.assign(this.ns, additional); 216 } 217 } 218 else { 219 this.nameStartCheck = isNameStartChar; 220 this.nameCheck = isNameChar; 221 this.isName = isName; 222 // eslint-disable-next-line @typescript-eslint/unbound-method 223 this.processAttribs = this.processAttribsPlain; 224 // eslint-disable-next-line @typescript-eslint/unbound-method 225 this.pushAttrib = this.pushAttribPlain; 226 } 227 // 228 // The order of the members in this table needs to correspond to the state 229 // numbers given to the states that correspond to the methods being recorded 230 // here. 231 // 232 this.stateTable = [ 233 /* eslint-disable @typescript-eslint/unbound-method */ 234 this.sBegin, 235 this.sBeginWhitespace, 236 this.sDoctype, 237 this.sDoctypeQuote, 238 this.sDTD, 239 this.sDTDQuoted, 240 this.sDTDOpenWaka, 241 this.sDTDOpenWakaBang, 242 this.sDTDComment, 243 this.sDTDCommentEnding, 244 this.sDTDCommentEnded, 245 this.sDTDPI, 246 this.sDTDPIEnding, 247 this.sText, 248 this.sEntity, 249 this.sOpenWaka, 250 this.sOpenWakaBang, 251 this.sComment, 252 this.sCommentEnding, 253 this.sCommentEnded, 254 this.sCData, 255 this.sCDataEnding, 256 this.sCDataEnding2, 257 this.sPIFirstChar, 258 this.sPIRest, 259 this.sPIBody, 260 this.sPIEnding, 261 this.sXMLDeclNameStart, 262 this.sXMLDeclName, 263 this.sXMLDeclEq, 264 this.sXMLDeclValueStart, 265 this.sXMLDeclValue, 266 this.sXMLDeclSeparator, 267 this.sXMLDeclEnding, 268 this.sOpenTag, 269 this.sOpenTagSlash, 270 this.sAttrib, 271 this.sAttribName, 272 this.sAttribNameSawWhite, 273 this.sAttribValue, 274 this.sAttribValueQuoted, 275 this.sAttribValueClosed, 276 this.sAttribValueUnquoted, 277 this.sCloseTag, 278 this.sCloseTagSawWhite, 279 ]; 280 this._init(); 281 } 282 /** 283 * Indicates whether or not the parser is closed. If ``true``, wait for 284 * the ``ready`` event to write again. 285 */ 286 get closed() { 287 return this._closed; 288 } 289 _init() { 290 var _a; 291 this.openWakaBang = ""; 292 this.text = ""; 293 this.name = ""; 294 this.piTarget = ""; 295 this.entity = ""; 296 this.q = null; 297 this.tags = []; 298 this.tag = null; 299 this.topNS = null; 300 this.chunk = ""; 301 this.chunkPosition = 0; 302 this.i = 0; 303 this.prevI = 0; 304 this.carriedFromPrevious = undefined; 305 this.forbiddenState = FORBIDDEN_START; 306 this.attribList = []; 307 // The logic is organized so as to minimize the need to check 308 // this.opt.fragment while parsing. 309 const { fragmentOpt } = this; 310 this.state = fragmentOpt ? S_TEXT : S_BEGIN; 311 // We want these to be all true if we are dealing with a fragment. 312 this.reportedTextBeforeRoot = this.reportedTextAfterRoot = this.closedRoot = 313 this.sawRoot = fragmentOpt; 314 // An XML declaration is intially possible only when parsing whole 315 // documents. 316 this.xmlDeclPossible = !fragmentOpt; 317 this.xmlDeclExpects = ["version"]; 318 this.entityReturnState = undefined; 319 let { defaultXMLVersion } = this.opt; 320 if (defaultXMLVersion === undefined) { 321 if (this.opt.forceXMLVersion === true) { 322 throw new Error("forceXMLVersion set but defaultXMLVersion is not set"); 323 } 324 defaultXMLVersion = "1.0"; 325 } 326 this.setXMLVersion(defaultXMLVersion); 327 this.positionAtNewLine = 0; 328 this.doctype = false; 329 this._closed = false; 330 this.xmlDecl = { 331 version: undefined, 332 encoding: undefined, 333 standalone: undefined, 334 }; 335 this.line = 1; 336 this.column = 0; 337 this.ENTITIES = Object.create(XML_ENTITIES); 338 // eslint-disable-next-line no-unused-expressions 339 (_a = this.readyHandler) === null || _a === void 0 ? void 0 : _a.call(this); 340 } 341 /** 342 * The stream position the parser is currently looking at. This field is 343 * zero-based. 344 * 345 * This field is not based on counting Unicode characters but is to be 346 * interpreted as a plain index into a JavaScript string. 347 */ 348 get position() { 349 return this.chunkPosition + this.i; 350 } 351 /** 352 * The column number of the next character to be read by the parser. * 353 * This field is zero-based. (The first column in a line is 0.) 354 * 355 * This field reports the index at which the next character would be in the 356 * line if the line were represented as a JavaScript string. Note that this 357 * *can* be different to a count based on the number of *Unicode characters* 358 * due to how JavaScript handles astral plane characters. 359 * 360 * See [[column]] for a number that corresponds to a count of Unicode 361 * characters. 362 */ 363 get columnIndex() { 364 return this.position - this.positionAtNewLine; 365 } 366 /** 367 * Set an event listener on an event. The parser supports one handler per 368 * event type. If you try to set an event handler over an existing handler, 369 * the old handler is silently overwritten. 370 * 371 * @param name The event to listen to. 372 * 373 * @param handler The handler to set. 374 */ 375 on(name, handler) { 376 // eslint-disable-next-line @typescript-eslint/no-explicit-any 377 this[EVENT_NAME_TO_HANDLER_NAME[name]] = handler; 378 } 379 /** 380 * Unset an event handler. 381 * 382 * @parma name The event to stop listening to. 383 */ 384 off(name) { 385 // eslint-disable-next-line @typescript-eslint/no-explicit-any 386 this[EVENT_NAME_TO_HANDLER_NAME[name]] = undefined; 387 } 388 /** 389 * Make an error object. The error object will have a message that contains 390 * the ``fileName`` option passed at the creation of the parser. If position 391 * tracking was turned on, it will also have line and column number 392 * information. 393 * 394 * @param message The message describing the error to report. 395 * 396 * @returns An error object with a properly formatted message. 397 */ 398 makeError(message) { 399 var _a; 400 let msg = (_a = this.fileName) !== null && _a !== void 0 ? _a : ""; 401 if (this.trackPosition) { 402 if (msg.length > 0) { 403 msg += ":"; 404 } 405 msg += `${this.line}:${this.column}`; 406 } 407 if (msg.length > 0) { 408 msg += ": "; 409 } 410 return new Error(msg + message); 411 } 412 /** 413 * Report a parsing error. This method is made public so that client code may 414 * check for issues that are outside the scope of this project and can report 415 * errors. 416 * 417 * @param message The error to report. 418 * 419 * @returns this 420 */ 421 fail(message) { 422 const err = this.makeError(message); 423 const handler = this.errorHandler; 424 if (handler === undefined) { 425 throw err; 426 } 427 else { 428 handler(err); 429 } 430 return this; 431 } 432 /** 433 * Write a XML data to the parser. 434 * 435 * @param chunk The XML data to write. 436 * 437 * @returns this 438 */ 439 write(chunk) { 440 if (this.closed) { 441 return this.fail("cannot write after close; assign an onready handler."); 442 } 443 let end = false; 444 if (chunk === null) { 445 // We cannot return immediately because carriedFromPrevious may need 446 // processing. 447 end = true; 448 chunk = ""; 449 } 450 else if (typeof chunk === "object") { 451 chunk = chunk.toString(); 452 } 453 // We checked if performing a pre-decomposition of the string into an array 454 // of single complete characters (``Array.from(chunk)``) would be faster 455 // than the current repeated calls to ``charCodeAt``. As of August 2018, it 456 // isn't. (There may be Node-specific code that would perform faster than 457 // ``Array.from`` but don't want to be dependent on Node.) 458 if (this.carriedFromPrevious !== undefined) { 459 // The previous chunk had char we must carry over. 460 chunk = `${this.carriedFromPrevious}${chunk}`; 461 this.carriedFromPrevious = undefined; 462 } 463 let limit = chunk.length; 464 const lastCode = chunk.charCodeAt(limit - 1); 465 if (!end && 466 // A trailing CR or surrogate must be carried over to the next 467 // chunk. 468 (lastCode === CR || (lastCode >= 0xD800 && lastCode <= 0xDBFF))) { 469 // The chunk ends with a character that must be carried over. We cannot 470 // know how to handle it until we get the next chunk or the end of the 471 // stream. So save it for later. 472 this.carriedFromPrevious = chunk[limit - 1]; 473 limit--; 474 chunk = chunk.slice(0, limit); 475 } 476 const { stateTable } = this; 477 this.chunk = chunk; 478 this.i = 0; 479 while (this.i < limit) { 480 // eslint-disable-next-line @typescript-eslint/no-explicit-any 481 stateTable[this.state].call(this); 482 } 483 this.chunkPosition += limit; 484 return end ? this.end() : this; 485 } 486 /** 487 * Close the current stream. Perform final well-formedness checks and reset 488 * the parser tstate. 489 * 490 * @returns this 491 */ 492 close() { 493 return this.write(null); 494 } 495 /** 496 * Get a single code point out of the current chunk. This updates the current 497 * position if we do position tracking. 498 * 499 * This is the algorithm to use for XML 1.0. 500 * 501 * @returns The character read. 502 */ 503 getCode10() { 504 const { chunk, i } = this; 505 this.prevI = i; 506 // Yes, we do this instead of doing this.i++. Doing it this way, we do not 507 // read this.i again, which is a bit faster. 508 this.i = i + 1; 509 if (i >= chunk.length) { 510 return EOC; 511 } 512 // Using charCodeAt and handling the surrogates ourselves is faster 513 // than using codePointAt. 514 const code = chunk.charCodeAt(i); 515 this.column++; 516 if (code < 0xD800) { 517 if (code >= SPACE || code === TAB) { 518 return code; 519 } 520 switch (code) { 521 case NL: 522 this.line++; 523 this.column = 0; 524 this.positionAtNewLine = this.position; 525 return NL; 526 case CR: 527 // We may get NaN if we read past the end of the chunk, which is fine. 528 if (chunk.charCodeAt(i + 1) === NL) { 529 // A \r\n sequence is converted to \n so we have to skip over the 530 // next character. We already know it has a size of 1 so ++ is fine 531 // here. 532 this.i = i + 2; 533 } 534 // Otherwise, a \r is just converted to \n, so we don't have to skip 535 // ahead. 536 // In either case, \r becomes \n. 537 this.line++; 538 this.column = 0; 539 this.positionAtNewLine = this.position; 540 return NL_LIKE; 541 default: 542 // If we get here, then code < SPACE and it is not NL CR or TAB. 543 this.fail("disallowed character."); 544 return code; 545 } 546 } 547 if (code > 0xDBFF) { 548 // This is a specialized version of isChar10 that takes into account 549 // that in this context code > 0xDBFF and code <= 0xFFFF. So it does not 550 // test cases that don't need testing. 551 if (!(code >= 0xE000 && code <= 0xFFFD)) { 552 this.fail("disallowed character."); 553 } 554 return code; 555 } 556 const final = 0x10000 + ((code - 0xD800) * 0x400) + 557 (chunk.charCodeAt(i + 1) - 0xDC00); 558 this.i = i + 2; 559 // This is a specialized version of isChar10 that takes into account that in 560 // this context necessarily final >= 0x10000. 561 if (final > 0x10FFFF) { 562 this.fail("disallowed character."); 563 } 564 return final; 565 } 566 /** 567 * Get a single code point out of the current chunk. This updates the current 568 * position if we do position tracking. 569 * 570 * This is the algorithm to use for XML 1.1. 571 * 572 * @returns {number} The character read. 573 */ 574 getCode11() { 575 const { chunk, i } = this; 576 this.prevI = i; 577 // Yes, we do this instead of doing this.i++. Doing it this way, we do not 578 // read this.i again, which is a bit faster. 579 this.i = i + 1; 580 if (i >= chunk.length) { 581 return EOC; 582 } 583 // Using charCodeAt and handling the surrogates ourselves is faster 584 // than using codePointAt. 585 const code = chunk.charCodeAt(i); 586 this.column++; 587 if (code < 0xD800) { 588 if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) || 589 code === TAB) { 590 return code; 591 } 592 switch (code) { 593 case NL: // 0xA 594 this.line++; 595 this.column = 0; 596 this.positionAtNewLine = this.position; 597 return NL; 598 case CR: { // 0xD 599 // We may get NaN if we read past the end of the chunk, which is 600 // fine. 601 const next = chunk.charCodeAt(i + 1); 602 if (next === NL || next === NEL) { 603 // A CR NL or CR NEL sequence is converted to NL so we have to skip 604 // over the next character. We already know it has a size of 1. 605 this.i = i + 2; 606 } 607 // Otherwise, a CR is just converted to NL, no skip. 608 } 609 /* yes, fall through */ 610 case NEL: // 0x85 611 case LS: // Ox2028 612 this.line++; 613 this.column = 0; 614 this.positionAtNewLine = this.position; 615 return NL_LIKE; 616 default: 617 this.fail("disallowed character."); 618 return code; 619 } 620 } 621 if (code > 0xDBFF) { 622 // This is a specialized version of isCharAndNotRestricted that takes into 623 // account that in this context code > 0xDBFF and code <= 0xFFFF. So it 624 // does not test cases that don't need testing. 625 if (!(code >= 0xE000 && code <= 0xFFFD)) { 626 this.fail("disallowed character."); 627 } 628 return code; 629 } 630 const final = 0x10000 + ((code - 0xD800) * 0x400) + 631 (chunk.charCodeAt(i + 1) - 0xDC00); 632 this.i = i + 2; 633 // This is a specialized version of isCharAndNotRestricted that takes into 634 // account that in this context necessarily final >= 0x10000. 635 if (final > 0x10FFFF) { 636 this.fail("disallowed character."); 637 } 638 return final; 639 } 640 /** 641 * Like ``getCode`` but with the return value normalized so that ``NL`` is 642 * returned for ``NL_LIKE``. 643 */ 644 getCodeNorm() { 645 const c = this.getCode(); 646 return c === NL_LIKE ? NL : c; 647 } 648 unget() { 649 this.i = this.prevI; 650 this.column--; 651 } 652 /** 653 * Capture characters into a buffer until encountering one of a set of 654 * characters. 655 * 656 * @param chars An array of codepoints. Encountering a character in the array 657 * ends the capture. (``chars`` may safely contain ``NL``.) 658 * 659 * @return The character code that made the capture end, or ``EOC`` if we hit 660 * the end of the chunk. The return value cannot be NL_LIKE: NL is returned 661 * instead. 662 */ 663 captureTo(chars) { 664 let { i: start } = this; 665 const { chunk } = this; 666 // eslint-disable-next-line no-constant-condition 667 while (true) { 668 const c = this.getCode(); 669 const isNLLike = c === NL_LIKE; 670 const final = isNLLike ? NL : c; 671 if (final === EOC || chars.includes(final)) { 672 this.text += chunk.slice(start, this.prevI); 673 return final; 674 } 675 if (isNLLike) { 676 this.text += `${chunk.slice(start, this.prevI)}\n`; 677 start = this.i; 678 } 679 } 680 } 681 /** 682 * Capture characters into a buffer until encountering a character. 683 * 684 * @param char The codepoint that ends the capture. **NOTE ``char`` MAY NOT 685 * CONTAIN ``NL``.** Passing ``NL`` will result in buggy behavior. 686 * 687 * @return ``true`` if we ran into the character. Otherwise, we ran into the 688 * end of the current chunk. 689 */ 690 captureToChar(char) { 691 let { i: start } = this; 692 const { chunk } = this; 693 // eslint-disable-next-line no-constant-condition 694 while (true) { 695 let c = this.getCode(); 696 switch (c) { 697 case NL_LIKE: 698 this.text += `${chunk.slice(start, this.prevI)}\n`; 699 start = this.i; 700 c = NL; 701 break; 702 case EOC: 703 this.text += chunk.slice(start); 704 return false; 705 default: 706 } 707 if (c === char) { 708 this.text += chunk.slice(start, this.prevI); 709 return true; 710 } 711 } 712 } 713 /** 714 * Capture characters that satisfy ``isNameChar`` into the ``name`` field of 715 * this parser. 716 * 717 * @return The character code that made the test fail, or ``EOC`` if we hit 718 * the end of the chunk. The return value cannot be NL_LIKE: NL is returned 719 * instead. 720 */ 721 captureNameChars() { 722 const { chunk, i: start } = this; 723 // eslint-disable-next-line no-constant-condition 724 while (true) { 725 const c = this.getCode(); 726 if (c === EOC) { 727 this.name += chunk.slice(start); 728 return EOC; 729 } 730 // NL is not a name char so we don't have to test specifically for it. 731 if (!isNameChar(c)) { 732 this.name += chunk.slice(start, this.prevI); 733 return c === NL_LIKE ? NL : c; 734 } 735 } 736 } 737 /** 738 * Skip white spaces. 739 * 740 * @return The character that ended the skip, or ``EOC`` if we hit 741 * the end of the chunk. The return value cannot be NL_LIKE: NL is returned 742 * instead. 743 */ 744 skipSpaces() { 745 // eslint-disable-next-line no-constant-condition 746 while (true) { 747 const c = this.getCodeNorm(); 748 if (c === EOC || !isS(c)) { 749 return c; 750 } 751 } 752 } 753 setXMLVersion(version) { 754 this.currentXMLVersion = version; 755 /* eslint-disable @typescript-eslint/unbound-method */ 756 if (version === "1.0") { 757 this.isChar = isChar10; 758 this.getCode = this.getCode10; 759 } 760 else { 761 this.isChar = isChar11; 762 this.getCode = this.getCode11; 763 } 764 /* eslint-enable @typescript-eslint/unbound-method */ 765 } 766 // STATE ENGINE METHODS 767 // This needs to be a state separate from S_BEGIN_WHITESPACE because we want 768 // to be sure never to come back to this state later. 769 sBegin() { 770 // We are essentially peeking at the first character of the chunk. Since 771 // S_BEGIN can be in effect only when we start working on the first chunk, 772 // the index at which we must look is necessarily 0. Note also that the 773 // following test does not depend on decoding surrogates. 774 // If the initial character is 0xFEFF, ignore it. 775 if (this.chunk.charCodeAt(0) === 0xFEFF) { 776 this.i++; 777 this.column++; 778 } 779 this.state = S_BEGIN_WHITESPACE; 780 } 781 sBeginWhitespace() { 782 // We need to know whether we've encountered spaces or not because as soon 783 // as we run into a space, an XML declaration is no longer possible. Rather 784 // than slow down skipSpaces even in places where we don't care whether it 785 // skipped anything or not, we check whether prevI is equal to the value of 786 // i from before we skip spaces. 787 const iBefore = this.i; 788 const c = this.skipSpaces(); 789 if (this.prevI !== iBefore) { 790 this.xmlDeclPossible = false; 791 } 792 switch (c) { 793 case LESS: 794 this.state = S_OPEN_WAKA; 795 // We could naively call closeText but in this state, it is not normal 796 // to have text be filled with any data. 797 if (this.text.length !== 0) { 798 throw new Error("no-empty text at start"); 799 } 800 break; 801 case EOC: 802 break; 803 default: 804 this.unget(); 805 this.state = S_TEXT; 806 this.xmlDeclPossible = false; 807 } 808 } 809 sDoctype() { 810 var _a; 811 const c = this.captureTo(DOCTYPE_TERMINATOR); 812 switch (c) { 813 case GREATER: { 814 // eslint-disable-next-line no-unused-expressions 815 (_a = this.doctypeHandler) === null || _a === void 0 ? void 0 : _a.call(this, this.text); 816 this.text = ""; 817 this.state = S_TEXT; 818 this.doctype = true; // just remember that we saw it. 819 break; 820 } 821 case EOC: 822 break; 823 default: 824 this.text += String.fromCodePoint(c); 825 if (c === OPEN_BRACKET) { 826 this.state = S_DTD; 827 } 828 else if (isQuote(c)) { 829 this.state = S_DOCTYPE_QUOTE; 830 this.q = c; 831 } 832 } 833 } 834 sDoctypeQuote() { 835 const q = this.q; 836 if (this.captureToChar(q)) { 837 this.text += String.fromCodePoint(q); 838 this.q = null; 839 this.state = S_DOCTYPE; 840 } 841 } 842 sDTD() { 843 const c = this.captureTo(DTD_TERMINATOR); 844 if (c === EOC) { 845 return; 846 } 847 this.text += String.fromCodePoint(c); 848 if (c === CLOSE_BRACKET) { 849 this.state = S_DOCTYPE; 850 } 851 else if (c === LESS) { 852 this.state = S_DTD_OPEN_WAKA; 853 } 854 else if (isQuote(c)) { 855 this.state = S_DTD_QUOTED; 856 this.q = c; 857 } 858 } 859 sDTDQuoted() { 860 const q = this.q; 861 if (this.captureToChar(q)) { 862 this.text += String.fromCodePoint(q); 863 this.state = S_DTD; 864 this.q = null; 865 } 866 } 867 sDTDOpenWaka() { 868 const c = this.getCodeNorm(); 869 this.text += String.fromCodePoint(c); 870 switch (c) { 871 case BANG: 872 this.state = S_DTD_OPEN_WAKA_BANG; 873 this.openWakaBang = ""; 874 break; 875 case QUESTION: 876 this.state = S_DTD_PI; 877 break; 878 default: 879 this.state = S_DTD; 880 } 881 } 882 sDTDOpenWakaBang() { 883 const char = String.fromCodePoint(this.getCodeNorm()); 884 const owb = this.openWakaBang += char; 885 this.text += char; 886 if (owb !== "-") { 887 this.state = owb === "--" ? S_DTD_COMMENT : S_DTD; 888 this.openWakaBang = ""; 889 } 890 } 891 sDTDComment() { 892 if (this.captureToChar(MINUS)) { 893 this.text += "-"; 894 this.state = S_DTD_COMMENT_ENDING; 895 } 896 } 897 sDTDCommentEnding() { 898 const c = this.getCodeNorm(); 899 this.text += String.fromCodePoint(c); 900 this.state = c === MINUS ? S_DTD_COMMENT_ENDED : S_DTD_COMMENT; 901 } 902 sDTDCommentEnded() { 903 const c = this.getCodeNorm(); 904 this.text += String.fromCodePoint(c); 905 if (c === GREATER) { 906 this.state = S_DTD; 907 } 908 else { 909 this.fail("malformed comment."); 910 // <!-- blah -- bloo --> will be recorded as 911 // a comment of " blah -- bloo " 912 this.state = S_DTD_COMMENT; 913 } 914 } 915 sDTDPI() { 916 if (this.captureToChar(QUESTION)) { 917 this.text += "?"; 918 this.state = S_DTD_PI_ENDING; 919 } 920 } 921 sDTDPIEnding() { 922 const c = this.getCodeNorm(); 923 this.text += String.fromCodePoint(c); 924 if (c === GREATER) { 925 this.state = S_DTD; 926 } 927 } 928 sText() { 929 // 930 // We did try a version of saxes where the S_TEXT state was split in two 931 // states: one for text inside the root element, and one for text 932 // outside. This was avoiding having to test this.tags.length to decide 933 // what implementation to actually use. 934 // 935 // Peformance testing on gigabyte-size files did not show any advantage to 936 // using the two states solution instead of the current one. Conversely, it 937 // made the code a bit more complicated elsewhere. For instance, a comment 938 // can appear before the root element so when a comment ended it was 939 // necessary to determine whether to return to the S_TEXT state or to the 940 // new text-outside-root state. 941 // 942 if (this.tags.length !== 0) { 943 this.handleTextInRoot(); 944 } 945 else { 946 this.handleTextOutsideRoot(); 947 } 948 } 949 sEntity() { 950 // This is essentially a specialized version of captureToChar(SEMICOLON...) 951 let { i: start } = this; 952 const { chunk } = this; 953 // eslint-disable-next-line no-labels, no-restricted-syntax 954 loop: 955 // eslint-disable-next-line no-constant-condition 956 while (true) { 957 switch (this.getCode()) { 958 case NL_LIKE: 959 this.entity += `${chunk.slice(start, this.prevI)}\n`; 960 start = this.i; 961 break; 962 case SEMICOLON: { 963 const { entityReturnState } = this; 964 const entity = this.entity + chunk.slice(start, this.prevI); 965 this.state = entityReturnState; 966 let parsed; 967 if (entity === "") { 968 this.fail("empty entity name."); 969 parsed = "&;"; 970 } 971 else { 972 parsed = this.parseEntity(entity); 973 this.entity = ""; 974 } 975 if (entityReturnState !== S_TEXT || this.textHandler !== undefined) { 976 this.text += parsed; 977 } 978 // eslint-disable-next-line no-labels 979 break loop; 980 } 981 case EOC: 982 this.entity += chunk.slice(start); 983 // eslint-disable-next-line no-labels 984 break loop; 985 default: 986 } 987 } 988 } 989 sOpenWaka() { 990 // Reminder: a state handler is called with at least one character 991 // available in the current chunk. So the first call to get code inside of 992 // a state handler cannot return ``EOC``. That's why we don't test 993 // for it. 994 const c = this.getCode(); 995 // either a /, ?, !, or text is coming next. 996 if (isNameStartChar(c)) { 997 this.state = S_OPEN_TAG; 998 this.unget(); 999 this.xmlDeclPossible = false; 1000 } 1001 else { 1002 switch (c) { 1003 case FORWARD_SLASH: 1004 this.state = S_CLOSE_TAG; 1005 this.xmlDeclPossible = false; 1006 break; 1007 case BANG: 1008 this.state = S_OPEN_WAKA_BANG; 1009 this.openWakaBang = ""; 1010 this.xmlDeclPossible = false; 1011 break; 1012 case QUESTION: 1013 this.state = S_PI_FIRST_CHAR; 1014 break; 1015 default: 1016 this.fail("disallowed character in tag name"); 1017 this.state = S_TEXT; 1018 this.xmlDeclPossible = false; 1019 } 1020 } 1021 } 1022 sOpenWakaBang() { 1023 this.openWakaBang += String.fromCodePoint(this.getCodeNorm()); 1024 switch (this.openWakaBang) { 1025 case "[CDATA[": 1026 if (!this.sawRoot && !this.reportedTextBeforeRoot) { 1027 this.fail("text data outside of root node."); 1028 this.reportedTextBeforeRoot = true; 1029 } 1030 if (this.closedRoot && !this.reportedTextAfterRoot) { 1031 this.fail("text data outside of root node."); 1032 this.reportedTextAfterRoot = true; 1033 } 1034 this.state = S_CDATA; 1035 this.openWakaBang = ""; 1036 break; 1037 case "--": 1038 this.state = S_COMMENT; 1039 this.openWakaBang = ""; 1040 break; 1041 case "DOCTYPE": 1042 this.state = S_DOCTYPE; 1043 if (this.doctype || this.sawRoot) { 1044 this.fail("inappropriately located doctype declaration."); 1045 } 1046 this.openWakaBang = ""; 1047 break; 1048 default: 1049 // 7 happens to be the maximum length of the string that can possibly 1050 // match one of the cases above. 1051 if (this.openWakaBang.length >= 7) { 1052 this.fail("incorrect syntax."); 1053 } 1054 } 1055 } 1056 sComment() { 1057 if (this.captureToChar(MINUS)) { 1058 this.state = S_COMMENT_ENDING; 1059 } 1060 } 1061 sCommentEnding() { 1062 var _a; 1063 const c = this.getCodeNorm(); 1064 if (c === MINUS) { 1065 this.state = S_COMMENT_ENDED; 1066 // eslint-disable-next-line no-unused-expressions 1067 (_a = this.commentHandler) === null || _a === void 0 ? void 0 : _a.call(this, this.text); 1068 this.text = ""; 1069 } 1070 else { 1071 this.text += `-${String.fromCodePoint(c)}`; 1072 this.state = S_COMMENT; 1073 } 1074 } 1075 sCommentEnded() { 1076 const c = this.getCodeNorm(); 1077 if (c !== GREATER) { 1078 this.fail("malformed comment."); 1079 // <!-- blah -- bloo --> will be recorded as 1080 // a comment of " blah -- bloo " 1081 this.text += `--${String.fromCodePoint(c)}`; 1082 this.state = S_COMMENT; 1083 } 1084 else { 1085 this.state = S_TEXT; 1086 } 1087 } 1088 sCData() { 1089 if (this.captureToChar(CLOSE_BRACKET)) { 1090 this.state = S_CDATA_ENDING; 1091 } 1092 } 1093 sCDataEnding() { 1094 const c = this.getCodeNorm(); 1095 if (c === CLOSE_BRACKET) { 1096 this.state = S_CDATA_ENDING_2; 1097 } 1098 else { 1099 this.text += `]${String.fromCodePoint(c)}`; 1100 this.state = S_CDATA; 1101 } 1102 } 1103 sCDataEnding2() { 1104 var _a; 1105 const c = this.getCodeNorm(); 1106 switch (c) { 1107 case GREATER: { 1108 // eslint-disable-next-line no-unused-expressions 1109 (_a = this.cdataHandler) === null || _a === void 0 ? void 0 : _a.call(this, this.text); 1110 this.text = ""; 1111 this.state = S_TEXT; 1112 break; 1113 } 1114 case CLOSE_BRACKET: 1115 this.text += "]"; 1116 break; 1117 default: 1118 this.text += `]]${String.fromCodePoint(c)}`; 1119 this.state = S_CDATA; 1120 } 1121 } 1122 // We need this separate state to check the first character fo the pi target 1123 // with this.nameStartCheck which allows less characters than this.nameCheck. 1124 sPIFirstChar() { 1125 const c = this.getCodeNorm(); 1126 // This is first because in the case where the file is well-formed this is 1127 // the branch taken. We optimize for well-formedness. 1128 if (this.nameStartCheck(c)) { 1129 this.piTarget += String.fromCodePoint(c); 1130 this.state = S_PI_REST; 1131 } 1132 else if (c === QUESTION || isS(c)) { 1133 this.fail("processing instruction without a target."); 1134 this.state = c === QUESTION ? S_PI_ENDING : S_PI_BODY; 1135 } 1136 else { 1137 this.fail("disallowed character in processing instruction name."); 1138 this.piTarget += String.fromCodePoint(c); 1139 this.state = S_PI_REST; 1140 } 1141 } 1142 sPIRest() { 1143 // Capture characters into a piTarget while ``this.nameCheck`` run on the 1144 // character read returns true. 1145 const { chunk, i: start } = this; 1146 // eslint-disable-next-line no-constant-condition 1147 while (true) { 1148 const c = this.getCodeNorm(); 1149 if (c === EOC) { 1150 this.piTarget += chunk.slice(start); 1151 return; 1152 } 1153 // NL cannot satisfy this.nameCheck so we don't have to test specifically 1154 // for it. 1155 if (!this.nameCheck(c)) { 1156 this.piTarget += chunk.slice(start, this.prevI); 1157 const isQuestion = c === QUESTION; 1158 if (isQuestion || isS(c)) { 1159 if (this.piTarget === "xml") { 1160 if (!this.xmlDeclPossible) { 1161 this.fail("an XML declaration must be at the start of the document."); 1162 } 1163 this.state = isQuestion ? S_XML_DECL_ENDING : S_XML_DECL_NAME_START; 1164 } 1165 else { 1166 this.state = isQuestion ? S_PI_ENDING : S_PI_BODY; 1167 } 1168 } 1169 else { 1170 this.fail("disallowed character in processing instruction name."); 1171 this.piTarget += String.fromCodePoint(c); 1172 } 1173 break; 1174 } 1175 } 1176 } 1177 sPIBody() { 1178 if (this.text.length === 0) { 1179 const c = this.getCodeNorm(); 1180 if (c === QUESTION) { 1181 this.state = S_PI_ENDING; 1182 } 1183 else if (!isS(c)) { 1184 this.text = String.fromCodePoint(c); 1185 } 1186 } 1187 // The question mark character is not valid inside any of the XML 1188 // declaration name/value pairs. 1189 else if (this.captureToChar(QUESTION)) { 1190 this.state = S_PI_ENDING; 1191 } 1192 } 1193 sPIEnding() { 1194 var _a; 1195 const c = this.getCodeNorm(); 1196 if (c === GREATER) { 1197 const { piTarget } = this; 1198 if (piTarget.toLowerCase() === "xml") { 1199 this.fail("the XML declaration must appear at the start of the document."); 1200 } 1201 // eslint-disable-next-line no-unused-expressions 1202 (_a = this.piHandler) === null || _a === void 0 ? void 0 : _a.call(this, { 1203 target: piTarget, 1204 body: this.text, 1205 }); 1206 this.piTarget = this.text = ""; 1207 this.state = S_TEXT; 1208 } 1209 else if (c === QUESTION) { 1210 // We ran into ?? as part of a processing instruction. We initially took 1211 // the first ? as a sign that the PI was ending, but it is not. So we have 1212 // to add it to the body but we take the new ? as a sign that the PI is 1213 // ending. 1214 this.text += "?"; 1215 } 1216 else { 1217 this.text += `?${String.fromCodePoint(c)}`; 1218 this.state = S_PI_BODY; 1219 } 1220 this.xmlDeclPossible = false; 1221 } 1222 sXMLDeclNameStart() { 1223 const c = this.skipSpaces(); 1224 // The question mark character is not valid inside any of the XML 1225 // declaration name/value pairs. 1226 if (c === QUESTION) { 1227 // It is valid to go to S_XML_DECL_ENDING from this state. 1228 this.state = S_XML_DECL_ENDING; 1229 return; 1230 } 1231 if (c !== EOC) { 1232 this.state = S_XML_DECL_NAME; 1233 this.name = String.fromCodePoint(c); 1234 } 1235 } 1236 sXMLDeclName() { 1237 const c = this.captureTo(XML_DECL_NAME_TERMINATOR); 1238 // The question mark character is not valid inside any of the XML 1239 // declaration name/value pairs. 1240 if (c === QUESTION) { 1241 this.state = S_XML_DECL_ENDING; 1242 this.name += this.text; 1243 this.text = ""; 1244 this.fail("XML declaration is incomplete."); 1245 return; 1246 } 1247 if (!(isS(c) || c === EQUAL)) { 1248 return; 1249 } 1250 this.name += this.text; 1251 this.text = ""; 1252 if (!this.xmlDeclExpects.includes(this.name)) { 1253 switch (this.name.length) { 1254 case 0: 1255 this.fail("did not expect any more name/value pairs."); 1256 break; 1257 case 1: 1258 this.fail(`expected the name ${this.xmlDeclExpects[0]}.`); 1259 break; 1260 default: 1261 this.fail(`expected one of ${this.xmlDeclExpects.join(", ")}`); 1262 } 1263 } 1264 this.state = c === EQUAL ? S_XML_DECL_VALUE_START : S_XML_DECL_EQ; 1265 } 1266 sXMLDeclEq() { 1267 const c = this.getCodeNorm(); 1268 // The question mark character is not valid inside any of the XML 1269 // declaration name/value pairs. 1270 if (c === QUESTION) { 1271 this.state = S_XML_DECL_ENDING; 1272 this.fail("XML declaration is incomplete."); 1273 return; 1274 } 1275 if (isS(c)) { 1276 return; 1277 } 1278 if (c !== EQUAL) { 1279 this.fail("value required."); 1280 } 1281 this.state = S_XML_DECL_VALUE_START; 1282 } 1283 sXMLDeclValueStart() { 1284 const c = this.getCodeNorm(); 1285 // The question mark character is not valid inside any of the XML 1286 // declaration name/value pairs. 1287 if (c === QUESTION) { 1288 this.state = S_XML_DECL_ENDING; 1289 this.fail("XML declaration is incomplete."); 1290 return; 1291 } 1292 if (isS(c)) { 1293 return; 1294 } 1295 if (!isQuote(c)) { 1296 this.fail("value must be quoted."); 1297 this.q = SPACE; 1298 } 1299 else { 1300 this.q = c; 1301 } 1302 this.state = S_XML_DECL_VALUE; 1303 } 1304 sXMLDeclValue() { 1305 const c = this.captureTo([this.q, QUESTION]); 1306 // The question mark character is not valid inside any of the XML 1307 // declaration name/value pairs. 1308 if (c === QUESTION) { 1309 this.state = S_XML_DECL_ENDING; 1310 this.text = ""; 1311 this.fail("XML declaration is incomplete."); 1312 return; 1313 } 1314 if (c === EOC) { 1315 return; 1316 } 1317 const value = this.text; 1318 this.text = ""; 1319 switch (this.name) { 1320 case "version": { 1321 this.xmlDeclExpects = ["encoding", "standalone"]; 1322 const version = value; 1323 this.xmlDecl.version = version; 1324 // This is the test specified by XML 1.0 but it is fine for XML 1.1. 1325 if (!/^1\.[0-9]+$/.test(version)) { 1326 this.fail("version number must match /^1\\.[0-9]+$/."); 1327 } 1328 // When forceXMLVersion is set, the XML declaration is ignored. 1329 else if (!this.opt.forceXMLVersion) { 1330 this.setXMLVersion(version); 1331 } 1332 break; 1333 } 1334 case "encoding": 1335 if (!/^[A-Za-z][A-Za-z0-9._-]*$/.test(value)) { 1336 this.fail("encoding value must match \ 1337 /^[A-Za-z0-9][A-Za-z0-9._-]*$/."); 1338 } 1339 this.xmlDeclExpects = ["standalone"]; 1340 this.xmlDecl.encoding = value; 1341 break; 1342 case "standalone": 1343 if (value !== "yes" && value !== "no") { 1344 this.fail("standalone value must match \"yes\" or \"no\"."); 1345 } 1346 this.xmlDeclExpects = []; 1347 this.xmlDecl.standalone = value; 1348 break; 1349 default: 1350 // We don't need to raise an error here since we've already raised one 1351 // when checking what name was expected. 1352 } 1353 this.name = ""; 1354 this.state = S_XML_DECL_SEPARATOR; 1355 } 1356 sXMLDeclSeparator() { 1357 const c = this.getCodeNorm(); 1358 // The question mark character is not valid inside any of the XML 1359 // declaration name/value pairs. 1360 if (c === QUESTION) { 1361 // It is valid to go to S_XML_DECL_ENDING from this state. 1362 this.state = S_XML_DECL_ENDING; 1363 return; 1364 } 1365 if (!isS(c)) { 1366 this.fail("whitespace required."); 1367 this.unget(); 1368 } 1369 this.state = S_XML_DECL_NAME_START; 1370 } 1371 sXMLDeclEnding() { 1372 var _a; 1373 const c = this.getCodeNorm(); 1374 if (c === GREATER) { 1375 if (this.piTarget !== "xml") { 1376 this.fail("processing instructions are not allowed before root."); 1377 } 1378 else if (this.name !== "version" && 1379 this.xmlDeclExpects.includes("version")) { 1380 this.fail("XML declaration must contain a version."); 1381 } 1382 // eslint-disable-next-line no-unused-expressions 1383 (_a = this.xmldeclHandler) === null || _a === void 0 ? void 0 : _a.call(this, this.xmlDecl); 1384 this.name = ""; 1385 this.piTarget = this.text = ""; 1386 this.state = S_TEXT; 1387 } 1388 else { 1389 // We got here because the previous character was a ?, but the question 1390 // mark character is not valid inside any of the XML declaration 1391 // name/value pairs. 1392 this.fail("The character ? is disallowed anywhere in XML declarations."); 1393 } 1394 this.xmlDeclPossible = false; 1395 } 1396 sOpenTag() { 1397 var _a; 1398 const c = this.captureNameChars(); 1399 if (c === EOC) { 1400 return; 1401 } 1402 const tag = this.tag = { 1403 name: this.name, 1404 attributes: Object.create(null), 1405 }; 1406 this.name = ""; 1407 if (this.xmlnsOpt) { 1408 this.topNS = tag.ns = Object.create(null); 1409 } 1410 // eslint-disable-next-line no-unused-expressions 1411 (_a = this.openTagStartHandler) === null || _a === void 0 ? void 0 : _a.call(this, tag); 1412 this.sawRoot = true; 1413 if (!this.fragmentOpt && this.closedRoot) { 1414 this.fail("documents may contain only one root."); 1415 } 1416 switch (c) { 1417 case GREATER: 1418 this.openTag(); 1419 break; 1420 case FORWARD_SLASH: 1421 this.state = S_OPEN_TAG_SLASH; 1422 break; 1423 default: 1424 if (!isS(c)) { 1425 this.fail("disallowed character in tag name."); 1426 } 1427 this.state = S_ATTRIB; 1428 } 1429 } 1430 sOpenTagSlash() { 1431 if (this.getCode() === GREATER) { 1432 this.openSelfClosingTag(); 1433 } 1434 else { 1435 this.fail("forward-slash in opening tag not followed by >."); 1436 this.state = S_ATTRIB; 1437 } 1438 } 1439 sAttrib() { 1440 const c = this.skipSpaces(); 1441 if (c === EOC) { 1442 return; 1443 } 1444 if (isNameStartChar(c)) { 1445 this.unget(); 1446 this.state = S_ATTRIB_NAME; 1447 } 1448 else if (c === GREATER) { 1449 this.openTag(); 1450 } 1451 else if (c === FORWARD_SLASH) { 1452 this.state = S_OPEN_TAG_SLASH; 1453 } 1454 else { 1455 this.fail("disallowed character in attribute name."); 1456 } 1457 } 1458 sAttribName() { 1459 const c = this.captureNameChars(); 1460 if (c === EQUAL) { 1461 this.state = S_ATTRIB_VALUE; 1462 } 1463 else if (isS(c)) { 1464 this.state = S_ATTRIB_NAME_SAW_WHITE; 1465 } 1466 else if (c === GREATER) { 1467 this.fail("attribute without value."); 1468 this.pushAttrib(this.name, this.name); 1469 this.name = this.text = ""; 1470 this.openTag(); 1471 } 1472 else if (c !== EOC) { 1473 this.fail("disallowed character in attribute name."); 1474 } 1475 } 1476 sAttribNameSawWhite() { 1477 const c = this.skipSpaces(); 1478 switch (c) { 1479 case EOC: 1480 return; 1481 case EQUAL: 1482 this.state = S_ATTRIB_VALUE; 1483 break; 1484 default: 1485 this.fail("attribute without value."); 1486 // Should we do this??? 1487 // this.tag.attributes[this.name] = ""; 1488 this.text = ""; 1489 this.name = ""; 1490 if (c === GREATER) { 1491 this.openTag(); 1492 } 1493 else if (isNameStartChar(c)) { 1494 this.unget(); 1495 this.state = S_ATTRIB_NAME; 1496 } 1497 else { 1498 this.fail("disallowed character in attribute name."); 1499 this.state = S_ATTRIB; 1500 } 1501 } 1502 } 1503 sAttribValue() { 1504 const c = this.getCodeNorm(); 1505 if (isQuote(c)) { 1506 this.q = c; 1507 this.state = S_ATTRIB_VALUE_QUOTED; 1508 } 1509 else if (!isS(c)) { 1510 this.fail("unquoted attribute value."); 1511 this.state = S_ATTRIB_VALUE_UNQUOTED; 1512 this.unget(); 1513 } 1514 } 1515 sAttribValueQuoted() { 1516 // We deliberately do not use captureTo here. The specialized code we use 1517 // here is faster than using captureTo. 1518 const { q, chunk } = this; 1519 let { i: start } = this; 1520 // eslint-disable-next-line no-constant-condition 1521 while (true) { 1522 switch (this.getCode()) { 1523 case q: 1524 this.pushAttrib(this.name, this.text + chunk.slice(start, this.prevI)); 1525 this.name = this.text = ""; 1526 this.q = null; 1527 this.state = S_ATTRIB_VALUE_CLOSED; 1528 return; 1529 case AMP: 1530 this.text += chunk.slice(start, this.prevI); 1531 this.state = S_ENTITY; 1532 this.entityReturnState = S_ATTRIB_VALUE_QUOTED; 1533 return; 1534 case NL: 1535 case NL_LIKE: 1536 case TAB: 1537 this.text += `${chunk.slice(start, this.prevI)} `; 1538 start = this.i; 1539 break; 1540 case LESS: 1541 this.text += chunk.slice(start, this.prevI); 1542 this.fail("disallowed character."); 1543 return; 1544 case EOC: 1545 this.text += chunk.slice(start); 1546 return; 1547 default: 1548 } 1549 } 1550 } 1551 sAttribValueClosed() { 1552 const c = this.getCodeNorm(); 1553 if (isS(c)) { 1554 this.state = S_ATTRIB; 1555 } 1556 else if (c === GREATER) { 1557 this.openTag(); 1558 } 1559 else if (c === FORWARD_SLASH) { 1560 this.state = S_OPEN_TAG_SLASH; 1561 } 1562 else if (isNameStartChar(c)) { 1563 this.fail("no whitespace between attributes."); 1564 this.unget(); 1565 this.state = S_ATTRIB_NAME; 1566 } 1567 else { 1568 this.fail("disallowed character in attribute name."); 1569 } 1570 } 1571 sAttribValueUnquoted() { 1572 // We don't do anything regarding EOL or space handling for unquoted 1573 // attributes. We already have failed by the time we get here, and the 1574 // contract that saxes upholds states that upon failure, it is not safe to 1575 // rely on the data passed to event handlers (other than 1576 // ``onerror``). Passing "bad" data is not a problem. 1577 const c = this.captureTo(ATTRIB_VALUE_UNQUOTED_TERMINATOR); 1578 switch (c) { 1579 case AMP: 1580 this.state = S_ENTITY; 1581 this.entityReturnState = S_ATTRIB_VALUE_UNQUOTED; 1582 break; 1583 case LESS: 1584 this.fail("disallowed character."); 1585 break; 1586 case EOC: 1587 break; 1588 default: 1589 if (this.text.includes("]]>")) { 1590 this.fail("the string \"]]>\" is disallowed in char data."); 1591 } 1592 this.pushAttrib(this.name, this.text); 1593 this.name = this.text = ""; 1594 if (c === GREATER) { 1595 this.openTag(); 1596 } 1597 else { 1598 this.state = S_ATTRIB; 1599 } 1600 } 1601 } 1602 sCloseTag() { 1603 const c = this.captureNameChars(); 1604 if (c === GREATER) { 1605 this.closeTag(); 1606 } 1607 else if (isS(c)) { 1608 this.state = S_CLOSE_TAG_SAW_WHITE; 1609 } 1610 else if (c !== EOC) { 1611 this.fail("disallowed character in closing tag."); 1612 } 1613 } 1614 sCloseTagSawWhite() { 1615 switch (this.skipSpaces()) { 1616 case GREATER: 1617 this.closeTag(); 1618 break; 1619 case EOC: 1620 break; 1621 default: 1622 this.fail("disallowed character in closing tag."); 1623 } 1624 } 1625 // END OF STATE ENGINE METHODS 1626 handleTextInRoot() { 1627 // This is essentially a specialized version of captureTo which is optimized 1628 // for performing the ]]> check. A previous version of this code, checked 1629 // ``this.text`` for the presence of ]]>. It simplified the code but was 1630 // very costly when character data contained a lot of entities to be parsed. 1631 // 1632 // Since we are using a specialized loop, we also keep track of the presence 1633 // of ]]> in text data. The sequence ]]> is forbidden to appear as-is. 1634 // 1635 let { i: start, forbiddenState } = this; 1636 const { chunk, textHandler: handler } = this; 1637 // eslint-disable-next-line no-labels, no-restricted-syntax 1638 scanLoop: 1639 // eslint-disable-next-line no-constant-condition 1640 while (true) { 1641 switch (this.getCode()) { 1642 case LESS: { 1643 this.state = S_OPEN_WAKA; 1644 if (handler !== undefined) { 1645 const { text } = this; 1646 const slice = chunk.slice(start, this.prevI); 1647 if (text.length !== 0) { 1648 handler(text + slice); 1649 this.text = ""; 1650 } 1651 else if (slice.length !== 0) { 1652 handler(slice); 1653 } 1654 } 1655 forbiddenState = FORBIDDEN_START; 1656 // eslint-disable-next-line no-labels 1657 break scanLoop; 1658 } 1659 case AMP: 1660 this.state = S_ENTITY; 1661 this.entityReturnState = S_TEXT; 1662 if (handler !== undefined) { 1663 this.text += chunk.slice(start, this.prevI); 1664 } 1665 forbiddenState = FORBIDDEN_START; 1666 // eslint-disable-next-line no-labels 1667 break scanLoop; 1668 case CLOSE_BRACKET: 1669 switch (forbiddenState) { 1670 case FORBIDDEN_START: 1671 forbiddenState = FORBIDDEN_BRACKET; 1672 break; 1673 case FORBIDDEN_BRACKET: 1674 forbiddenState = FORBIDDEN_BRACKET_BRACKET; 1675 break; 1676 case FORBIDDEN_BRACKET_BRACKET: 1677 break; 1678 default: 1679 throw new Error("impossible state"); 1680 } 1681 break; 1682 case GREATER: 1683 if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) { 1684 this.fail("the string \"]]>\" is disallowed in char data."); 1685 } 1686 forbiddenState = FORBIDDEN_START; 1687 break; 1688 case NL_LIKE: 1689 if (handler !== undefined) { 1690 this.text += `${chunk.slice(start, this.prevI)}\n`; 1691 } 1692 start = this.i; 1693 forbiddenState = FORBIDDEN_START; 1694 break; 1695 case EOC: 1696 if (handler !== undefined) { 1697 this.text += chunk.slice(start); 1698 } 1699 // eslint-disable-next-line no-labels 1700 break scanLoop; 1701 default: 1702 forbiddenState = FORBIDDEN_START; 1703 } 1704 } 1705 this.forbiddenState = forbiddenState; 1706 } 1707 handleTextOutsideRoot() { 1708 // This is essentially a specialized version of captureTo which is optimized 1709 // for a specialized task. We keep track of the presence of non-space 1710 // characters in the text since these are errors when appearing outside the 1711 // document root element. 1712 let { i: start } = this; 1713 const { chunk, textHandler: handler } = this; 1714 let nonSpace = false; 1715 // eslint-disable-next-line no-labels, no-restricted-syntax 1716 outRootLoop: 1717 // eslint-disable-next-line no-constant-condition 1718 while (true) { 1719 const code = this.getCode(); 1720 switch (code) { 1721 case LESS: { 1722 this.state = S_OPEN_WAKA; 1723 if (handler !== undefined) { 1724 const { text } = this; 1725 const slice = chunk.slice(start, this.prevI); 1726 if (text.length !== 0) { 1727 handler(text + slice); 1728 this.text = ""; 1729 } 1730 else if (slice.length !== 0) { 1731 handler(slice); 1732 } 1733 } 1734 // eslint-disable-next-line no-labels 1735 break outRootLoop; 1736 } 1737 case AMP: 1738 this.state = S_ENTITY; 1739 this.entityReturnState = S_TEXT; 1740 if (handler !== undefined) { 1741 this.text += chunk.slice(start, this.prevI); 1742 } 1743 nonSpace = true; 1744 // eslint-disable-next-line no-labels 1745 break outRootLoop; 1746 case NL_LIKE: 1747 if (handler !== undefined) { 1748 this.text += `${chunk.slice(start, this.prevI)}\n`; 1749 } 1750 start = this.i; 1751 break; 1752 case EOC: 1753 if (handler !== undefined) { 1754 this.text += chunk.slice(start); 1755 } 1756 // eslint-disable-next-line no-labels 1757 break outRootLoop; 1758 default: 1759 if (!isS(code)) { 1760 nonSpace = true; 1761 } 1762 } 1763 } 1764 if (!nonSpace) { 1765 return; 1766 } 1767 // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags 1768 // to avoid reporting errors for every single character that is out of 1769 // place. 1770 if (!this.sawRoot && !this.reportedTextBeforeRoot) { 1771 this.fail("text data outside of root node."); 1772 this.reportedTextBeforeRoot = true; 1773 } 1774 if (this.closedRoot && !this.reportedTextAfterRoot) { 1775 this.fail("text data outside of root node."); 1776 this.reportedTextAfterRoot = true; 1777 } 1778 } 1779 pushAttribNS(name, value) { 1780 var _a; 1781 const { prefix, local } = this.qname(name); 1782 const attr = { name, prefix, local, value }; 1783 this.attribList.push(attr); 1784 // eslint-disable-next-line no-unused-expressions 1785 (_a = this.attributeHandler) === null || _a === void 0 ? void 0 : _a.call(this, attr); 1786 if (prefix === "xmlns") { 1787 const trimmed = value.trim(); 1788 if (this.currentXMLVersion === "1.0" && trimmed === "") { 1789 this.fail("invalid attempt to undefine prefix in XML 1.0"); 1790 } 1791 this.topNS[local] = trimmed; 1792 nsPairCheck(this, local, trimmed); 1793 } 1794 else if (name === "xmlns") { 1795 const trimmed = value.trim(); 1796 this.topNS[""] = trimmed; 1797 nsPairCheck(this, "", trimmed); 1798 } 1799 } 1800 pushAttribPlain(name, value) { 1801 var _a; 1802 const attr = { name, value }; 1803 this.attribList.push(attr); 1804 // eslint-disable-next-line no-unused-expressions 1805 (_a = this.attributeHandler) === null || _a === void 0 ? void 0 : _a.call(this, attr); 1806 } 1807 /** 1808 * End parsing. This performs final well-formedness checks and resets the 1809 * parser to a clean state. 1810 * 1811 * @returns this 1812 */ 1813 end() { 1814 var _a, _b; 1815 if (!this.sawRoot) { 1816 this.fail("document must contain a root element."); 1817 } 1818 const { tags } = this; 1819 while (tags.length > 0) { 1820 const tag = tags.pop(); 1821 this.fail(`unclosed tag: ${tag.name}`); 1822 } 1823 if ((this.state !== S_BEGIN) && (this.state !== S_TEXT)) { 1824 this.fail("unexpected end."); 1825 } 1826 const { text } = this; 1827 if (text.length !== 0) { 1828 // eslint-disable-next-line no-unused-expressions 1829 (_a = this.textHandler) === null || _a === void 0 ? void 0 : _a.call(this, text); 1830 this.text = ""; 1831 } 1832 this._closed = true; 1833 // eslint-disable-next-line no-unused-expressions 1834 (_b = this.endHandler) === null || _b === void 0 ? void 0 : _b.call(this); 1835 this._init(); 1836 return this; 1837 } 1838 /** 1839 * Resolve a namespace prefix. 1840 * 1841 * @param prefix The prefix to resolve. 1842 * 1843 * @returns The namespace URI or ``undefined`` if the prefix is not defined. 1844 */ 1845 resolve(prefix) { 1846 var _a, _b; 1847 let uri = this.topNS[prefix]; 1848 if (uri !== undefined) { 1849 return uri; 1850 } 1851 const { tags } = this; 1852 for (let index = tags.length - 1; index >= 0; index--) { 1853 uri = tags[index].ns[prefix]; 1854 if (uri !== undefined) { 1855 return uri; 1856 } 1857 } 1858 uri = this.ns[prefix]; 1859 if (uri !== undefined) { 1860 return uri; 1861 } 1862 return (_b = (_a = this.opt).resolvePrefix) === null || _b === void 0 ? void 0 : _b.call(_a, prefix); 1863 } 1864 /** 1865 * Parse a qname into its prefix and local name parts. 1866 * 1867 * @param name The name to parse 1868 * 1869 * @returns 1870 */ 1871 qname(name) { 1872 // This is faster than using name.split(":"). 1873 const colon = name.indexOf(":"); 1874 if (colon === -1) { 1875 return { prefix: "", local: name }; 1876 } 1877 const local = name.slice(colon + 1); 1878 const prefix = name.slice(0, colon); 1879 if (prefix === "" || local === "" || local.includes(":")) { 1880 this.fail(`malformed name: ${name}.`); 1881 } 1882 return { prefix, local }; 1883 } 1884 processAttribsNS() { 1885 var _a; 1886 const { attribList } = this; 1887 const tag = this.tag; 1888 { 1889 // add namespace info to tag 1890 const { prefix, local } = this.qname(tag.name); 1891 tag.prefix = prefix; 1892 tag.local = local; 1893 const uri = tag.uri = (_a = this.resolve(prefix)) !== null && _a !== void 0 ? _a : ""; 1894 if (prefix !== "") { 1895 if (prefix === "xmlns") { 1896 this.fail("tags may not have \"xmlns\" as prefix."); 1897 } 1898 if (uri === "") { 1899 this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`); 1900 tag.uri = prefix; 1901 } 1902 } 1903 } 1904 if (attribList.length === 0) { 1905 return; 1906 } 1907 const { attributes } = tag; 1908 const seen = new Set(); 1909 // Note: do not apply default ns to attributes: 1910 // http://www.w3.org/TR/REC-xml-names/#defaulting 1911 for (const attr of attribList) { 1912 const { name, prefix, local } = attr; 1913 let uri; 1914 let eqname; 1915 if (prefix === "") { 1916 uri = name === "xmlns" ? XMLNS_NAMESPACE : ""; 1917 eqname = name; 1918 } 1919 else { 1920 uri = this.resolve(prefix); 1921 // if there's any attributes with an undefined namespace, 1922 // then fail on them now. 1923 if (uri === undefined) { 1924 this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`); 1925 uri = prefix; 1926 } 1927 eqname = `{${uri}}${local}`; 1928 } 1929 if (seen.has(eqname)) { 1930 this.fail(`duplicate attribute: ${eqname}.`); 1931 } 1932 seen.add(eqname); 1933 attr.uri = uri; 1934 attributes[name] = attr; 1935 } 1936 this.attribList = []; 1937 } 1938 processAttribsPlain() { 1939 const { attribList } = this; 1940 // eslint-disable-next-line prefer-destructuring 1941 const attributes = this.tag.attributes; 1942 for (const { name, value } of attribList) { 1943 if (attributes[name] !== undefined) { 1944 this.fail(`duplicate attribute: ${name}.`); 1945 } 1946 attributes[name] = value; 1947 } 1948 this.attribList = []; 1949 } 1950 /** 1951 * Handle a complete open tag. This parser code calls this once it has seen 1952 * the whole tag. This method checks for well-formeness and then emits 1953 * ``onopentag``. 1954 */ 1955 openTag() { 1956 var _a; 1957 this.processAttribs(); 1958 const { tags } = this; 1959 const tag = this.tag; 1960 tag.isSelfClosing = false; 1961 // There cannot be any pending text here due to the onopentagstart that was 1962 // necessarily emitted before we get here. So we do not check text. 1963 // eslint-disable-next-line no-unused-expressions 1964 (_a = this.openTagHandler) === null || _a === void 0 ? void 0 : _a.call(this, tag); 1965 tags.push(tag); 1966 this.state = S_TEXT; 1967 this.name = ""; 1968 } 1969 /** 1970 * Handle a complete self-closing tag. This parser code calls this once it has 1971 * seen the whole tag. This method checks for well-formeness and then emits 1972 * ``onopentag`` and ``onclosetag``. 1973 */ 1974 openSelfClosingTag() { 1975 var _a, _b, _c; 1976 this.processAttribs(); 1977 const { tags } = this; 1978 const tag = this.tag; 1979 tag.isSelfClosing = true; 1980 // There cannot be any pending text here due to the onopentagstart that was 1981 // necessarily emitted before we get here. So we do not check text. 1982 // eslint-disable-next-line no-unused-expressions 1983 (_a = this.openTagHandler) === null || _a === void 0 ? void 0 : _a.call(this, tag); 1984 // eslint-disable-next-line no-unused-expressions 1985 (_b = this.closeTagHandler) === null || _b === void 0 ? void 0 : _b.call(this, tag); 1986 const top = this.tag = (_c = tags[tags.length - 1]) !== null && _c !== void 0 ? _c : null; 1987 if (top === null) { 1988 this.closedRoot = true; 1989 } 1990 this.state = S_TEXT; 1991 this.name = ""; 1992 } 1993 /** 1994 * Handle a complete close tag. This parser code calls this once it has seen 1995 * the whole tag. This method checks for well-formeness and then emits 1996 * ``onclosetag``. 1997 */ 1998 closeTag() { 1999 const { tags, name } = this; 2000 // Our state after this will be S_TEXT, no matter what, and we can clear 2001 // tagName now. 2002 this.state = S_TEXT; 2003 this.name = ""; 2004 if (name === "") { 2005 this.fail("weird empty close tag."); 2006 this.text += "</>"; 2007 return; 2008 } 2009 const handler = this.closeTagHandler; 2010 let l = tags.length; 2011 while (l-- > 0) { 2012 const tag = this.tag = tags.pop(); 2013 this.topNS = tag.ns; 2014 // eslint-disable-next-line no-unused-expressions 2015 handler === null || handler === void 0 ? void 0 : handler(tag); 2016 if (tag.name === name) { 2017 break; 2018 } 2019 this.fail("unexpected close tag."); 2020 } 2021 if (l === 0) { 2022 this.closedRoot = true; 2023 } 2024 else if (l < 0) { 2025 this.fail(`unmatched closing tag: ${name}.`); 2026 this.text += `</${name}>`; 2027 } 2028 } 2029 /** 2030 * Resolves an entity. Makes any necessary well-formedness checks. 2031 * 2032 * @param entity The entity to resolve. 2033 * 2034 * @returns The parsed entity. 2035 */ 2036 parseEntity(entity) { 2037 // startsWith would be significantly slower for this test. 2038 // eslint-disable-next-line @typescript-eslint/prefer-string-starts-ends-with 2039 if (entity[0] !== "#") { 2040 const defined = this.ENTITIES[entity]; 2041 if (defined !== undefined) { 2042 return defined; 2043 } 2044 this.fail(this.isName(entity) ? "undefined entity." : 2045 "disallowed character in entity name."); 2046 return `&${entity};`; 2047 } 2048 let num = NaN; 2049 if (entity[1] === "x" && /^#x[0-9a-f]+$/i.test(entity)) { 2050 num = parseInt(entity.slice(2), 16); 2051 } 2052 else if (/^#[0-9]+$/.test(entity)) { 2053 num = parseInt(entity.slice(1), 10); 2054 } 2055 // The character reference is required to match the CHAR production. 2056 if (!this.isChar(num)) { 2057 this.fail("malformed character entity."); 2058 return `&${entity};`; 2059 } 2060 return String.fromCodePoint(num); 2061 } 2062 } 2063 exports.SaxesParser = SaxesParser; 2064 //# sourceMappingURL=saxes.js.map