| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- // EntityDecoder.js
- import { trie1, trie2, trie3 } from './entityTries.js';
- // Replacement strings indexed by char code — direct array access, no hashing
- const XML_UNSAFE_REPLACEMENT = new Array(128);
- XML_UNSAFE_REPLACEMENT[38] = '&'; // &
- XML_UNSAFE_REPLACEMENT[60] = '<'; // <
- XML_UNSAFE_REPLACEMENT[62] = '>'; // >
- XML_UNSAFE_REPLACEMENT[34] = '"'; // "
- XML_UNSAFE_REPLACEMENT[39] = '''; // '
- // Typed bitmask for O(1) "is this ASCII code XML-unsafe?" check
- const IS_XML_UNSAFE = new Uint8Array(128);
- IS_XML_UNSAFE[38] = 1;
- IS_XML_UNSAFE[60] = 1;
- IS_XML_UNSAFE[62] = 1;
- IS_XML_UNSAFE[34] = 1;
- IS_XML_UNSAFE[39] = 1;
- // Fast pre-scan: bail out immediately if nothing needs encoding
- const NEEDS_PROCESSING = /[&<>"'\u0080-\uFFFF]/;
- export default class EntityEncoder {
- constructor(options = {}) {
- this.encodeXmlSafe = options.encodeXmlSafe !== false;
- this.encodeAllNamed = options.encodeAllNamed !== false;
- this.maxReplacements = options.maxReplacements || 0;
- this.replacementsCount = 0;
- }
- encode(str) {
- if (typeof str !== 'string' || str.length === 0) return str;
- if (!NEEDS_PROCESSING.test(str)) return str;
- const maxRep = this.maxReplacements;
- if (maxRep > 0 && this.replacementsCount >= maxRep) return str;
- // Hoist to locals — avoids `this` property lookup inside the hot loop
- const encodeXmlSafe = this.encodeXmlSafe;
- const encodeAllNamed = this.encodeAllNamed;
- const len = str.length;
- let result = '';
- let last = 0;
- let i = 0;
- let limitReached = false;
- // ── Main loop: runs to len-2 so trie3 never needs a bounds check ────────
- // The last 2 characters are handled by the tail block below.
- const mainEnd = len - 2; // i <= mainEnd guarantees i+1 and i+2 are valid
- while (i <= mainEnd && !limitReached) {
- const c0 = str.charCodeAt(i);
- // ── ASCII branch ───────────────────────────────────────────────────
- if (c0 < 128) {
- if (encodeXmlSafe && IS_XML_UNSAFE[c0] === 1) {
- result += str.substring(last, i) + XML_UNSAFE_REPLACEMENT[c0];
- last = ++i;
- if (maxRep > 0) {
- this.replacementsCount++;
- if (this.replacementsCount >= maxRep) {
- limitReached = true;
- break;
- }
- }
- } else {
- // Bulk-skip: advance to the next interesting position without
- // touching the outer loop overhead on every safe character
- i++;
- while (i <= mainEnd && !limitReached) {
- const c = str.charCodeAt(i);
- if (c >= 128 || (encodeXmlSafe && IS_XML_UNSAFE[c] === 1)) break;
- i++;
- }
- }
- continue;
- }
- // ── Non-ASCII: integer-keyed trie lookup ───────────────────────────
- // No bounds checks needed for c1/c2 because i <= mainEnd guarantees
- // i+1 and i+2 are both within the string.
- let matchedEntity = null;
- let advance = 1;
- // Try 3-char match first (longest wins)
- const mid3 = trie3.get(c0);
- if (mid3 !== undefined) {
- const c1 = str.charCodeAt(i + 1);
- const inner3 = mid3.get(c1);
- if (inner3 !== undefined) {
- const c2 = str.charCodeAt(i + 2);
- const candidate = inner3.get(c2);
- if (candidate !== undefined) { matchedEntity = candidate; advance = 3; }
- }
- }
- // Try 2-char match
- if (matchedEntity === null) {
- const inner2 = trie2.get(c0);
- if (inner2 !== undefined) {
- const c1 = str.charCodeAt(i + 1);
- const candidate = inner2.get(c1);
- if (candidate !== undefined) { matchedEntity = candidate; advance = 2; }
- }
- }
- // Try 1-char match
- if (matchedEntity === null && encodeAllNamed) {
- const candidate = trie1.get(c0);
- if (candidate !== undefined) { matchedEntity = candidate; }
- }
- if (matchedEntity !== null) {
- result += str.substring(last, i) + matchedEntity;
- i += advance;
- last = i;
- if (maxRep > 0) {
- this.replacementsCount++;
- if (this.replacementsCount >= maxRep) {
- limitReached = true;
- break;
- }
- }
- } else {
- i++;
- }
- }
- // ── Tail: handle the last 1-2 characters (no 3-char match possible) ────
- while (i < len && !limitReached) {
- const c0 = str.charCodeAt(i);
- if (c0 < 128) {
- if (encodeXmlSafe && IS_XML_UNSAFE[c0] === 1) {
- result += str.substring(last, i) + XML_UNSAFE_REPLACEMENT[c0];
- last = ++i;
- if (maxRep > 0) {
- this.replacementsCount++;
- if (this.replacementsCount >= maxRep) {
- limitReached = true;
- break;
- }
- }
- } else {
- i++;
- }
- continue;
- }
- // Non-ASCII tail — only 2-char and 1-char matches are possible here
- let matchedEntity = null;
- let advance = 1;
- if (i + 1 < len) {
- const inner2 = trie2.get(c0);
- if (inner2 !== undefined) {
- const c1 = str.charCodeAt(i + 1);
- const candidate = inner2.get(c1);
- if (candidate !== undefined) { matchedEntity = candidate; advance = 2; }
- }
- }
- if (matchedEntity === null && encodeAllNamed) {
- const candidate = trie1.get(c0);
- if (candidate !== undefined) { matchedEntity = candidate; }
- }
- if (matchedEntity !== null) {
- result += str.substring(last, i) + matchedEntity;
- i += advance;
- last = i;
- if (maxRep > 0) {
- this.replacementsCount++;
- if (this.replacementsCount >= maxRep) {
- limitReached = true;
- break;
- }
- }
- } else {
- i++;
- }
- }
- // ── Flush any remaining literal suffix ────────────────────────────────
- if (last < len) result += str.substring(last);
- return result;
- }
- reset() {
- this.replacementsCount = 0;
- }
- }
|