EntityEncoder.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. // EntityDecoder.js
  2. import { trie1, trie2, trie3 } from './entityTries.js';
  3. // Replacement strings indexed by char code — direct array access, no hashing
  4. const XML_UNSAFE_REPLACEMENT = new Array(128);
  5. XML_UNSAFE_REPLACEMENT[38] = '&'; // &
  6. XML_UNSAFE_REPLACEMENT[60] = '&lt;'; // <
  7. XML_UNSAFE_REPLACEMENT[62] = '&gt;'; // >
  8. XML_UNSAFE_REPLACEMENT[34] = '&quot;'; // "
  9. XML_UNSAFE_REPLACEMENT[39] = '&apos;'; // '
  10. // Typed bitmask for O(1) "is this ASCII code XML-unsafe?" check
  11. const IS_XML_UNSAFE = new Uint8Array(128);
  12. IS_XML_UNSAFE[38] = 1;
  13. IS_XML_UNSAFE[60] = 1;
  14. IS_XML_UNSAFE[62] = 1;
  15. IS_XML_UNSAFE[34] = 1;
  16. IS_XML_UNSAFE[39] = 1;
  17. // Fast pre-scan: bail out immediately if nothing needs encoding
  18. const NEEDS_PROCESSING = /[&<>"'\u0080-\uFFFF]/;
  19. export default class EntityEncoder {
  20. constructor(options = {}) {
  21. this.encodeXmlSafe = options.encodeXmlSafe !== false;
  22. this.encodeAllNamed = options.encodeAllNamed !== false;
  23. this.maxReplacements = options.maxReplacements || 0;
  24. this.replacementsCount = 0;
  25. }
  26. encode(str) {
  27. if (typeof str !== 'string' || str.length === 0) return str;
  28. if (!NEEDS_PROCESSING.test(str)) return str;
  29. const maxRep = this.maxReplacements;
  30. if (maxRep > 0 && this.replacementsCount >= maxRep) return str;
  31. // Hoist to locals — avoids `this` property lookup inside the hot loop
  32. const encodeXmlSafe = this.encodeXmlSafe;
  33. const encodeAllNamed = this.encodeAllNamed;
  34. const len = str.length;
  35. let result = '';
  36. let last = 0;
  37. let i = 0;
  38. let limitReached = false;
  39. // ── Main loop: runs to len-2 so trie3 never needs a bounds check ────────
  40. // The last 2 characters are handled by the tail block below.
  41. const mainEnd = len - 2; // i <= mainEnd guarantees i+1 and i+2 are valid
  42. while (i <= mainEnd && !limitReached) {
  43. const c0 = str.charCodeAt(i);
  44. // ── ASCII branch ───────────────────────────────────────────────────
  45. if (c0 < 128) {
  46. if (encodeXmlSafe && IS_XML_UNSAFE[c0] === 1) {
  47. result += str.substring(last, i) + XML_UNSAFE_REPLACEMENT[c0];
  48. last = ++i;
  49. if (maxRep > 0) {
  50. this.replacementsCount++;
  51. if (this.replacementsCount >= maxRep) {
  52. limitReached = true;
  53. break;
  54. }
  55. }
  56. } else {
  57. // Bulk-skip: advance to the next interesting position without
  58. // touching the outer loop overhead on every safe character
  59. i++;
  60. while (i <= mainEnd && !limitReached) {
  61. const c = str.charCodeAt(i);
  62. if (c >= 128 || (encodeXmlSafe && IS_XML_UNSAFE[c] === 1)) break;
  63. i++;
  64. }
  65. }
  66. continue;
  67. }
  68. // ── Non-ASCII: integer-keyed trie lookup ───────────────────────────
  69. // No bounds checks needed for c1/c2 because i <= mainEnd guarantees
  70. // i+1 and i+2 are both within the string.
  71. let matchedEntity = null;
  72. let advance = 1;
  73. // Try 3-char match first (longest wins)
  74. const mid3 = trie3.get(c0);
  75. if (mid3 !== undefined) {
  76. const c1 = str.charCodeAt(i + 1);
  77. const inner3 = mid3.get(c1);
  78. if (inner3 !== undefined) {
  79. const c2 = str.charCodeAt(i + 2);
  80. const candidate = inner3.get(c2);
  81. if (candidate !== undefined) { matchedEntity = candidate; advance = 3; }
  82. }
  83. }
  84. // Try 2-char match
  85. if (matchedEntity === null) {
  86. const inner2 = trie2.get(c0);
  87. if (inner2 !== undefined) {
  88. const c1 = str.charCodeAt(i + 1);
  89. const candidate = inner2.get(c1);
  90. if (candidate !== undefined) { matchedEntity = candidate; advance = 2; }
  91. }
  92. }
  93. // Try 1-char match
  94. if (matchedEntity === null && encodeAllNamed) {
  95. const candidate = trie1.get(c0);
  96. if (candidate !== undefined) { matchedEntity = candidate; }
  97. }
  98. if (matchedEntity !== null) {
  99. result += str.substring(last, i) + matchedEntity;
  100. i += advance;
  101. last = i;
  102. if (maxRep > 0) {
  103. this.replacementsCount++;
  104. if (this.replacementsCount >= maxRep) {
  105. limitReached = true;
  106. break;
  107. }
  108. }
  109. } else {
  110. i++;
  111. }
  112. }
  113. // ── Tail: handle the last 1-2 characters (no 3-char match possible) ────
  114. while (i < len && !limitReached) {
  115. const c0 = str.charCodeAt(i);
  116. if (c0 < 128) {
  117. if (encodeXmlSafe && IS_XML_UNSAFE[c0] === 1) {
  118. result += str.substring(last, i) + XML_UNSAFE_REPLACEMENT[c0];
  119. last = ++i;
  120. if (maxRep > 0) {
  121. this.replacementsCount++;
  122. if (this.replacementsCount >= maxRep) {
  123. limitReached = true;
  124. break;
  125. }
  126. }
  127. } else {
  128. i++;
  129. }
  130. continue;
  131. }
  132. // Non-ASCII tail — only 2-char and 1-char matches are possible here
  133. let matchedEntity = null;
  134. let advance = 1;
  135. if (i + 1 < len) {
  136. const inner2 = trie2.get(c0);
  137. if (inner2 !== undefined) {
  138. const c1 = str.charCodeAt(i + 1);
  139. const candidate = inner2.get(c1);
  140. if (candidate !== undefined) { matchedEntity = candidate; advance = 2; }
  141. }
  142. }
  143. if (matchedEntity === null && encodeAllNamed) {
  144. const candidate = trie1.get(c0);
  145. if (candidate !== undefined) { matchedEntity = candidate; }
  146. }
  147. if (matchedEntity !== null) {
  148. result += str.substring(last, i) + matchedEntity;
  149. i += advance;
  150. last = i;
  151. if (maxRep > 0) {
  152. this.replacementsCount++;
  153. if (this.replacementsCount >= maxRep) {
  154. limitReached = true;
  155. break;
  156. }
  157. }
  158. } else {
  159. i++;
  160. }
  161. }
  162. // ── Flush any remaining literal suffix ────────────────────────────────
  163. if (last < len) result += str.substring(last);
  164. return result;
  165. }
  166. reset() {
  167. this.replacementsCount = 0;
  168. }
  169. }