commit 0b3180070c95a3e988a891353a0402ee545f0745 Author: SheetJS Date: Thu Apr 4 21:59:48 2024 -0400 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..add2ce5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules +misc diff --git a/README.md b/README.md new file mode 100644 index 0000000..87d850c --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# Regexide + +This is the source and supporting material for + +## HTML Generation + +`index.html` was generated with [`markdown-preview-enhanced`](https://open-vsx.org/extension/shd101wyy/markdown-preview-enhanced) diff --git a/bing.png b/bing.png new file mode 100644 index 0000000..de4d2b6 Binary files /dev/null and b/bing.png differ diff --git a/chatgpt.png b/chatgpt.png new file mode 100644 index 0000000..2fbf10a Binary files /dev/null and b/chatgpt.png differ diff --git a/data/js.csv b/data/js.csv new file mode 100644 index 0000000..705eb8d --- /dev/null +++ b/data/js.csv @@ -0,0 +1,9 @@ +repetitions,runtime (sec) +4096,0.031867 +8192,0.114731 +16384,0.439549 +32768,1.766 +65536,7.178 +131072,28.975 +262144,119.265 +524288,486.966 \ No newline at end of file diff --git a/data/js.png b/data/js.png new file mode 100644 index 0000000..ca57ec3 Binary files /dev/null and b/data/js.png differ diff --git a/data/js.raw b/data/js.raw new file mode 100644 index 0000000..5aed9dd --- /dev/null +++ b/data/js.raw @@ -0,0 +1,13 @@ +128: 0.068ms +256: 0.136ms +512: 0.537ms +1024: 2.083ms +2048: 8.241ms +4096: 31.867ms +8192: 114.731ms +16384: 439.549ms +32768: 1.766s +65536: 7.178s +131072: 28.975s +262144: 1:59.265 (m:ss.mmm) +524288: 8:06.966 (m:ss.mmm) diff --git a/data/js.xlsx b/data/js.xlsx new file mode 100644 index 0000000..4afe232 Binary files /dev/null and b/data/js.xlsx differ diff --git a/data/re2.csv b/data/re2.csv new file mode 100644 index 0000000..734669e --- /dev/null +++ b/data/re2.csv @@ -0,0 +1,9 @@ +repetitions,runtime (sec) +524288,0.009082 +1048576,0.020563 +2097152,0.038678 +4194304,0.078328 +8388608,0.179104 +16777216,0.35633 +33554432,0.696656 +67108864,1.359 \ No newline at end of file diff --git a/data/re2.png b/data/re2.png new file mode 100644 index 0000000..0abc60f Binary files /dev/null and b/data/re2.png differ diff --git a/data/re2.raw b/data/re2.raw new file mode 100644 index 0000000..6211560 --- /dev/null +++ b/data/re2.raw @@ -0,0 +1,20 @@ +128: 0.055ms +256: 0.048ms +512: 0.046ms +1024: 0.049ms +2048: 0.06ms +4096: 0.079ms +8192: 0.218ms +16384: 0.281ms +32768: 0.585ms +65536: 1.036ms +131072: 3.019ms +262144: 4.759ms +524288: 9.082ms +1048576: 20.563ms +2097152: 38.678ms +4194304: 78.328ms +8388608: 179.104ms +16777216: 356.33ms +33554432: 696.656ms +67108864: 1.359s \ No newline at end of file diff --git a/data/regex.csv b/data/regex.csv new file mode 100644 index 0000000..82756f2 --- /dev/null +++ b/data/regex.csv @@ -0,0 +1,7 @@ +repetitions,runtime (sec) +16777216,1.503 +33554432,3.142 +67108864,6.220 +134217728,12.196 +268435456,25.131 +536870912,50.214 \ No newline at end of file diff --git a/data/regex.png b/data/regex.png new file mode 100644 index 0000000..bc9cd3b Binary files /dev/null and b/data/regex.png differ diff --git a/data/regex.raw b/data/regex.raw new file mode 100644 index 0000000..ac3fe0d --- /dev/null +++ b/data/regex.raw @@ -0,0 +1,8 @@ +16777216: 1.503218115s +33554432: 3.142173204s +67108864: 6.22026418s +134217728: 12.19631641s +268435456: 25.13115468s +536870912: 50.214314487s +1073741824: 101.206080608s +2147483648: 201.994993574s \ No newline at end of file diff --git a/data/regress.csv b/data/regress.csv new file mode 100644 index 0000000..cb61ffb --- /dev/null +++ b/data/regress.csv @@ -0,0 +1,7 @@ +repetitions,runtime (sec) +2048,0.086 +4096,0.286 +8192,1.130 +16384,4.429 +32768,18.436 +65536,73.369 \ No newline at end of file diff --git a/data/regress.png b/data/regress.png new file mode 100644 index 0000000..d7542a3 Binary files /dev/null and b/data/regress.png differ diff --git a/data/regress.raw b/data/regress.raw new file mode 100644 index 0000000..87d9ff7 --- /dev/null +++ b/data/regress.raw @@ -0,0 +1,10 @@ +128: 299.952µs +256: 1.192834ms +512: 4.780516ms +1024: 18.974845ms +2048: 86.106892ms +4096: 285.863802ms +8192: 1.129599808s +16384: 4.428627998s +32768: 18.436471062s +65536: 73.369061527s \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100755 index 0000000..f73bd7a Binary files /dev/null and b/favicon.ico differ diff --git a/gpt4.png b/gpt4.png new file mode 100644 index 0000000..ecea98f Binary files /dev/null and b/gpt4.png differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..f671977 --- /dev/null +++ b/index.html @@ -0,0 +1,491 @@ + + Regexide + + + + + + + + + + + + + +
+ +

Regexide Logo

+

This story begins with a simple question:

+
+

How do I remove XML comments in JavaScript?

+
+

The Internet hivemind converged on one general approach: regular expressions.

+

The most frequently recommended answer is:

+
str = str.replace(/<!--[\s\S]*?-->/g, ""); // bad, do not use
+                  ^^^^^^^^^^^^^^^^^^
+

There are known flaws with this family of regular expressions.

+

This discussion focuses on "Regexide", the act of identifying and replacing flawed regular expressions with other techniques that better reflect the intended effect.

+ + +

Why XML Comments matter

+

XML is a popular format for storing and sharing data. It was explicitly designed for people and programs to read and write data.[1] From spreadsheets to save states, most modern software and games parse and write XML.

+

XML comments are special notes that parsers should not treat as data. XML comments start with <!-- and end with -->.

+

Technically XML comments must not contain the string -- within the comment body. Many programs and people write invalid XML comments, so parsers will typically allow for nested --.

+

The following XML comment is technically invalid but accepted by many parsers:

+
<!-- I used to be a programmer like you,
+     then I took an <!-- in the Kleene -->
+

(Kleene wrote a seminal paper[2] on regular expressions.)

+

How the regular expression works

+

The regular expression body /<!--[\s\S]*?-->/ has three parts:

+

A) <!-- matches the four literal characters
+B) [\s\S]*? matches any number of characters
+C) --> matches the three literal characters

+

In (B), [\s\S] matches any character. The *? is a "non-greedy quantifier" that instructs the regular expression engine to take the shortest match.

+
Example of greedy and non-greedy matches (click to show) +

Consider the following string:

+
<!-- <!-- <!-- --> --> -->
+
+

The "greedy" /<!--[\s\S]*-->/ will match from the first <!-- to the last -->:

+
<!-- <!-- <!-- --> --> -->  /<!--[\s\S]*-->/
+
+

The non-greedy /<!--[\s\S]*?-->/ will match from the first <!-- to the first -->:

+
<!-- <!-- <!-- --> --> -->  /<!--[\s\S]*?-->/
+
+
+

The modern variant of the regular expression uses the /s flag:

+
str = str.replace(/<!--.*?-->/gs, ""); // even worse
+                  ^^^^^^^^^^^^^^
+

The /s flag modifies the . character class to include line terminators.

+

Usage in Open Source Projects

+

Many popular open source projects use problematic regular expressions.

+

Nunjucks used this regular expression within in the striptags filter expression:

+
  let tags = /<\/?([a-z][a-z0-9]*)\b[^>]*>|<!--[\s\S]*?-->/gi;
+

PrettierJS used this regular expression in the build sequence:

+
const templateComments = template.match(/<!--.*?-->/gs);
+

RollupJS used this regular expression in the build sequence:

+
const bodyWithoutComments = data.body.replace(/<!--[\S\s]*?-->/g, '');
+

SheetJS used this regular expression in parsing:

+
str = str.replace(/<!--([\s\S]*?)-->/mg,"");
+

ViteJS used the nascent s flag to ensure . matches newline characters:

+
export const commentRE = /<!--.*?-->/gs
+
+  // Avoid matching the content of the comment
+  raw = raw.replace(commentRE, '<!---->')
+

VueJS 2 used regular expressions in processing:

+
text = text
+  .replace(/<!--([\s\S]*?)-->/g, '$1')
+  .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1');
+

WordPress used regular expressions in the word count calculator:

+
		HTMLcommentRegExp: /<!--[\s\S]*?-->/g,
+

Element Plus used a similar regular expression to match blocks starting with <del> and ending with </del>:

+
  const str = removeTag(value)
+    .replaceAll(/<del>.*<\/del>/g, '')
+    // ---------^^^^^^^^^^^^^^^^^^ -- start <del> end </del>
+

A rare consensus

+

Most resources recommend this approach.

+

Books recommend this approach. "Regular Expressions Cookbook"[3] section 9.9 explicitly recommends /<!--[\s\S]*?-->/ for matching XML comments.

+

StackOverflow Answers recommend this regular expression and variants such as /<!--[\s\S\n]*?-->/ (which are, for all practical purposes, equivalent).

+

ChatGPT4 has recommended the previous regular expression. It also generated code for a complete unrelated tag.

+

Bing AI proposed unrelated command line tools for JavaScript.

+
ChatGPT4 and Bing AI Screenshots (click to show) +

ChatGPT4 Incorrect interpretation

+

ChatGPT incorrect interpretation

+

ChatGPT4 Correct interpretation, solution uses vulnerable regular expression

+

ChatGPT correct interpretation

+

Bing AI Correct Interpretation, solution uses vulnerable regular expression

+

Bing AI correct interpretation

+
+

The Internet Failed Us

+

There are deep performance issues with the regular expression. To see this, consider a string that repeats the header part <!-- many times. In general, this type of string can be generated in JavaScript using String#repeat:

+
var string_repeated_65536_times = "<!--".repeat(65536);
+

The replace operation is surprisingly slow. Try the following snippet in a new browser window or NodeJS terminal:

+
// this loop doubles each time
+for(var n = 64; n < 1000000; n*=2) {
+  var s = "<!--".repeat(n); // generate repeated string
+  console.time(n);
+  s.replace(/<!--([\s\S]*?)-->/mg,""); // replace
+  console.timeEnd(n);
+}
+

Results are from local tests on a 2019 Intel i9 MacBook Pro. The following chart displays runtime in seconds (vertical axis) as a function of repetitions (horizontal axis). The quadratic trend line closely fits the data.

+javascript performance test - quadratic complexity +

Download the raw data as a CSV

+

When the number of repetitions doubled, the runtime roughly quadrupled. This is a "quadratic" relationship.

+

Why the regular expression is slow

+

The regular expression matches a string that starts with <!-- and ends with -->. Consider a function that repeatedly looks for the <!-- string and tries to find the first --> that appears afterwards. Computer scientists classify this algorithm as "Backtracking"[4]:

+
function match_all_regex_comments(str) {
+  const results = [];
+
+  /* look for the first instance of <!-- */
+  let start_index = str.indexOf("<!--");
+
+  /* this loop runs while start_index is valid */
+  while(start_index > -1) {
+
+    /* look for the first instance of --> after <!-- */
+    let end_index = str.indexOf("-->", start_index + 4);
+
+    /* if --> is found, then we have a match! */
+    if(end_index > -1) {
+
+      /* add to array */
+      results.push(str.slice(start_index, end_index + 3));
+
+      /* start scanning from the end of the `-->` */
+      start_index = str.indexOf("<!--", end_index + 3);
+    }
+
+    else {
+      /* jump to the next potential starting point */
+      start_index = str.indexOf("<!--", start_index + 1);
+    }
+  }
+
+  /* return the final list */
+  return results;
+}
+
+

Optimization

+

The keen-eyed reader will notice that the loop can be terminated once the search for --> fails (line 25 should be break;).

+

Engines designed for JavaScript regular expressions do not currently perform this optimization.

+

It can be shown that the runtime complexity of the modified algorithm is Θ(L+M)\Theta(L+M) where LL is the string length and MM is the number of matches

+
+

If --> is not in the string, the scan str.indexOf("-->", start_index + 4) will look at every character in the string starting from start_index + 4. In the worst case, with repeated <!--, the scan will start from index 4, then index 8, then index 12, etc.

+

The following diagram shows the first three scans when running the function against the string formed by repeating <!-- 5 times. The <!-- matches are highlighted in yellow and the scans for the --> are highlighted in blue.

+
<!--<!--<!--<!--<!--
+^^^^             (first  match of <!--     0 - 3)
+    ............ (scan for --> from index  4 to end)   L -  4 characters
+
+<!--<!--<!--<!--<!--
+    ^^^^         (second match of <!--     4 - 7)
+        ........ (scan for --> from index  8 to end)   L -  8 characters
+
+
+<!--<!--<!--<!--<!--
+        ^^^^     (third  match of <!--     8 - 11)
+            .... (scan for --> from index 12 to end)   L - 12 characters
+
+

For NN repetitions of <!--, the total string length is L=4NL = 4 * N. There will be NN matches.

+
Mathematical Analysis (click to show) +

The first scan will start on character 4 (end of the first match) and inspect L4L-4 characters (to the end of the string).

+

The second scan will start on character 24=82 * 4 = 8 and inspect L8L-8 characters.

+

In general, the KK-th scan will start on character 4K4 * K and inspect L4KL - 4*K characters.

+

The total number of characters scanned when looking for the end tag (line 11 in the code) is:

+

Scanned=(L4)+(L42)++(L(N1)4)+(LN4)=NL41424(N1)4N=NL4i=1Ni=4N24N(N+1)2=4N22N22N=L28L2\begin{array}{rl} +Scanned &= (L-4) + (L-4\cdot 2) + \ldots + (L-(N-1)\cdot 4) + (L - N\cdot 4) \\ +&= N\cdot L - 4 \cdot 1 - 4 \cdot 2 - \ldots - 4 \cdot (N-1) - 4 \cdot N \\ +&= N\cdot L - 4 \cdot \displaystyle\sum_{i=1}^{N} i \\ +&= 4\cdot N^2 - 4 \cdot \dfrac{N \cdot (N+1)}{2} \\ +&= 4 \cdot N^2 - 2 \cdot N^2 - 2 \cdot N = \dfrac{L^2}{8} - \dfrac{L}{2} +\end{array}

+
+

In the worst case, the number of characters scanned is roughly proportional to the square of the length of the string. In "Big-O Notation", the complexity is O(L2)O(L^2). This is colloquially described as a "quadratic blowup".

+

Vulnerability

+

This is generally considered a vulnerability since relatively small data can cause browsers or servers to freeze for extended periods of time.

+

The official category for this weakness is "CWE-1333"[5] "Inefficient Regular Expression Complexity".

+

Some resources use the phrase "Catastrophic backtracking" to describe the issue.

+

A side note about Rust

+

Everyone writes high-performance code in Rust, right?

+

Rust does not have built-in support for regular expressions. Third-party libraries fill the gap.

+

The Rust regress[6] crate is designed for JavaScript regular expressions. It represents a true apples-to-apples comparison with JavaScript.

+
    let re = regress::Regex::new(r"<!--([\s\S]*?)-->").unwrap();
+    let mut str = "<!--<!--<!--";
+    let _match = re.find(str);
+
Complete Example (click to show) +
fn main() {
+    let re = regress::Regex::new(r"<!--([\s\S]*?)-->").unwrap();
+
+    /* construct string by repeating with itself */
+    let mut str = "<!--";
+    let mut _str = format!("{}{}", str, str);
+    let mut rept: u64 = 1;
+    for _i in 1..8 {
+        _str = format!("{}{}", str, str);
+        str = _str.as_str();
+        rept *= 2;
+    }
+
+    for _j in 1..11 {
+        /* test regular expression against string */
+        let start_time = std::time::Instant::now();
+        let _caps = re.find(str);
+        let elapsed_time = start_time.elapsed();
+        println!("{}: {:?}", rept, elapsed_time);
+
+        /* double string length by repeating with itself */
+        _str = format!("{}{}", str, str);
+        str = _str.as_str();
+        rept *= 2;
+    }
+}
+
+

Results are from local tests on a 2019 Intel i9 MacBook Pro. regress shows the same quadratic behavior as other JavaScript regular expression engines.

+rust regress performance test - quadratic complexity +

Download the raw data as a CSV

+

Workarounds

+

There are a few general approaches to address the issue.

+

Use a Different Engine

+

By limiting the supported featureset, other regular expression engines have stricter performance guarantees.

+

NodeJS

+

The re2[7] C++ engine sacrifices backreference and lookaround support for performance. There are bindings for many server-side programming languages.

+

The re2[8] NodeJS package is a native binding to the C++ engine and can be used in server-side environments. With modern versions of NodeJS, normal regular expressions can be wrapped with RE2:

+
var out = str.replace(new RE2(/<!--([\s\S]*?)-->/mg),""); // replace
+
Complete Example (click to show) +
var RE2 = require("re2");
+// this loop doubles each time
+for(var n = 64; n < 100000000; n*=2) {
+  var s = "<!--".repeat(n); // generate repeated string
+  console.time(n);
+  s.replace(new RE2(/<!--([\s\S]*?)-->/mg),""); // replace
+  console.timeEnd(n);
+}
+
+

The re2 implementation uses algorithms whose performance scales linearly with the size of the input.

+nodejs re2 performance test - linear complexity +

Download the raw data as a CSV

+

Rust

+

The Rust regex[9] crate sacrifices support for performance. It is the same tradeoff made by the re2 engine.

+

Since it does not use lookaround or backreferences, the original regular expression is compatible with the regex crate:

+
    let re = regex::Regex::new(r"<!--([\s\S]*?)-->").unwrap();
+    let mut str = "<!--<!--<!--";
+    let _match = re.find(str);
+
Complete Example (click to show) +
fn main() {
+    let re = regex::Regex::new(r"<!--([\s\S]*?)-->").unwrap();
+
+    /* construct string by repeating with itself */
+    let mut str = "<!--";
+    let mut _str = format!("{}{}", str, str);
+    let mut rept: u64 = 1;
+    for _i in 1..25 {
+        _str = format!("{}{}", str, str);
+        str = _str.as_str();
+        rept *= 2;
+    }
+
+    for _j in 1..11 {
+        /* find all matches */
+        let start_time = std::time::Instant::now();
+        let _caps = re.captures(str);
+        let elapsed_time = start_time.elapsed();
+        println!("{}: {:?}", rept, elapsed_time);
+
+        /* double string length by repeating with itself */
+        _str = format!("{}{}", str, str);
+        str = _str.as_str();
+        rept *= 2;
+    }
+}
+
+

The Rust regex implementation uses algorithms whose performance scales linearly with the size of the input.

+rust regex performance test - linear complexity +

Download the raw data as a CSV

+

Exogenous Constraints

+

Most problems have additional constraints. Addressing the constraints allow for more precise regular expressions with better performance.

+

For the problem of matching comments, various specifications impose limitations.

+

XML Comments

+

The XML 1.0 specification[10] disallows -- within comments.

+
(this comment is not valid in XML 1.0)
+<!-- - .... .. ... / -.-. --- -- -- . -. - / .. ... / .. -. ...- .- .-.. .. -.. -->
+

PrettierJS uses a regular expression in the MDX parser that enforces the XML constraint:

+
const COMMENT_REGEX = /<!---->|<!---?[^>-](?:-?[^-])*-->/;
+

Commonly-used regular expression engines can optimize for this pattern and avoid backtracking.

+
+

Spreadsheet Engines

+

The XML parser in Excel powering the Excel Workbook (XLSX) format expects proper XML comments with no -- in the comment body.

+

The XML parser in Excel powering the Excel 2003-2004 (SpreadsheetML) format allows -- in the comment body.

+
+

HTML Comments

+

The HTML5 standard[11] permits -- but forbids <!-- within comment text. For example, the following comment is not valid according to the standard:

+
<!-- I used to be a programmer like you, then I took an <!-- in the Kleene -->
+
+

yt-dlp uses a regular expression with a negative lookahead to ensure <!-- does not appear in the body:

+
    html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+

This expression allows -- but disallows <!-- in the comment body. In practice, it will match comments starting from the innermost <!--. Using the previous example:

+
<!-- I used to be a programmer like you, then I took an <!-- in the Kleene -->
+
+
+

Web Browsers

+

Web browsers generally allow <!-- in comments. Text between the first <!-- and the first --> are treated as a comment. For example, consider the following HTML:

+
<pre><!-- this is a nested comment <!-- --> --> more text</pre>
+     |                                    |^^^^^^^^^^^^^^ --- content
+     |  this is interpreted as a comment  |
+

This exact HTML code is added below:

+
 --> more text
+

Chromium and other browsers will display --> more text

+
+

Remove the Regular Expression

+

Regular expression operations can be reimplemented using standard string operations.

+

For example, the replacement

+
str = str.replace(/<!--([\s\S]*?)-->/, "");
+

can be rewritten with a loop. The core idea is to collect non-commented fragments:

+
function remove_xml_comments(str) {
+  const START = "<!--", END = "-->";
+  const results = [];
+  /* this index tracks the last analyzed character */
+  let last_index = 0;
+
+  /* look for the first instance of <!-- */
+  let start_index = str.indexOf(START);
+
+  /* this loop runs while start_index is valid */
+  while(start_index > -1) {
+
+    /* add the fragment that precedes the comment */
+    results.push(str.slice(last_index, start_index));
+    last_index = start_index;
+
+    /* look for the first instance of --> after <!-- */
+    let end_index = str.indexOf(END, start_index + START.length);
+
+    /* if --> is found, then we have a match! */
+    if(end_index > -1) {
+      /* skip the comment */
+      last_index = end_index + END.length;
+
+      /* search for next comment open tag */
+      start_index = str.indexOf(START, last_index);
+    }
+
+    /* if there is no end comment tag, stop processing */
+    else break;
+  }
+
+  /* add remaining part of string */
+  results.push(str.slice(last_index));
+
+  /* concatenate the fragments */
+  return results.join("");
+}
+

Validate Data

+

In the places where ViteJS used the vulnerable regular expression, the text was validated using a separate HTML parser.

+

It is still strongly recommended to replace the regular expression.

+

Limit to Trusted Data

+

PrettierJS and RollupJS use the vulnerable regular expression in internal scripts. The expressions are not used or added in websites. The data sources are trusted and malformed data can be corrected manually.

+

Special Thanks

+

Special thanks to Asadbek, Jardel, and members of the SheetJS team for early feedback.

+
+
+
    +
  1. See "Origin and Goals" in the Extensible Markup Language (XML) 1.0 specification. ↩︎

    +
  2. +
  3. The theoretical underpinnings of modern regular expressions were established in the working paper "Representation of Events in Nerve Nets and Finite Automata" ↩︎

    +
  4. +
  5. See "9.9 Remove XML-Style Comments" on the official site for the book. ↩︎

    +
  6. +
  7. See the Wikipedia article for "Backtracking" for more details and resources. ↩︎

    +
  8. +
  9. See the definition in the "CWE List" for more details and resources. ↩︎

    +
  10. +
  11. See the listing for regress crate for more details. ↩︎

    +
  12. +
  13. See the google/re2 project on GitHub for more details. ↩︎

    +
  14. +
  15. See the listing for the re2 NodeJS package for more details. ↩︎

    +
  16. +
  17. See the listing for regex crate for more details. ↩︎

    +
  18. +
  19. See "Comments" in the XML 1.0 specification. ↩︎

    +
  20. +
  21. See "Comments" in the WHATWG HTML Living Standard. ↩︎

    +
  22. +
+
+ +
+ + + + + + + + + + \ No newline at end of file diff --git a/index.md b/index.md new file mode 100644 index 0000000..f1d5474 --- /dev/null +++ b/index.md @@ -0,0 +1,595 @@ +--- +title: Regexide +html: + offline: false +export_on_save: + html: true + toc: true +toc: + depth_from: 1 + depth_to: 3 + ordered: true +--- + +

Regexide Logo

+ +This story begins with a simple question: + +!!! question How do I remove XML comments in JavaScript? + +The Internet hivemind converged on one general approach: regular expressions. + +The most frequently recommended answer is: + +```js +str = str.replace(//g, ""); // bad, do not use + ^^^^^^^^^^^^^^^^^^ +``` + +**There are known flaws with this family of regular expressions.** + +This discussion focuses on "Regexide", the act of identifying and replacing flawed regular expressions with other techniques that better reflect the intended effect. + +[TOC] + +## Why XML Comments matter + +XML is a popular format for storing and sharing data. It was explicitly designed for people and programs to read and write data.[^1] From spreadsheets to save states, most modern software and games parse and write XML. + +XML comments are special notes that parsers should not treat as data. XML comments start with ``. + +Technically XML comments must not contain the string `--` within the comment body. Many programs and people write invalid XML comments, so parsers will typically allow for nested `--`. + +The following XML comment is technically invalid but accepted by many parsers: + +```xml + +``` + +(Kleene wrote a seminal paper[^2] on regular expressions.) + +## How the regular expression works + +The regular expression body `//` has three parts: + +A) `` matches the three literal characters + +In (B), `[\s\S]` matches any character. The `*?` is a "non-greedy quantifier" that instructs the regular expression engine to take the shortest match. + +
Example of greedy and non-greedy matches (click to show) + +Consider the following string: + +
+<!-- <!-- <!-- --> --> -->
+
+ +The "greedy" `//` will match from the first ``: + +
+<!-- <!-- <!-- --> --> -->  /<!--[\s\S]*-->/
+
+ +The non-greedy `//` will match from the first ``: + +
+<!-- <!-- <!-- --> --> -->  /<!--[\s\S]*?-->/
+
+ +
+ +The modern variant of the regular expression uses the `/s` flag: + +```js +str = str.replace(//gs, ""); // even worse + ^^^^^^^^^^^^^^ +``` + +The `/s` flag modifies the `.` character class to include line terminators. + +### Usage in Open Source Projects + +Many popular open source projects use problematic regular expressions. + +[Nunjucks](https://github.com/mozilla/nunjucks/blob/ea0d6d5396d39d9eed1b864febb36fbeca908f23/nunjucks/src/filters.js#L491) used this regular expression within in the `striptags` filter expression: + +```js + let tags = /<\/?([a-z][a-z0-9]*)\b[^>]*>|/gi; +``` + +[PrettierJS](https://github.com/prettier/prettier/blob/45ad4668ebc133621c7f94e678ce399cab318068/scripts/lint-changelog.js#L51) used this regular expression in the build sequence: + +```js +const templateComments = template.match(//gs); +``` + +[RollupJS](https://github.com/rollup/rollup/blob/18372035f167ec104280e1e91ef795e4f7033f1e/scripts/release-helpers.js#L76) used this regular expression in the build sequence: + +```js +const bodyWithoutComments = data.body.replace(//g, ''); +``` + +[SheetJS](https://github.com/SheetJS/sheetjs/blob/master/xlsx.mjs#L18117) used this regular expression in parsing: + +```js +str = str.replace(//mg,""); +``` + +[ViteJS](https://github.com/vitejs/vite/blob/9fc5d9cb3a1b9df067e00959faa9da43ae03f776/packages/vite/src/node/optimizer/scan.ts#L259) used the nascent `s` flag to ensure `.` matches newline characters: + +```js +export const commentRE = //gs + + // Avoid matching the content of the comment + raw = raw.replace(commentRE, '') +``` + +[VueJS 2](https://github.com/vuejs/vue/blob/v2.2.3/dist/vue.esm.js#L7404) used regular expressions in processing: + +```js +text = text + .replace(//g, '$1') + .replace(//g, '$1'); +``` + +[WordPress](https://github.com/WordPress/WordPress/blob/master/wp-admin/js/word-count.js#L73) used regular expressions in the word count calculator: + +```js + HTMLcommentRegExp: //g, +``` + +[Element Plus](https://github.com/element-plus/element-plus/blob/4ac4750158fa634aa9da186111bce86c2898fda2/internal/build/src/tasks/helper.ts#L60) used a similar regular expression to match blocks starting with `` and ending with ``: + +```js + const str = removeTag(value) + .replaceAll(/.*<\/del>/g, '') + // ---------^^^^^^^^^^^^^^^^^^ -- start end +``` + +### A rare consensus + +Most resources recommend this approach. + +**Books** recommend this approach. "Regular Expressions Cookbook"[^3] section 9.9 explicitly recommends `//` for matching XML comments. + +**StackOverflow Answers** recommend this regular expression and variants such as `//` (which are, for all practical purposes, equivalent). + +**ChatGPT4** has recommended the previous regular expression. It also generated code for a complete unrelated tag. + +**Bing AI** proposed unrelated command line tools for JavaScript. + +
ChatGPT4 and Bing AI Screenshots (click to show) + +_ChatGPT4 Incorrect interpretation_ + +![ChatGPT incorrect interpretation](gpt4.png) + +_ChatGPT4 Correct interpretation, solution uses vulnerable regular expression_ + +![ChatGPT correct interpretation](chatgpt.png) + +_Bing AI Correct Interpretation, solution uses vulnerable regular expression_ + +![Bing AI correct interpretation](bing.png) + +
+ +## The Internet Failed Us + +There are deep performance issues with the regular expression. To see this, consider a string that repeats the header part `/mg,""); // replace + console.timeEnd(n); +} +``` + +Results are from local tests on a 2019 Intel i9 MacBook Pro. The following chart displays runtime in seconds (vertical axis) as a function of repetitions (horizontal axis). The quadratic trend line closely fits the data. + +javascript performance test - quadratic complexity + +[Download the raw data as a CSV](./data/js.csv) + +When the number of repetitions doubled, the runtime roughly quadrupled. This is a "quadratic" relationship. + +### Why the regular expression is slow + +The regular expression matches a string that starts with ``. Consider a function that repeatedly looks for the `` that appears afterwards. Computer scientists classify this algorithm as "Backtracking"[^4]: + +```js {.line-numbers} +function match_all_regex_comments(str) { + const results = []; + + /* look for the first instance of after ", start_index + 4); + + /* if --> is found, then we have a match! */ + if(end_index > -1) { + + /* add to array */ + results.push(str.slice(start_index, end_index + 3)); + + /* start scanning from the end of the `-->` */ + start_index = str.indexOf("` fails (line 25 should be `break;`). + + **Engines designed for JavaScript regular expressions do not currently perform this optimization.** + + It can be shown that the runtime complexity of the modified algorithm is $\Theta(L+M)$ where $L$ is the string length and $M$ is the number of matches + +If `-->` is not in the string, the scan `str.indexOf("-->", start_index + 4)` will look at every character in the string starting from `start_index + 4`. In the worst case, with repeated `` are highlighted in blue. + +
+<!--<!--<!--<!--<!--
+^^^^             (first  match of <!--     0 - 3)
+    ............ (scan for --> from index  4 to end)   L -  4 characters
+
+<!--<!--<!--<!--<!--
+    ^^^^         (second match of <!--     4 - 7)
+        ........ (scan for --> from index  8 to end)   L -  8 characters
+
+
+<!--<!--<!--<!--<!--
+        ^^^^     (third  match of <!--     8 - 11)
+            .... (scan for --> from index 12 to end)   L - 12 characters
+
+ +For $N$ repetitions of `").unwrap(); + let mut str = "").unwrap(); + + /* construct string by repeating with itself */ + let mut str = "/mg),""); // replace +``` + +
Complete Example (click to show) + +```js +var RE2 = require("re2"); +// this loop doubles each time +for(var n = 64; n < 100000000; n*=2) { + var s = "/mg),""); // replace + console.timeEnd(n); +} +``` + +
+ +The `re2` implementation uses algorithms whose performance scales linearly with the size of the input. + +nodejs re2 performance test - linear complexity + +[Download the raw data as a CSV](./data/re2.csv) + +#### Rust + +The Rust `regex`[^9] crate sacrifices support for performance. It is the same tradeoff made by the `re2` engine. + +Since it does not use lookaround or backreferences, the original regular expression is compatible with the `regex` crate: + +```rust + let re = regex::Regex::new(r"").unwrap(); + let mut str = "").unwrap(); + + /* construct string by repeating with itself */ + let mut str = " +``` + +[PrettierJS](https://github.com/prettier/prettier/blob/ff83d55d05e92ceef10ec0cb1c0272ab894a00a0/src/language-markdown/mdx.js#L28) uses a regular expression in the MDX parser that enforces the XML constraint: + +```js +const COMMENT_REGEX = /|/; +``` + +Commonly-used regular expression engines can optimize for this pattern and avoid backtracking. + +!!! info Spreadsheet Engines + + The XML parser in Excel powering the [Excel Workbook (XLSX) format](https://docs.sheetjs.com/docs/miscellany/formats/#excel-2007-xml-xlsxxlsm) expects proper XML comments with no `--` in the comment body. + + The XML parser in Excel powering the [Excel 2003-2004 (SpreadsheetML) format](https://docs.sheetjs.com/docs/miscellany/formats#excel-2003-2004-spreadsheetml) allows `--` in the comment body. + + +#### HTML Comments + +The HTML5 standard[^11] permits `--` but forbids `', '', html) +``` + +This expression allows `--` but disallows `` are treated as a comment. For example, consider the following HTML: + + ```html +
 --> more text
+ | |^^^^^^^^^^^^^^ --- content + | this is interpreted as a comment | + ``` + + This exact HTML code is added below: + +
 --> more text
+ + Chromium and other browsers will display `--> more text` + +### Remove the Regular Expression + +Regular expression operations can be reimplemented using standard string operations. + +For example, the replacement + +```js +str = str.replace(//, ""); +``` + +can be rewritten with a loop. The core idea is to collect non-commented fragments: + +```js {.line-numbers} +function remove_xml_comments(str) { + const START = ""; + const results = []; + /* this index tracks the last analyzed character */ + let last_index = 0; + + /* look for the first instance of after is found, then we have a match! */ + if(end_index > -1) { + /* skip the comment */ + last_index = end_index + END.length; + + /* search for next comment open tag */ + start_index = str.indexOf(START, last_index); + } + + /* if there is no end comment tag, stop processing */ + else break; + } + + /* add remaining part of string */ + results.push(str.slice(last_index)); + + /* concatenate the fragments */ + return results.join(""); +} +``` + +### Validate Data + +In the places where ViteJS used the vulnerable regular expression, the text was validated using a separate HTML parser. + +It is still strongly recommended to replace the regular expression. + +### Limit to Trusted Data + +PrettierJS and RollupJS use the vulnerable regular expression in internal scripts. The expressions are not used or added in websites. The data sources are trusted and malformed data can be corrected manually. + +## Special Thanks + +Special thanks to [Asadbek](https://asadbek.dev/), [Jardel](http://francoatmega.com/), and members of the [SheetJS team](https://sheetjs.com) for early feedback. + +[^1]: See ["Origin and Goals"](https://www.w3.org/TR/REC-xml/#sec-origin-goals) in the Extensible Markup Language (XML) 1.0 specification. +[^2]: The theoretical underpinnings of modern regular expressions were established in the working paper ["Representation of Events in Nerve Nets and Finite Automata"](https://www.rand.org/content/dam/rand/pubs/research_memoranda/2008/RM704.pdf) +[^3]: See ["9.9 Remove XML-Style Comments"](https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch09s10.html) on the official site for the book. +[^4]: See [the Wikipedia article for "Backtracking"](https://en.wikipedia.org/wiki/Backtracking) for more details and resources. +[^5]: See [the definition in the "CWE List"](https://cwe.mitre.org/data/definitions/1333.html) for more details and resources. +[^6]: See [the listing for `regress` crate](https://crates.io/crates/regress) for more details. +[^7]: See [the `google/re2` project on GitHub](https://github.com/google/re2) for more details. +[^8]: See [the listing for the `re2` NodeJS package](https://www.npmjs.com/package/re2) for more details. +[^9]: See [the listing for `regex` crate](https://crates.io/crates/regex) for more details. +[^10]: See ["Comments"](https://www.w3.org/TR/REC-xml/#sec-comments) in the XML 1.0 specification. +[^11]: See ["Comments"](https://html.spec.whatwg.org/multipage/syntax.html#comments) in the WHATWG HTML Living Standard. diff --git a/regexide.png b/regexide.png new file mode 100755 index 0000000..3e30ddb Binary files /dev/null and b/regexide.png differ diff --git a/rust/.gitignore b/rust/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/rust/.gitignore @@ -0,0 +1 @@ +/target diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..eda4957 --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,159 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash", +] + +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "proc-macro2" +version = "1.0.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "regress" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ed9969cad8051328011596bf549629f1b800cf1731e7964b1eef8dfc480d2c2" +dependencies = [ + "hashbrown", + "memchr", +] + +[[package]] +name = "rust" +version = "0.1.0" +dependencies = [ + "regex", + "regress", +] + +[[package]] +name = "syn" +version = "2.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..34861ad --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "rust" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "1.10.2" +regress = "0.7.1" diff --git a/rust/src/main.rs b/rust/src/main.rs new file mode 100644 index 0000000..ee75726 --- /dev/null +++ b/rust/src/main.rs @@ -0,0 +1,54 @@ +fn main_regex() { + let re = regex::Regex::new(r"").unwrap(); + + let mut str = "").unwrap(); + + let mut str = "