From 36e77f8cf9fe7c3a2fee6d3a7c510d6fb576ed8a Mon Sep 17 00:00:00 2001 From: Logan Houp Date: Fri, 5 Apr 2024 14:01:53 -0400 Subject: [PATCH] refactor for clarity --- index.md | 285 ++++++++++++++++++++++++------------------------------- 1 file changed, 122 insertions(+), 163 deletions(-) diff --git a/index.md b/index.md index f1d5474..c45d11e 100644 --- a/index.md +++ b/index.md @@ -11,36 +11,23 @@ toc: ordered: true --- -

Regexide Logo

+Regexide Logo -This story begins with a simple question: +

Denials of Service in Regular Expression

-!!! question How do I remove XML comments in JavaScript? +This whole situation began with a simple question, "How do I remove XML comments in JavaScript?". The Internet hivemind converged on one general approach: regular expressions. -The Internet hivemind converged on one general approach: regular expressions. +This has one big problem. If you're not careful, parsing relatively small amounts of data could lead to browsers or servers freezing for extended periods of time. The official category for this weakness is "CWE-1333"[^5] "Inefficient Regular Expression Complexity". Some resources also use the phrase "Catastrophic backtracking" to describe the issue. -The most frequently recommended answer is: - -```js -str = str.replace(//g, ""); // bad, do not use - ^^^^^^^^^^^^^^^^^^ -``` - -**There are known flaws with this family of regular expressions.** - -This discussion focuses on "Regexide", the act of identifying and replacing flawed regular expressions with other techniques that better reflect the intended effect. +This discussion focuses on what we've come to call "Regexide", which we've defined as the act of identifying and replacing flawed regular expressions with other techniques that better reflect the intended effect. Let's look at a few examples. [TOC] -## Why XML Comments matter +## How the Regular Expression Works -XML is a popular format for storing and sharing data. It was explicitly designed for people and programs to read and write data.[^1] From spreadsheets to save states, most modern software and games parse and write XML. +For the purposes of this discussion, it's important to understand exactly what XML comments are. XML comments are special notes that parsers should not treat as data. XML comments start with ``. Technically XML comments must not contain the string `--` within the comment body. Many programs and people write invalid XML comments, so parsers will typically allow for nested `--`. -XML comments are special notes that parsers should not treat as data. XML comments start with ``. - -Technically XML comments must not contain the string `--` within the comment body. Many programs and people write invalid XML comments, so parsers will typically allow for nested `--`. - -The following XML comment is technically invalid but accepted by many parsers: +For example, the following XML comment is technically invalid but accepted by many parsers: ```xml /` has three parts: A) `/gs, ""); // even worse The `/s` flag modifies the `.` character class to include line terminators. -### Usage in Open Source Projects +## Usage in Open Source Projects -Many popular open source projects use problematic regular expressions. +Many popular open source projects use problematic regular expressions. The most frequently recommended answer is: + +```js +str = str.replace(//g, ""); // bad, do not use + ^^^^^^^^^^^^^^^^^^ +``` [Nunjucks](https://github.com/mozilla/nunjucks/blob/ea0d6d5396d39d9eed1b864febb36fbeca908f23/nunjucks/src/filters.js#L491) used this regular expression within in the `striptags` filter expression: ```js - let tags = /<\/?([a-z][a-z0-9]*)\b[^>]*>|/gi; +let tags = /<\/?([a-z][a-z0-9]*)\b[^>]*>|/gi ``` [PrettierJS](https://github.com/prettier/prettier/blob/45ad4668ebc133621c7f94e678ce399cab318068/scripts/lint-changelog.js#L51) used this regular expression in the build sequence: ```js -const templateComments = template.match(//gs); +const templateComments = template.match(//gs) ``` [RollupJS](https://github.com/rollup/rollup/blob/18372035f167ec104280e1e91ef795e4f7033f1e/scripts/release-helpers.js#L76) used this regular expression in the build sequence: ```js -const bodyWithoutComments = data.body.replace(//g, ''); +const bodyWithoutComments = data.body.replace(//g, "") ``` [SheetJS](https://github.com/SheetJS/sheetjs/blob/master/xlsx.mjs#L18117) used this regular expression in parsing: ```js -str = str.replace(//mg,""); +str = str.replace(//gm, "") ``` [ViteJS](https://github.com/vitejs/vite/blob/9fc5d9cb3a1b9df067e00959faa9da43ae03f776/packages/vite/src/node/optimizer/scan.ts#L259) used the nascent `s` flag to ensure `.` matches newline characters: @@ -123,16 +113,16 @@ str = str.replace(//mg,""); ```js export const commentRE = //gs - // Avoid matching the content of the comment - raw = raw.replace(commentRE, '') +// Avoid matching the content of the comment +raw = raw.replace(commentRE, "") ``` [VueJS 2](https://github.com/vuejs/vue/blob/v2.2.3/dist/vue.esm.js#L7404) used regular expressions in processing: ```js text = text - .replace(//g, '$1') - .replace(//g, '$1'); + .replace(//g, "$1") + .replace(//g, "$1") ``` [WordPress](https://github.com/WordPress/WordPress/blob/master/wp-admin/js/word-count.js#L73) used regular expressions in the word count calculator: @@ -144,24 +134,13 @@ text = text [Element Plus](https://github.com/element-plus/element-plus/blob/4ac4750158fa634aa9da186111bce86c2898fda2/internal/build/src/tasks/helper.ts#L60) used a similar regular expression to match blocks starting with `` and ending with ``: ```js - const str = removeTag(value) - .replaceAll(/.*<\/del>/g, '') - // ---------^^^^^^^^^^^^^^^^^^ -- start end +const str = removeTag(value).replaceAll(/.*<\/del>/g, "") +// ---------^^^^^^^^^^^^^^^^^^ -- start end ``` -### A rare consensus +## A Troubling Consensus -Most resources recommend this approach. - -**Books** recommend this approach. "Regular Expressions Cookbook"[^3] section 9.9 explicitly recommends `//` for matching XML comments. - -**StackOverflow Answers** recommend this regular expression and variants such as `//` (which are, for all practical purposes, equivalent). - -**ChatGPT4** has recommended the previous regular expression. It also generated code for a complete unrelated tag. - -**Bing AI** proposed unrelated command line tools for JavaScript. - -
ChatGPT4 and Bing AI Screenshots (click to show) +It's surprising to see that most resources recommend this approach. A prominent O'Reilly textbook, "Regular Expressions Cookbook"[^3], explicitly recommends `//` in section 9.9 for matching XML comments. **StackOverflow Answers** recommend this regular expression and variants such as `//` (which are, for all practical purposes, equivalent). **ChatGPT4** has also recommended the previous regular expression. It also generated code for a complete unrelated tag. 🙄 _ChatGPT4 Incorrect interpretation_ @@ -171,29 +150,33 @@ _ChatGPT4 Correct interpretation, solution uses vulnerable regular expression_ ![ChatGPT correct interpretation](chatgpt.png) +**Bing AI** proposed unrelated command line tools for JavaScript. + +
ChatGPT4 and Bing AI Screenshots (click to show) + _Bing AI Correct Interpretation, solution uses vulnerable regular expression_ ![Bing AI correct interpretation](bing.png)
-## The Internet Failed Us +## But Why is it So Slow? -There are deep performance issues with the regular expression. To see this, consider a string that repeats the header part `/mg,""); // replace - console.timeEnd(n); +for (var n = 64; n < 1000000; n *= 2) { + var s = "/gm, "") // replace + console.timeEnd(n) } ``` @@ -205,41 +188,35 @@ Results are from local tests on a 2019 Intel i9 MacBook Pro. The following chart When the number of repetitions doubled, the runtime roughly quadrupled. This is a "quadratic" relationship. -### Why the regular expression is slow - The regular expression matches a string that starts with ``. Consider a function that repeatedly looks for the `` that appears afterwards. Computer scientists classify this algorithm as "Backtracking"[^4]: ```js {.line-numbers} function match_all_regex_comments(str) { - const results = []; + const results = [] /* look for the first instance of after ", start_index + 4); + let end_index = str.indexOf("-->", start_index + 4) /* if --> is found, then we have a match! */ - if(end_index > -1) { - + if (end_index > -1) { /* add to array */ - results.push(str.slice(start_index, end_index + 3)); + results.push(str.slice(start_index, end_index + 3)) /* start scanning from the end of the `-->` */ - start_index = str.indexOf("").unwrap(); + let mut str = "").unwrap(); + + /* construct string by repeating with itself */ + let mut str = "` is not in the string, the scan `str.indexOf("-->", start_index + 4)` will look at every character in the string starting from `start_index + 4`. In the worst case, with repeated `` are highlighted in blue. @@ -296,68 +322,6 @@ $$ In the worst case, the number of characters scanned is roughly proportional to the square of the length of the string. In "Big-O Notation", the complexity is $O(L^2)$. This is colloquially described as a "quadratic blowup". -### Vulnerability - -This is generally considered a vulnerability since relatively small data can cause browsers or servers to freeze for extended periods of time. - -The official category for this weakness is "CWE-1333"[^5] "Inefficient Regular Expression Complexity". - -Some resources use the phrase "Catastrophic backtracking" to describe the issue. - -### A side note about Rust - -Everyone writes high-performance code in Rust, right? - -Rust does not have built-in support for regular expressions. Third-party libraries fill the gap. - -The Rust `regress`[^6] crate is designed for JavaScript regular expressions. It represents a true apples-to-apples comparison with JavaScript. - -```rust - let re = regress::Regex::new(r"").unwrap(); - let mut str = "").unwrap(); - - /* construct string by repeating with itself */ - let mut str = "/mg),""); // replace +var out = str.replace(new RE2(//gm), "") // replace ```
Complete Example (click to show) ```js -var RE2 = require("re2"); +var RE2 = require("re2") // this loop doubles each time -for(var n = 64; n < 100000000; n*=2) { - var s = "/mg),""); // replace - console.timeEnd(n); +for (var n = 64; n < 100000000; n *= 2) { + var s = "/gm), "") // replace + console.timeEnd(n) } ``` @@ -440,8 +404,6 @@ fn main() { } ``` -
- The Rust `regex` implementation uses algorithms whose performance scales linearly with the size of the input. rust regex performance test - linear complexity @@ -466,7 +428,7 @@ The XML 1.0 specification[^10] disallows `--` within comments. [PrettierJS](https://github.com/prettier/prettier/blob/ff83d55d05e92ceef10ec0cb1c0272ab894a00a0/src/language-markdown/mdx.js#L28) uses a regular expression in the MDX parser that enforces the XML constraint: ```js -const COMMENT_REGEX = /|/; +const COMMENT_REGEX = /|/ ``` Commonly-used regular expression engines can optimize for this pattern and avoid backtracking. @@ -477,7 +439,6 @@ Commonly-used regular expression engines can optimize for this pattern and avoid The XML parser in Excel powering the [Excel 2003-2004 (SpreadsheetML) format](https://docs.sheetjs.com/docs/miscellany/formats#excel-2003-2004-spreadsheetml) allows `--` in the comment body. - #### HTML Comments The HTML5 standard[^11] permits `--` but forbids `` are treated as a comment. For example, consider the following HTML: @@ -522,49 +482,48 @@ Regular expression operations can be reimplemented using standard string operati For example, the replacement ```js -str = str.replace(//, ""); +str = str.replace(//, "") ``` can be rewritten with a loop. The core idea is to collect non-commented fragments: ```js {.line-numbers} function remove_xml_comments(str) { - const START = ""; - const results = []; + const START = "" + const results = [] /* this index tracks the last analyzed character */ - let last_index = 0; + let last_index = 0 /* look for the first instance of after is found, then we have a match! */ - if(end_index > -1) { + if (end_index > -1) { /* skip the comment */ - last_index = end_index + END.length; + last_index = end_index + END.length /* search for next comment open tag */ - start_index = str.indexOf(START, last_index); - } + start_index = str.indexOf(START, last_index) + } else break /* if there is no end comment tag, stop processing */ - else break; } /* add remaining part of string */ - results.push(str.slice(last_index)); + results.push(str.slice(last_index)) /* concatenate the fragments */ - return results.join(""); + return results.join("") } ``` -- 2.34.1