Skip to content

Commit 91ee342

Browse files
authored
Fix v-flag bugs (#85)
1 parent 947f2cd commit 91ee342

File tree

5 files changed

+128
-27
lines changed

5 files changed

+128
-27
lines changed

rewrite-pattern.js

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ function flatMap(array, callback) {
2121
return result;
2222
}
2323

24+
function regenerateContainsAstral(regenerateData) {
25+
const data = regenerateData.data;
26+
return data.length >= 1 && data[data.length - 1] >= 0x10000;
27+
}
28+
2429
const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;
2530

2631
// Prepare a Regenerate set containing all code points, used for negative
@@ -330,7 +335,7 @@ const buildHandler = (action) => {
330335
}
331336
// The `default` clause is only here as a safeguard; it should never be
332337
// reached. Code coverage tools should ignore it.
333-
/* istanbul ignore next */
338+
/* node:coverage ignore next */
334339
default:
335340
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
336341
}
@@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
414419
break;
415420
// The `default` clause is only here as a safeguard; it should never be
416421
// reached. Code coverage tools should ignore it.
417-
/* istanbul ignore next */
422+
/* node:coverage ignore next */
418423
default:
419424
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
420425
}
@@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
441446
case 'characterClassEscape':
442447
handlePositive.regSet(data, getCharacterClassEscapeSet(
443448
item.value,
444-
config.flags.unicode,
449+
config.flags.unicode || config.flags.unicodeSets,
445450
config.flags.ignoreCase
446451
));
447452
break;
@@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
465470
break;
466471
// The `default` clause is only here as a safeguard; it should never be
467472
// reached. Code coverage tools should ignore it.
468-
/* istanbul ignore next */
473+
/* node:coverage ignore next */
469474
default:
470475
throw new Error(`Unknown term type: ${ item.type }`);
471476
}
@@ -488,13 +493,15 @@ const processCharacterClass = (
488493
const negative = characterClassItem.negative;
489494
const { singleChars, transformed, longStrings } = computed;
490495
if (transformed) {
491-
const setStr = singleChars.toString(regenerateOptions);
496+
// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
497+
const bmpOnly = regenerateContainsAstral(singleChars);
498+
const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));
492499

493500
if (negative) {
494501
if (config.useUnicodeFlag) {
495502
update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
496503
} else {
497-
if (config.flags.unicode) {
504+
if (config.flags.unicode || config.flags.unicodeSets) {
498505
if (config.flags.ignoreCase) {
499506
const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
500507
// Assumption: singleChars do not contain lone surrogates.
@@ -518,10 +525,9 @@ const processCharacterClass = (
518525
);
519526
} else {
520527
// Generate negative set directly when case folding is not involved.
521-
update(
522-
characterClassItem,
523-
UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
524-
);
528+
const negativeSet = UNICODE_SET.clone().remove(singleChars);
529+
const bmpOnly = regenerateContainsAstral(negativeSet);
530+
update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
525531
}
526532
} else {
527533
update(characterClassItem, `(?!${setStr})[\\s\\S]`);
@@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => {
731737
break;
732738
// The `default` clause is only here as a safeguard; it should never be
733739
// reached. Code coverage tools should ignore it.
734-
/* istanbul ignore next */
740+
/* node:coverage ignore next */
735741
default:
736742
throw new Error(`Unknown term type: ${ item.type }`);
737743
}
@@ -835,7 +841,7 @@ const rewritePattern = (pattern, flags, options) => {
835841

836842
const regenerateOptions = {
837843
'hasUnicodeFlag': config.useUnicodeFlag,
838-
'bmpOnly': !config.flags.unicode
844+
'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
839845
};
840846

841847
const groups = {

tests/fixtures/character-class.js

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,25 +44,33 @@ const characterClassFixtures = [
4444
{
4545
pattern: '[^K]', // LATIN CAPITAL LETTER K
4646
flags: 'u',
47-
expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
47+
matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
48+
nonMatches: ["K"],
49+
expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
4850
options: { unicodeFlag: 'transform' }
4951
},
5052
{
5153
pattern: '[^k]', // LATIN SMALL LETTER K
5254
flags: 'u',
53-
expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
55+
matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
56+
nonMatches: ["k"],
57+
expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
5458
options: { unicodeFlag: 'transform' }
5559
},
5660
{
5761
pattern: '[^\u212a]', // KELVIN SIGN
5862
flags: 'u',
59-
expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
63+
matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
64+
nonMatches: ["\u212a"],
65+
expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
6066
options: { unicodeFlag: 'transform' }
6167
},
6268
{
6369
pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
6470
flags: 'u',
65-
expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
71+
matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
72+
nonMatches: ["\u{1D50E}"],
73+
expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
6674
options: { unicodeFlag: 'transform' }
6775
},
6876
{

tests/fixtures/unicode-set.js

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,21 @@ const unicodeSetFixtures = [
105105
},
106106
{
107107
pattern: '[^[a-z][f-h]]',
108-
expected: '(?:(?![a-z])[\\s\\S])',
108+
matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
109+
nonMatches: ["a", "z"],
110+
expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
109111
options: TRANSFORM_U
110112
},
111113
{
112114
pattern: '[[^a-z][f-h]]',
115+
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
116+
nonMatches: ["a", "z"],
113117
expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
114118
},
115119
{
116120
pattern: '[[^a-z][f-h]]',
121+
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
122+
nonMatches: ["a", "z"],
117123
expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
118124
options: TRANSFORM_U
119125
},
@@ -336,6 +342,13 @@ const unicodeSetFixtures = [
336342
{
337343
pattern: '[\\p{ASCII}&&\\p{Control}]',
338344
expected: '[\\0-\\x1F\\x7F]',
345+
},
346+
{
347+
pattern: '.',
348+
flags: 'sv',
349+
matches: ['\n'],
350+
options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' },
351+
expected: '[\\s\\S]'
339352
}
340353
];
341354

tests/fixtures/unicode.js

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ const unicodeFixtures = [
4444
{
4545
'pattern': '[\\s\\S]',
4646
'flags': FLAGS_WITH_UNICODE,
47-
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
47+
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
4848
},
4949
{
5050
'pattern': '\\d',
@@ -68,8 +68,9 @@ const unicodeFixtures = [
6868
},
6969
{
7070
'pattern': '[\\d\\D]',
71+
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
7172
'flags': FLAGS_WITH_UNICODE,
72-
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
73+
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
7374
},
7475
{
7576
'pattern': '\\w',
@@ -100,8 +101,9 @@ const unicodeFixtures = [
100101
},
101102
{
102103
'pattern': '[\\w\\W]',
104+
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
103105
'flags': FLAGS_WITH_UNICODE,
104-
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
106+
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
105107
},
106108
{
107109
'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
@@ -180,11 +182,14 @@ const unicodeFixtures = [
180182
},
181183
{
182184
'pattern': '[^a]',
185+
'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
186+
'nonMatches': ['a'],
183187
'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
184-
'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
188+
'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
185189
},
186190
{
187191
'pattern': '[^a]',
192+
'nonMatches': ['a', 'A'],
188193
'flags': FLAGS_WITH_UNICODE_WITH_I,
189194
'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
190195
},

tests/tests.js

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,30 @@ const { characterClassFixtures } = require("./fixtures/character-class.js");
1717
const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
1818
const { modifiersFixtures } = require("./fixtures/modifiers.js");
1919

20+
/** For node 6 compat */
21+
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
22+
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });
23+
24+
/**
25+
* comput output regex flags from input flags and transform options
26+
*
27+
* @param {string} inputFlags
28+
* @param {*} regexpuOptions
29+
*/
30+
function getOutputFlags(inputFlags, options) {
31+
let result = inputFlags;
32+
if (options.unicodeSetsFlag === "transform") {
33+
result = result.replace("v", "u");
34+
}
35+
if (options.unicodeFlag === "transform") {
36+
result = result.replace("u", "");
37+
}
38+
if (options.dotAllFlag === "transform") {
39+
result = result.replace("s", "");
40+
}
41+
return result;
42+
}
43+
2044
describe('rewritePattern { unicodeFlag }', () => {
2145
const options = {
2246
'unicodeFlag': 'transform'
@@ -95,19 +119,19 @@ describe('unicodePropertyEscapes', () => {
95119
);
96120
assert.equal(
97121
rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features),
98-
'(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
122+
'(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
99123
);
100124
assert.equal(
101125
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features),
102-
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
126+
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
103127
);
104128
assert.equal(
105129
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
106130
'(?:_|\\uD811[\\uDC00-\\uDE46])'
107131
);
108132
assert.equal(
109133
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
110-
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
134+
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
111135
);
112136
assert.equal(
113137
rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features),
@@ -219,10 +243,10 @@ describe('unicodePropertyEscapes', () => {
219243
'[\\u{14400}-\\u{14646}]'
220244
);
221245
assert.equal(
222-
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
246+
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
223247
'unicodePropertyEscapes': 'transform',
224248
}),
225-
'[\\u{14400}-\\u{14646}]'
249+
'[\\0-\\u{143FF}\\u{14647}-\\u{10FFFF}]'
226250
);
227251
});
228252
it('should not transpile unicode property when unicodePropertyEscapes is not enabled', () => {
@@ -391,13 +415,50 @@ describe('character classes', () => {
391415
if (transpiled != '(?:' + expected + ')') {
392416
assert.strictEqual(transpiled, expected);
393417
}
418+
for (const match of fixture.matches || []) {
419+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
420+
assert.match(match, transpiledRegex);
421+
}
422+
for (const nonMatch of fixture.nonMatches || []) {
423+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
424+
assert.doesNotMatch(nonMatch, transpiledRegex);
425+
}
394426
});
395427
}
396428
});
397429

398430

399431

400432
describe('unicodeSets (v) flag', () => {
433+
// Re-use the unicode fixtures but replacing the input pattern's `u` flag with `v` flag
434+
for (const fixture of unicodeFixtures) {
435+
if (fixture.flags.includes("u")) {
436+
for (let flag of fixture.flags) {
437+
flag = flag.replace("u", "v");
438+
const { pattern, transpiled: expected } = fixture;
439+
const inputRE = `/${pattern}/${flag}`;
440+
it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => {
441+
const options = {
442+
unicodeSetsFlag: "transform",
443+
unicodeFlag: "transform",
444+
};
445+
const transpiled = rewritePattern(pattern, flag, options);
446+
if (transpiled != "(?:" + expected + ")") {
447+
assert.strictEqual(transpiled, expected);
448+
}
449+
for (const match of fixture.matches || []) {
450+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
451+
assert.match(match, transpiledRegex);
452+
}
453+
for (const nonMatch of fixture.nonMatches || []) {
454+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
455+
assert.doesNotMatch(nonMatch, transpiledRegex);
456+
}
457+
});
458+
}
459+
}
460+
}
461+
401462
if (IS_NODE_6) return;
402463

403464
for (const fixture of unicodeSetFixtures) {
@@ -421,12 +482,20 @@ describe('unicodeSets (v) flag', () => {
421482
}, throws);
422483
});
423484
} else {
485+
const transpiled = rewritePattern(pattern, flags, options);
424486
it(`rewrites \`${inputRE}\` correctly ${transformUnicodeFlag ? 'without ' : ''}using the u flag`, () => {
425-
const transpiled = rewritePattern(pattern, flags, options);
426487
if (transpiled != '(?:' + expected + ')') {
427488
assert.strictEqual(transpiled, expected);
428489
}
429490
});
491+
for (const match of fixture.matches || []) {
492+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
493+
assert.match(match, transpiledRegex);
494+
}
495+
for (const nonMatch of fixture.nonMatches || []) {
496+
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
497+
assert.doesNotMatch(nonMatch, transpiledRegex);
498+
}
430499
}
431500
}
432501

0 commit comments

Comments
 (0)