Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
gregli-msft committed Mar 6, 2025
1 parent 78887be commit 4f54c71
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 132 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ Table({FullMatch:"1",StartMatch:1})
>> MatchAll( "1a" & Char(13)&Char(10) & "2b" & Char(13)&Char(10) & "3c", "^\d", MatchOptions.Multiline )
Table({FullMatch:"1",StartMatch:1},{FullMatch:"2",StartMatch:5},{FullMatch:"3",StartMatch:9})

// tests for matching between a \r and \n

>> MatchAll( "a" & Char(13) & Char(10) & "a", "(?m)^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1},{FullMatch:"a",StartMatch:4})

// >> MatchAll( "a" & Char(13) & Char(10), "(?m)^(a|)" ) // gives a different answer on PCRE2
// ok
// >> MatchAll( "a" & Char(13) & Char(10), "(?m)^(a|)" ) // gives a different answer on PCRE2 (doesn't include last empty match), but not on Excel?
// Table({FullMatch:"a",StartMatch:Float(1)},{FullMatch:"",StartMatch:Float(4)})

>> MatchAll( Char(13) & Char(10) & "a", "(?m)^(a|)" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:3})
Expand All @@ -98,6 +100,100 @@ Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",Sta
>> MatchAll( Char(13) & Char(10) & "a", "(?m)(a|)$" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

>> MatchAll( "a" & Char(13) & Char(10) & "a", "^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( "a" & Char(13) & Char(10), "^(a|)" )
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( Char(13) & Char(10) & "a", "^(a|)" )
Table({FullMatch:"",StartMatch:1})

>> MatchAll( "a" & Char(13) & Char(10) & "a", "(a|)$" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:4},{FullMatch:"",StartMatch:5})

>> MatchAll( "a" & Char(13) & Char(10) , "(a|)$" ) // pcre2 repeated empty result
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",StartMatch:4})

>> MatchAll( Char(13) & Char(10) & "a", "(a|)$" )
Table({FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

// same tests with single newline \n

>> MatchAll( "a" & Char(10) & "a", "(?m)^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1},{FullMatch:"a",StartMatch:3})

// >> MatchAll( "a" & Char(10), "(?m)^(a|)" ) // gives a different answer on PCRE2 (doesn't include last empty match), but not on Excel?
// Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:4})

>> MatchAll( Char(10) & "a", "(?m)^(a|)" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:2})

>> MatchAll( "a" & Char(10) & "a", "(?m)(a|)$" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

>> MatchAll( "a" & Char(10), "(?m)(a|)$" ) // pcre2 repeated empty result
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( Char(10) & "a", "(?m)(a|)$" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( "a" & Char(10) & "a", "^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( "a" & Char(10), "^(a|)" )
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( Char(10) & "a", "^(a|)" )
Table({FullMatch:"",StartMatch:1})

>> MatchAll( "a" & Char(10) & "a", "(a|)$" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

>> MatchAll( "a" & Char(10) , "(a|)$" ) // pcre2 repeated empty result
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( Char(10) & "a", "(a|)$" )
Table({FullMatch:"a",StartMatch:2},{FullMatch:"",StartMatch:3})

// same tests with single newline \r

>> MatchAll( "a" & Char(13) & "a", "(?m)^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1},{FullMatch:"a",StartMatch:3})

// >> MatchAll( "a" & Char(13), "(?m)^(a|)" ) // gives a different answer on PCRE2 (doesn't include last empty match), but not on Excel?
// Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:4})

>> MatchAll( Char(13) & "a", "(?m)^(a|)" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:2})

>> MatchAll( "a" & Char(13) & "a", "(?m)(a|)$" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

>> MatchAll( "a" & Char(13) , "(?m)(a|)$" ) // pcre2 repeated empty result
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( Char(13) & "a", "(?m)(a|)$" )
Table({FullMatch:"",StartMatch:1},{FullMatch:"a",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( "a" & Char(13) & "a", "^(a|)" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( "a" & Char(13), "^(a|)" )
Table({FullMatch:"a",StartMatch:1})

>> MatchAll( Char(13) & "a", "^(a|)" )
Table({FullMatch:"",StartMatch:1})

>> MatchAll( "a" & Char(13) & "a", "(a|)$" ) // matches after the \r and \n in JavaScript, not in PCRE2
Table({FullMatch:"a",StartMatch:3},{FullMatch:"",StartMatch:4})

>> MatchAll( "a" & Char(13), "(a|)$" ) // pcre2 repeated empty result
Table({FullMatch:"a",StartMatch:1},{FullMatch:"",StartMatch:2},{FullMatch:"",StartMatch:3})

>> MatchAll( Char(13) & "a", "(a|)$" )
Table({FullMatch:"a",StartMatch:2},{FullMatch:"",StartMatch:3})

>> ForAll( MatchAll( "
a
b
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,87 +15,14 @@ public class RegEx_JavaScript
// For example, no affodance is made for nested character classes or inline options on a subexpression, as those would have already been blocked.
// Stick to single ticks for strings to keep this easier to read and maintain here in C#.
public const string AlterRegex_JavaScript = @"
function AlterRegex_NeedToMapNewlines(regex, flags)
{
var index = 0;
const inlineFlagsRE = /^\(\?(?<flags>[imnsx]+)\)/;
const inlineFlags = inlineFlagsRE.exec( regex );
if (inlineFlags != null)
{
flags = flags.concat(inlineFlags.groups['flags']);
index = inlineFlags[0].length;
}
const multiline = flags.includes('m');
const freeSpacing = flags.includes('x');
var openCharacterClass = false;
for ( ; index < regex.length; index++)
{
switch (regex.charAt(index) )
{
case '[':
openCharacterClass = true;
break;
case ']':
openCharacterClass = false;
break;
case '\\':
index++;
break;
case '^':
if (!openCharacterClass)
return true;
break;
case '$':
if (!openCharacterClass)
return true;
break;
case '(':
if (regex.length - index > 2 && regex.charAt(index+1) == '?' && regex.charAt(index+2) == '#')
{
// inline comment
for ( index++; index < regex.length && regex.charAt(index) != ')'; index++)
{
// eat characters until a close paren, it doesn't matter if it is escaped (consistent with .NET)
}
}
break;
case '#':
if (freeSpacing && !openCharacterClass)
{
for ( index++; index < regex.length && regex.charAt(index) != '\r' && regex.charAt(index) != '\n'; index++)
{
// eat characters until the end of the line
// leaving dangling whitespace characters will be eaten on next iteration
}
}
break;
}
}
return false;
}
function AlterRegex_JavaScript(regex, flags, crCode, nlCode)
{
const otherNewLines = '';
var index = 0;
if (crCode > 0xffff || nlCode > 0xffff)
return [undefined, undefined];
const cr = '\\u'.concat(crCode.toString(16).padStart(4,'0'));
const nl = '\\u'.concat(nlCode.toString(16).padStart(4,'0'));
const cr = '\\r';
const nl = '\\n';
const inlineFlagsRE = /^\(\?(?<flags>[imnsx]+)\)/;
const inlineFlags = inlineFlagsRE.exec( regex );
Expand All @@ -117,7 +44,8 @@ function AlterRegex_JavaScript(regex, flags, crCode, nlCode)
// rebuilding from booleans avoids possible duplicate letters
// x has been handled in this function and does not need to be passed on (and would cause an error)
const alteredFlags = 'v'.concat((ignoreCase ? 'i' : ''), (multiline ? 'm' : ''), (dotAll ? 's' : ''));
// multiline is excluded as the definitions for caret and dollar above take this into account
const alteredFlags = 'v'.concat((ignoreCase ? 'i' : ''), (dotAll ? 's' : ''));
var openCharacterClass = false; // are we defining a character class?
var altered = '';
Expand Down Expand Up @@ -200,34 +128,6 @@ function AlterRegex_JavaScript(regex, flags, crCode, nlCode)
alteredToken = regex.charAt(index);
break;
case 'r':
alteredToken = cr;
break;
case 'n':
alteredToken = nl;
break;
case 'x':
xCode = regex.charAt(++index) + regex.charAt(++index);
if (xCode == '0d' || xCode == '0D' )
alteredToken = cr;
else if (xCode == '0a' || xCode == '0A' )
alteredToken = nl;
else
alteredToken = '\\x'.concat( xCode );
break;
case 'u':
xCode = regex.charAt(++index) + regex.charAt(++index) + regex.charAt(++index) + regex.charAt(++index);
if (xCode == '000d' || xCode == '000D' )
alteredToken = cr;
else if (xCode == '000a' || xCode == '000A' )
alteredToken = nl;
else
alteredToken = '\\u'.concat( xCode );
break;
default:
alteredToken = '\\'.concat(regex.charAt(index));
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ public void RunOne()

testRunner.AddFile(new Dictionary<string, bool>(), null, path);

// We can filter to just cases we want, set line above abcd
// We can filter to just cases we want, set line above
if (line > 0)
{
testRunner.Tests.RemoveAll(x => x.SourceLine != line);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,11 @@ internal static async Task<FormulaValue> MatchAsync(string subject, string patte
string js2 = @"
function MatchTest( subject, pattern, flags, matchAll )
{
const newLineMap = AlterRegex_NeedToMapNewlines( pattern, flags );
console.log( 'need to map: ' + newLineMap );
const nlCode = newLineMap ? 0xf8f0 : 10;
const crCode = newLineMap ? 0xf8f1 : 13;
const nl = String.fromCharCode( nlCode );
const cr = String.fromCharCode( crCode );
if (crCode != 13 || nlCode != 10)
subject = subject.replaceAll( '\r', cr ).replaceAll( '\n', nl );
const [alteredPattern, alteredFlags, endGuards] = AlterRegex_JavaScript( pattern, flags, crCode, nlCode );
const [alteredPattern, alteredFlags, endGuards] = AlterRegex_JavaScript( pattern, flags );
const regex = RegExp(alteredPattern, alteredFlags.concat(matchAll ? 'g' : ''));
const matches = matchAll ? [...subject.matchAll(regex)] : [subject.match(regex)];
console.log(alteredPattern); // useful to debug AlterRegex_JavaScript
console.log(encodeURI(subject));
// console.log(alteredPattern); // useful to debug AlterRegex_JavaScript
// console.log(encodeURI(subject));
console.log('%%begin%%');
if (matches.length != 0 && matches[0] != null)
{
Expand All @@ -125,16 +113,8 @@ function MatchTest( subject, pattern, flags, matchAll )
{
var o = new Object();
o.Index = match.index;
o.Named = new Object();
for (const prop in match.groups)
{
o.Named[prop] = (match.groups[prop] == undefined ? undefined : newLineMap ? match.groups[prop].replaceAll( cr, '\r' ).replaceAll( nl, '\n' ) : match.groups[prop] );
}
o.Numbered = new Array();
for (const subMatch of match)
{
o.Numbered.push( subMatch == undefined ? undefined : newLineMap ? subMatch.replaceAll( cr, '\r' ).replaceAll( nl, '\n' ) : subMatch );
}
o.Named = match.groups;
o.Numbered = match;
arr.push(o);
}
console.log(JSON.stringify(arr));
Expand Down

0 comments on commit 4f54c71

Please sign in to comment.