Skip to content

Commit

Permalink
Updates, changed IsMatch to be contains vs complete
Browse files Browse the repository at this point in the history
  • Loading branch information
gregli-msft committed Mar 9, 2025
1 parent 22a21bb commit 4644f91
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 61 deletions.
54 changes: 35 additions & 19 deletions src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,35 @@ public MatchAllFunction(RegexTypeCache regexTypeCache)

// These start with the codes that can come after a regular expression definition in Perl/JavaScript with "/a+/misx" and can also be used in "(?misx)a+".
// If possible, do not add lower case letters that are Power Fx specific to avoid future conflicts with the industry. We added ^, $, and N as Power Fx specific.
internal class MatchOptionCodes
internal class MatchOptionChar
{
public const char BeginsWith = '^'; // invented by us, adds a '^' at the front of the regex
public const char EndsWith = '$'; // invented by us, adds a '$' at the end of the regex
public const char Begins = '^'; // invented by us, adds a '^' at the front of the regex
public const char Ends = '$'; // invented by us, adds a '$' at the end of the regex
public const char IgnoreCase = 'i';
public const char Multiline = 'm';
public const char FreeSpacing = 'x'; // we don't support the double 'xx' mode
public const char DotAll = 's'; // otherwise known as "singleline" in other flavors, hence the 's', but note is not the opposite of "multiline"
public const char ExplicitCapture = 'n'; // default for Power Fx, can be asserted too for compatibility
public const char NumberedSubMatches = 'N'; // invented by us, opposite of ExplicitCapture and can't be used together
public const char Contains = 'c'; // invented by us, something to wrap ^ and $ around
}

public const char FreeSpacing = 'x'; // we don't support the double 'xx' mode
public const char DotAll = 's'; // otherwise known as "singleline" in other flavors, hence the 's', but note is not the opposite of "multiline"
public const char ExplicitCapture = 'n'; // default for Power Fx, can be asserted too for compatibility with inline option (not exposed through Power Fx enum)
public const char NumberedSubMatches = 'N'; // invented by us, opposite of ExplicitCapture and can't be used together
public const char ContainsBeginsEndsComplete = 'c'; // invented by us, something to wrap ^ and $ around
}

// We insert with the string, and the enums are based on the string. We test with the char, above.
// It makes a difference when we test for an existing beginswith/endswith/contains/complete directive with a single "c" char, which is inserted for all of these.
// There isn't a good way to get a constant string from a constant char in C#, so duplicating in close proximity
internal class MatchOptionString
{
public const string BeginsWith = "^c"; // invented by us, adds a '^' at the front of the regex
public const string EndsWith = "c$"; // invented by us, adds a '$' at the end of the regex
public const string IgnoreCase = "i";
public const string Multiline = "m";
public const string FreeSpacing = "x"; // we don't support the double 'xx' mode
public const string DotAll = "s"; // otherwise known as "singleline" in other flavors, hence the 's', but note is not the opposite of "multiline"
public const string NumberedSubMatches = "N"; // invented by us, opposite of ExplicitCapture and can't be used together
public const string Contains = "c"; // invented by us, something to wrap ^ and $ around
public const string Complete = "^c$"; // invented by us, with the ^ and $ around
}

internal class BaseMatchFunction : BuiltinFunction
{
private readonly ConcurrentDictionary<string, Tuple<DType, bool, bool, bool>> _regexTypeCache;
Expand Down Expand Up @@ -153,7 +169,7 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp
errors.EnsureError(args[2], TexlStrings.ErrVariableRegExOptions);
return false;
}
else if (!context.Features.PowerFxV1CompatibilityRules && goodTypeAndConstant && (regularExpressionOptions.Contains(MatchOptionCodes.DotAll) || regularExpressionOptions.Contains(MatchOptionCodes.FreeSpacing)))
else if (!context.Features.PowerFxV1CompatibilityRules && goodTypeAndConstant && (regularExpressionOptions.Contains(MatchOptionChar.DotAll) || regularExpressionOptions.Contains(MatchOptionChar.FreeSpacing)))
{
// some options are not available pre-V1, we leave the enum value in place and compile time error
// we can't detect this if not a constant string, which is supported by pre-V1 but is very uncommon
Expand All @@ -165,7 +181,7 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp
if (!context.Features.PowerFxV1CompatibilityRules)
{
// only used for the following analysis and type creation, not modified in the IR
regularExpressionOptions += MatchOptionCodes.NumberedSubMatches;
regularExpressionOptions += MatchOptionChar.NumberedSubMatches;
}

string alteredOptions = regularExpressionOptions;
Expand All @@ -180,7 +196,7 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp
// - Regular expression pattern
// - NumberedSubMatches vs. Not
// if another MatchOption is added which impacts the return type, this will need to be updated
string regexCacheKey = this._cachePrefix + (alteredOptions.Contains(MatchOptionCodes.NumberedSubMatches) ? "N_" : "-_") + regularExpression;
string regexCacheKey = this._cachePrefix + (alteredOptions.Contains(MatchOptionChar.NumberedSubMatches) ? "N_" : "-_") + regularExpression;

// if the key is found in the cache, then the regular expression must have previously passed IsSupportedRegularExpression (or we are pre V1 and we don't check)
if (RegexCacheTypeLookup(regExNode, regexCacheKey, errors, ref returnType))
Expand Down Expand Up @@ -232,7 +248,7 @@ private bool RegexCacheTypeCreate(TexlNode regExNode, string regexCacheKey, stri
try
{
var regexDotNetOptions = RegexOptions.None;
if (alteredOptions.Contains(MatchOptionCodes.FreeSpacing))
if (alteredOptions.Contains(MatchOptionChar.FreeSpacing))
{
regexDotNetOptions |= RegexOptions.IgnorePatternWhitespace;

Expand Down Expand Up @@ -284,7 +300,7 @@ private bool RegexCacheTypeCreate(TexlNode regExNode, string regexCacheKey, stri
propertyNames.Add(new TypedName(DType.String, ColumnName_FullMatch));
}

if (!subMatchesHidden && alteredOptions.Contains(MatchOptionCodes.NumberedSubMatches))
if (!subMatchesHidden && alteredOptions.Contains(MatchOptionChar.NumberedSubMatches))
{
propertyNames.Add(new TypedName(DType.CreateTable(new TypedName(DType.String, ColumnName_Value)), ColumnName_SubMatches));
}
Expand Down Expand Up @@ -370,8 +386,8 @@ private void AddWarnings(TexlNode regExNode, IErrorContainer errors, bool hidesF

private bool IsSupportedRegularExpression(TexlNode regExNode, string regexPattern, string regexOptions, out string alteredOptions, IErrorContainer errors)
{
bool freeSpacing = regexOptions.Contains(MatchOptionCodes.FreeSpacing); // can also be set with inline mode modifier
bool numberedCpature = regexOptions.Contains(MatchOptionCodes.NumberedSubMatches); // can only be set here, no inline mode modifier
bool freeSpacing = regexOptions.Contains(MatchOptionChar.FreeSpacing); // can also be set with inline mode modifier
bool numberedCpature = regexOptions.Contains(MatchOptionChar.NumberedSubMatches); // can only be set here, no inline mode modifier

alteredOptions = regexOptions;

Expand Down Expand Up @@ -802,13 +818,13 @@ void CCRegExError(ErrorResourceKey errKey)
return false;
}

if (inlineOptions.Contains(MatchOptionCodes.ExplicitCapture) && numberedCpature)
if (inlineOptions.Contains(MatchOptionChar.ExplicitCapture) && numberedCpature)
{
RegExError(TexlStrings.ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches);
return false;
}

if (inlineOptions.Contains(MatchOptionCodes.FreeSpacing))
if (inlineOptions.Contains(MatchOptionChar.FreeSpacing))
{
freeSpacing = true;
}
Expand Down
18 changes: 9 additions & 9 deletions src/libraries/Microsoft.PowerFx.Core/Types/Enums/BuiltInEnums.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ internal static class BuiltInEnums
DType.String,
new Dictionary<string, object>()
{
{ "BeginsWith", $"{MatchOptionCodes.BeginsWith}{MatchOptionCodes.Contains}" },
{ "EndsWith", $"{MatchOptionCodes.Contains}{MatchOptionCodes.EndsWith}" },
{ "Complete", $"{MatchOptionCodes.BeginsWith}{MatchOptionCodes.Contains}{MatchOptionCodes.EndsWith}" },
{ "Contains", $"{MatchOptionCodes.Contains}" },
{ "IgnoreCase", $"{MatchOptionCodes.IgnoreCase}" },
{ "Multiline", $"{MatchOptionCodes.Multiline}" },
{ "FreeSpacing", $"{MatchOptionCodes.FreeSpacing}" },
{ "DotAll", $"{MatchOptionCodes.DotAll}" },
{ "NumberedSubMatches", $"{MatchOptionCodes.NumberedSubMatches}" }
{ "BeginsWith", MatchOptionString.BeginsWith },
{ "EndsWith", MatchOptionString.EndsWith },
{ "Complete", MatchOptionString.Complete },
{ "Contains", MatchOptionString.Contains },
{ "IgnoreCase", MatchOptionString.IgnoreCase },
{ "Multiline", MatchOptionString.Multiline },
{ "FreeSpacing", MatchOptionString.FreeSpacing },
{ "DotAll", MatchOptionString.DotAll },
{ "NumberedSubMatches", MatchOptionString.NumberedSubMatches }
},
canConcatenateStronglyTyped: true);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ internal abstract class RegexCommonImplementation : IAsyncTexlFunction
protected const string STARTMATCH = "StartMatch";
protected const string SUBMATCHES = "SubMatches";

protected const string DefaultIsMatchOptions = "^c$";
protected const string DefaultMatchOptions = "c";
protected const string DefaultMatchAllOptions = "c";
protected const string DefaultIsMatchOptions = MatchOptionString.Contains;
protected const string DefaultMatchOptions = MatchOptionString.Contains;
protected const string DefaultMatchAllOptions = MatchOptionString.Contains;

public Task<FormulaValue> InvokeAsync(FormulaValue[] args, CancellationToken cancellationToken)
{
Expand Down Expand Up @@ -176,6 +176,12 @@ public Task<FormulaValue> InvokeAsync(FormulaValue[] args, CancellationToken can
default:
return Task.FromResult<FormulaValue>(args[2] is ErrorValue ? args[2] : CommonErrors.InvalidArgumentError(args[2].IRContext, RuntimeStringResources.ErrInvalidArgument));
}

// don't override complete/contains/beginswith/endswith if already given, all these options include Contains ("c")
if (!matchOptions.Contains(MatchOptionChar.ContainsBeginsEndsComplete))
{
matchOptions += DefaultRegexOptions;
}
}
else
{
Expand Down Expand Up @@ -228,13 +234,13 @@ public Task<FormulaValue> InvokeAsync(FormulaValue[] args, CancellationToken can
index = inlineOptions.Length;
}

bool freeSpacing = options.Contains(MatchOptionCodes.FreeSpacing);
bool multiline = options.Contains(MatchOptionCodes.Multiline);
bool ignoreCase = options.Contains(MatchOptionCodes.IgnoreCase);
bool dotAll = options.Contains(MatchOptionCodes.DotAll);
bool matchStart = options.Contains(MatchOptionCodes.BeginsWith);
bool matchEnd = options.Contains(MatchOptionCodes.EndsWith);
bool numberedSubMatches = options.Contains(MatchOptionCodes.NumberedSubMatches);
bool freeSpacing = options.Contains(MatchOptionChar.FreeSpacing);
bool multiline = options.Contains(MatchOptionChar.Multiline);
bool ignoreCase = options.Contains(MatchOptionChar.IgnoreCase);
bool dotAll = options.Contains(MatchOptionChar.DotAll);
bool matchStart = options.Contains(MatchOptionChar.BeginsWith);
bool matchEnd = options.Contains(MatchOptionChar.EndsWith);
bool numberedSubMatches = options.Contains(MatchOptionChar.NumberedSubMatches);

// Can't add options ^ and $ too early as there may be freespacing comments, centralize the logic here and call subfunctions
string AlterStart()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#SETUP: RegEx

>> IsMatch("Hi", "H")
>> IsMatch("Hi", "H", MatchOptions.Complete)
false

>> IsMatch("Hi", "Hi")
Expand Down Expand Up @@ -63,16 +63,26 @@ false
>> IsMatch(Blank(), Blank())
Errors: Error 17-24: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments.

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}", MatchOptions.Complete)
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}", MatchOptions.Complete)
false

// With Icelandic Eth
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}", MatchOptions.Complete)
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}", MatchOptions.Contains)
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}", MatchOptions.Contains)
false

// With Icelandic Eth
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}", MatchOptions.Contains)
true

>> IsMatch( "28", Concat( [2,8], Value ) )
Errors: Error 15-37: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments.

Expand Down Expand Up @@ -139,11 +149,21 @@ false
>> IsMatch("Hi", "I", MatchOptions.EndsWith & MatchOptions.IgnoreCase)
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}", MatchOptions.Contains)
>> IsMatch( "abcba", "abcba", MatchOptions.Complete )
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}", MatchOptions.Contains)
>> IsMatch( "abcba", "abcba", MatchOptions.BeginsWith & MatchOptions.EndsWith )
true

>> IsMatch( "abcba", "bcba", MatchOptions.BeginsWith & MatchOptions.EndsWith )
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}", MatchOptions.Contains)
>> IsMatch( "abcba", "bcba", MatchOptions.EndsWith )
true

>> IsMatch( "abcba", "abcb", MatchOptions.BeginsWith & MatchOptions.EndsWith )
false

>> IsMatch( "abcba", "abcb", MatchOptions.BeginsWith )
true

Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ true
>> IsMatch("Hi", "Hi", "foo")
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}", "^c$")
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}", "^c$")
false

// With Icelandic Eth
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}")
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}", "^c$")
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}", "c")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,16 @@ Errors: Error 15-19: Invalid regular expression: Unclosed groups, too few closin

>> IsMatch("""Hello world""", "\w+", If( Sqrt(4) > 0, MatchOptions.Contains, MatchOptions.Complete))
Errors: Error 34-96: MatchOptions must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments.

// newer IsMatch "contains" semantics by default

>> IsMatch( "ihi", "h" )
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}")
true

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}")
false


Original file line number Diff line number Diff line change
@@ -1,4 +1,25 @@
#SETUP: RegEx,disable:PowerFxV1CompatibilityRules
#SETUP: RegEx,disable:PowerFxV1CompatibilityRules

// Without V1, we don't run the full regular expression validation test, so this result comes from .NET

>> IsMatch("Foo", "J(")
Errors: Error 15-19: Invalid regular expression.|Error 0-7: The function 'IsMatch' has some invalid arguments.

// includes tests for IsMatch with the older "complete" semantics

>> IsMatch("ihi", "h")
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}")
false

>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` 1234567890", "\p{L}")
false

// With Icelandic Eth
>> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}")
false

// set options shouldn't override complete
>> IsMatch( "aa", "a", MatchOptions.IgnoreCase )
false
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Errors: Error 15-63: Invalid regular expression: Unclosed inline comment, starts
"test"

>> IsMatch( "ab", "(?x)a # b" )
false
true

>> IsMatch( "ab", "(?x)a # " &Char(13)& " b" )
true
Expand All @@ -93,8 +93,8 @@ true
>> IsMatch( "ab", "(?x)a # " &Char(13)&Char(10)& " b" ) // one is the newline, the other is just whitespace that is ignored
true

>> IsMatch( "ab", "(?x)a # " &Char(133)& " b" ) // \x85
false
// >> IsMatch( "ab", "(?x)a # " &Char(133)& " b" ) // \x85
// false

// Edge cases for removal during RE translations
>> Match( "1111111122221", "(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)\11", MatchOptions.NumberedSubMatches )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -423,12 +423,18 @@ Errors: Error 0-7: The function 'Boolean' has some invalid arguments.
>> Match( "a3d4", "\d")
{FullMatch:"3",StartMatch:2}

>> IsMatch( "a3d4", Match.Digit )
>> IsMatch( "a3d4", Match.Digit, MatchOptions.Complete )
false

>> IsMatch( "a3d4", "\d")
>> IsMatch( "a3d4", Match.Digit, MatchOptions.Contains )
true

>> IsMatch( "a3d4", "\d", MatchOptions.Complete )
false

>> IsMatch( "a3d4", "\d", MatchOptions.Contains )
true

>> MatchAll( "a3d4", Match.Digit )
Table({FullMatch:"3",StartMatch:2},{FullMatch:"4",StartMatch:4})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,12 +380,18 @@ Error({Kind:ErrorKind.InvalidArgument})
>> Match( "a3d4", "\d").FullMatch
"3"

>> IsMatch( "a3d4", Match.Digit )
>> IsMatch( "a3d4", Match.Digit, MatchOptions.Complete )
false

>> IsMatch( "a3d4", "\d")
>> IsMatch( "a3d4", Match.Digit, MatchOptions.Contains )
true

>> IsMatch( "a3d4", "\d", MatchOptions.Complete )
false

>> IsMatch( "a3d4", "\d", MatchOptions.Contains )
true

>> ForAll( MatchAll( "a3d4", Match.Digit ), {fm:FullMatch} )
Table({fm:"3"},{fm:"4"})

Expand Down
Loading

0 comments on commit 4644f91

Please sign in to comment.