Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
gregli-msft committed Mar 8, 2025
1 parent ce00d9a commit 22a21bb
Show file tree
Hide file tree
Showing 15 changed files with 324 additions and 173 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,7 @@ internal static class TexlStrings
public static ErrorResourceKey ErrInvalidRegExLowHighQuantifierFlip = new ErrorResourceKey("ErrInvalidRegExLowHighQuantifierFlip");
public static ErrorResourceKey ErrInvalidRegExLookbehindTooManyChars = new ErrorResourceKey("ErrInvalidRegExLookbehindTooManyChars");
public static ErrorResourceKey ErrInvalidRegExNumberOverflow = new ErrorResourceKey("ErrInvalidRegExNumberOverflow");
public static ErrorResourceKey ErrInvalidRegExV1Options = new ErrorResourceKey("ErrInvalidRegExV1Options");

public static ErrorResourceKey ErrVariableRegEx = new ErrorResourceKey("ErrVariableRegEx");
public static ErrorResourceKey ErrVariableRegExOptions = new ErrorResourceKey("ErrVariableRegExOptions");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ namespace Microsoft.PowerFx
/// </summary>
internal class RegexTypeCache
{
// Key is ("tbl_" or "rec_" + regex expression)
// Key is ("tbl_" or "rec_" + schema altering options + regex expression)
// DType can be null if we have validated the regular expression, but didn't need the type for IsMatch
// See Match.cs code for details
internal ConcurrentDictionary<string, Tuple<DType, bool, bool, bool>> Cache { get; }

Expand Down
208 changes: 130 additions & 78 deletions src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace Microsoft.PowerFx.Core.Texl.Builtins
// IsMatch(text:s, regular_expression:s, [options:s])
internal class IsMatchFunction : BaseMatchFunction
{
public IsMatchFunction()
: base("IsMatch", TexlStrings.AboutIsMatch, DType.Boolean, null)
public IsMatchFunction(RegexTypeCache regexTypeCache)
: base("IsMatch", TexlStrings.AboutIsMatch, DType.Boolean, regexTypeCache)
{
}

Expand Down Expand Up @@ -93,7 +93,7 @@ public BaseMatchFunction(string functionName, TexlStrings.StringGetter aboutGett
{
if (regexTypeCache != null)
{
_cachePrefix = returnType.IsTable ? "tbl_" : "rec_";
_cachePrefix = returnType == DType.Boolean ? "bol_" : (returnType.IsTable ? "tbl_" : "rec_");
_regexTypeCache = regexTypeCache.Cache;
_regexCacheSize = regexTypeCache.CacheSize;
}
Expand Down Expand Up @@ -129,64 +129,105 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp
bool fValid = base.CheckTypes(context, args, argTypes, errors, out returnType, out nodeToCoercedTypeMap);
Contracts.Assert(returnType.IsRecord || returnType.IsTable || returnType == DType.Boolean);

string regularExpressionOptions = string.Empty;
var regExNode = args[1];

if ((argTypes[1].Kind != DKind.String && argTypes[1].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, regExNode, out var regularExpression))
{
errors.EnsureError(regExNode, TexlStrings.ErrVariableRegEx);
return false;
}

if (context.Features.PowerFxV1CompatibilityRules && args.Length == 3 &&
((argTypes[2].Kind != DKind.String && argTypes[2].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, args[2], out regularExpressionOptions)))
{
errors.EnsureError(args[2], TexlStrings.ErrVariableRegExOptions);
return false;
}

string regularExpressionOptions = string.Empty;

if (args.Length == 3)
{
var goodTypeAndConstant = false;

if (argTypes[2].Kind == DKind.String || argTypes[2].Kind == DKind.OptionSetValue)
{
goodTypeAndConstant = BinderUtils.TryGetConstantValue(context, args[2], out regularExpressionOptions);
}

if (context.Features.PowerFxV1CompatibilityRules && !goodTypeAndConstant)
{
errors.EnsureError(args[2], TexlStrings.ErrVariableRegExOptions);
return false;
}
else if (!context.Features.PowerFxV1CompatibilityRules && goodTypeAndConstant && (regularExpressionOptions.Contains(MatchOptionCodes.DotAll) || regularExpressionOptions.Contains(MatchOptionCodes.FreeSpacing)))
{
// some options are not available pre-V1, we leave the enum value in place and compile time error
// we can't detect this if not a constant string, which is supported by pre-V1 but is very uncommon
errors.EnsureError(args[2], TexlStrings.ErrInvalidRegExV1Options, args[2]);
return false;
}
}

if (!context.Features.PowerFxV1CompatibilityRules)
{
{
// only used for the following analysis and type creation, not modified in the IR
regularExpressionOptions += MatchOptionCodes.NumberedSubMatches;
regularExpressionOptions += MatchOptionCodes.NumberedSubMatches;
}

string alteredOptions = regularExpressionOptions;

return fValid &&
(!context.Features.PowerFxV1CompatibilityRules || IsSupportedRegularExpression(regExNode, regularExpression, regularExpressionOptions, out alteredOptions, errors)) &&
(returnType == DType.Boolean || TryCreateReturnType(regExNode, regularExpression, alteredOptions, errors, ref returnType));
}

// Creates a typed result: [Match:s, Captures:*[Value:s], NamedCaptures:r[<namedCaptures>:s]]
private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, string alteredOptions, IErrorContainer errors, ref DType returnType)
{
Contracts.AssertValue(regexPattern);
string prefixedRegexPattern = this._cachePrefix + regexPattern;

if (_regexTypeCache != null && _regexTypeCache.ContainsKey(prefixedRegexPattern))
string alteredOptions = regularExpressionOptions;

if (!fValid)
{
return false;
}

// Cache entry can vary on:
// - Table (MatchAll) vs. Record (Match)
// - Regular expression pattern
// - NumberedSubMatches vs. Not
// if another MatchOption is added which impacts the return type, this will need to be updated
string regexCacheKey = this._cachePrefix + (alteredOptions.Contains(MatchOptionCodes.NumberedSubMatches) ? "N_" : "-_") + regularExpression;

// if the key is found in the cache, then the regular expression must have previously passed IsSupportedRegularExpression (or we are pre V1 and we don't check)
if (RegexCacheTypeLookup(regExNode, regexCacheKey, errors, ref returnType))
{
return true;
}

// cache miss, validate the regular expression, create the return type, and cache
if (!context.Features.PowerFxV1CompatibilityRules || IsSupportedRegularExpression(regExNode, regularExpression, regularExpressionOptions, out alteredOptions, errors))
{
return RegexCacheTypeCreate(regExNode, regexCacheKey, regularExpression, alteredOptions, errors, ref returnType);
}

return false;
}

private bool RegexCacheTypeLookup(TexlNode regExNode, string regexCacheKey, IErrorContainer errors, ref DType returnType)
{
if (_regexTypeCache != null && _regexTypeCache.ContainsKey(regexCacheKey))
{
var cachedType = _regexTypeCache[prefixedRegexPattern];
var cachedType = _regexTypeCache[regexCacheKey];
if (cachedType != null)
{
{
returnType = cachedType.Item1;
AddWarnings(regExNode, errors, cachedType.Item2, cachedType.Item3, cachedType.Item4);
return true;
}
AddWarnings(regExNode, errors, cachedType.Item2, cachedType.Item3, cachedType.Item4);
return true;
}
else
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegEx);
return false;
}
}
}

return false;
}

// Creates a typed result: [Match:s, Captures:*[Value:s], NamedCaptures:r[<namedCaptures>:s]]
private bool RegexCacheTypeCreate(TexlNode regExNode, string regexCacheKey, string regexPattern, string alteredOptions, IErrorContainer errors, ref DType returnType)
{
if (_regexTypeCache != null && _regexTypeCache.Count >= _regexCacheSize)
{
// To preserve memory during authoring, we clear the cache if it gets
// too large. This should only happen in a minority of cases and
// should have no impact on deployed apps.
_regexTypeCache.Clear();
}
}

try
{
Expand All @@ -198,70 +239,81 @@ private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, string
// In x mode, comment line endings are [\r\n], but .NET only supports \n. For our purposes here, we can just replace the \r.
regexPattern = regexPattern.Replace('\r', '\n');
}

// always .NET compile the regular expression, even if we don't need the return type (boolean), to ensure it is legal in .NET
var regex = new Regex(regexPattern, regexDotNetOptions);

if (returnType == DType.Boolean)
{
if (_regexTypeCache != null)
{
_regexTypeCache[regexCacheKey] = Tuple.Create((DType)null, false, false, false);
}
}
else
{
List<TypedName> propertyNames = new List<TypedName>();
bool fullMatchHidden = false, subMatchesHidden = false, startMatchHidden = false;

var regex = new Regex(regexPattern, regexDotNetOptions);
foreach (var captureName in regex.GetGroupNames())
{
if (int.TryParse(captureName, out _))
{
// Unnamed captures are returned as integers, ignoring them
continue;
}

List<TypedName> propertyNames = new List<TypedName>();
bool fullMatchHidden = false, subMatchesHidden = false, startMatchHidden = false;
if (captureName == ColumnName_FullMatch.Value)
{
fullMatchHidden = true;
}
else if (captureName == ColumnName_SubMatches.Value)
{
subMatchesHidden = true;
}
else if (captureName == ColumnName_StartMatch.Value)
{
startMatchHidden = true;
}

foreach (var captureName in regex.GetGroupNames())
{
if (int.TryParse(captureName, out _))
{
// Unnamed captures are returned as integers, ignoring them
continue;
propertyNames.Add(new TypedName(DType.String, DName.MakeValid(captureName, out _)));
}

if (captureName == ColumnName_FullMatch.Value)
if (!fullMatchHidden)
{
fullMatchHidden = true;
propertyNames.Add(new TypedName(DType.String, ColumnName_FullMatch));
}
else if (captureName == ColumnName_SubMatches.Value)

if (!subMatchesHidden && alteredOptions.Contains(MatchOptionCodes.NumberedSubMatches))
{
subMatchesHidden = true;
propertyNames.Add(new TypedName(DType.CreateTable(new TypedName(DType.String, ColumnName_Value)), ColumnName_SubMatches));
}
else if (captureName == ColumnName_StartMatch.Value)

if (!startMatchHidden)
{
startMatchHidden = true;
propertyNames.Add(new TypedName(DType.Number, ColumnName_StartMatch));
}

propertyNames.Add(new TypedName(DType.String, DName.MakeValid(captureName, out _)));
}

if (!fullMatchHidden)
{
propertyNames.Add(new TypedName(DType.String, ColumnName_FullMatch));
}

if (!subMatchesHidden && alteredOptions.Contains(MatchOptionCodes.NumberedSubMatches))
{
propertyNames.Add(new TypedName(DType.CreateTable(new TypedName(DType.String, ColumnName_Value)), ColumnName_SubMatches));
}

if (!startMatchHidden)
{
propertyNames.Add(new TypedName(DType.Number, ColumnName_StartMatch));
}

returnType = returnType.IsRecord
? DType.CreateRecord(propertyNames)
: DType.CreateTable(propertyNames);

AddWarnings(regExNode, errors, hidesFullMatch: fullMatchHidden, hidesSubMatches: subMatchesHidden, hidesStartMatch: startMatchHidden);

if (_regexTypeCache != null)
{
_regexTypeCache[prefixedRegexPattern] = Tuple.Create(returnType, fullMatchHidden, subMatchesHidden, startMatchHidden);
}
returnType = returnType.IsRecord
? DType.CreateRecord(propertyNames)
: DType.CreateTable(propertyNames);

AddWarnings(regExNode, errors, hidesFullMatch: fullMatchHidden, hidesSubMatches: subMatchesHidden, hidesStartMatch: startMatchHidden);

if (_regexTypeCache != null)
{
_regexTypeCache[regexCacheKey] = Tuple.Create(returnType, fullMatchHidden, subMatchesHidden, startMatchHidden);
}
}

return true;
}
catch (ArgumentException)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegEx);
if (_regexTypeCache != null)
{
_regexTypeCache[prefixedRegexPattern] = null; // Cache to avoid evaluating again
_regexTypeCache[regexCacheKey] = null; // Cache to avoid evaluating again
}

return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ internal static Dictionary<TexlFunction, IAsyncTexlFunction> RegexFunctions(Time

return new Dictionary<TexlFunction, IAsyncTexlFunction>()
{
{ new IsMatchFunction(), new IsMatchImplementation(regexTimeout) },
{ new IsMatchFunction(regexCache), new IsMatchImplementation(regexTimeout) },
{ new MatchFunction(regexCache), new MatchImplementation(regexTimeout) },
{ new MatchAllFunction(regexCache), new MatchAllImplementation(regexTimeout) }
};
Expand Down
4 changes: 4 additions & 0 deletions src/strings/PowerFxResources.en-US.resx
Original file line number Diff line number Diff line change
Expand Up @@ -4687,6 +4687,10 @@
<value>Invalid regular expression: Number is too large, found "{0}".</value>
<comment>Error Message.</comment>
</data>
<data name="ErrorResource_ErrInvalidRegExV1Options_ShortMessage" xml:space="preserve">
<value>Invalid regular expression: MatchOptions.DotAll and MatchOptions.FreeSpacing are only available with Power Fx V1, found "{0}".</value>
<comment>{Locked=Power Fx V1}{Locked=MatchOptions.DotAll}{Locked=MatchOptions.FreeSpacing}Error Message.</comment>
</data>
<data name="ErrorResource_ErrVariableRegEx_ShortMessage" xml:space="preserve">
<value>Regular expression must be a constant value.</value>
<comment>Error Message.</comment>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#SETUP: RegEx,disable:PowerFxV1CompatibilityRules

>> IsMatch("Foo", "J(")
Error({Kind:ErrorKind.BadRegex})
Errors: Error 15-19: Invalid regular expression.|Error 0-7: The function 'IsMatch' has some invalid arguments.
Loading

0 comments on commit 22a21bb

Please sign in to comment.