Merge pull request #656 from Mbucari/master

Fix query parsing tags with underscores (#655)
This commit is contained in:
rmcrackan 2023-07-06 09:16:20 -04:00 committed by GitHub
commit 22a3dcbc1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 49 deletions

View File

@ -2,10 +2,11 @@
using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Analysis.Tokenattributes;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Text.RegularExpressions;
namespace LibationSearchEngine namespace LibationSearchEngine
{ {
internal static class QuerySanitizer internal static partial class QuerySanitizer
{ {
private static readonly HashSet<string> idTerms private static readonly HashSet<string> idTerms
= SearchEngine.FieldIndexRules.IdFieldNames = SearchEngine.FieldIndexRules.IdFieldNames
@ -23,11 +24,17 @@ namespace LibationSearchEngine
.Select(n => n.ToLowerInvariant()) .Select(n => n.ToLowerInvariant())
.ToHashSet(); .ToHashSet();
private static readonly Regex tagRegex = TagRegex();
internal static string Sanitize(string searchString, StandardAnalyzer analyzer) internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
{ {
if (string.IsNullOrWhiteSpace(searchString)) if (string.IsNullOrWhiteSpace(searchString))
return SearchEngine.ALL_QUERY; return SearchEngine.ALL_QUERY;
//Replace a block tags with tags with proper tag query syntax
//eg: [foo] -> tags:foo
searchString = tagRegex.Replace(searchString, $"{SearchEngine.TAGS}:$1 ");
// range operator " TO " and bool operators " AND " and " OR " must be uppercase // range operator " TO " and bool operators " AND " and " OR " must be uppercase
searchString searchString
= searchString = searchString
@ -76,11 +83,6 @@ namespace LibationSearchEngine
addUnalteredToken(offset); addUnalteredToken(offset);
previousIsTags = false; previousIsTags = false;
} }
else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
{
//The term is a block tag. add it to the part list
partList.Add($"{SearchEngine.TAGS}:{tagName}");
}
else if (double.TryParse(term, out var num)) else if (double.TryParse(term, out var num))
{ {
//Term is a number so pad it with zeros //Term is a number so pad it with zeros
@ -117,35 +119,7 @@ namespace LibationSearchEngine
partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset)); partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));
} }
private static bool tryParseBlockTag(IOffsetAttribute offset, List<string> partList, string searchString, out string tagName) [GeneratedRegex(@"(?<!\\)\[\u0020*(\w+)\u0020*\]", RegexOptions.Compiled)]
{ private static partial Regex TagRegex();
tagName = null;
if (partList.Count == 0) return false;
var previous = partList[^1].TrimEnd();
//cannot be preceeded by an escaping \
if (previous.Length == 0) return false;
if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
var next = searchString.Substring(offset.EndOffset);
if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
//Only legal tag characters are letters, numbers and underscores
//Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
foreach (var c in tagName)
{
if (!char.IsLetterOrDigit(c) && c != '_')
return false;
}
//Remove the leading '['
partList[^1] = previous[..^1];
//Ignore the trailing ']'
offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
return true;
}
} }
} }

View File

@ -39,6 +39,9 @@ namespace SearchEngineTests
[DataRow(" -[foo]", " -tags:foo ")] [DataRow(" -[foo]", " -tags:foo ")]
[DataRow("-[foo] ", "-tags:foo ")] [DataRow("-[foo] ", "-tags:foo ")]
[DataRow(" -[foo] ", " -tags:foo ")] [DataRow(" -[foo] ", " -tags:foo ")]
[DataRow("[foo_bar]", "tags:foo_bar ")]
[DataRow("-[foo_bar]", "-tags:foo_bar ")]
[DataRow("[foo_bar] [foo_bar2]", "tags:foo_bar tags:foo_bar2 ")]
// tag case irrelevant // tag case irrelevant
[DataRow("[FoO]", "tags:FoO ")] [DataRow("[FoO]", "tags:FoO ")]