Fix query parsing tags with underscores (#655)
This commit is contained in:
parent
fbc29dfb0a
commit
ec9d11cf52
@ -2,10 +2,11 @@
|
|||||||
using Lucene.Net.Analysis.Tokenattributes;
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
namespace LibationSearchEngine
|
namespace LibationSearchEngine
|
||||||
{
|
{
|
||||||
internal static class QuerySanitizer
|
internal static partial class QuerySanitizer
|
||||||
{
|
{
|
||||||
private static readonly HashSet<string> idTerms
|
private static readonly HashSet<string> idTerms
|
||||||
= SearchEngine.FieldIndexRules.IdFieldNames
|
= SearchEngine.FieldIndexRules.IdFieldNames
|
||||||
@ -23,11 +24,17 @@ namespace LibationSearchEngine
|
|||||||
.Select(n => n.ToLowerInvariant())
|
.Select(n => n.ToLowerInvariant())
|
||||||
.ToHashSet();
|
.ToHashSet();
|
||||||
|
|
||||||
|
private static readonly Regex tagRegex = TagRegex();
|
||||||
|
|
||||||
internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
|
internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(searchString))
|
if (string.IsNullOrWhiteSpace(searchString))
|
||||||
return SearchEngine.ALL_QUERY;
|
return SearchEngine.ALL_QUERY;
|
||||||
|
|
||||||
|
//Replace a block tags with tags with proper tag query syntax
|
||||||
|
//eg: [foo] -> tags:foo
|
||||||
|
searchString = tagRegex.Replace(searchString, $"{SearchEngine.TAGS}:$1 ");
|
||||||
|
|
||||||
// range operator " TO " and bool operators " AND " and " OR " must be uppercase
|
// range operator " TO " and bool operators " AND " and " OR " must be uppercase
|
||||||
searchString
|
searchString
|
||||||
= searchString
|
= searchString
|
||||||
@ -76,11 +83,6 @@ namespace LibationSearchEngine
|
|||||||
addUnalteredToken(offset);
|
addUnalteredToken(offset);
|
||||||
previousIsTags = false;
|
previousIsTags = false;
|
||||||
}
|
}
|
||||||
else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
|
|
||||||
{
|
|
||||||
//The term is a block tag. add it to the part list
|
|
||||||
partList.Add($"{SearchEngine.TAGS}:{tagName}");
|
|
||||||
}
|
|
||||||
else if (double.TryParse(term, out var num))
|
else if (double.TryParse(term, out var num))
|
||||||
{
|
{
|
||||||
//Term is a number so pad it with zeros
|
//Term is a number so pad it with zeros
|
||||||
@ -117,35 +119,7 @@ namespace LibationSearchEngine
|
|||||||
partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));
|
partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool tryParseBlockTag(IOffsetAttribute offset, List<string> partList, string searchString, out string tagName)
|
[GeneratedRegex(@"(?<!\\)\[\u0020*(\w+)\u0020*\]", RegexOptions.Compiled)]
|
||||||
{
|
private static partial Regex TagRegex();
|
||||||
tagName = null;
|
|
||||||
if (partList.Count == 0) return false;
|
|
||||||
|
|
||||||
var previous = partList[^1].TrimEnd();
|
|
||||||
|
|
||||||
//cannot be preceeded by an escaping \
|
|
||||||
if (previous.Length == 0) return false;
|
|
||||||
if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
|
|
||||||
|
|
||||||
var next = searchString.Substring(offset.EndOffset);
|
|
||||||
if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
|
|
||||||
|
|
||||||
tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
|
|
||||||
|
|
||||||
//Only legal tag characters are letters, numbers and underscores
|
|
||||||
//Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
|
|
||||||
foreach (var c in tagName)
|
|
||||||
{
|
|
||||||
if (!char.IsLetterOrDigit(c) && c != '_')
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Remove the leading '['
|
|
||||||
partList[^1] = previous[..^1];
|
|
||||||
//Ignore the trailing ']'
|
|
||||||
offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,18 +30,21 @@ namespace SearchEngineTests
|
|||||||
[DataRow(" ", "*:*")]
|
[DataRow(" ", "*:*")]
|
||||||
|
|
||||||
// tag surrounded by spaces
|
// tag surrounded by spaces
|
||||||
[DataRow("[foo]", "tags:foo")]
|
[DataRow("[foo]", "tags:foo ")]
|
||||||
[DataRow(" [foo]", " tags:foo")]
|
[DataRow(" [foo]", " tags:foo ")]
|
||||||
[DataRow(" [ foo ]", " tags:foo")]
|
[DataRow(" [ foo ]", " tags:foo ")]
|
||||||
[DataRow("[foo] ", "tags:foo ")]
|
[DataRow("[foo] ", "tags:foo ")]
|
||||||
[DataRow(" [foo] ", " tags:foo ")]
|
[DataRow(" [foo] ", " tags:foo ")]
|
||||||
[DataRow("-[foo]", "-tags:foo")]
|
[DataRow("-[foo]", "-tags:foo ")]
|
||||||
[DataRow(" -[foo]", " -tags:foo")]
|
[DataRow(" -[foo]", " -tags:foo ")]
|
||||||
[DataRow("-[foo] ", "-tags:foo ")]
|
[DataRow("-[foo] ", "-tags:foo ")]
|
||||||
[DataRow(" -[foo] ", " -tags:foo ")]
|
[DataRow(" -[foo] ", " -tags:foo ")]
|
||||||
|
[DataRow("[foo_bar]", "tags:foo_bar ")]
|
||||||
|
[DataRow("-[foo_bar]", "-tags:foo_bar ")]
|
||||||
|
[DataRow("[foo_bar] [foo_bar2]", "tags:foo_bar tags:foo_bar2 ")]
|
||||||
|
|
||||||
// tag case irrelevant
|
// tag case irrelevant
|
||||||
[DataRow("[FoO]", "tags:FoO")]
|
[DataRow("[FoO]", "tags:FoO ")]
|
||||||
|
|
||||||
// bool keyword surrounded by spaces
|
// bool keyword surrounded by spaces
|
||||||
[DataRow("israted", "israted:True")]
|
[DataRow("israted", "israted:True")]
|
||||||
@ -69,9 +72,9 @@ namespace SearchEngineTests
|
|||||||
[DataRow("liberated AND isRated:false", "liberated:True AND israted:false")]
|
[DataRow("liberated AND isRated:false", "liberated:True AND israted:false")]
|
||||||
|
|
||||||
// tag which happens to be a bool keyword >> parse as tag
|
// tag which happens to be a bool keyword >> parse as tag
|
||||||
[DataRow("[israted]", "tags:israted")]
|
[DataRow("[israted]", "tags:israted ")]
|
||||||
[DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")]
|
[DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")]
|
||||||
[DataRow("[tags][israted]", "tags:tagstags:israted")]
|
[DataRow("[tags][israted]", "tags:tags tags:israted ")]
|
||||||
|
|
||||||
// numbers with "to". TO all caps, numbers [8.2] format
|
// numbers with "to". TO all caps, numbers [8.2] format
|
||||||
[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
|
[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user