From ec9d11cf52d1dd9a867c3c62d641148f388605e6 Mon Sep 17 00:00:00 2001 From: Mbucari Date: Wed, 5 Jul 2023 15:37:53 -0600 Subject: [PATCH] Fix query parsing tags with underscores (#655) --- Source/LibationSearchEngine/QuerySanitizer.cs | 46 ++++--------------- .../SearchEngineTests.cs | 29 ++++++------ 2 files changed, 26 insertions(+), 49 deletions(-) diff --git a/Source/LibationSearchEngine/QuerySanitizer.cs b/Source/LibationSearchEngine/QuerySanitizer.cs index f6020528..fab57e8f 100644 --- a/Source/LibationSearchEngine/QuerySanitizer.cs +++ b/Source/LibationSearchEngine/QuerySanitizer.cs @@ -2,10 +2,11 @@ using Lucene.Net.Analysis.Tokenattributes; using System.Collections.Generic; using System.Linq; +using System.Text.RegularExpressions; namespace LibationSearchEngine { - internal static class QuerySanitizer + internal static partial class QuerySanitizer { private static readonly HashSet idTerms = SearchEngine.FieldIndexRules.IdFieldNames @@ -23,11 +24,17 @@ namespace LibationSearchEngine .Select(n => n.ToLowerInvariant()) .ToHashSet(); + private static readonly Regex tagRegex = TagRegex(); + internal static string Sanitize(string searchString, StandardAnalyzer analyzer) { if (string.IsNullOrWhiteSpace(searchString)) return SearchEngine.ALL_QUERY; + //Replace a block tags with tags with proper tag query syntax + //eg: [foo] -> tags:foo + searchString = tagRegex.Replace(searchString, $"{SearchEngine.TAGS}:$1 "); + // range operator " TO " and bool operators " AND " and " OR " must be uppercase searchString = searchString @@ -76,11 +83,6 @@ namespace LibationSearchEngine addUnalteredToken(offset); previousIsTags = false; } - else if (tryParseBlockTag(offset, partList, searchString, out var tagName)) - { - //The term is a block tag. add it to the part list - partList.Add($"{SearchEngine.TAGS}:{tagName}"); - } else if (double.TryParse(term, out var num)) { //Term is a number so pad it with zeros @@ -117,35 +119,7 @@ namespace LibationSearchEngine partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset)); } - private static bool tryParseBlockTag(IOffsetAttribute offset, List partList, string searchString, out string tagName) - { - tagName = null; - if (partList.Count == 0) return false; - - var previous = partList[^1].TrimEnd(); - - //cannot be preceeded by an escaping \ - if (previous.Length == 0) return false; - if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false; - - var next = searchString.Substring(offset.EndOffset); - if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false; - - tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset); - - //Only legal tag characters are letters, numbers and underscores - //Per DataLayer.UserDefinedItem.IllegalCharacterRegex() - foreach (var c in tagName) - { - if (!char.IsLetterOrDigit(c) && c != '_') - return false; - } - - //Remove the leading '[' - partList[^1] = previous[..^1]; - //Ignore the trailing ']' - offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1); - return true; - } + [GeneratedRegex(@"(?> parse as tag - [DataRow("[israted]", "tags:israted")] - [DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")] - [DataRow("[tags][israted]", "tags:tagstags:israted")] + [DataRow("[israted]", "tags:israted ")] + [DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")] + [DataRow("[tags][israted]", "tags:tags tags:israted ")] // numbers with "to". TO all caps, numbers [8.2] format [DataRow("1 to 10", "00000001.00 TO 00000010.00")]