diff --git a/Source/LibationSearchEngine/AsinAnalyzer.cs b/Source/LibationSearchEngine/AsinAnalyzer.cs deleted file mode 100644 index 0d60c42b..00000000 --- a/Source/LibationSearchEngine/AsinAnalyzer.cs +++ /dev/null @@ -1,81 +0,0 @@ -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Analysis; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace LibationSearchEngine -{ - internal class AsinAnalyzer : Analyzer - { - public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) - { - return new AsinFilter(reader); - } - /// - /// Emits the entire input as a single token and removes - /// trailing .00 from strings that parsed to numbers - /// - /// Based on Lucene.Net.Analysis.KeywordTokenizer - /// - private class AsinFilter : Tokenizer - { - private bool done; - private int finalOffset; - private readonly ITermAttribute termAtt; - private readonly IOffsetAttribute offsetAtt; - private const int DEFAULT_BUFFER_SIZE = 256; - - public AsinFilter(System.IO.TextReader input) : base(input) - { - offsetAtt = AddAttribute(); - termAtt = AddAttribute(); - termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE); - } - public override bool IncrementToken() - { - var charReader = input as CharReader; - if (!done) - { - ClearAttributes(); - done = true; - int upto = 0; - char[] buffer = termAtt.TermBuffer(); - - while (true) - { - int length = charReader.Read(buffer, upto, buffer.Length - upto); - if (length == 0) - break; - upto += length; - if (upto == buffer.Length) - buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); - } - - var termStr = new string(buffer, 0, upto); - if (termStr.EndsWith(".00")) - upto -= 3; - - termAtt.SetTermLength(upto); - finalOffset = CorrectOffset(upto); - offsetAtt.SetOffset(CorrectOffset(0), finalOffset); - return true; - } - return false; - } - public override void End() - { - // set final offset - offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset(System.IO.TextReader input) - { - base.Reset(input); - this.done = false; - } - } - } -} diff --git a/Source/LibationSearchEngine/LuceneRegex.cs b/Source/LibationSearchEngine/LuceneRegex.cs deleted file mode 100644 index 4033ee48..00000000 --- a/Source/LibationSearchEngine/LuceneRegex.cs +++ /dev/null @@ -1,103 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; - -namespace LibationSearchEngine -{ - internal static partial class LuceneRegex - { - #region pattern pieces - // negative lookbehind: cannot be preceeded by an escaping \ - const string NOT_ESCAPED = @"(? $@"\{c}").Aggregate((a, b) => a + b); - private static string WORD_CAPTURE { get; } = $@"([^\s{disallowedCharsEscaped}]+)"; - - // : with optional preceeding spaces. capture these so i don't accidentally replace a non-field name - const string FIELD_END = @"(\s*:)"; - - const string BEGIN_TAG = @"\["; - const string END_TAG = @"\]"; - - // space is forgiven at beginning and end of tag but not in the middle - // literal space character only. do NOT allow new lines, tabs, ... - const string OPTIONAL_SPACE_LITERAL = @"\u0020*"; - #endregion - - private static string tagPattern { get; } = NOT_ESCAPED + BEGIN_TAG + OPTIONAL_SPACE_LITERAL + WORD_CAPTURE + OPTIONAL_SPACE_LITERAL + END_TAG; - public static Regex TagRegex { get; } = new Regex(tagPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled); - - private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END; - public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled); - - /// - /// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd) - /// positive look behind: beginning space { [ : - /// positive look ahead: end space ] } - /// - - [GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)] - public static partial Regex NumbersRegex(); - - /// - /// proper bools are single keywords which are turned into keyword:True - /// if bordered by colons or inside brackets, they are not stand-alone bool keywords - /// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag: - /// [israted] - /// parseTag => tags:israted - /// replaceBools => tags:israted:True - /// or - /// [israted] - /// replaceBools => israted:True - /// parseTag => [israted:True] - /// also don't want to apply :True where the value already exists: - /// israted:false => israted:false:True - /// - /// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture - /// - private static string boolPattern_parameterized { get; } - = @" -### IMPORTANT: 'ignore whitespace' is only partially honored in character sets -### - new lines are ok -### - ANY leading whitespace is treated like actual matching spaces :( - - ### can't begin with colon. incorrect syntax - ### can't begin with open bracket: this signals the start of a tag -(? boolRegexDic { get; } = new Dictionary(); - public static Regex GetBoolRegex(string boolSearch) - { - if (boolRegexDic.TryGetValue(boolSearch, out var regex)) - return regex; - - var boolPattern = string.Format(boolPattern_parameterized, boolSearch); - regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled); - boolRegexDic.Add(boolSearch, regex); - - return regex; - } - } -} diff --git a/Source/LibationSearchEngine/QuerySanitizer.cs b/Source/LibationSearchEngine/QuerySanitizer.cs new file mode 100644 index 00000000..bd4acef0 --- /dev/null +++ b/Source/LibationSearchEngine/QuerySanitizer.cs @@ -0,0 +1,153 @@ +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Analysis.Tokenattributes; +using System.Collections.Generic; +using System.Linq; + +namespace LibationSearchEngine +{ + internal static class QuerySanitizer + { + private static readonly HashSet idTerms + = SearchEngine.idIndexRules.Keys + .Select(s => s.ToLowerInvariant()) + .ToHashSet(); + + private static readonly HashSet boolTerms + = SearchEngine.boolIndexRules.Keys + .Select(s => s.ToLowerInvariant()) + .ToHashSet(); + + private static readonly HashSet fieldTerms + = SearchEngine.stringIndexRules.Keys + .Union(SearchEngine.numberIndexRules.Keys) + .Select(s => s.ToLowerInvariant()) + .Union(idTerms) + .Union(boolTerms) + .ToHashSet(); + + internal static string Sanitize(string searchString, StandardAnalyzer analyzer) + { + if (string.IsNullOrWhiteSpace(searchString)) + return SearchEngine.ALL_QUERY; + + // range operator " TO " and bool operators " AND " and " OR " must be uppercase + searchString + = searchString + .Replace(" to ", " TO ", System.StringComparison.OrdinalIgnoreCase) + .Replace(" and ", " AND ", System.StringComparison.OrdinalIgnoreCase) + .Replace(" or ", " OR ", System.StringComparison.OrdinalIgnoreCase); + + using var tokenStream = analyzer.TokenStream(SearchEngine.ALL, new System.IO.StringReader(searchString)); + + var partList = new List(); + int previousEndOffset = 0; + bool previousIsBool = false, previousIsTags = false, previousIsAsin = false; + + while (tokenStream.IncrementToken()) + { + var term = tokenStream.GetAttribute().Term; + var offset = tokenStream.GetAttribute(); + + if (previousIsBool && !bool.TryParse(term, out _)) + { + //The previous term was a boolean tag and this term is NOT a bool value + //Add the default ":True" bool and continue parsing the current term + partList.Add(":True"); + previousIsBool = false; + } + + //Add all text between the current token and the previous token + partList.Add(searchString.Substring(previousEndOffset, offset.StartOffset - previousEndOffset)); + + if (previousIsBool) + { + //The previous term was a boolean tag and this term is a bool value + addUnalteredToken(offset); + previousIsBool = false; + } + else if (previousIsAsin) + { + //The previous term was an ASIN field ID, so this term is an ASIN + partList.Add(term); + previousIsAsin = false; + } + else if (previousIsTags) + { + //This term is a tag. Do this check before checking if term is a defined field + //so that "tags:israted" does not parse as a bool + addUnalteredToken(offset); + previousIsTags = false; + } + else if (tryParseBlockTag(offset, partList, searchString, out var tagName)) + { + //The term is a block tag. add it to the part list + partList.Add($"{SearchEngine.TAGS}:{tagName}"); + } + else if (double.TryParse(term, out var num)) + { + //Term is a number so pad it with zeros + partList.Add(num.ToLuceneString()); + } + else if (fieldTerms.Contains(term)) + { + //Term is a defined search field, add it. + //The StandardAnalyzer already converts all terms to lowercase + partList.Add(term); + previousIsBool = boolTerms.Contains(term); + previousIsAsin = idTerms.Contains(term); + previousIsTags = term == SearchEngine.TAGS; + } + else + { + //Term is any other user-defined constant value + addUnalteredToken(offset); + } + + previousEndOffset = offset.EndOffset; + } + + if (previousIsBool) + partList.Add(":True"); + + //Add ending non-token text + partList.Add(searchString.Substring(previousEndOffset, searchString.Length - previousEndOffset)); + + return string.Concat(partList); + + //Add the full, unaltered token as well as all inter-token text + void addUnalteredToken(IOffsetAttribute offset) => + partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset)); + } + + private static bool tryParseBlockTag(IOffsetAttribute offset, List partList, string searchString, out string tagName) + { + tagName = null; + if (partList.Count == 0) return false; + + var previous = partList[^1].TrimEnd(); + + //cannot be preceeded by an escaping \ + if (previous.Length == 0) return false; + if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false; + + var next = searchString.Substring(offset.EndOffset); + if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false; + + tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset); + + //Only legal tag characters are letters, numbers and underscores + //Per DataLayer.UserDefinedItem.IllegalCharacterRegex() + foreach (var c in tagName) + { + if (!char.IsLetterOrDigit(c) && c != '_') + return false; + } + + //Remove the leading '[' + partList[^1] = previous[..^1]; + //Ignore the trailing ']' + offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1); + return true; + } + } +} diff --git a/Source/LibationSearchEngine/SearchEngine.cs b/Source/LibationSearchEngine/SearchEngine.cs index 83bde6e4..83d7d588 100644 --- a/Source/LibationSearchEngine/SearchEngine.cs +++ b/Source/LibationSearchEngine/SearchEngine.cs @@ -6,8 +6,8 @@ using System.Text.RegularExpressions; using DataLayer; using Dinah.Core; using LibationFileManager; -using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; +using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using Lucene.Net.Index; using Lucene.Net.Search; @@ -32,18 +32,18 @@ namespace LibationSearchEngine public const string ALL_NARRATOR_NAMES = "NarratorNames"; public const string ALL_SERIES_NAMES = "SeriesNames"; - private static ReadOnlyDictionary> idIndexRules { get; } + internal static ReadOnlyDictionary> idIndexRules { get; } = new ReadOnlyDictionary>( new Dictionary> { - [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId, - ["ProductId"] = lb => lb.Book.AudibleProductId, - ["Id"] = lb => lb.Book.AudibleProductId, - ["ASIN"] = lb => lb.Book.AudibleProductId - } + [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId.ToLowerInvariant(), + ["ProductId"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(), + ["Id"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(), + ["ASIN"] = lb => lb.Book.AudibleProductId.ToLowerInvariant() + } ); - private static ReadOnlyDictionary> stringIndexRules { get; } + internal static ReadOnlyDictionary> stringIndexRules { get; } = new ReadOnlyDictionary>( new Dictionary> { @@ -75,7 +75,7 @@ namespace LibationSearchEngine } ); - private static ReadOnlyDictionary> numberIndexRules { get; } + internal static ReadOnlyDictionary> numberIndexRules { get; } = new ReadOnlyDictionary>( new Dictionary> { @@ -99,7 +99,7 @@ namespace LibationSearchEngine } ); - private static ReadOnlyDictionary> boolIndexRules { get; } + internal static ReadOnlyDictionary> boolIndexRules { get; } = new ReadOnlyDictionary>( new Dictionary> { @@ -353,112 +353,27 @@ namespace LibationSearchEngine #region search public SearchResultSet Search(string searchString) - { - Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString }); - searchString = FormatSearchQuery(searchString); + { + using var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); + + Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString }); + searchString = QuerySanitizer.Sanitize(searchString, analyzer); Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString }); - var results = generalSearch(searchString); + var results = generalSearch(searchString, analyzer); Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() }); displayResults(results); return results; } - internal static string FormatSearchQuery(string searchString) - { - if (string.IsNullOrWhiteSpace(searchString)) - return ALL_QUERY; - - searchString = replaceBools(searchString); - - searchString = parseTag(searchString); - - // in ranges " TO " must be uppercase - searchString = searchString.Replace(" to ", " TO "); - - searchString = padNumbers(searchString); - - searchString = lowerFieldNames(searchString); - - return searchString; - } - - #region format query string - private static string parseTag(string tagSearchString) - { - var allMatches = LuceneRegex - .TagRegex - .Matches(tagSearchString) - .Cast() - .Select(a => a.ToString()) - .ToList(); - foreach (var match in allMatches) - tagSearchString = tagSearchString.Replace( - match, - TAGS + ":" + match.Trim('[', ']').Trim() - ); - - return tagSearchString; - } - - private static string replaceBools(string searchString) - { - foreach (var boolSearch in boolIndexRules.Keys) - searchString = - LuceneRegex.GetBoolRegex(boolSearch) - .Replace(searchString, @"$1:True"); - - return searchString; - } - - private static string padNumbers(string searchString) - { - var matches = LuceneRegex - .NumbersRegex() - .Matches(searchString) - .Cast() - .OrderByDescending(m => m.Index); - - foreach (var m in matches) - { - var replaceString = double.Parse(m.ToString()).ToLuceneString(); - searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index); - } - - return searchString; - } - - private static string lowerFieldNames(string searchString) - { - // fields are case specific - var allMatches = LuceneRegex - .FieldRegex - .Matches(searchString) - .Cast() - .Select(a => a.ToString()) - .ToList(); - - foreach (var match in allMatches) - searchString = searchString.Replace(match, match.ToLowerInvariant()); - - return searchString; - } - #endregion - - private SearchResultSet generalSearch(string searchString) + private SearchResultSet generalSearch(string searchString, StandardAnalyzer analyzer) { var defaultField = ALL; using var index = getIndex(); using var searcher = new IndexSearcher(index); - using var analyzer = new StandardAnalyzer(Version); - using var asinAnalyzer = new AsinAnalyzer(); - - var dic = idIndexRules.Keys.Select(k => new KeyValuePair(k.ToLowerInvariant(), asinAnalyzer)); - using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic); - - var query = perFieldAnalyzer.GetQuery(defaultField, searchString); + var query = analyzer.GetQuery(defaultField, searchString); // lucene doesn't allow only negations. eg this returns nothing: // -tags:hidden diff --git a/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs b/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs index 987f0f66..15be3d14 100644 --- a/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs +++ b/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs @@ -10,6 +10,7 @@ using Dinah.Core; using FluentAssertions; using FluentAssertions.Common; using LibationSearchEngine; +using Lucene.Net.Analysis.Standard; using Microsoft.VisualStudio.TestPlatform.Common.Filtering; using Microsoft.VisualStudio.TestTools.UnitTesting; using Moq; @@ -31,6 +32,7 @@ namespace SearchEngineTests // tag surrounded by spaces [DataRow("[foo]", "tags:foo")] [DataRow(" [foo]", " tags:foo")] + [DataRow(" [ foo ]", " tags:foo")] [DataRow("[foo] ", "tags:foo ")] [DataRow(" [foo] ", " tags:foo ")] [DataRow("-[foo]", "-tags:foo")] @@ -57,9 +59,12 @@ namespace SearchEngineTests // bool keyword with [:bool]. Do not add :True [DataRow("israted:True", "israted:True")] [DataRow("isRated:false", "israted:false")] + [DataRow("liberated AND isRated:false", "liberated:True AND israted:false")] // tag which happens to be a bool keyword >> parse as tag [DataRow("[israted]", "tags:israted")] + [DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")] + [DataRow("[tags][israted]", "tags:tagstags:israted")] // numbers with "to". TO all caps, numbers [8.2] format [DataRow("1 to 10", "00000001.00 TO 00000010.00")] @@ -72,6 +77,10 @@ namespace SearchEngineTests [DataRow("-isRATED", "-israted:True")] public void FormattingTest(string input, string output) - => SearchEngine.FormatSearchQuery(input).Should().Be(output); + { + using var analyzer = new StandardAnalyzer(SearchEngine.Version); + + QuerySanitizer.Sanitize(input, analyzer).Should().Be(output); + } } }