diff --git a/Source/LibationSearchEngine/AsinAnalyzer.cs b/Source/LibationSearchEngine/AsinAnalyzer.cs
deleted file mode 100644
index 0d60c42b..00000000
--- a/Source/LibationSearchEngine/AsinAnalyzer.cs
+++ /dev/null
@@ -1,81 +0,0 @@
-using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Analysis;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-
-namespace LibationSearchEngine
-{
- internal class AsinAnalyzer : Analyzer
- {
- public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
- {
- return new AsinFilter(reader);
- }
- ///
- /// Emits the entire input as a single token and removes
- /// trailing .00 from strings that parsed to numbers
- ///
- /// Based on Lucene.Net.Analysis.KeywordTokenizer
- ///
- private class AsinFilter : Tokenizer
- {
- private bool done;
- private int finalOffset;
- private readonly ITermAttribute termAtt;
- private readonly IOffsetAttribute offsetAtt;
- private const int DEFAULT_BUFFER_SIZE = 256;
-
- public AsinFilter(System.IO.TextReader input) : base(input)
- {
- offsetAtt = AddAttribute();
- termAtt = AddAttribute();
- termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE);
- }
- public override bool IncrementToken()
- {
- var charReader = input as CharReader;
- if (!done)
- {
- ClearAttributes();
- done = true;
- int upto = 0;
- char[] buffer = termAtt.TermBuffer();
-
- while (true)
- {
- int length = charReader.Read(buffer, upto, buffer.Length - upto);
- if (length == 0)
- break;
- upto += length;
- if (upto == buffer.Length)
- buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
- }
-
- var termStr = new string(buffer, 0, upto);
- if (termStr.EndsWith(".00"))
- upto -= 3;
-
- termAtt.SetTermLength(upto);
- finalOffset = CorrectOffset(upto);
- offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
- return true;
- }
- return false;
- }
- public override void End()
- {
- // set final offset
- offsetAtt.SetOffset(finalOffset, finalOffset);
- }
-
- public override void Reset(System.IO.TextReader input)
- {
- base.Reset(input);
- this.done = false;
- }
- }
- }
-}
diff --git a/Source/LibationSearchEngine/LuceneRegex.cs b/Source/LibationSearchEngine/LuceneRegex.cs
deleted file mode 100644
index 4033ee48..00000000
--- a/Source/LibationSearchEngine/LuceneRegex.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text.RegularExpressions;
-
-namespace LibationSearchEngine
-{
- internal static partial class LuceneRegex
- {
- #region pattern pieces
- // negative lookbehind: cannot be preceeded by an escaping \
- const string NOT_ESCAPED = @"(? $@"\{c}").Aggregate((a, b) => a + b);
- private static string WORD_CAPTURE { get; } = $@"([^\s{disallowedCharsEscaped}]+)";
-
- // : with optional preceeding spaces. capture these so i don't accidentally replace a non-field name
- const string FIELD_END = @"(\s*:)";
-
- const string BEGIN_TAG = @"\[";
- const string END_TAG = @"\]";
-
- // space is forgiven at beginning and end of tag but not in the middle
- // literal space character only. do NOT allow new lines, tabs, ...
- const string OPTIONAL_SPACE_LITERAL = @"\u0020*";
- #endregion
-
- private static string tagPattern { get; } = NOT_ESCAPED + BEGIN_TAG + OPTIONAL_SPACE_LITERAL + WORD_CAPTURE + OPTIONAL_SPACE_LITERAL + END_TAG;
- public static Regex TagRegex { get; } = new Regex(tagPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
-
- private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END;
- public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
-
- ///
- /// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
- /// positive look behind: beginning space { [ :
- /// positive look ahead: end space ] }
- ///
-
- [GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)]
- public static partial Regex NumbersRegex();
-
- ///
- /// proper bools are single keywords which are turned into keyword:True
- /// if bordered by colons or inside brackets, they are not stand-alone bool keywords
- /// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
- /// [israted]
- /// parseTag => tags:israted
- /// replaceBools => tags:israted:True
- /// or
- /// [israted]
- /// replaceBools => israted:True
- /// parseTag => [israted:True]
- /// also don't want to apply :True where the value already exists:
- /// israted:false => israted:false:True
- ///
- /// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
- ///
- private static string boolPattern_parameterized { get; }
- = @"
-### IMPORTANT: 'ignore whitespace' is only partially honored in character sets
-### - new lines are ok
-### - ANY leading whitespace is treated like actual matching spaces :(
-
- ### can't begin with colon. incorrect syntax
- ### can't begin with open bracket: this signals the start of a tag
-(? boolRegexDic { get; } = new Dictionary();
- public static Regex GetBoolRegex(string boolSearch)
- {
- if (boolRegexDic.TryGetValue(boolSearch, out var regex))
- return regex;
-
- var boolPattern = string.Format(boolPattern_parameterized, boolSearch);
- regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
- boolRegexDic.Add(boolSearch, regex);
-
- return regex;
- }
- }
-}
diff --git a/Source/LibationSearchEngine/QuerySanitizer.cs b/Source/LibationSearchEngine/QuerySanitizer.cs
new file mode 100644
index 00000000..bd4acef0
--- /dev/null
+++ b/Source/LibationSearchEngine/QuerySanitizer.cs
@@ -0,0 +1,153 @@
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace LibationSearchEngine
+{
+ internal static class QuerySanitizer
+ {
+ private static readonly HashSet idTerms
+ = SearchEngine.idIndexRules.Keys
+ .Select(s => s.ToLowerInvariant())
+ .ToHashSet();
+
+ private static readonly HashSet boolTerms
+ = SearchEngine.boolIndexRules.Keys
+ .Select(s => s.ToLowerInvariant())
+ .ToHashSet();
+
+ private static readonly HashSet fieldTerms
+ = SearchEngine.stringIndexRules.Keys
+ .Union(SearchEngine.numberIndexRules.Keys)
+ .Select(s => s.ToLowerInvariant())
+ .Union(idTerms)
+ .Union(boolTerms)
+ .ToHashSet();
+
+ internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
+ {
+ if (string.IsNullOrWhiteSpace(searchString))
+ return SearchEngine.ALL_QUERY;
+
+ // range operator " TO " and bool operators " AND " and " OR " must be uppercase
+ searchString
+ = searchString
+ .Replace(" to ", " TO ", System.StringComparison.OrdinalIgnoreCase)
+ .Replace(" and ", " AND ", System.StringComparison.OrdinalIgnoreCase)
+ .Replace(" or ", " OR ", System.StringComparison.OrdinalIgnoreCase);
+
+ using var tokenStream = analyzer.TokenStream(SearchEngine.ALL, new System.IO.StringReader(searchString));
+
+ var partList = new List();
+ int previousEndOffset = 0;
+ bool previousIsBool = false, previousIsTags = false, previousIsAsin = false;
+
+ while (tokenStream.IncrementToken())
+ {
+ var term = tokenStream.GetAttribute().Term;
+ var offset = tokenStream.GetAttribute();
+
+ if (previousIsBool && !bool.TryParse(term, out _))
+ {
+ //The previous term was a boolean tag and this term is NOT a bool value
+ //Add the default ":True" bool and continue parsing the current term
+ partList.Add(":True");
+ previousIsBool = false;
+ }
+
+ //Add all text between the current token and the previous token
+ partList.Add(searchString.Substring(previousEndOffset, offset.StartOffset - previousEndOffset));
+
+ if (previousIsBool)
+ {
+ //The previous term was a boolean tag and this term is a bool value
+ addUnalteredToken(offset);
+ previousIsBool = false;
+ }
+ else if (previousIsAsin)
+ {
+ //The previous term was an ASIN field ID, so this term is an ASIN
+ partList.Add(term);
+ previousIsAsin = false;
+ }
+ else if (previousIsTags)
+ {
+ //This term is a tag. Do this check before checking if term is a defined field
+ //so that "tags:israted" does not parse as a bool
+ addUnalteredToken(offset);
+ previousIsTags = false;
+ }
+ else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
+ {
+ //The term is a block tag. add it to the part list
+ partList.Add($"{SearchEngine.TAGS}:{tagName}");
+ }
+ else if (double.TryParse(term, out var num))
+ {
+ //Term is a number so pad it with zeros
+ partList.Add(num.ToLuceneString());
+ }
+ else if (fieldTerms.Contains(term))
+ {
+ //Term is a defined search field, add it.
+ //The StandardAnalyzer already converts all terms to lowercase
+ partList.Add(term);
+ previousIsBool = boolTerms.Contains(term);
+ previousIsAsin = idTerms.Contains(term);
+ previousIsTags = term == SearchEngine.TAGS;
+ }
+ else
+ {
+ //Term is any other user-defined constant value
+ addUnalteredToken(offset);
+ }
+
+ previousEndOffset = offset.EndOffset;
+ }
+
+ if (previousIsBool)
+ partList.Add(":True");
+
+ //Add ending non-token text
+ partList.Add(searchString.Substring(previousEndOffset, searchString.Length - previousEndOffset));
+
+ return string.Concat(partList);
+
+ //Add the full, unaltered token as well as all inter-token text
+ void addUnalteredToken(IOffsetAttribute offset) =>
+ partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));
+ }
+
+ private static bool tryParseBlockTag(IOffsetAttribute offset, List partList, string searchString, out string tagName)
+ {
+ tagName = null;
+ if (partList.Count == 0) return false;
+
+ var previous = partList[^1].TrimEnd();
+
+ //cannot be preceeded by an escaping \
+ if (previous.Length == 0) return false;
+ if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
+
+ var next = searchString.Substring(offset.EndOffset);
+ if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
+
+ tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
+
+ //Only legal tag characters are letters, numbers and underscores
+ //Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
+ foreach (var c in tagName)
+ {
+ if (!char.IsLetterOrDigit(c) && c != '_')
+ return false;
+ }
+
+ //Remove the leading '['
+ partList[^1] = previous[..^1];
+ //Ignore the trailing ']'
+ offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
+ return true;
+ }
+ }
+}
diff --git a/Source/LibationSearchEngine/SearchEngine.cs b/Source/LibationSearchEngine/SearchEngine.cs
index 83bde6e4..83d7d588 100644
--- a/Source/LibationSearchEngine/SearchEngine.cs
+++ b/Source/LibationSearchEngine/SearchEngine.cs
@@ -6,8 +6,8 @@ using System.Text.RegularExpressions;
using DataLayer;
using Dinah.Core;
using LibationFileManager;
-using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
@@ -32,18 +32,18 @@ namespace LibationSearchEngine
public const string ALL_NARRATOR_NAMES = "NarratorNames";
public const string ALL_SERIES_NAMES = "SeriesNames";
- private static ReadOnlyDictionary> idIndexRules { get; }
+ internal static ReadOnlyDictionary> idIndexRules { get; }
= new ReadOnlyDictionary>(
new Dictionary>
{
- [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId,
- ["ProductId"] = lb => lb.Book.AudibleProductId,
- ["Id"] = lb => lb.Book.AudibleProductId,
- ["ASIN"] = lb => lb.Book.AudibleProductId
- }
+ [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
+ ["ProductId"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
+ ["Id"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
+ ["ASIN"] = lb => lb.Book.AudibleProductId.ToLowerInvariant()
+ }
);
- private static ReadOnlyDictionary> stringIndexRules { get; }
+ internal static ReadOnlyDictionary> stringIndexRules { get; }
= new ReadOnlyDictionary>(
new Dictionary>
{
@@ -75,7 +75,7 @@ namespace LibationSearchEngine
}
);
- private static ReadOnlyDictionary> numberIndexRules { get; }
+ internal static ReadOnlyDictionary> numberIndexRules { get; }
= new ReadOnlyDictionary>(
new Dictionary>
{
@@ -99,7 +99,7 @@ namespace LibationSearchEngine
}
);
- private static ReadOnlyDictionary> boolIndexRules { get; }
+ internal static ReadOnlyDictionary> boolIndexRules { get; }
= new ReadOnlyDictionary>(
new Dictionary>
{
@@ -353,112 +353,27 @@ namespace LibationSearchEngine
#region search
public SearchResultSet Search(string searchString)
- {
- Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
- searchString = FormatSearchQuery(searchString);
+ {
+ using var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
+
+ Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
+ searchString = QuerySanitizer.Sanitize(searchString, analyzer);
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
- var results = generalSearch(searchString);
+ var results = generalSearch(searchString, analyzer);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results);
return results;
}
- internal static string FormatSearchQuery(string searchString)
- {
- if (string.IsNullOrWhiteSpace(searchString))
- return ALL_QUERY;
-
- searchString = replaceBools(searchString);
-
- searchString = parseTag(searchString);
-
- // in ranges " TO " must be uppercase
- searchString = searchString.Replace(" to ", " TO ");
-
- searchString = padNumbers(searchString);
-
- searchString = lowerFieldNames(searchString);
-
- return searchString;
- }
-
- #region format query string
- private static string parseTag(string tagSearchString)
- {
- var allMatches = LuceneRegex
- .TagRegex
- .Matches(tagSearchString)
- .Cast()
- .Select(a => a.ToString())
- .ToList();
- foreach (var match in allMatches)
- tagSearchString = tagSearchString.Replace(
- match,
- TAGS + ":" + match.Trim('[', ']').Trim()
- );
-
- return tagSearchString;
- }
-
- private static string replaceBools(string searchString)
- {
- foreach (var boolSearch in boolIndexRules.Keys)
- searchString =
- LuceneRegex.GetBoolRegex(boolSearch)
- .Replace(searchString, @"$1:True");
-
- return searchString;
- }
-
- private static string padNumbers(string searchString)
- {
- var matches = LuceneRegex
- .NumbersRegex()
- .Matches(searchString)
- .Cast()
- .OrderByDescending(m => m.Index);
-
- foreach (var m in matches)
- {
- var replaceString = double.Parse(m.ToString()).ToLuceneString();
- searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index);
- }
-
- return searchString;
- }
-
- private static string lowerFieldNames(string searchString)
- {
- // fields are case specific
- var allMatches = LuceneRegex
- .FieldRegex
- .Matches(searchString)
- .Cast()
- .Select(a => a.ToString())
- .ToList();
-
- foreach (var match in allMatches)
- searchString = searchString.Replace(match, match.ToLowerInvariant());
-
- return searchString;
- }
- #endregion
-
- private SearchResultSet generalSearch(string searchString)
+ private SearchResultSet generalSearch(string searchString, StandardAnalyzer analyzer)
{
var defaultField = ALL;
using var index = getIndex();
using var searcher = new IndexSearcher(index);
- using var analyzer = new StandardAnalyzer(Version);
- using var asinAnalyzer = new AsinAnalyzer();
-
- var dic = idIndexRules.Keys.Select(k => new KeyValuePair(k.ToLowerInvariant(), asinAnalyzer));
- using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic);
-
- var query = perFieldAnalyzer.GetQuery(defaultField, searchString);
+ var query = analyzer.GetQuery(defaultField, searchString);
// lucene doesn't allow only negations. eg this returns nothing:
// -tags:hidden
diff --git a/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs b/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs
index 987f0f66..15be3d14 100644
--- a/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs
+++ b/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs
@@ -10,6 +10,7 @@ using Dinah.Core;
using FluentAssertions;
using FluentAssertions.Common;
using LibationSearchEngine;
+using Lucene.Net.Analysis.Standard;
using Microsoft.VisualStudio.TestPlatform.Common.Filtering;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Moq;
@@ -31,6 +32,7 @@ namespace SearchEngineTests
// tag surrounded by spaces
[DataRow("[foo]", "tags:foo")]
[DataRow(" [foo]", " tags:foo")]
+ [DataRow(" [ foo ]", " tags:foo")]
[DataRow("[foo] ", "tags:foo ")]
[DataRow(" [foo] ", " tags:foo ")]
[DataRow("-[foo]", "-tags:foo")]
@@ -57,9 +59,12 @@ namespace SearchEngineTests
// bool keyword with [:bool]. Do not add :True
[DataRow("israted:True", "israted:True")]
[DataRow("isRated:false", "israted:false")]
+ [DataRow("liberated AND isRated:false", "liberated:True AND israted:false")]
// tag which happens to be a bool keyword >> parse as tag
[DataRow("[israted]", "tags:israted")]
+ [DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")]
+ [DataRow("[tags][israted]", "tags:tagstags:israted")]
// numbers with "to". TO all caps, numbers [8.2] format
[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
@@ -72,6 +77,10 @@ namespace SearchEngineTests
[DataRow("-isRATED", "-israted:True")]
public void FormattingTest(string input, string output)
- => SearchEngine.FormatSearchQuery(input).Should().Be(output);
+ {
+ using var analyzer = new StandardAnalyzer(SearchEngine.Version);
+
+ QuerySanitizer.Sanitize(input, analyzer).Should().Be(output);
+ }
}
}