Merge branch 'master' of https://github.com/rmcrackan/Libation

2023-06-11 17:03:55 -04:00 · 2023-06-11 17:03:55 -04:00 · a4dfdf80e4
commit a4dfdf80e4
parent d8c90bc745 46accddd2d
7 changed files with 198 additions and 290 deletions
--- a/Documentation/Advanced.md
+++ b/Documentation/Advanced.md
@ -28,6 +28,15 @@ To make upgrades and reinstalls easier, Libation separates all of its responsibi
 * Allow Libation to fix up audiobook metadata. After decrypting a title, Libation attempts to fix details like chapters and cover art. Some power users and/or control freaks prefer to manage this themselves. By unchecking this setting, Libation will only decrypt the book and will leave metadata as-is, warts and all.
 In addition to the options that are enabled if you allow Libation to "fix up" the audiobook, it does the following:
 * Adds the `TCOM` metadata tag for the narrators.
 * Sets the `©gen` metadata tag for the genres.
 * Unescapes the copyright symbol (replace `&#169;` with `©`)
 * Replaces the recording copyright `(P)` string with `℗`
 * Replaces the chapter markers embedded in the aax file with the chapter markers retrieved from Audible's API.
 * Sets the embedded cover art image with the 500x500 px cover art retrieved from Audible
 ### Command Line Interface
 Libationcli.exe allows limited access to Libation's functionalities as a CLI.
--- a/Source/LibationAvalonia/Controls/WheelComboBox.axaml.cs
+++ b/Source/LibationAvalonia/Controls/WheelComboBox.axaml.cs
@ -1,11 +1,10 @@
 using Avalonia.Controls;
 using Avalonia.Input;
 using Avalonia.Styling;
 using System;
 namespace LibationAvalonia.Controls
 {
-	public partial class WheelComboBox : ComboBox, IStyleable
+	public partial class WheelComboBox : ComboBox
 	{
 		protected override Type StyleKeyOverride => typeof(ComboBox);
--- a/Source/LibationSearchEngine/AsinAnalyzer.cs
+++ b/Source/LibationSearchEngine/AsinAnalyzer.cs
@ -1,81 +0,0 @@
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis;
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 namespace LibationSearchEngine
 {
 	internal class AsinAnalyzer : Analyzer
 	{
 		public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
 		{
 			return new AsinFilter(reader);
 		}
 		/// <summary>
 		/// Emits the entire input as a single token and removes
 		/// trailing .00 from strings that parsed to numbers
 		/// 
 		/// Based on Lucene.Net.Analysis.KeywordTokenizer
 		/// </summary>
 		private class AsinFilter : Tokenizer
 		{
 			private bool done;
 			private int finalOffset;
 			private readonly ITermAttribute termAtt;
 			private readonly IOffsetAttribute offsetAtt;
 			private const int DEFAULT_BUFFER_SIZE = 256;
 			public AsinFilter(System.IO.TextReader input) : base(input)
 			{
 				offsetAtt = AddAttribute<IOffsetAttribute>();
 				termAtt = AddAttribute<ITermAttribute>();
 				termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE);
 			}
 			public override bool IncrementToken()
 			{
 				var charReader = input as CharReader;
 				if (!done)
 				{
 					ClearAttributes();
 					done = true;
 					int upto = 0;
 					char[] buffer = termAtt.TermBuffer();
 					while (true)
 					{
 						int length = charReader.Read(buffer, upto, buffer.Length - upto);
 						if (length == 0)
 							break;
 						upto += length;
 						if (upto == buffer.Length)
 							buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
 					}
 					var termStr = new string(buffer, 0, upto);
 					if (termStr.EndsWith(".00"))
 						upto -= 3;
 					termAtt.SetTermLength(upto);
 					finalOffset = CorrectOffset(upto);
 					offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
 					return true;
 				}
 				return false;
 			}
 			public override void End()
 			{
 				// set final offset 
 				offsetAtt.SetOffset(finalOffset, finalOffset);
 			}
 			public override void Reset(System.IO.TextReader input)
 			{
 				base.Reset(input);
 				this.done = false;
 			}
 		}
 	}
 }
--- a/Source/LibationSearchEngine/LuceneRegex.cs
+++ b/Source/LibationSearchEngine/LuceneRegex.cs
@ -1,103 +0,0 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text.RegularExpressions;
 namespace LibationSearchEngine
 {
    internal static partial class LuceneRegex
    {
        #region pattern pieces
        //  negative lookbehind: cannot be preceeded by an escaping \
        const string NOT_ESCAPED = @"(?<!\\)";
        // disallow spaces and lucene reserved characters
        //     + - && || ! ( ) { } [ ] ^ " ~ * ? : \
        // define chars
        // escape and concat
        // create regex. also disallow spaces
        private static char[] disallowedChars { get; } = new[] {
            '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\' };
        private static string disallowedCharsEscaped { get; } = disallowedChars.Select(c => $@"\{c}").Aggregate((a, b) => a + b);
        private static string WORD_CAPTURE { get; } = $@"([^\s{disallowedCharsEscaped}]+)";
        // : with optional preceeding spaces. capture these so i don't accidentally replace a non-field name
        const string FIELD_END = @"(\s*:)";
        const string BEGIN_TAG = @"\[";
        const string END_TAG = @"\]";
        // space is forgiven at beginning and end of tag but not in the middle
        // literal space character only. do NOT allow new lines, tabs, ...
        const string OPTIONAL_SPACE_LITERAL = @"\u0020*";
        #endregion
        private static string tagPattern { get; } = NOT_ESCAPED + BEGIN_TAG + OPTIONAL_SPACE_LITERAL + WORD_CAPTURE + OPTIONAL_SPACE_LITERAL + END_TAG;
        public static Regex TagRegex { get; } = new Regex(tagPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
        private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END;
        public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
 		/// <summary>
 		/// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
 		///   positive look behind: beginning  space  {  [  :
 		///   positive look ahead: end  space  ]  }
 		/// </summary>
 		[GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)]
 		public static partial Regex NumbersRegex();
 		/// <summary>
 		/// proper bools are single keywords which are turned into keyword:True
 		/// if bordered by colons or inside brackets, they are not stand-alone bool keywords
 		/// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
 		///   [israted]
 		///     parseTag => tags:israted
 		///     replaceBools => tags:israted:True
 		///   or
 		///     [israted]
 		///       replaceBools => israted:True
 		///         parseTag => [israted:True]
 		/// also don't want to apply :True where the value already exists:
 		///   israted:false => israted:false:True
 		///   
 		/// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
 		/// </summary>
 		private static string boolPattern_parameterized { get; }
            = @"
 ### IMPORTANT: 'ignore whitespace' is only partially honored in character sets
 ### - new lines are ok
 ### - ANY leading whitespace is treated like actual matching spaces  :(
                    ### can't begin with colon. incorrect syntax
                    ### can't begin with open bracket: this signals the start of a tag
 (?<!                # begin negative lookbehind
  [:\[]             #   char set: colon and open bracket, escaped
  \s*               #   optional space
 )                   # end negative lookbehind
 \b                  # word boundary
  ({0})             #   captured bool search keyword. this is the $1 reference used in regex.Replace
 \b                  # word boundary
                    ### can't end with colon. this signals that the bool's value already exists
                    ### can't begin with close bracket: this signals the end of a tag
 (?!                 # begin negative lookahead
  \s*               #   optional space
  [:\]]             #   char set: colon and close bracket, escaped
 )                   # end negative lookahead
 ";
        private static Dictionary<string, Regex> boolRegexDic { get; } = new Dictionary<string, Regex>();
        public static Regex GetBoolRegex(string boolSearch)
        {
            if (boolRegexDic.TryGetValue(boolSearch, out var regex))
                return regex;
            var boolPattern = string.Format(boolPattern_parameterized, boolSearch);
            regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
            boolRegexDic.Add(boolSearch, regex);
            return regex;
        }
 	}
 }
--- a/Source/LibationSearchEngine/QuerySanitizer.cs
+++ b/Source/LibationSearchEngine/QuerySanitizer.cs
@ -0,0 +1,153 @@
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.Tokenattributes;
 using System.Collections.Generic;
 using System.Linq;
 namespace LibationSearchEngine
 {
 	internal static class QuerySanitizer
 	{
 		private static readonly HashSet<string> idTerms
 				= SearchEngine.idIndexRules.Keys
 				.Select(s => s.ToLowerInvariant())
 				.ToHashSet();
 		private static readonly HashSet<string> boolTerms
 				= SearchEngine.boolIndexRules.Keys
 				.Select(s => s.ToLowerInvariant())
 				.ToHashSet();
 		private static readonly HashSet<string> fieldTerms
 				= SearchEngine.stringIndexRules.Keys
 				.Union(SearchEngine.numberIndexRules.Keys)
 				.Select(s => s.ToLowerInvariant())
 				.Union(idTerms)
 				.Union(boolTerms)
 				.ToHashSet();
 		internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
 		{
 			if (string.IsNullOrWhiteSpace(searchString))
 				return SearchEngine.ALL_QUERY;
 			// range operator " TO " and bool operators " AND " and " OR " must be uppercase
 			searchString
 				= searchString
 				.Replace(" to ", " TO ", System.StringComparison.OrdinalIgnoreCase)
 				.Replace(" and ", " AND ", System.StringComparison.OrdinalIgnoreCase)
 				.Replace(" or ", " OR ", System.StringComparison.OrdinalIgnoreCase);
 			using var tokenStream = analyzer.TokenStream(SearchEngine.ALL, new System.IO.StringReader(searchString));
 			var partList = new List<string>();
 			int previousEndOffset = 0;
 			bool previousIsBool = false, previousIsTags = false, previousIsAsin = false;
 			while (tokenStream.IncrementToken())
 			{
 				var term = tokenStream.GetAttribute<ITermAttribute>().Term;
 				var offset = tokenStream.GetAttribute<IOffsetAttribute>();
 				if (previousIsBool && !bool.TryParse(term, out _))
 				{
 					//The previous term was a boolean tag and this term is NOT a bool value
 					//Add the default ":True" bool and continue parsing the current term
 					partList.Add(":True");
 					previousIsBool = false;
 				}
 				//Add all text between the current token and the previous token
 				partList.Add(searchString.Substring(previousEndOffset, offset.StartOffset - previousEndOffset));
 				if (previousIsBool)
 				{
 					//The previous term was a boolean tag and this term is a bool value
 					addUnalteredToken(offset);
 					previousIsBool = false;
 				}
 				else if (previousIsAsin)
 				{
 					//The previous term was an ASIN field ID, so this term is an ASIN
 					partList.Add(term);
 					previousIsAsin = false;
 				}
 				else if (previousIsTags)
 				{
 					//This term is a tag. Do this check before checking if term is a defined field
 					//so that "tags:israted" does not parse as a bool
 					addUnalteredToken(offset);
 					previousIsTags = false;
 				}
 				else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
 				{
 					//The term is a block tag. add it to the part list
 					partList.Add($"{SearchEngine.TAGS}:{tagName}");
 				}
 				else if (double.TryParse(term, out var num))
 				{
 					//Term is a number so pad it with zeros
 					partList.Add(num.ToLuceneString());
 				}
 				else if (fieldTerms.Contains(term))
 				{
 					//Term is a defined search field, add it.
 					//The StandardAnalyzer already converts all terms to lowercase
 					partList.Add(term);
 					previousIsBool = boolTerms.Contains(term);
 					previousIsAsin = idTerms.Contains(term);
 					previousIsTags = term == SearchEngine.TAGS;
 				}
 				else
 				{
 					//Term is any other user-defined constant value
 					addUnalteredToken(offset);
 				}
 				previousEndOffset = offset.EndOffset;
 			}
 			if (previousIsBool)
 				partList.Add(":True");
 			//Add ending non-token text
 			partList.Add(searchString.Substring(previousEndOffset, searchString.Length - previousEndOffset));
 			return string.Concat(partList);
 			//Add the full, unaltered token as well as all inter-token text
 			void addUnalteredToken(IOffsetAttribute offset) =>
 				partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));			
 		}
 		private static bool tryParseBlockTag(IOffsetAttribute offset, List<string> partList, string searchString, out string tagName)
 		{
 			tagName = null;
 			if (partList.Count == 0) return false;
 			var previous = partList[^1].TrimEnd();
 			//cannot be preceeded by an escaping \
 			if (previous.Length == 0) return false;
 			if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
 			var next = searchString.Substring(offset.EndOffset);
 			if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
 			tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
 			//Only legal tag characters are letters, numbers and underscores
 			//Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
 			foreach (var c in tagName)
 			{
 				if (!char.IsLetterOrDigit(c) && c != '_')
 					return false;
 			}
 			//Remove the leading '['
 			partList[^1] = previous[..^1];
 			//Ignore the trailing ']'
 			offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
 			return true;
 		}
 	}
 }
--- a/Source/LibationSearchEngine/SearchEngine.cs
+++ b/Source/LibationSearchEngine/SearchEngine.cs
@ -6,8 +6,8 @@ using System.Text.RegularExpressions;
 using DataLayer;
 using Dinah.Core;
 using LibationFileManager;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Documents;
 using Lucene.Net.Index;
 using Lucene.Net.Search;
@ -32,18 +32,18 @@ namespace LibationSearchEngine
        public const string ALL_NARRATOR_NAMES = "NarratorNames";
        public const string ALL_SERIES_NAMES = "SeriesNames";
-        private static ReadOnlyDictionary<string, Func<LibraryBook, string>> idIndexRules { get; }
+        internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> idIndexRules { get; }
            = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
                new Dictionary<string, Func<LibraryBook, string>>
                {
-                    [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId,
+                    [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
-                    ["ProductId"] = lb => lb.Book.AudibleProductId,
+                    ["ProductId"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
-                    ["Id"] = lb => lb.Book.AudibleProductId,
+                    ["Id"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
-                    ["ASIN"] = lb => lb.Book.AudibleProductId
+                    ["ASIN"] = lb => lb.Book.AudibleProductId.ToLowerInvariant()
 				}
                );
-        private static ReadOnlyDictionary<string, Func<LibraryBook, string>> stringIndexRules { get; }
+		internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> stringIndexRules { get; }
            = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
                new Dictionary<string, Func<LibraryBook, string>>
                {
@ -75,7 +75,7 @@ namespace LibationSearchEngine
                }
                );
-        private static ReadOnlyDictionary<string, Func<LibraryBook, string>> numberIndexRules { get; }
+		internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> numberIndexRules { get; }
            = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
                new Dictionary<string, Func<LibraryBook, string>>
                {
@ -99,7 +99,7 @@ namespace LibationSearchEngine
                }
                );
-        private static ReadOnlyDictionary<string, Func<LibraryBook, bool>> boolIndexRules { get; }
+        internal static ReadOnlyDictionary<string, Func<LibraryBook, bool>> boolIndexRules { get; }
            = new ReadOnlyDictionary<string, Func<LibraryBook, bool>>(
                new Dictionary<string, Func<LibraryBook, bool>>
                {
@ -354,111 +354,26 @@ namespace LibationSearchEngine
        #region search
        public SearchResultSet Search(string searchString)
 		{
 			using var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
 			Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
-            searchString = FormatSearchQuery(searchString);
+            searchString = QuerySanitizer.Sanitize(searchString, analyzer);
            Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
-            var results = generalSearch(searchString);
+            var results = generalSearch(searchString, analyzer);
            Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
            displayResults(results);
            return results;
        }
-        internal static string FormatSearchQuery(string searchString)
+        private SearchResultSet generalSearch(string searchString, StandardAnalyzer analyzer)
        {
            if (string.IsNullOrWhiteSpace(searchString))
                return ALL_QUERY;
            searchString = replaceBools(searchString);
            searchString = parseTag(searchString);
            // in ranges " TO " must be uppercase
            searchString = searchString.Replace(" to ", " TO ");
            searchString = padNumbers(searchString);
            searchString = lowerFieldNames(searchString);
            return searchString;
        }
        #region format query string
        private static string parseTag(string tagSearchString)
        {
            var allMatches = LuceneRegex
                .TagRegex
                .Matches(tagSearchString)
                .Cast<Match>()
                .Select(a => a.ToString())
                .ToList();
            foreach (var match in allMatches)
                tagSearchString = tagSearchString.Replace(
                    match,
                    TAGS + ":" + match.Trim('[', ']').Trim()
                    );
            return tagSearchString;
        }
        private static string replaceBools(string searchString)
        {
            foreach (var boolSearch in boolIndexRules.Keys)
                searchString =
                    LuceneRegex.GetBoolRegex(boolSearch)
                    .Replace(searchString, @"$1:True");
            return searchString;
        }
        private static string padNumbers(string searchString)
        {
            var matches = LuceneRegex
                .NumbersRegex()
                .Matches(searchString)
                .Cast<Match>()
                .OrderByDescending(m => m.Index);
            foreach (var m in matches)
            {
                var replaceString = double.Parse(m.ToString()).ToLuceneString();
                searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index);
            }
            return searchString;
        }
        private static string lowerFieldNames(string searchString)
        {
            // fields are case specific
            var allMatches = LuceneRegex
                .FieldRegex
                .Matches(searchString)
                .Cast<Match>()
                .Select(a => a.ToString())
                .ToList();
            foreach (var match in allMatches)
                searchString = searchString.Replace(match, match.ToLowerInvariant());
            return searchString;
        }
        #endregion
        private SearchResultSet generalSearch(string searchString)
        {
            var defaultField = ALL;
            using var index = getIndex();
            using var searcher = new IndexSearcher(index);
-            using var analyzer = new StandardAnalyzer(Version);
+			var query = analyzer.GetQuery(defaultField, searchString);
            using var asinAnalyzer = new AsinAnalyzer();
 			var dic = idIndexRules.Keys.Select(k => new KeyValuePair<string, Analyzer>(k.ToLowerInvariant(), asinAnalyzer));
 			using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic);
 			var query = perFieldAnalyzer.GetQuery(defaultField, searchString);
 			// lucene doesn't allow only negations. eg this returns nothing:
 			//     -tags:hidden
--- a/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs
+++ b/Source/_Tests/LibationSearchEngine.Tests/SearchEngineTests.cs
@ -10,6 +10,7 @@ using Dinah.Core;
 using FluentAssertions;
 using FluentAssertions.Common;
 using LibationSearchEngine;
 using Lucene.Net.Analysis.Standard;
 using Microsoft.VisualStudio.TestPlatform.Common.Filtering;
 using Microsoft.VisualStudio.TestTools.UnitTesting;
 using Moq;
@ -31,6 +32,7 @@ namespace SearchEngineTests
 		// tag surrounded by spaces
 		[DataRow("[foo]", "tags:foo")]
 		[DataRow("  [foo]", "  tags:foo")]
 		[DataRow("  [   foo   ]", "  tags:foo")]
 		[DataRow("[foo]  ", "tags:foo  ")]
 		[DataRow("  [foo]  ", "  tags:foo  ")]
 		[DataRow("-[foo]", "-tags:foo")]
@ -51,15 +53,25 @@ namespace SearchEngineTests
 		[DataRow("-israted  ", "-israted:True  ")]
 		[DataRow("  -israted  ", "  -israted:True  ")]
 		//ID Tags to lowercase and not parsed as numbers
 		[DataRow("id:0000000123", "id:0000000123")]
 		[DataRow("id:B000000123", "id:b000000123")]
 		[DataRow("ASIN:B000000123", "asin:b000000123")]
 		[DataRow("AudibleProductId:B000000123", "audibleproductid:b000000123")]
 		[DataRow("ProductId:B000000123", "productid:b000000123")]
 		// bool keyword. Append :True
 		[DataRow("israted", "israted:True")]
 		// bool keyword with [:bool]. Do not add :True
 		[DataRow("israted:True", "israted:True")]
 		[DataRow("isRated:false", "israted:false")]
 		[DataRow("liberated AND isRated:false", "liberated:True AND israted:false")]
 		// tag which happens to be a bool keyword >> parse as tag
 		[DataRow("[israted]", "tags:israted")]
 		[DataRow("[tags]    [israted] [tags] [tags]  [isliberated] [israted]   ", "tags:tags    tags:israted tags:tags tags:tags  tags:isliberated tags:israted   ")]
 		[DataRow("[tags][israted]", "tags:tagstags:israted")]
 		// numbers with "to". TO all caps, numbers [8.2] format
 		[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
@ -72,6 +84,10 @@ namespace SearchEngineTests
 		[DataRow("-isRATED", "-israted:True")]
 		public void FormattingTest(string input, string output)
-			=> SearchEngine.FormatSearchQuery(input).Should().Be(output);
+		{
 			using var analyzer = new StandardAnalyzer(SearchEngine.Version);
 			QuerySanitizer.Sanitize(input, analyzer).Should().Be(output);
 		}
 	}
 }