This commit is contained in:
Robert McRackan 2023-06-11 17:03:55 -04:00
commit a4dfdf80e4
7 changed files with 198 additions and 290 deletions

View File

@ -28,6 +28,15 @@ To make upgrades and reinstalls easier, Libation separates all of its responsibi
* Allow Libation to fix up audiobook metadata. After decrypting a title, Libation attempts to fix details like chapters and cover art. Some power users and/or control freaks prefer to manage this themselves. By unchecking this setting, Libation will only decrypt the book and will leave metadata as-is, warts and all. * Allow Libation to fix up audiobook metadata. After decrypting a title, Libation attempts to fix details like chapters and cover art. Some power users and/or control freaks prefer to manage this themselves. By unchecking this setting, Libation will only decrypt the book and will leave metadata as-is, warts and all.
In addition to the options that are enabled if you allow Libation to "fix up" the audiobook, it does the following:
* Adds the `TCOM` metadata tag for the narrators.
* Sets the `©gen` metadata tag for the genres.
* Unescapes the copyright symbol (replace `©` with `©`)
* Replaces the recording copyright `(P)` string with `℗`
* Replaces the chapter markers embedded in the aax file with the chapter markers retrieved from Audible's API.
* Sets the embedded cover art image with the 500x500 px cover art retrieved from Audible
### Command Line Interface ### Command Line Interface
Libationcli.exe allows limited access to Libation's functionalities as a CLI. Libationcli.exe allows limited access to Libation's functionalities as a CLI.

View File

@ -1,11 +1,10 @@
using Avalonia.Controls; using Avalonia.Controls;
using Avalonia.Input; using Avalonia.Input;
using Avalonia.Styling;
using System; using System;
namespace LibationAvalonia.Controls namespace LibationAvalonia.Controls
{ {
public partial class WheelComboBox : ComboBox, IStyleable public partial class WheelComboBox : ComboBox
{ {
protected override Type StyleKeyOverride => typeof(ComboBox); protected override Type StyleKeyOverride => typeof(ComboBox);

View File

@ -1,81 +0,0 @@
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace LibationSearchEngine
{
internal class AsinAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
return new AsinFilter(reader);
}
/// <summary>
/// Emits the entire input as a single token and removes
/// trailing .00 from strings that parsed to numbers
///
/// Based on Lucene.Net.Analysis.KeywordTokenizer
/// </summary>
private class AsinFilter : Tokenizer
{
private bool done;
private int finalOffset;
private readonly ITermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
private const int DEFAULT_BUFFER_SIZE = 256;
public AsinFilter(System.IO.TextReader input) : base(input)
{
offsetAtt = AddAttribute<IOffsetAttribute>();
termAtt = AddAttribute<ITermAttribute>();
termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE);
}
public override bool IncrementToken()
{
var charReader = input as CharReader;
if (!done)
{
ClearAttributes();
done = true;
int upto = 0;
char[] buffer = termAtt.TermBuffer();
while (true)
{
int length = charReader.Read(buffer, upto, buffer.Length - upto);
if (length == 0)
break;
upto += length;
if (upto == buffer.Length)
buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
}
var termStr = new string(buffer, 0, upto);
if (termStr.EndsWith(".00"))
upto -= 3;
termAtt.SetTermLength(upto);
finalOffset = CorrectOffset(upto);
offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
return true;
}
return false;
}
public override void End()
{
// set final offset
offsetAtt.SetOffset(finalOffset, finalOffset);
}
public override void Reset(System.IO.TextReader input)
{
base.Reset(input);
this.done = false;
}
}
}
}

View File

@ -1,103 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace LibationSearchEngine
{
internal static partial class LuceneRegex
{
#region pattern pieces
// negative lookbehind: cannot be preceeded by an escaping \
const string NOT_ESCAPED = @"(?<!\\)";
// disallow spaces and lucene reserved characters
// + - && || ! ( ) { } [ ] ^ " ~ * ? : \
// define chars
// escape and concat
// create regex. also disallow spaces
private static char[] disallowedChars { get; } = new[] {
'+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\' };
private static string disallowedCharsEscaped { get; } = disallowedChars.Select(c => $@"\{c}").Aggregate((a, b) => a + b);
private static string WORD_CAPTURE { get; } = $@"([^\s{disallowedCharsEscaped}]+)";
// : with optional preceeding spaces. capture these so i don't accidentally replace a non-field name
const string FIELD_END = @"(\s*:)";
const string BEGIN_TAG = @"\[";
const string END_TAG = @"\]";
// space is forgiven at beginning and end of tag but not in the middle
// literal space character only. do NOT allow new lines, tabs, ...
const string OPTIONAL_SPACE_LITERAL = @"\u0020*";
#endregion
private static string tagPattern { get; } = NOT_ESCAPED + BEGIN_TAG + OPTIONAL_SPACE_LITERAL + WORD_CAPTURE + OPTIONAL_SPACE_LITERAL + END_TAG;
public static Regex TagRegex { get; } = new Regex(tagPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END;
public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
/// <summary>
/// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
/// positive look behind: beginning space { [ :
/// positive look ahead: end space ] }
/// </summary>
[GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)]
public static partial Regex NumbersRegex();
/// <summary>
/// proper bools are single keywords which are turned into keyword:True
/// if bordered by colons or inside brackets, they are not stand-alone bool keywords
/// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
/// [israted]
/// parseTag => tags:israted
/// replaceBools => tags:israted:True
/// or
/// [israted]
/// replaceBools => israted:True
/// parseTag => [israted:True]
/// also don't want to apply :True where the value already exists:
/// israted:false => israted:false:True
///
/// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
/// </summary>
private static string boolPattern_parameterized { get; }
= @"
### IMPORTANT: 'ignore whitespace' is only partially honored in character sets
### - new lines are ok
### - ANY leading whitespace is treated like actual matching spaces :(
### can't begin with colon. incorrect syntax
### can't begin with open bracket: this signals the start of a tag
(?<! # begin negative lookbehind
[:\[] # char set: colon and open bracket, escaped
\s* # optional space
) # end negative lookbehind
\b # word boundary
({0}) # captured bool search keyword. this is the $1 reference used in regex.Replace
\b # word boundary
### can't end with colon. this signals that the bool's value already exists
### can't begin with close bracket: this signals the end of a tag
(?! # begin negative lookahead
\s* # optional space
[:\]] # char set: colon and close bracket, escaped
) # end negative lookahead
";
private static Dictionary<string, Regex> boolRegexDic { get; } = new Dictionary<string, Regex>();
public static Regex GetBoolRegex(string boolSearch)
{
if (boolRegexDic.TryGetValue(boolSearch, out var regex))
return regex;
var boolPattern = string.Format(boolPattern_parameterized, boolSearch);
regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
boolRegexDic.Add(boolSearch, regex);
return regex;
}
}
}

View File

@ -0,0 +1,153 @@
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Tokenattributes;
using System.Collections.Generic;
using System.Linq;
namespace LibationSearchEngine
{
internal static class QuerySanitizer
{
private static readonly HashSet<string> idTerms
= SearchEngine.idIndexRules.Keys
.Select(s => s.ToLowerInvariant())
.ToHashSet();
private static readonly HashSet<string> boolTerms
= SearchEngine.boolIndexRules.Keys
.Select(s => s.ToLowerInvariant())
.ToHashSet();
private static readonly HashSet<string> fieldTerms
= SearchEngine.stringIndexRules.Keys
.Union(SearchEngine.numberIndexRules.Keys)
.Select(s => s.ToLowerInvariant())
.Union(idTerms)
.Union(boolTerms)
.ToHashSet();
internal static string Sanitize(string searchString, StandardAnalyzer analyzer)
{
if (string.IsNullOrWhiteSpace(searchString))
return SearchEngine.ALL_QUERY;
// range operator " TO " and bool operators " AND " and " OR " must be uppercase
searchString
= searchString
.Replace(" to ", " TO ", System.StringComparison.OrdinalIgnoreCase)
.Replace(" and ", " AND ", System.StringComparison.OrdinalIgnoreCase)
.Replace(" or ", " OR ", System.StringComparison.OrdinalIgnoreCase);
using var tokenStream = analyzer.TokenStream(SearchEngine.ALL, new System.IO.StringReader(searchString));
var partList = new List<string>();
int previousEndOffset = 0;
bool previousIsBool = false, previousIsTags = false, previousIsAsin = false;
while (tokenStream.IncrementToken())
{
var term = tokenStream.GetAttribute<ITermAttribute>().Term;
var offset = tokenStream.GetAttribute<IOffsetAttribute>();
if (previousIsBool && !bool.TryParse(term, out _))
{
//The previous term was a boolean tag and this term is NOT a bool value
//Add the default ":True" bool and continue parsing the current term
partList.Add(":True");
previousIsBool = false;
}
//Add all text between the current token and the previous token
partList.Add(searchString.Substring(previousEndOffset, offset.StartOffset - previousEndOffset));
if (previousIsBool)
{
//The previous term was a boolean tag and this term is a bool value
addUnalteredToken(offset);
previousIsBool = false;
}
else if (previousIsAsin)
{
//The previous term was an ASIN field ID, so this term is an ASIN
partList.Add(term);
previousIsAsin = false;
}
else if (previousIsTags)
{
//This term is a tag. Do this check before checking if term is a defined field
//so that "tags:israted" does not parse as a bool
addUnalteredToken(offset);
previousIsTags = false;
}
else if (tryParseBlockTag(offset, partList, searchString, out var tagName))
{
//The term is a block tag. add it to the part list
partList.Add($"{SearchEngine.TAGS}:{tagName}");
}
else if (double.TryParse(term, out var num))
{
//Term is a number so pad it with zeros
partList.Add(num.ToLuceneString());
}
else if (fieldTerms.Contains(term))
{
//Term is a defined search field, add it.
//The StandardAnalyzer already converts all terms to lowercase
partList.Add(term);
previousIsBool = boolTerms.Contains(term);
previousIsAsin = idTerms.Contains(term);
previousIsTags = term == SearchEngine.TAGS;
}
else
{
//Term is any other user-defined constant value
addUnalteredToken(offset);
}
previousEndOffset = offset.EndOffset;
}
if (previousIsBool)
partList.Add(":True");
//Add ending non-token text
partList.Add(searchString.Substring(previousEndOffset, searchString.Length - previousEndOffset));
return string.Concat(partList);
//Add the full, unaltered token as well as all inter-token text
void addUnalteredToken(IOffsetAttribute offset) =>
partList.Add(searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset));
}
private static bool tryParseBlockTag(IOffsetAttribute offset, List<string> partList, string searchString, out string tagName)
{
tagName = null;
if (partList.Count == 0) return false;
var previous = partList[^1].TrimEnd();
//cannot be preceeded by an escaping \
if (previous.Length == 0) return false;
if (previous[^1] != '[' || (previous.Length > 1 && previous[^2] == '\\')) return false;
var next = searchString.Substring(offset.EndOffset);
if (next.Length == 0 || !next.TrimStart().StartsWith(']')) return false;
tagName = searchString.Substring(offset.StartOffset, offset.EndOffset - offset.StartOffset);
//Only legal tag characters are letters, numbers and underscores
//Per DataLayer.UserDefinedItem.IllegalCharacterRegex()
foreach (var c in tagName)
{
if (!char.IsLetterOrDigit(c) && c != '_')
return false;
}
//Remove the leading '['
partList[^1] = previous[..^1];
//Ignore the trailing ']'
offset.SetOffset(offset.StartOffset, searchString.IndexOf(']', offset.EndOffset) + 1);
return true;
}
}
}

View File

@ -6,8 +6,8 @@ using System.Text.RegularExpressions;
using DataLayer; using DataLayer;
using Dinah.Core; using Dinah.Core;
using LibationFileManager; using LibationFileManager;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard; using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents; using Lucene.Net.Documents;
using Lucene.Net.Index; using Lucene.Net.Index;
using Lucene.Net.Search; using Lucene.Net.Search;
@ -32,18 +32,18 @@ namespace LibationSearchEngine
public const string ALL_NARRATOR_NAMES = "NarratorNames"; public const string ALL_NARRATOR_NAMES = "NarratorNames";
public const string ALL_SERIES_NAMES = "SeriesNames"; public const string ALL_SERIES_NAMES = "SeriesNames";
private static ReadOnlyDictionary<string, Func<LibraryBook, string>> idIndexRules { get; } internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> idIndexRules { get; }
= new ReadOnlyDictionary<string, Func<LibraryBook, string>>( = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
new Dictionary<string, Func<LibraryBook, string>> new Dictionary<string, Func<LibraryBook, string>>
{ {
[nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId, [nameof(Book.AudibleProductId)] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
["ProductId"] = lb => lb.Book.AudibleProductId, ["ProductId"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
["Id"] = lb => lb.Book.AudibleProductId, ["Id"] = lb => lb.Book.AudibleProductId.ToLowerInvariant(),
["ASIN"] = lb => lb.Book.AudibleProductId ["ASIN"] = lb => lb.Book.AudibleProductId.ToLowerInvariant()
} }
); );
private static ReadOnlyDictionary<string, Func<LibraryBook, string>> stringIndexRules { get; } internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> stringIndexRules { get; }
= new ReadOnlyDictionary<string, Func<LibraryBook, string>>( = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
new Dictionary<string, Func<LibraryBook, string>> new Dictionary<string, Func<LibraryBook, string>>
{ {
@ -75,7 +75,7 @@ namespace LibationSearchEngine
} }
); );
private static ReadOnlyDictionary<string, Func<LibraryBook, string>> numberIndexRules { get; } internal static ReadOnlyDictionary<string, Func<LibraryBook, string>> numberIndexRules { get; }
= new ReadOnlyDictionary<string, Func<LibraryBook, string>>( = new ReadOnlyDictionary<string, Func<LibraryBook, string>>(
new Dictionary<string, Func<LibraryBook, string>> new Dictionary<string, Func<LibraryBook, string>>
{ {
@ -99,7 +99,7 @@ namespace LibationSearchEngine
} }
); );
private static ReadOnlyDictionary<string, Func<LibraryBook, bool>> boolIndexRules { get; } internal static ReadOnlyDictionary<string, Func<LibraryBook, bool>> boolIndexRules { get; }
= new ReadOnlyDictionary<string, Func<LibraryBook, bool>>( = new ReadOnlyDictionary<string, Func<LibraryBook, bool>>(
new Dictionary<string, Func<LibraryBook, bool>> new Dictionary<string, Func<LibraryBook, bool>>
{ {
@ -354,111 +354,26 @@ namespace LibationSearchEngine
#region search #region search
public SearchResultSet Search(string searchString) public SearchResultSet Search(string searchString)
{ {
using var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString }); Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
searchString = FormatSearchQuery(searchString); searchString = QuerySanitizer.Sanitize(searchString, analyzer);
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString }); Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
var results = generalSearch(searchString); var results = generalSearch(searchString, analyzer);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() }); Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results); displayResults(results);
return results; return results;
} }
internal static string FormatSearchQuery(string searchString) private SearchResultSet generalSearch(string searchString, StandardAnalyzer analyzer)
{
if (string.IsNullOrWhiteSpace(searchString))
return ALL_QUERY;
searchString = replaceBools(searchString);
searchString = parseTag(searchString);
// in ranges " TO " must be uppercase
searchString = searchString.Replace(" to ", " TO ");
searchString = padNumbers(searchString);
searchString = lowerFieldNames(searchString);
return searchString;
}
#region format query string
private static string parseTag(string tagSearchString)
{
var allMatches = LuceneRegex
.TagRegex
.Matches(tagSearchString)
.Cast<Match>()
.Select(a => a.ToString())
.ToList();
foreach (var match in allMatches)
tagSearchString = tagSearchString.Replace(
match,
TAGS + ":" + match.Trim('[', ']').Trim()
);
return tagSearchString;
}
private static string replaceBools(string searchString)
{
foreach (var boolSearch in boolIndexRules.Keys)
searchString =
LuceneRegex.GetBoolRegex(boolSearch)
.Replace(searchString, @"$1:True");
return searchString;
}
private static string padNumbers(string searchString)
{
var matches = LuceneRegex
.NumbersRegex()
.Matches(searchString)
.Cast<Match>()
.OrderByDescending(m => m.Index);
foreach (var m in matches)
{
var replaceString = double.Parse(m.ToString()).ToLuceneString();
searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index);
}
return searchString;
}
private static string lowerFieldNames(string searchString)
{
// fields are case specific
var allMatches = LuceneRegex
.FieldRegex
.Matches(searchString)
.Cast<Match>()
.Select(a => a.ToString())
.ToList();
foreach (var match in allMatches)
searchString = searchString.Replace(match, match.ToLowerInvariant());
return searchString;
}
#endregion
private SearchResultSet generalSearch(string searchString)
{ {
var defaultField = ALL; var defaultField = ALL;
using var index = getIndex(); using var index = getIndex();
using var searcher = new IndexSearcher(index); using var searcher = new IndexSearcher(index);
using var analyzer = new StandardAnalyzer(Version); var query = analyzer.GetQuery(defaultField, searchString);
using var asinAnalyzer = new AsinAnalyzer();
var dic = idIndexRules.Keys.Select(k => new KeyValuePair<string, Analyzer>(k.ToLowerInvariant(), asinAnalyzer));
using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic);
var query = perFieldAnalyzer.GetQuery(defaultField, searchString);
// lucene doesn't allow only negations. eg this returns nothing: // lucene doesn't allow only negations. eg this returns nothing:
// -tags:hidden // -tags:hidden

View File

@ -10,6 +10,7 @@ using Dinah.Core;
using FluentAssertions; using FluentAssertions;
using FluentAssertions.Common; using FluentAssertions.Common;
using LibationSearchEngine; using LibationSearchEngine;
using Lucene.Net.Analysis.Standard;
using Microsoft.VisualStudio.TestPlatform.Common.Filtering; using Microsoft.VisualStudio.TestPlatform.Common.Filtering;
using Microsoft.VisualStudio.TestTools.UnitTesting; using Microsoft.VisualStudio.TestTools.UnitTesting;
using Moq; using Moq;
@ -31,6 +32,7 @@ namespace SearchEngineTests
// tag surrounded by spaces // tag surrounded by spaces
[DataRow("[foo]", "tags:foo")] [DataRow("[foo]", "tags:foo")]
[DataRow(" [foo]", " tags:foo")] [DataRow(" [foo]", " tags:foo")]
[DataRow(" [ foo ]", " tags:foo")]
[DataRow("[foo] ", "tags:foo ")] [DataRow("[foo] ", "tags:foo ")]
[DataRow(" [foo] ", " tags:foo ")] [DataRow(" [foo] ", " tags:foo ")]
[DataRow("-[foo]", "-tags:foo")] [DataRow("-[foo]", "-tags:foo")]
@ -51,15 +53,25 @@ namespace SearchEngineTests
[DataRow("-israted ", "-israted:True ")] [DataRow("-israted ", "-israted:True ")]
[DataRow(" -israted ", " -israted:True ")] [DataRow(" -israted ", " -israted:True ")]
//ID Tags to lowercase and not parsed as numbers
[DataRow("id:0000000123", "id:0000000123")]
[DataRow("id:B000000123", "id:b000000123")]
[DataRow("ASIN:B000000123", "asin:b000000123")]
[DataRow("AudibleProductId:B000000123", "audibleproductid:b000000123")]
[DataRow("ProductId:B000000123", "productid:b000000123")]
// bool keyword. Append :True // bool keyword. Append :True
[DataRow("israted", "israted:True")] [DataRow("israted", "israted:True")]
// bool keyword with [:bool]. Do not add :True // bool keyword with [:bool]. Do not add :True
[DataRow("israted:True", "israted:True")] [DataRow("israted:True", "israted:True")]
[DataRow("isRated:false", "israted:false")] [DataRow("isRated:false", "israted:false")]
[DataRow("liberated AND isRated:false", "liberated:True AND israted:false")]
// tag which happens to be a bool keyword >> parse as tag // tag which happens to be a bool keyword >> parse as tag
[DataRow("[israted]", "tags:israted")] [DataRow("[israted]", "tags:israted")]
[DataRow("[tags] [israted] [tags] [tags] [isliberated] [israted] ", "tags:tags tags:israted tags:tags tags:tags tags:isliberated tags:israted ")]
[DataRow("[tags][israted]", "tags:tagstags:israted")]
// numbers with "to". TO all caps, numbers [8.2] format // numbers with "to". TO all caps, numbers [8.2] format
[DataRow("1 to 10", "00000001.00 TO 00000010.00")] [DataRow("1 to 10", "00000001.00 TO 00000010.00")]
@ -72,6 +84,10 @@ namespace SearchEngineTests
[DataRow("-isRATED", "-israted:True")] [DataRow("-isRATED", "-israted:True")]
public void FormattingTest(string input, string output) public void FormattingTest(string input, string output)
=> SearchEngine.FormatSearchQuery(input).Should().Be(output); {
using var analyzer = new StandardAnalyzer(SearchEngine.Version);
QuerySanitizer.Sanitize(input, analyzer).Should().Be(output);
}
} }
} }