From c77fe5d561ddbffd679f0a6bd9f7b9d8b0136ca6 Mon Sep 17 00:00:00 2001 From: Mbucari Date: Thu, 8 Jun 2023 14:23:39 -0600 Subject: [PATCH] Add Asin query tokenizer --- Source/LibationSearchEngine/AsinAnalyzer.cs | 81 +++++++++++++++++++++ Source/LibationSearchEngine/SearchEngine.cs | 19 +++-- 2 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 Source/LibationSearchEngine/AsinAnalyzer.cs diff --git a/Source/LibationSearchEngine/AsinAnalyzer.cs b/Source/LibationSearchEngine/AsinAnalyzer.cs new file mode 100644 index 00000000..0d60c42b --- /dev/null +++ b/Source/LibationSearchEngine/AsinAnalyzer.cs @@ -0,0 +1,81 @@ +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Analysis; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace LibationSearchEngine +{ + internal class AsinAnalyzer : Analyzer + { + public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) + { + return new AsinFilter(reader); + } + /// + /// Emits the entire input as a single token and removes + /// trailing .00 from strings that parsed to numbers + /// + /// Based on Lucene.Net.Analysis.KeywordTokenizer + /// + private class AsinFilter : Tokenizer + { + private bool done; + private int finalOffset; + private readonly ITermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + private const int DEFAULT_BUFFER_SIZE = 256; + + public AsinFilter(System.IO.TextReader input) : base(input) + { + offsetAtt = AddAttribute(); + termAtt = AddAttribute(); + termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE); + } + public override bool IncrementToken() + { + var charReader = input as CharReader; + if (!done) + { + ClearAttributes(); + done = true; + int upto = 0; + char[] buffer = termAtt.TermBuffer(); + + while (true) + { + int length = charReader.Read(buffer, upto, buffer.Length - upto); + if (length == 0) + break; + upto += length; + if (upto == buffer.Length) + buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); + } + + var termStr = new string(buffer, 0, upto); + if (termStr.EndsWith(".00")) + upto -= 3; + + termAtt.SetTermLength(upto); + finalOffset = CorrectOffset(upto); + offsetAtt.SetOffset(CorrectOffset(0), finalOffset); + return true; + } + return false; + } + public override void End() + { + // set final offset + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + public override void Reset(System.IO.TextReader input) + { + base.Reset(input); + this.done = false; + } + } + } +} diff --git a/Source/LibationSearchEngine/SearchEngine.cs b/Source/LibationSearchEngine/SearchEngine.cs index 436fe223..83bde6e4 100644 --- a/Source/LibationSearchEngine/SearchEngine.cs +++ b/Source/LibationSearchEngine/SearchEngine.cs @@ -6,6 +6,7 @@ using System.Text.RegularExpressions; using DataLayer; using Dinah.Core; using LibationFileManager; +using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Documents; using Lucene.Net.Index; @@ -452,15 +453,19 @@ namespace LibationSearchEngine using var index = getIndex(); using var searcher = new IndexSearcher(index); using var analyzer = new StandardAnalyzer(Version); - var query = analyzer.GetQuery(defaultField, searchString); + using var asinAnalyzer = new AsinAnalyzer(); + var dic = idIndexRules.Keys.Select(k => new KeyValuePair(k.ToLowerInvariant(), asinAnalyzer)); + using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic); - // lucene doesn't allow only negations. eg this returns nothing: - // -tags:hidden - // work arounds: https://kb.ucla.edu/articles/pure-negation-query-in-lucene - // HOWEVER, doing this to any other type of query can cause EVERYTHING to be a match unless "Occur" is carefully set - // this should really check that all leaf nodes are MUST_NOT - if (query is BooleanQuery boolQuery) + var query = perFieldAnalyzer.GetQuery(defaultField, searchString); + + // lucene doesn't allow only negations. eg this returns nothing: + // -tags:hidden + // work arounds: https://kb.ucla.edu/articles/pure-negation-query-in-lucene + // HOWEVER, doing this to any other type of query can cause EVERYTHING to be a match unless "Occur" is carefully set + // this should really check that all leaf nodes are MUST_NOT + if (query is BooleanQuery boolQuery) { var occurs = getOccurs_recurs(boolQuery); if (occurs.Any() && occurs.All(o => o == Occur.MUST_NOT))