Add Asin query tokenizer

This commit is contained in:
Mbucari 2023-06-08 14:23:39 -06:00
parent 359d082ffd
commit c77fe5d561
2 changed files with 93 additions and 7 deletions

View File

@ -0,0 +1,81 @@
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace LibationSearchEngine
{
internal class AsinAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
return new AsinFilter(reader);
}
/// <summary>
/// Emits the entire input as a single token and removes
/// trailing .00 from strings that parsed to numbers
///
/// Based on Lucene.Net.Analysis.KeywordTokenizer
/// </summary>
private class AsinFilter : Tokenizer
{
private bool done;
private int finalOffset;
private readonly ITermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
private const int DEFAULT_BUFFER_SIZE = 256;
public AsinFilter(System.IO.TextReader input) : base(input)
{
offsetAtt = AddAttribute<IOffsetAttribute>();
termAtt = AddAttribute<ITermAttribute>();
termAtt.ResizeTermBuffer(DEFAULT_BUFFER_SIZE);
}
public override bool IncrementToken()
{
var charReader = input as CharReader;
if (!done)
{
ClearAttributes();
done = true;
int upto = 0;
char[] buffer = termAtt.TermBuffer();
while (true)
{
int length = charReader.Read(buffer, upto, buffer.Length - upto);
if (length == 0)
break;
upto += length;
if (upto == buffer.Length)
buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
}
var termStr = new string(buffer, 0, upto);
if (termStr.EndsWith(".00"))
upto -= 3;
termAtt.SetTermLength(upto);
finalOffset = CorrectOffset(upto);
offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
return true;
}
return false;
}
public override void End()
{
// set final offset
offsetAtt.SetOffset(finalOffset, finalOffset);
}
public override void Reset(System.IO.TextReader input)
{
base.Reset(input);
this.done = false;
}
}
}
}

View File

@ -6,6 +6,7 @@ using System.Text.RegularExpressions;
using DataLayer; using DataLayer;
using Dinah.Core; using Dinah.Core;
using LibationFileManager; using LibationFileManager;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard; using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents; using Lucene.Net.Documents;
using Lucene.Net.Index; using Lucene.Net.Index;
@ -452,8 +453,12 @@ namespace LibationSearchEngine
using var index = getIndex(); using var index = getIndex();
using var searcher = new IndexSearcher(index); using var searcher = new IndexSearcher(index);
using var analyzer = new StandardAnalyzer(Version); using var analyzer = new StandardAnalyzer(Version);
var query = analyzer.GetQuery(defaultField, searchString); using var asinAnalyzer = new AsinAnalyzer();
var dic = idIndexRules.Keys.Select(k => new KeyValuePair<string, Analyzer>(k.ToLowerInvariant(), asinAnalyzer));
using var perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer, dic);
var query = perFieldAnalyzer.GetQuery(defaultField, searchString);
// lucene doesn't allow only negations. eg this returns nothing: // lucene doesn't allow only negations. eg this returns nothing:
// -tags:hidden // -tags:hidden