* bug fix: when user creates a tag which is also a reserved bool word (eg: israted), searching for this tag breaks the search

* add unit tests for search engine
This commit is contained in:
Robert McRackan 2021-04-01 15:45:19 -04:00
parent abd00ff1df
commit 726b36de4d
6 changed files with 141 additions and 18 deletions

View File

@ -86,6 +86,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "0 Libation Tests", "0 Libat
EndProject EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InternalUtilities.Tests", "_Tests\InternalUtilities.Tests\InternalUtilities.Tests.csproj", "{8447C956-B03E-4F59-9DD4-877793B849D9}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InternalUtilities.Tests", "_Tests\InternalUtilities.Tests\InternalUtilities.Tests.csproj", "{8447C956-B03E-4F59-9DD4-877793B849D9}"
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LibationSearchEngine.Tests", "_Tests\LibationSearchEngine.Tests\LibationSearchEngine.Tests.csproj", "{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}"
EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU Debug|Any CPU = Debug|Any CPU
@ -208,6 +210,10 @@ Global
{8447C956-B03E-4F59-9DD4-877793B849D9}.Debug|Any CPU.Build.0 = Debug|Any CPU {8447C956-B03E-4F59-9DD4-877793B849D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.ActiveCfg = Release|Any CPU {8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.Build.0 = Release|Any CPU {8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.Build.0 = Release|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE
@ -242,6 +248,7 @@ Global
{E7EFD64D-6630-4426-B09C-B6862A92E3FD} = {F0CBB7A7-D3FB-41FF-8F47-CF3F6A592249} {E7EFD64D-6630-4426-B09C-B6862A92E3FD} = {F0CBB7A7-D3FB-41FF-8F47-CF3F6A592249}
{F3B04A3A-20C8-4582-A54A-715AF6A5D859} = {8679CAC8-9164-4007-BDD2-F004810EDA14} {F3B04A3A-20C8-4582-A54A-715AF6A5D859} = {8679CAC8-9164-4007-BDD2-F004810EDA14}
{8447C956-B03E-4F59-9DD4-877793B849D9} = {67E66E82-5532-4440-AFB3-9FB1DF9DEF53} {8447C956-B03E-4F59-9DD4-877793B849D9} = {67E66E82-5532-4440-AFB3-9FB1DF9DEF53}
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98} = {67E66E82-5532-4440-AFB3-9FB1DF9DEF53}
EndGlobalSection EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {615E00ED-BAEF-4E8E-A92A-9B82D87942A9} SolutionGuid = {615E00ED-BAEF-4E8E-A92A-9B82D87942A9}

View File

@ -13,7 +13,7 @@
<!-- <PublishSingleFile>true</PublishSingleFile> --> <!-- <PublishSingleFile>true</PublishSingleFile> -->
<RuntimeIdentifier>win-x64</RuntimeIdentifier> <RuntimeIdentifier>win-x64</RuntimeIdentifier>
<Version>4.2.1.1</Version> <Version>4.2.2.1</Version>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>

View File

@ -42,5 +42,56 @@ namespace LibationSearchEngine
// positive look behind: beginning space { [ : // positive look behind: beginning space { [ :
// positive look ahead: end space ] } // positive look ahead: end space ] }
public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled); public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled);
/// <summary>
/// proper bools are single keywords which are turned into keyword:True
/// if bordered by colons or inside brackets, they are not stand-alone bool keywords
/// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
/// [israted]
/// parseTag => tags:israted
/// replaceBools => tags:israted:True
/// or
/// [israted]
/// replaceBools => israted:True
/// parseTag => [israted:True]
/// also don't want to apply :True where the value already exists:
/// israted:false => israted:false:True
///
/// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
/// </summary>
private static string boolPattern_parameterized { get; }
= @"
(?<! # begin negative lookbehind
[ # begin char set
: # colon
\[ # open bracket, escaped
] # end char set
\s* # optional space
) # end negative lookbehind
\b # word boundary
({0}) # captured bool search keyword. this is the $1 reference used in regex.Replace
\b # word boundary
(?! # begin negative lookahead
\s* # optional space
[ # begin char set
: # colon
\] # close bracket, escaped
] # end char set
) # end negative lookahead
";
private static Dictionary<string, Regex> boolRegexDic { get; } = new Dictionary<string, Regex>();
public static Regex GetBoolRegex(string boolSearch)
{
if (boolRegexDic.TryGetValue(boolSearch, out var regex))
return regex;
var boolPattern = string.Format(boolPattern_parameterized, boolSearch);
regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
boolRegexDic.Add(boolSearch, regex);
return regex;
}
} }
} }

View File

@ -347,32 +347,33 @@ namespace LibationSearchEngine
public SearchResultSet Search(string searchString) public SearchResultSet Search(string searchString)
{ {
Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString }); Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
searchString = FormatSearchQuery(searchString);
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
var results = generalSearch(searchString);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results);
return results;
}
public static string FormatSearchQuery(string searchString)
{
if (string.IsNullOrWhiteSpace(searchString)) if (string.IsNullOrWhiteSpace(searchString))
searchString = ALL_QUERY; return ALL_QUERY;
#region apply formatting
searchString = parseTag(searchString);
searchString = replaceBools(searchString); searchString = replaceBools(searchString);
searchString = parseTag(searchString);
// in ranges " TO " must be uppercase // in ranges " TO " must be uppercase
searchString = searchString.Replace(" to ", " TO "); searchString = searchString.Replace(" to ", " TO ");
searchString = padNumbers(searchString); searchString = padNumbers(searchString);
searchString = lowerFieldNames(searchString); searchString = lowerFieldNames(searchString);
#endregion
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString }); return searchString;
var results = generalSearch(searchString);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results);
return results;
} }
#region format query string #region format query string
@ -395,9 +396,10 @@ namespace LibationSearchEngine
private static string replaceBools(string searchString) private static string replaceBools(string searchString)
{ {
// negative look-ahead for optional spaces then colon. don't want to double-up. eg:"israted:false" => "israted:false:True"
foreach (var boolSearch in boolIndexRules.Keys) foreach (var boolSearch in boolIndexRules.Keys)
searchString = Regex.Replace(searchString, $@"\b({boolSearch})\b(?!\s*:)", @"$1:True", RegexOptions.IgnoreCase); searchString =
LuceneRegex.GetBoolRegex(boolSearch)
.Replace(searchString, @"$1:True");
return searchString; return searchString;
} }

View File

@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net5.0</TargetFramework>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.9.4" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.3" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.3" />
<PackageReference Include="coverlet.collector" Version="3.0.3">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\Dinah.Core\_Tests\TestCommon\TestCommon.csproj" />
<ProjectReference Include="..\..\LibationSearchEngine\LibationSearchEngine.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,39 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using Dinah.Core;
using FluentAssertions;
using FluentAssertions.Common;
using LibationSearchEngine;
using Microsoft.VisualStudio.TestPlatform.Common.Filtering;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Moq;
using Moq.Protected;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using TestCommon;
namespace SearchEngineTests
{
[TestClass]
public class FormatSearchQuery
{
[TestMethod]
[DataRow(null, SearchEngine.ALL_QUERY)]
[DataRow("", SearchEngine.ALL_QUERY)]
[DataRow(" ", SearchEngine.ALL_QUERY)]
[DataRow("israted", "israted:True")]
[DataRow("israted:True", "israted:True")]
[DataRow("isRated:false", "israted:false")]
[DataRow("[israted]", "tags:israted")]
[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
[DataRow("19990101 to 20001231", "19990101.00 TO 20001231.00")]
public void FormattingTest(string input, string output)
=> SearchEngine.FormatSearchQuery(input).Should().Be(output);
}
}