Use new .NET regular expression source generators

This commit is contained in:
Michael Bucari-Tovo 2023-02-10 18:20:27 -07:00
parent 5b4a4341ad
commit ee8c0ae27b
5 changed files with 107 additions and 92 deletions

View File

@ -1,11 +1,11 @@
using System;
using NPOI.XWPF.UserModel;
using System;
using System.Text.RegularExpressions;
namespace AppScaffolding
{
public record UpgradeProperties
public partial record UpgradeProperties
{
private static readonly Regex linkstripper = new Regex(@"\[(.*)\]\(.*\)");
public string ZipUrl { get; }
public string HtmlUrl { get; }
public string ZipName { get; }
@ -18,17 +18,10 @@ namespace AppScaffolding
HtmlUrl = htmlUrl;
ZipUrl = zipUrl;
LatestRelease = latestRelease;
Notes = stripMarkdownLinks(notes);
Notes = LinkStripRegex().Replace(notes, "$1");
}
private string stripMarkdownLinks(string body)
{
body = body.Replace(@"\", "");
var matches = linkstripper.Matches(body);
foreach (Match match in matches)
body = body.Replace(match.Groups[0].Value, match.Groups[1].Value);
return body;
}
[GeneratedRegex(@"\[(.*)\]\(.*\)")]
private static partial Regex LinkStripRegex();
}
}

View File

@ -20,7 +20,7 @@ namespace DataLayer
PartialDownload = 0x1000
}
public class UserDefinedItem
public partial class UserDefinedItem
{
internal int BookId { get; private set; }
public Book Book { get; private set; }
@ -51,17 +51,22 @@ namespace DataLayer
public IEnumerable<string> TagsEnumerated => Tags == "" ? new string[0] : Tags.Split(null as char[], StringSplitOptions.RemoveEmptyEntries);
#region sanitize tags: space delimited. Inline/denormalized. Lower case. Alpha numeric and hyphen
// only legal chars are letters numbers underscores and separating whitespace
//
// technically, the only char.s which aren't easily supported are \ [ ]
// however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character)
// it's easy to expand whitelist as needed
// for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates
//
// there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score
// full list of characters which must be escaped:
// + - && || ! ( ) { } [ ] ^ " ~ * ? : \
static Regex regex { get; } = new Regex(@"[^\w\d\s_]", RegexOptions.Compiled);
/// <summary>
/// only legal chars are letters numbers underscores and separating whitespace
///
/// technically, the only char.s which aren't easily supported are \ [ ]
/// however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character)
/// it's easy to expand whitelist as needed
/// for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates
///
/// there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score
/// full list of characters which must be escaped:
/// + - && || ! ( ) { } [ ] ^ " ~ * ? : \
/// </summary>
[GeneratedRegex(@"[^\w\d\s_]")]
private static partial Regex IllegalCharacterRegex();
private static string sanitize(string input)
{
if (string.IsNullOrWhiteSpace(input))
@ -73,7 +78,7 @@ namespace DataLayer
// assume a hyphen is supposed to be an underscore
.Replace("-", "_");
var unique = regex
var unique = IllegalCharacterRegex()
// turn illegal characters into a space. this will also take care of turning new lines into spaces
.Replace(str, " ")
// split and remove excess spaces

View File

@ -19,7 +19,7 @@ namespace LibationFileManager
static abstract IEnumerable<TagCollection> TagCollections { get; }
}
public abstract class Templates
public abstract partial class Templates
{
public const string ERROR_FULL_PATH_IS_INVALID = @"No colons or full paths allowed. Eg: should not start with C:\";
public const string WARNING_NO_CHAPTER_NUMBER_TAG = "Should include chapter number tag in template used for naming files which are split by chapter. Ie: <ch#> or <ch# 0>";
@ -252,44 +252,29 @@ namespace LibationFileManager
#region Tag Formatters
//Format must have at least one of the string {T}, {F}, {M}, {L}, or {S}
private static readonly Regex FormatRegex = new(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)", RegexOptions.Compiled);
//Sort must have exactly one of the characters F, M, or L
private static readonly Regex SortRegex = new(@"[Ss]ort\(\s*?([FML])\s*?\)", RegexOptions.Compiled);
//Max must have a 1 or 2-digit number
private static readonly Regex MaxRegex = new(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)", RegexOptions.Compiled);
//Separator can be anything
private static readonly Regex SeparatorRegex = new(@"[Ss]eparator\((.*?)\)", RegexOptions.Compiled);
/// <summary> Sort must have exactly one of the characters F, M, or L </summary>
[GeneratedRegex(@"[Ss]ort\(\s*?([FML])\s*?\)")]
private static partial Regex NamesSortRegex();
/// <summary> Format must have at least one of the string {T}, {F}, {M}, {L}, or {S} </summary>
[GeneratedRegex(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)")]
private static partial Regex NamesFormatRegex();
/// <summary> Separator can be anything </summary>
[GeneratedRegex(@"[Ss]eparator\((.*?)\)")]
private static partial Regex NamesSeparatorRegex();
/// <summary> Max must have a 1 or 2-digit number </summary>
[GeneratedRegex(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)")]
private static partial Regex NamesMaxRegex();
private static string NameListFormatter(ITemplateTag templateTag, IEnumerable<string> value, string formatString)
private static string NameListFormatter(ITemplateTag templateTag, IEnumerable<string> names, string formatString)
{
var names = value.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix));
var humanNames = names.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix));
var formatMatch = FormatRegex.Match(formatString);
string nameFormatString = formatMatch.Success ? formatMatch.Groups[1].Value : "{T} {F} {M} {L} {S}";
var sortedNames = sort(humanNames, formatString);
var nameFormatString = format(formatString, defaultValue: "{T} {F} {M} {L} {S}");
var separatorString = separator(formatString, defaultValue: ", ");
var maxNames = max(formatString, defaultValue: humanNames.Count());
var maxMatch = MaxRegex.Match(formatString);
int maxNames = maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : int.MaxValue;
var separatorMatch = SeparatorRegex.Match(formatString);
var separatorString = separatorMatch.Success ? separatorMatch.Groups[1].Value : ", ";
var sortMatch = SortRegex.Match(formatString);
var sortedNames
= sortMatch.Success
? (
sortMatch.Groups[1].Value == "F" ? names.OrderBy(n => n.First)
: sortMatch.Groups[1].Value == "M" ? names.OrderBy(n => n.Middle)
: sortMatch.Groups[1].Value == "L" ? names.OrderBy(n => n.Last)
: names
)
: names;
var formattedNames = string.Join(
separatorString,
sortedNames
.Take(int.Min(sortedNames.Count(), maxNames))
.Select(n => formatName(n, nameFormatString)));
var formattedNames = string.Join(separatorString, sortedNames.Take(maxNames).Select(n => formatName(n, nameFormatString)));
while (formattedNames.Contains(" "))
formattedNames = formattedNames.Replace(" ", " ");
@ -299,12 +284,40 @@ namespace LibationFileManager
static string removeSuffix(string namesString)
{
namesString = namesString.Replace('', '\'').Replace(" - Ret.", ", Ret.");
int dashIndex = namesString.IndexOf(" - ");
return (dashIndex > 0 ? namesString[..dashIndex] : namesString).Trim();
}
static IEnumerable<HumanName> sort(IEnumerable<HumanName> humanNames, string formatString)
{
var sortMatch = NamesSortRegex().Match(formatString);
return
sortMatch.Success
? sortMatch.Groups[1].Value == "F" ? humanNames.OrderBy(n => n.First)
: sortMatch.Groups[1].Value == "M" ? humanNames.OrderBy(n => n.Middle)
: sortMatch.Groups[1].Value == "L" ? humanNames.OrderBy(n => n.Last)
: humanNames
: humanNames;
}
static string format(string formatString, string defaultValue)
{
var formatMatch = NamesFormatRegex().Match(formatString);
return formatMatch.Success ? formatMatch.Groups[1].Value : defaultValue;
}
static string separator(string formatString, string defaultValue)
{
var separatorMatch = NamesSeparatorRegex().Match(formatString);
return separatorMatch.Success ? separatorMatch.Groups[1].Value : defaultValue;
}
static int max(string formatString, int defaultValue)
{
var maxMatch = NamesMaxRegex().Match(formatString);
return maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : defaultValue;
}
static string formatName(HumanName humanName, string nameFormatString)
{
//Single-word names parse as first names. Use it as last name.

View File

@ -5,7 +5,7 @@ using System.Text.RegularExpressions;
namespace LibationSearchEngine
{
internal static class LuceneRegex
internal static partial class LuceneRegex
{
#region pattern pieces
// negative lookbehind: cannot be preceeded by an escaping \
@ -38,10 +38,14 @@ namespace LibationSearchEngine
private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END;
public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
// positive look behind: beginning space { [ :
// positive look ahead: end space ] }
public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled);
/// <summary>
/// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
/// positive look behind: beginning space { [ :
/// positive look ahead: end space ] }
/// </summary>
[GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)]
public static partial Regex NumbersRegex();
/// <summary>
/// proper bools are single keywords which are turned into keyword:True

View File

@ -402,7 +402,7 @@ namespace LibationSearchEngine
private static string padNumbers(string searchString)
{
var matches = LuceneRegex
.NumbersRegex
.NumbersRegex()
.Matches(searchString)
.Cast<Match>()
.OrderByDescending(m => m.Index);
@ -410,7 +410,7 @@ namespace LibationSearchEngine
foreach (var m in matches)
{
var replaceString = double.Parse(m.ToString()).ToLuceneString();
searchString = LuceneRegex.NumbersRegex.Replace(searchString, replaceString, 1, m.Index);
searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index);
}
return searchString;