diff --git a/Source/AppScaffolding/UpgradeProperties.cs b/Source/AppScaffolding/UpgradeProperties.cs index 506255b2..aa042b52 100644 --- a/Source/AppScaffolding/UpgradeProperties.cs +++ b/Source/AppScaffolding/UpgradeProperties.cs @@ -1,11 +1,11 @@ -using System; +using NPOI.XWPF.UserModel; +using System; using System.Text.RegularExpressions; namespace AppScaffolding { - public record UpgradeProperties + public partial record UpgradeProperties { - private static readonly Regex linkstripper = new Regex(@"\[(.*)\]\(.*\)"); public string ZipUrl { get; } public string HtmlUrl { get; } public string ZipName { get; } @@ -18,17 +18,10 @@ namespace AppScaffolding HtmlUrl = htmlUrl; ZipUrl = zipUrl; LatestRelease = latestRelease; - Notes = stripMarkdownLinks(notes); + Notes = LinkStripRegex().Replace(notes, "$1"); } - private string stripMarkdownLinks(string body) - { - body = body.Replace(@"\", ""); - var matches = linkstripper.Matches(body); - foreach (Match match in matches) - body = body.Replace(match.Groups[0].Value, match.Groups[1].Value); - - return body; - } + [GeneratedRegex(@"\[(.*)\]\(.*\)")] + private static partial Regex LinkStripRegex(); } } diff --git a/Source/DataLayer/EfClasses/UserDefinedItem.cs b/Source/DataLayer/EfClasses/UserDefinedItem.cs index 91bf236b..d34c45e7 100644 --- a/Source/DataLayer/EfClasses/UserDefinedItem.cs +++ b/Source/DataLayer/EfClasses/UserDefinedItem.cs @@ -20,7 +20,7 @@ namespace DataLayer PartialDownload = 0x1000 } - public class UserDefinedItem + public partial class UserDefinedItem { internal int BookId { get; private set; } public Book Book { get; private set; } @@ -51,18 +51,23 @@ namespace DataLayer public IEnumerable TagsEnumerated => Tags == "" ? new string[0] : Tags.Split(null as char[], StringSplitOptions.RemoveEmptyEntries); #region sanitize tags: space delimited. Inline/denormalized. Lower case. Alpha numeric and hyphen - // only legal chars are letters numbers underscores and separating whitespace - // - // technically, the only char.s which aren't easily supported are \ [ ] - // however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character) - // it's easy to expand whitelist as needed - // for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates - // - // there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score - // full list of characters which must be escaped: - // + - && || ! ( ) { } [ ] ^ " ~ * ? : \ - static Regex regex { get; } = new Regex(@"[^\w\d\s_]", RegexOptions.Compiled); - private static string sanitize(string input) + + /// + /// only legal chars are letters numbers underscores and separating whitespace + /// + /// technically, the only char.s which aren't easily supported are \ [ ] + /// however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character) + /// it's easy to expand whitelist as needed + /// for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates + /// + /// there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score + /// full list of characters which must be escaped: + /// + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + /// + + [GeneratedRegex(@"[^\w\d\s_]")] + private static partial Regex IllegalCharacterRegex(); + private static string sanitize(string input) { if (string.IsNullOrWhiteSpace(input)) return ""; @@ -73,9 +78,9 @@ namespace DataLayer // assume a hyphen is supposed to be an underscore .Replace("-", "_"); - var unique = regex - // turn illegal characters into a space. this will also take care of turning new lines into spaces - .Replace(str, " ") + var unique = IllegalCharacterRegex() + // turn illegal characters into a space. this will also take care of turning new lines into spaces + .Replace(str, " ") // split and remove excess spaces .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) // de-dup diff --git a/Source/LibationFileManager/Templates.cs b/Source/LibationFileManager/Templates.cs index 70be7bac..f714637e 100644 --- a/Source/LibationFileManager/Templates.cs +++ b/Source/LibationFileManager/Templates.cs @@ -19,7 +19,7 @@ namespace LibationFileManager static abstract IEnumerable TagCollections { get; } } - public abstract class Templates + public abstract partial class Templates { public const string ERROR_FULL_PATH_IS_INVALID = @"No colons or full paths allowed. Eg: should not start with C:\"; public const string WARNING_NO_CHAPTER_NUMBER_TAG = "Should include chapter number tag in template used for naming files which are split by chapter. Ie: or "; @@ -250,46 +250,31 @@ namespace LibationFileManager #endregion - #region Tag Formatters + #region Tag Formatters - //Format must have at least one of the string {T}, {F}, {M}, {L}, or {S} - private static readonly Regex FormatRegex = new(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)", RegexOptions.Compiled); - //Sort must have exactly one of the characters F, M, or L - private static readonly Regex SortRegex = new(@"[Ss]ort\(\s*?([FML])\s*?\)", RegexOptions.Compiled); - //Max must have a 1 or 2-digit number - private static readonly Regex MaxRegex = new(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)", RegexOptions.Compiled); - //Separator can be anything - private static readonly Regex SeparatorRegex = new(@"[Ss]eparator\((.*?)\)", RegexOptions.Compiled); + /// Sort must have exactly one of the characters F, M, or L + [GeneratedRegex(@"[Ss]ort\(\s*?([FML])\s*?\)")] + private static partial Regex NamesSortRegex(); + /// Format must have at least one of the string {T}, {F}, {M}, {L}, or {S} + [GeneratedRegex(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)")] + private static partial Regex NamesFormatRegex(); + /// Separator can be anything + [GeneratedRegex(@"[Ss]eparator\((.*?)\)")] + private static partial Regex NamesSeparatorRegex(); + /// Max must have a 1 or 2-digit number + [GeneratedRegex(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)")] + private static partial Regex NamesMaxRegex(); - private static string NameListFormatter(ITemplateTag templateTag, IEnumerable value, string formatString) + private static string NameListFormatter(ITemplateTag templateTag, IEnumerable names, string formatString) { - var names = value.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix)); - - var formatMatch = FormatRegex.Match(formatString); - string nameFormatString = formatMatch.Success ? formatMatch.Groups[1].Value : "{T} {F} {M} {L} {S}"; + var humanNames = names.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix)); - var maxMatch = MaxRegex.Match(formatString); - int maxNames = maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : int.MaxValue; + var sortedNames = sort(humanNames, formatString); + var nameFormatString = format(formatString, defaultValue: "{T} {F} {M} {L} {S}"); + var separatorString = separator(formatString, defaultValue: ", "); + var maxNames = max(formatString, defaultValue: humanNames.Count()); - var separatorMatch = SeparatorRegex.Match(formatString); - var separatorString = separatorMatch.Success ? separatorMatch.Groups[1].Value : ", "; - - var sortMatch = SortRegex.Match(formatString); - var sortedNames - = sortMatch.Success - ? ( - sortMatch.Groups[1].Value == "F" ? names.OrderBy(n => n.First) - : sortMatch.Groups[1].Value == "M" ? names.OrderBy(n => n.Middle) - : sortMatch.Groups[1].Value == "L" ? names.OrderBy(n => n.Last) - : names - ) - : names; - - var formattedNames = string.Join( - separatorString, - sortedNames - .Take(int.Min(sortedNames.Count(), maxNames)) - .Select(n => formatName(n, nameFormatString))); + var formattedNames = string.Join(separatorString, sortedNames.Take(maxNames).Select(n => formatName(n, nameFormatString))); while (formattedNames.Contains(" ")) formattedNames = formattedNames.Replace(" ", " "); @@ -299,12 +284,40 @@ namespace LibationFileManager static string removeSuffix(string namesString) { namesString = namesString.Replace('’', '\'').Replace(" - Ret.", ", Ret."); - int dashIndex = namesString.IndexOf(" - "); - return (dashIndex > 0 ? namesString[..dashIndex] : namesString).Trim(); } + static IEnumerable sort(IEnumerable humanNames, string formatString) + { + var sortMatch = NamesSortRegex().Match(formatString); + return + sortMatch.Success + ? sortMatch.Groups[1].Value == "F" ? humanNames.OrderBy(n => n.First) + : sortMatch.Groups[1].Value == "M" ? humanNames.OrderBy(n => n.Middle) + : sortMatch.Groups[1].Value == "L" ? humanNames.OrderBy(n => n.Last) + : humanNames + : humanNames; + } + + static string format(string formatString, string defaultValue) + { + var formatMatch = NamesFormatRegex().Match(formatString); + return formatMatch.Success ? formatMatch.Groups[1].Value : defaultValue; + } + + static string separator(string formatString, string defaultValue) + { + var separatorMatch = NamesSeparatorRegex().Match(formatString); + return separatorMatch.Success ? separatorMatch.Groups[1].Value : defaultValue; + } + + static int max(string formatString, int defaultValue) + { + var maxMatch = NamesMaxRegex().Match(formatString); + return maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : defaultValue; + } + static string formatName(HumanName humanName, string nameFormatString) { //Single-word names parse as first names. Use it as last name. diff --git a/Source/LibationSearchEngine/LuceneRegex.cs b/Source/LibationSearchEngine/LuceneRegex.cs index cc055098..4033ee48 100644 --- a/Source/LibationSearchEngine/LuceneRegex.cs +++ b/Source/LibationSearchEngine/LuceneRegex.cs @@ -5,7 +5,7 @@ using System.Text.RegularExpressions; namespace LibationSearchEngine { - internal static class LuceneRegex + internal static partial class LuceneRegex { #region pattern pieces // negative lookbehind: cannot be preceeded by an escaping \ @@ -38,28 +38,32 @@ namespace LibationSearchEngine private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END; public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled); - // auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd) - // positive look behind: beginning space { [ : - // positive look ahead: end space ] } - public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled); + /// + /// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd) + /// positive look behind: beginning space { [ : + /// positive look ahead: end space ] } + /// - /// - /// proper bools are single keywords which are turned into keyword:True - /// if bordered by colons or inside brackets, they are not stand-alone bool keywords - /// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag: - /// [israted] - /// parseTag => tags:israted - /// replaceBools => tags:israted:True - /// or - /// [israted] - /// replaceBools => israted:True - /// parseTag => [israted:True] - /// also don't want to apply :True where the value already exists: - /// israted:false => israted:false:True - /// - /// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture - /// - private static string boolPattern_parameterized { get; } + [GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)] + public static partial Regex NumbersRegex(); + + /// + /// proper bools are single keywords which are turned into keyword:True + /// if bordered by colons or inside brackets, they are not stand-alone bool keywords + /// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag: + /// [israted] + /// parseTag => tags:israted + /// replaceBools => tags:israted:True + /// or + /// [israted] + /// replaceBools => israted:True + /// parseTag => [israted:True] + /// also don't want to apply :True where the value already exists: + /// israted:false => israted:false:True + /// + /// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture + /// + private static string boolPattern_parameterized { get; } = @" ### IMPORTANT: 'ignore whitespace' is only partially honored in character sets ### - new lines are ok @@ -95,5 +99,5 @@ namespace LibationSearchEngine return regex; } - } + } } diff --git a/Source/LibationSearchEngine/SearchEngine.cs b/Source/LibationSearchEngine/SearchEngine.cs index 3be589cb..a24266cd 100644 --- a/Source/LibationSearchEngine/SearchEngine.cs +++ b/Source/LibationSearchEngine/SearchEngine.cs @@ -402,7 +402,7 @@ namespace LibationSearchEngine private static string padNumbers(string searchString) { var matches = LuceneRegex - .NumbersRegex + .NumbersRegex() .Matches(searchString) .Cast() .OrderByDescending(m => m.Index); @@ -410,7 +410,7 @@ namespace LibationSearchEngine foreach (var m in matches) { var replaceString = double.Parse(m.ToString()).ToLuceneString(); - searchString = LuceneRegex.NumbersRegex.Replace(searchString, replaceString, 1, m.Index); + searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index); } return searchString;