Use new .NET regular expression source generators

2023-02-10 18:20:27 -07:00 · 2023-02-10 18:20:27 -07:00 · ee8c0ae27b
commit ee8c0ae27b
parent 5b4a4341ad
5 changed files with 107 additions and 92 deletions
--- a/Source/AppScaffolding/UpgradeProperties.cs
+++ b/Source/AppScaffolding/UpgradeProperties.cs
@ -1,11 +1,11 @@
-using System;
+using NPOI.XWPF.UserModel;
 using System;
 using System.Text.RegularExpressions;
 namespace AppScaffolding
 {
-	public record UpgradeProperties
+	public partial record UpgradeProperties
 	{
 		private static readonly Regex linkstripper = new Regex(@"\[(.*)\]\(.*\)");
 		public string ZipUrl { get; }
 		public string HtmlUrl { get; }
 		public string ZipName { get; }
@ -18,17 +18,10 @@ namespace AppScaffolding
 			HtmlUrl = htmlUrl;
 			ZipUrl = zipUrl;
 			LatestRelease = latestRelease;
-			Notes = stripMarkdownLinks(notes);
+			Notes = LinkStripRegex().Replace(notes, "$1");
 		}
 		private string stripMarkdownLinks(string body)
 		{
 			body = body.Replace(@"\", "");
 			var matches = linkstripper.Matches(body);
-			foreach (Match match in matches)
+		[GeneratedRegex(@"\[(.*)\]\(.*\)")]
-				body = body.Replace(match.Groups[0].Value, match.Groups[1].Value);
+		private static partial Regex LinkStripRegex();
 			return body;
 		}
 	}
 }
--- a/Source/DataLayer/EfClasses/UserDefinedItem.cs
+++ b/Source/DataLayer/EfClasses/UserDefinedItem.cs
@ -20,7 +20,7 @@ namespace DataLayer
        PartialDownload = 0x1000
    }
-    public class UserDefinedItem
+    public partial class UserDefinedItem
    {
        internal int BookId { get; private set; }
        public Book Book { get; private set; }
@ -51,18 +51,23 @@ namespace DataLayer
 		public IEnumerable<string> TagsEnumerated => Tags == "" ? new string[0] : Tags.Split(null as char[], StringSplitOptions.RemoveEmptyEntries);
 		#region sanitize tags: space delimited. Inline/denormalized. Lower case. Alpha numeric and hyphen
-		// only legal chars are letters numbers underscores and separating whitespace
+
-		//
+		/// <summary>
-		// technically, the only char.s which aren't easily supported are  \  [  ]
+		/// only legal chars are letters numbers underscores and separating whitespace
-		// however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character)
+		/// 
-		// it's easy to expand whitelist as needed
+		/// technically, the only char.s which aren't easily supported are  \  [  ]
-		// for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates
+		/// however, whitelisting is far safer than blacklisting (eg: new lines, non-printable character)
-		//
+		/// it's easy to expand whitelist as needed
-		// there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score
+		/// for lucene, ToLower() isn't needed because search is case-inspecific. for here, it prevents duplicates
-		// full list of characters which must be escaped:
+		/// 
-		//   + - && || ! ( ) { } [ ] ^ " ~ * ? : \
+		/// there are also other allowed but misleading characters. eg: the ^ operator defines a 'boost' score
-		static Regex regex { get; } = new Regex(@"[^\w\d\s_]", RegexOptions.Compiled);
+		/// full list of characters which must be escaped:
-        private static string sanitize(string input)
+		///     + - && || ! ( ) { } [ ] ^ " ~ * ? : \
 		/// </summary>
 		[GeneratedRegex(@"[^\w\d\s_]")]
 		private static partial Regex IllegalCharacterRegex();
 		private static string sanitize(string input)
        {
            if (string.IsNullOrWhiteSpace(input))
                return "";
@ -73,9 +78,9 @@ namespace DataLayer
                // assume a hyphen is supposed to be an underscore
                .Replace("-", "_");
-            var unique = regex
+            var unique = IllegalCharacterRegex()
-                // turn illegal characters into a space. this will also take care of turning new lines into spaces
+				// turn illegal characters into a space. this will also take care of turning new lines into spaces
-                .Replace(str, " ")
+				.Replace(str, " ")
                // split and remove excess spaces
                .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)
                // de-dup
--- a/Source/LibationFileManager/Templates.cs
+++ b/Source/LibationFileManager/Templates.cs
@ -19,7 +19,7 @@ namespace LibationFileManager
 		static abstract IEnumerable<TagCollection> TagCollections { get; }
 	}
-	public abstract class Templates
+	public abstract partial class Templates
 	{
 		public const string ERROR_FULL_PATH_IS_INVALID = @"No colons or full paths allowed. Eg: should not start with C:\";
 		public const string WARNING_NO_CHAPTER_NUMBER_TAG = "Should include chapter number tag in template used for naming files which are split by chapter. Ie: <ch#> or <ch# 0>";
@ -250,46 +250,31 @@ namespace LibationFileManager
 		#endregion
-		#region Tag Formatters		
+		#region Tag Formatters
-		//Format must have at least one of the string {T}, {F}, {M}, {L}, or {S}
+		/// <summary> Sort must have exactly one of the characters F, M, or L </summary>
-		private static readonly Regex FormatRegex = new(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)", RegexOptions.Compiled);
+		[GeneratedRegex(@"[Ss]ort\(\s*?([FML])\s*?\)")]
-		//Sort must have exactly one of the characters F, M, or L
+		private static partial Regex NamesSortRegex();
-		private static readonly Regex SortRegex = new(@"[Ss]ort\(\s*?([FML])\s*?\)", RegexOptions.Compiled);
+		/// <summary> Format must have at least one of the string {T}, {F}, {M}, {L}, or {S} </summary>
-		//Max must have a 1 or 2-digit number
+		[GeneratedRegex(@"[Ff]ormat\((.*?(?:{[TFMLS]})+.*?)\)")]
-		private static readonly Regex MaxRegex = new(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)", RegexOptions.Compiled);
+		private static partial Regex NamesFormatRegex();
-		//Separator can be anything
+		/// <summary> Separator can be anything </summary>
-		private static readonly Regex SeparatorRegex = new(@"[Ss]eparator\((.*?)\)", RegexOptions.Compiled);
+		[GeneratedRegex(@"[Ss]eparator\((.*?)\)")]
 		private static partial Regex NamesSeparatorRegex();
 		/// <summary> Max must have a 1 or 2-digit number </summary>
 		[GeneratedRegex(@"[Mm]ax\(\s*?(\d{1,2})\s*?\)")]
 		private static partial Regex NamesMaxRegex();
-		private static string NameListFormatter(ITemplateTag templateTag, IEnumerable<string> value, string formatString)
+		private static string NameListFormatter(ITemplateTag templateTag, IEnumerable<string> names, string formatString)
 		{
-			var names = value.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix));
+			var humanNames = names.Select(n => new HumanName(removeSuffix(n), Prefer.FirstOverPrefix));
 			var formatMatch = FormatRegex.Match(formatString);
 			string nameFormatString = formatMatch.Success ? formatMatch.Groups[1].Value : "{T} {F} {M} {L} {S}";
-			var maxMatch = MaxRegex.Match(formatString);
+			var sortedNames = sort(humanNames, formatString);
-			int maxNames = maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : int.MaxValue;
+			var nameFormatString = format(formatString, defaultValue: "{T} {F} {M} {L} {S}");
 			var separatorString = separator(formatString, defaultValue: ", ");
 			var maxNames = max(formatString, defaultValue: humanNames.Count());
-			var separatorMatch = SeparatorRegex.Match(formatString);
+			var formattedNames = string.Join(separatorString, sortedNames.Take(maxNames).Select(n => formatName(n, nameFormatString)));
 			var separatorString = separatorMatch.Success ? separatorMatch.Groups[1].Value : ", ";
 			var sortMatch = SortRegex.Match(formatString);
 			var sortedNames
 				= sortMatch.Success
 				? (
 					  sortMatch.Groups[1].Value == "F" ? names.OrderBy(n => n.First)
 					: sortMatch.Groups[1].Value == "M" ? names.OrderBy(n => n.Middle)
 					: sortMatch.Groups[1].Value == "L" ? names.OrderBy(n => n.Last)
 					: names
 				)
 				: names;
 			var formattedNames = string.Join(
 					separatorString,
 					sortedNames
 					.Take(int.Min(sortedNames.Count(), maxNames))
 					.Select(n => formatName(n, nameFormatString)));
 			while (formattedNames.Contains("  "))
 				formattedNames = formattedNames.Replace("  ", " ");
@ -299,12 +284,40 @@ namespace LibationFileManager
 			static string removeSuffix(string namesString)
 			{
 				namesString = namesString.Replace('’', '\'').Replace(" - Ret.", ", Ret.");
 				int dashIndex = namesString.IndexOf(" - ");
 				return (dashIndex > 0 ? namesString[..dashIndex] : namesString).Trim();
 			}
 			static IEnumerable<HumanName> sort(IEnumerable<HumanName> humanNames, string formatString)
 			{
 				var sortMatch = NamesSortRegex().Match(formatString);
 				return
 					sortMatch.Success
 					? sortMatch.Groups[1].Value == "F" ? humanNames.OrderBy(n => n.First)
 						: sortMatch.Groups[1].Value == "M" ? humanNames.OrderBy(n => n.Middle)
 						: sortMatch.Groups[1].Value == "L" ? humanNames.OrderBy(n => n.Last)
 						: humanNames
 					: humanNames;
 			}
 			static string format(string formatString, string defaultValue)
 			{
 				var formatMatch = NamesFormatRegex().Match(formatString);
 				return formatMatch.Success ? formatMatch.Groups[1].Value : defaultValue;
 			}
 			static string separator(string formatString, string defaultValue)
 			{
 				var separatorMatch = NamesSeparatorRegex().Match(formatString);
 				return separatorMatch.Success ? separatorMatch.Groups[1].Value : defaultValue;
 			}
 			static int max(string formatString, int defaultValue)
 			{
 				var maxMatch = NamesMaxRegex().Match(formatString);
 				return maxMatch.Success && int.TryParse(maxMatch.Groups[1].Value, out var max) ? int.Max(1, max) : defaultValue;
 			}
 			static string formatName(HumanName humanName, string nameFormatString)
 			{
 				//Single-word names parse as first names. Use it as last name.
--- a/Source/LibationSearchEngine/LuceneRegex.cs
+++ b/Source/LibationSearchEngine/LuceneRegex.cs
@ -5,7 +5,7 @@ using System.Text.RegularExpressions;
 namespace LibationSearchEngine
 {
-    internal static class LuceneRegex
+    internal static partial class LuceneRegex
    {
        #region pattern pieces
        //  negative lookbehind: cannot be preceeded by an escaping \
@ -38,28 +38,32 @@ namespace LibationSearchEngine
        private static string fieldPattern { get; } = NOT_ESCAPED + WORD_CAPTURE + FIELD_END;
        public static Regex FieldRegex { get; } = new Regex(fieldPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
-        // auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
+		/// <summary>
-        //   positive look behind: beginning  space  {  [  :
+		/// auto-pad numbers to 8 char.s. This will match int.s and dates (yyyyMMdd)
-        //   positive look ahead: end  space  ]  }
+		///   positive look behind: beginning  space  {  [  :
-        public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled);
+		///   positive look ahead: end  space  ]  }
 		/// </summary>
-        /// <summary>
+		[GeneratedRegex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled)]
-        /// proper bools are single keywords which are turned into keyword:True
+		public static partial Regex NumbersRegex();
-        /// if bordered by colons or inside brackets, they are not stand-alone bool keywords
+
-        /// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
+		/// <summary>
-        ///   [israted]
+		/// proper bools are single keywords which are turned into keyword:True
-        ///     parseTag => tags:israted
+		/// if bordered by colons or inside brackets, they are not stand-alone bool keywords
-        ///     replaceBools => tags:israted:True
+		/// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
-        ///   or
+		///   [israted]
-        ///     [israted]
+		///     parseTag => tags:israted
-        ///       replaceBools => israted:True
+		///     replaceBools => tags:israted:True
-        ///         parseTag => [israted:True]
+		///   or
-        /// also don't want to apply :True where the value already exists:
+		///     [israted]
-        ///   israted:false => israted:false:True
+		///       replaceBools => israted:True
-        ///   
+		///         parseTag => [israted:True]
-        /// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
+		/// also don't want to apply :True where the value already exists:
-        /// </summary>
+		///   israted:false => israted:false:True
-        private static string boolPattern_parameterized { get; }
+		///   
 		/// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
 		/// </summary>
 		private static string boolPattern_parameterized { get; }
            = @"
 ### IMPORTANT: 'ignore whitespace' is only partially honored in character sets
 ### - new lines are ok
@ -95,5 +99,5 @@ namespace LibationSearchEngine
            return regex;
        }
-    }
+	}
 }
--- a/Source/LibationSearchEngine/SearchEngine.cs
+++ b/Source/LibationSearchEngine/SearchEngine.cs
@ -402,7 +402,7 @@ namespace LibationSearchEngine
        private static string padNumbers(string searchString)
        {
            var matches = LuceneRegex
-                .NumbersRegex
+                .NumbersRegex()
                .Matches(searchString)
                .Cast<Match>()
                .OrderByDescending(m => m.Index);
@ -410,7 +410,7 @@ namespace LibationSearchEngine
            foreach (var m in matches)
            {
                var replaceString = double.Parse(m.ToString()).ToLuceneString();
-                searchString = LuceneRegex.NumbersRegex.Replace(searchString, replaceString, 1, m.Index);
+                searchString = LuceneRegex.NumbersRegex().Replace(searchString, replaceString, 1, m.Index);
            }
            return searchString;