using System; using System.Collections.Generic; using System.Linq; using AudibleDotCom; using Dinah.Core; using DTOs; using Newtonsoft.Json.Linq; using Scraping.Selectors; namespace Scraping.BookDetail { static class NewtonsoftExt { public static string GetDecodedTokenString(this JToken jToken) => System.Net.WebUtility.HtmlDecode(((string)jToken).Trim()); } internal class BookDetailScraper { private AudiblePageSource source { get; } private WebElement docRoot { get; } public BookDetailScraper(AudiblePageSource pageSource) { source = pageSource; var doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(source.Source); docRoot = new WebElement(doc.DocumentNode); } static RuleFamilyBD ruleFamily { get; } = new RuleFamilyBD { RowsLocator = By.XPath("/*"), Rules = new RuleSetBD { parseJson, parseSeries } }; public BookDetailDTO ScrapePage() { //debug//var sw = System.Diagnostics.Stopwatch.StartNew(); var returnBookDetailDto = new BookDetailDTO { ProductId = source.PageId }; var wholePage = ruleFamily.GetRows(docRoot).Single(); ruleFamily.Rules.Run(wholePage, returnBookDetailDto); //debug//sw.Stop(); var ms = sw.ElapsedMilliseconds; return returnBookDetailDto; } static void parseJson(WebElement row, BookDetailDTO productItem) { // structured data is in the 2nd of the 3 json embedded sections