Libation/Scraping/UNTESTED/Library/LibraryScraper.cs
2019-10-21 09:25:56 -04:00

273 lines
13 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using AudibleDotCom;
using Dinah.Core;
using DTOs;
using FileManager;
using Scraping.Selectors;
namespace Scraping.Library
{
internal class LibraryScraper : LibraryScraperBase
{
// row_xpath_OLD = "//tr[td[@class='adbl-lib-multipart-toggle']]"
static RuleFamilyLib ruleFamily { get; } = new RuleFamilyLib
{
// find all rows. ignores multi-part rows. ie: gets the first part only when multi-part
RowsLocator = By.XPath("//*[starts-with(@id,'adbl-library-content-row-')]"),
Rules = getRuleSet()
};
static string debug_lastGoodTitleOuterHtml;
protected static RuleSetLib getRuleSet()
=> new RuleSetLib
{
// product id
(row, productItem) => productItem.ProductId = row.FindElements(By.XPath(".//input[@name='asin']")).First().Value.Trim(),
// title: 1st td, 1st ul, 1st li, within h2
(row, productItem) => // (row, productItem) => productItem.Title = row.FindElement(By.XPath("(./td[1]//ul)[1]/li[1]/h2")).Text.Trim(),
{
var debug_attemptTitleScriptOuterHtml = row.Node.OuterHtml;
try
{
productItem.Title = row.FindElement(By.XPath("(./td[1]//ul)[1]/li[1]/h2")).Text.Trim();
debug_lastGoodTitleOuterHtml = debug_attemptTitleScriptOuterHtml;
}
catch
{
var badTitle = debug_attemptTitleScriptOuterHtml;
throw;
}
},
// is episodes. these will not have a book download link or personal library ratings
(row, productItem) => productItem.IsEpisodes = row.FindElements(By.XPath(".//a[starts-with(@href, '/a/library/subscription?')]")).Any(),
// get picture id, download images
(row, productItem) => {
productItem.PictureId = row
.FindElements(By.ClassName("bc-image-inset-border"))
.First()
.GetAttribute("src")
.Split('/').Last()
.Split('.').First();
PictureStorage.DownloadImages(productItem.PictureId);
},
// all text links
new LocatedRuleSetLib(By.XPath(".//a[not(img)]")) {
(link, ProductItem) => {
var href = link.GetAttribute("href");
if (href == null)
return;
// authors
var authorName = link.Text.Trim();
string authorId;
// with no id. DO NOT REPLACE THIS STEP. needed for valid early exit in 'else'
if (href.Contains("/search?searchAuthor="))
authorId = null;
// with id
else if (href.Contains("/author/"))
authorId = href
.Split('?')[0]
.Split('/').Last();
else // not an author
return;
ProductItem.Authors.Add((authorName, authorId));
},
},
// series. id only; not name
new LocatedRuleSetLib(By.XPath(".//a[text()='View Series']/@href")) {
(link, productItem) => productItem.Series[link.GetAttribute("href").Replace("series?asin=", "")] = null
},
// pdf download link
new LocatedRuleSetLib(By.ClassName("adbl-lib-action-pdf")) {
(link, productItem) => productItem.SupplementUrls.Add(link.GetAttribute("href"))
},
bookDownloadLink,
// date added to library
(row, productItem) => {
var dateAdded = row
.FindElements(By.ClassName("bc-text"))
.Select(l => l.Text.Trim())
.Where(str => Regex.IsMatch(str, @"^\d\d-\d\d-\d\d$"))
.Select(dateText => DateTime.ParseExact(dateText, "MM-dd-yy", System.Globalization.CultureInfo.InvariantCulture))
.ToList();
if (dateAdded.Any())
productItem.DateAdded = dateAdded.First();
},
// my library ratings
(row, productItem) => {
if (productItem.IsEpisodes)
return;
productItem.MyUserRating_Overall = int.Parse(row
.FindElement(By.ClassName("adbl-prod-rate-review-bar-overall")).GetAttribute("data-star-count"));
productItem.MyUserRating_Performance = int.Parse(row
.FindElement(By.ClassName("adbl-prod-rate-review-bar-performance")).GetAttribute("data-star-count"));
productItem.MyUserRating_Story = int.Parse(row
.FindElement(By.ClassName("adbl-prod-rate-review-bar-story")).GetAttribute("data-star-count"));
},
// 1st td (summary panel) (xpath uses 1-based indexes), top bullets
// to get the first, use parentheses. it will parse w/o parans but will fall through to the 2nd unwanted ul
new LocatedRuleSetLib(By.XPath("(./td[1]//ul)[1]/li")) {
(li, productItem) => {
var text = li.Text.Trim();
// narrators
if (!text.StartsWith("Narrated by:"))
return;
var narratorNames = text.Replace("Narrated by:", "").Trim();
productItem.Narrators = sanitizeContributorNames(narratorNames.Split(',')).ToArray();
},
(li, productItem) => {
var text = li.Text.Trim();
// parse time
if (!text.StartsWith("Length:"))
return;
if (!text.Contains(" hr") && !text.Contains(" min"))
return;
var timeSplit = text
.Replace("Length:", "")
.Trim()
.Split(new string[] { " and " }, StringSplitOptions.RemoveEmptyEntries);
// do the math for 1 item then add to productItem.
// If we += directly to the productItem inside foreach(), then time will be doubled if this runs twice
var tempLengthInMinutes = 0;
foreach (var part in timeSplit)
{
if (part.Contains("sec"))
continue;
var intPart = int.Parse(part.Replace("hr", "").Replace("min", "").Replace("s", "").Trim());
if (part.Contains("hr"))
intPart *= 60;
tempLengthInMinutes += intPart;
}
productItem.LengthInMinutes = tempLengthInMinutes;
},
},
// 1st td (summary panel)
// description
new LocatedRuleSetLib(By.XPath("./td[1]//p")) {
(p, productItem) => {
var text = p.Text.Trim();
if (!string.IsNullOrWhiteSpace(text))
productItem.Description = sanitizeDescription(text);
},
},
// 1st td (summary panel)
// 2nd set of bullets has product ratings
new LocatedRuleSetLib(By.XPath("(./td[1]//ul)[2]/li")) {
(li, productItem) => {
// splitting on null assumes white space: https://docs.microsoft.com/en-us/dotnet/api/system.string.split
var text = li.Text.Split(null as char[], StringSplitOptions.RemoveEmptyEntries);
if (text.Length < 2)
return;
var rating = float.Parse(text[1]);
var totalVotes = int.Parse(text[6].Replace(",", ""));
switch (text[0])
{
case "Overall": productItem.Product_OverallStars = rating; return;
case "Performance": productItem.Product_PerformanceStars = rating; return;
case "Story": productItem.Product_StoryStars = rating; return;
}
},
},
};
public LibraryScraper(AudiblePageSource pageSource) : base(pageSource) { }
public override IEnumerable<LibraryDTO> ScrapeCurrentPage()
{
#region // example for once per page rules
//var onePerPageProductItemValues = scrapeRows(onePerPageRules).Single();
//foreach (var productItem in scrapeRows(old_family))
//{
// productItem.CustomerId = onePerPageProductItemValues.CustomerId;
// productItem.UserName = onePerPageProductItemValues.UserName;
// productItem.Last_DownloadCustId = onePerPageProductItemValues.Last_DownloadCustId;
// yield return productItem;
//}
#endregion
#region // example for mutiple rule sets
// var i = 0;
// foreach (var oldResult in scrapeRows(oldRuleFamily))
// {
// i++;
// yield return oldResult;
// }
// if (i > 0)
// yield break;
// foreach (var newResult in scrapeRows(newRuleFamily))
// yield return newResult;
#endregion
return scrapeRows(ruleFamily);
}
// this is broken out into its own method as a proof of concept. it may also help with debugging
static void bookDownloadLink(WebElement row, LibraryDTO productItem)
{
if (productItem.IsEpisodes)
return;
var downloadLink = row.FindElements(By.ClassName("adbl-lib-action-download")).FirstOrDefault();
// ToNode switches to HtmlAgilityPack style. could also have used xpath .//a (or ./a since it happens to be the immediate descendant)
productItem.DownloadBookLink = downloadLink.Node
.Descendants("a").Single()
.Attributes["href"].Value;
// check for
// href="/howtolisten"
if (productItem.DownloadBookLink.ContainsInsensitive("howtolisten"))
throw new Exception("BAD DOWNLOAD LINKS"
+ "\r\n" + "PROBLEM: Library download button is a link to the 'howtolisten' page"
+ "\r\n" + "SOLUTION: Toggle this checkbox: Accounts Details > Update Settings > Software Verification > Check for Audible Download Manager");
}
#region static scrape page helpers
private static Regex removeIntroductionsRegex = new Regex(
#region regex: remove "introduction" variants
@"
# keep this. non-greedy
(.*?)
# non-capture. this is what to throw away
(?:
# this will capture
# (introduction)
# (introductions)
# - introduction
# - introductions
\s*\-?\s*\(?introductions?\)?
)?
", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled
#endregion
);
// input: comma delimited list of names. possibly with " - introductions", "(introduction)", etc
// output: clean up string[]
private static IEnumerable<string> sanitizeContributorNames(IEnumerable<string> names)
=> names.Select(n => removeIntroductionsRegex.Replace(n.Replace(",", "").Trim(), ""));
// room for improvement. all kinds of other things are tagged onto names with hyphens and parans. eg: "(cover illustration)", " - essay"
private static string sanitizeDescription(string desc) => desc
.Replace("’", "'") // '
.Replace("", "'") // '
.Replace("…", "...") // …
.Replace("“", "\"") // "
.Replace("â€" + '\u009d', "\"") // "
.Replace("“", "\"") // "
.Replace("”", "\"") // "
.Trim();
#endregion
}
}