273 lines
13 KiB
C#
273 lines
13 KiB
C#
using System;
|
||
using System.Collections.Generic;
|
||
using System.Linq;
|
||
using System.Text.RegularExpressions;
|
||
using AudibleDotCom;
|
||
using Dinah.Core;
|
||
using DTOs;
|
||
using FileManager;
|
||
using Scraping.Selectors;
|
||
|
||
namespace Scraping.Library
|
||
{
|
||
internal class LibraryScraper : LibraryScraperBase
|
||
{
|
||
// row_xpath_OLD = "//tr[td[@class='adbl-lib-multipart-toggle']]"
|
||
static RuleFamilyLib ruleFamily { get; } = new RuleFamilyLib
|
||
{
|
||
// find all rows. ignores multi-part rows. ie: gets the first part only when multi-part
|
||
RowsLocator = By.XPath("//*[starts-with(@id,'adbl-library-content-row-')]"),
|
||
Rules = getRuleSet()
|
||
};
|
||
static string debug_lastGoodTitleOuterHtml;
|
||
protected static RuleSetLib getRuleSet()
|
||
=> new RuleSetLib
|
||
{
|
||
// product id
|
||
(row, productItem) => productItem.ProductId = row.FindElements(By.XPath(".//input[@name='asin']")).First().Value.Trim(),
|
||
// title: 1st td, 1st ul, 1st li, within h2
|
||
(row, productItem) => // (row, productItem) => productItem.Title = row.FindElement(By.XPath("(./td[1]//ul)[1]/li[1]/h2")).Text.Trim(),
|
||
{
|
||
var debug_attemptTitleScriptOuterHtml = row.Node.OuterHtml;
|
||
try
|
||
{
|
||
productItem.Title = row.FindElement(By.XPath("(./td[1]//ul)[1]/li[1]/h2")).Text.Trim();
|
||
|
||
debug_lastGoodTitleOuterHtml = debug_attemptTitleScriptOuterHtml;
|
||
}
|
||
catch
|
||
{
|
||
var badTitle = debug_attemptTitleScriptOuterHtml;
|
||
|
||
throw;
|
||
}
|
||
},
|
||
// is episodes. these will not have a book download link or personal library ratings
|
||
(row, productItem) => productItem.IsEpisodes = row.FindElements(By.XPath(".//a[starts-with(@href, '/a/library/subscription?')]")).Any(),
|
||
// get picture id, download images
|
||
(row, productItem) => {
|
||
productItem.PictureId = row
|
||
.FindElements(By.ClassName("bc-image-inset-border"))
|
||
.First()
|
||
.GetAttribute("src")
|
||
.Split('/').Last()
|
||
.Split('.').First();
|
||
PictureStorage.DownloadImages(productItem.PictureId);
|
||
},
|
||
// all text links
|
||
new LocatedRuleSetLib(By.XPath(".//a[not(img)]")) {
|
||
(link, ProductItem) => {
|
||
var href = link.GetAttribute("href");
|
||
|
||
if (href == null)
|
||
return;
|
||
|
||
// authors
|
||
var authorName = link.Text.Trim();
|
||
string authorId;
|
||
|
||
// with no id. DO NOT REPLACE THIS STEP. needed for valid early exit in 'else'
|
||
if (href.Contains("/search?searchAuthor="))
|
||
authorId = null;
|
||
// with id
|
||
else if (href.Contains("/author/"))
|
||
authorId = href
|
||
.Split('?')[0]
|
||
.Split('/').Last();
|
||
else // not an author
|
||
return;
|
||
|
||
ProductItem.Authors.Add((authorName, authorId));
|
||
},
|
||
},
|
||
// series. id only; not name
|
||
new LocatedRuleSetLib(By.XPath(".//a[text()='View Series']/@href")) {
|
||
(link, productItem) => productItem.Series[link.GetAttribute("href").Replace("series?asin=", "")] = null
|
||
},
|
||
// pdf download link
|
||
new LocatedRuleSetLib(By.ClassName("adbl-lib-action-pdf")) {
|
||
(link, productItem) => productItem.SupplementUrls.Add(link.GetAttribute("href"))
|
||
},
|
||
bookDownloadLink,
|
||
// date added to library
|
||
(row, productItem) => {
|
||
var dateAdded = row
|
||
.FindElements(By.ClassName("bc-text"))
|
||
.Select(l => l.Text.Trim())
|
||
.Where(str => Regex.IsMatch(str, @"^\d\d-\d\d-\d\d$"))
|
||
.Select(dateText => DateTime.ParseExact(dateText, "MM-dd-yy", System.Globalization.CultureInfo.InvariantCulture))
|
||
.ToList();
|
||
if (dateAdded.Any())
|
||
productItem.DateAdded = dateAdded.First();
|
||
},
|
||
// my library ratings
|
||
(row, productItem) => {
|
||
if (productItem.IsEpisodes)
|
||
return;
|
||
productItem.MyUserRating_Overall = int.Parse(row
|
||
.FindElement(By.ClassName("adbl-prod-rate-review-bar-overall")).GetAttribute("data-star-count"));
|
||
productItem.MyUserRating_Performance = int.Parse(row
|
||
.FindElement(By.ClassName("adbl-prod-rate-review-bar-performance")).GetAttribute("data-star-count"));
|
||
productItem.MyUserRating_Story = int.Parse(row
|
||
.FindElement(By.ClassName("adbl-prod-rate-review-bar-story")).GetAttribute("data-star-count"));
|
||
},
|
||
// 1st td (summary panel) (xpath uses 1-based indexes), top bullets
|
||
// to get the first, use parentheses. it will parse w/o parans but will fall through to the 2nd unwanted ul
|
||
new LocatedRuleSetLib(By.XPath("(./td[1]//ul)[1]/li")) {
|
||
(li, productItem) => {
|
||
var text = li.Text.Trim();
|
||
|
||
// narrators
|
||
if (!text.StartsWith("Narrated by:"))
|
||
return;
|
||
|
||
var narratorNames = text.Replace("Narrated by:", "").Trim();
|
||
productItem.Narrators = sanitizeContributorNames(narratorNames.Split(',')).ToArray();
|
||
},
|
||
(li, productItem) => {
|
||
var text = li.Text.Trim();
|
||
|
||
// parse time
|
||
if (!text.StartsWith("Length:"))
|
||
return;
|
||
if (!text.Contains(" hr") && !text.Contains(" min"))
|
||
return;
|
||
|
||
var timeSplit = text
|
||
.Replace("Length:", "")
|
||
.Trim()
|
||
.Split(new string[] { " and " }, StringSplitOptions.RemoveEmptyEntries);
|
||
// do the math for 1 item then add to productItem.
|
||
// If we += directly to the productItem inside foreach(), then time will be doubled if this runs twice
|
||
var tempLengthInMinutes = 0;
|
||
foreach (var part in timeSplit)
|
||
{
|
||
if (part.Contains("sec"))
|
||
continue;
|
||
var intPart = int.Parse(part.Replace("hr", "").Replace("min", "").Replace("s", "").Trim());
|
||
if (part.Contains("hr"))
|
||
intPart *= 60;
|
||
tempLengthInMinutes += intPart;
|
||
}
|
||
productItem.LengthInMinutes = tempLengthInMinutes;
|
||
},
|
||
},
|
||
// 1st td (summary panel)
|
||
// description
|
||
new LocatedRuleSetLib(By.XPath("./td[1]//p")) {
|
||
(p, productItem) => {
|
||
var text = p.Text.Trim();
|
||
if (!string.IsNullOrWhiteSpace(text))
|
||
productItem.Description = sanitizeDescription(text);
|
||
},
|
||
},
|
||
// 1st td (summary panel)
|
||
// 2nd set of bullets has product ratings
|
||
new LocatedRuleSetLib(By.XPath("(./td[1]//ul)[2]/li")) {
|
||
(li, productItem) => {
|
||
// splitting on null assumes white space: https://docs.microsoft.com/en-us/dotnet/api/system.string.split
|
||
var text = li.Text.Split(null as char[], StringSplitOptions.RemoveEmptyEntries);
|
||
if (text.Length < 2)
|
||
return;
|
||
|
||
var rating = float.Parse(text[1]);
|
||
var totalVotes = int.Parse(text[6].Replace(",", ""));
|
||
switch (text[0])
|
||
{
|
||
case "Overall": productItem.Product_OverallStars = rating; return;
|
||
case "Performance": productItem.Product_PerformanceStars = rating; return;
|
||
case "Story": productItem.Product_StoryStars = rating; return;
|
||
}
|
||
},
|
||
},
|
||
};
|
||
|
||
public LibraryScraper(AudiblePageSource pageSource) : base(pageSource) { }
|
||
|
||
public override IEnumerable<LibraryDTO> ScrapeCurrentPage()
|
||
{
|
||
#region // example for once per page rules
|
||
//var onePerPageProductItemValues = scrapeRows(onePerPageRules).Single();
|
||
//foreach (var productItem in scrapeRows(old_family))
|
||
//{
|
||
// productItem.CustomerId = onePerPageProductItemValues.CustomerId;
|
||
// productItem.UserName = onePerPageProductItemValues.UserName;
|
||
// productItem.Last_DownloadCustId = onePerPageProductItemValues.Last_DownloadCustId;
|
||
// yield return productItem;
|
||
//}
|
||
#endregion
|
||
|
||
#region // example for mutiple rule sets
|
||
// var i = 0;
|
||
// foreach (var oldResult in scrapeRows(oldRuleFamily))
|
||
// {
|
||
// i++;
|
||
// yield return oldResult;
|
||
// }
|
||
// if (i > 0)
|
||
// yield break;
|
||
// foreach (var newResult in scrapeRows(newRuleFamily))
|
||
// yield return newResult;
|
||
#endregion
|
||
|
||
return scrapeRows(ruleFamily);
|
||
}
|
||
|
||
// this is broken out into its own method as a proof of concept. it may also help with debugging
|
||
static void bookDownloadLink(WebElement row, LibraryDTO productItem)
|
||
{
|
||
if (productItem.IsEpisodes)
|
||
return;
|
||
|
||
var downloadLink = row.FindElements(By.ClassName("adbl-lib-action-download")).FirstOrDefault();
|
||
// ToNode switches to HtmlAgilityPack style. could also have used xpath .//a (or ./a since it happens to be the immediate descendant)
|
||
productItem.DownloadBookLink = downloadLink.Node
|
||
.Descendants("a").Single()
|
||
.Attributes["href"].Value;
|
||
|
||
// check for
|
||
// href="/howtolisten"
|
||
if (productItem.DownloadBookLink.ContainsInsensitive("howtolisten"))
|
||
throw new Exception("BAD DOWNLOAD LINKS"
|
||
+ "\r\n" + "PROBLEM: Library download button is a link to the 'howtolisten' page"
|
||
+ "\r\n" + "SOLUTION: Toggle this checkbox: Accounts Details > Update Settings > Software Verification > Check for Audible Download Manager");
|
||
}
|
||
|
||
#region static scrape page helpers
|
||
private static Regex removeIntroductionsRegex = new Regex(
|
||
#region regex: remove "introduction" variants
|
||
@"
|
||
# keep this. non-greedy
|
||
(.*?)
|
||
|
||
# non-capture. this is what to throw away
|
||
(?:
|
||
# this will capture
|
||
# (introduction)
|
||
# (introductions)
|
||
# - introduction
|
||
# - introductions
|
||
\s*\-?\s*\(?introductions?\)?
|
||
)?
|
||
", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled
|
||
#endregion
|
||
);
|
||
// input: comma delimited list of names. possibly with " - introductions", "(introduction)", etc
|
||
// output: clean up string[]
|
||
private static IEnumerable<string> sanitizeContributorNames(IEnumerable<string> names)
|
||
=> names.Select(n => removeIntroductionsRegex.Replace(n.Replace(",", "").Trim(), ""));
|
||
// room for improvement. all kinds of other things are tagged onto names with hyphens and parans. eg: "(cover illustration)", " - essay"
|
||
|
||
private static string sanitizeDescription(string desc) => desc
|
||
.Replace("’", "'") // '
|
||
.Replace("’", "'") // '
|
||
.Replace("…", "...") // …
|
||
.Replace("“", "\"") // "
|
||
.Replace("â€" + '\u009d', "\"") // "
|
||
.Replace("“", "\"") // "
|
||
.Replace("”", "\"") // "
|
||
.Trim();
|
||
#endregion
|
||
}
|
||
}
|