Libation/Scraping/UNTESTED/Library/LibraryScraperBase.cs
2019-10-21 09:25:56 -04:00

107 lines
4.2 KiB
C#

using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using AudibleDotCom;
using DTOs;
using Scraping.Selectors;
namespace Scraping.Library
{
internal abstract class LibraryScraperBase
{
private AudiblePageSource source { get; }
private WebElement docRoot { get; }
protected LibraryScraperBase(AudiblePageSource pageSource)
{
source = pageSource;
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(source.Source);
docRoot = new WebElement(doc.DocumentNode);
}
public abstract IEnumerable<LibraryDTO> ScrapeCurrentPage();
#region // parallel-izing tests: selenium vs html agility pack
// iterate: foreach (var r in rows) extractProductItem(r, ruleFamily, returnProductItems)
// yield: foreach (var r in driver.FindElements(ruleFamily.RowsLocator).ToList()) yield return extractProductItem(r, ruleFamily, returnProductItems)
// Parallel_ForEach: Parallel.ForEach(rows, (r) => extractProductItem(r, ruleFamily, returnProductItems))
// WaitAll: Task.WaitAll(tasks)
// AsParallel: rows.AsParallel().Select(r => extractProductItem(r, ruleFamily, returnProductItems))
//
// in milliseconds
// selenium. [1] slow [2] is/has a bottleneck which resists parallelization
// iterate: 394424 - 439711
// yield: 387854
// Parallel_ForEach: 345149 - 371547
// WaitAll: 363970
// AsParallel: 369904
// html agility pack
// iterate: 15024 - 19092 55-60% of this time is downloading images
// Parallel_ForEach: 4060 - 4271 <<<<<<<<<<<<<<<<<<<<<<<
// WaitAll: 3646 - 8702 . mostly ~6-8k
// AsParallel: 4318 - 8378
#endregion
protected IEnumerable<LibraryDTO> scrapeRows(RuleFamilyLib ruleFamily)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var rows = ruleFamily.GetRows(docRoot).ToList();
var returnProductItems = new List<LibraryDTO>();
scrape1row_parallel(rows, ruleFamily, returnProductItems);
//scrape1row_iterate(rows, ruleFamily, returnProductItems);
sw.Stop();
var ms = sw.ElapsedMilliseconds;
return returnProductItems;
}
private IEnumerable<LibraryDTO> scrapeRows_YIELD(WebElement driver, RuleFamilyLib ruleFamily)
{
var returnProductItems = new List<LibraryDTO>();
var rows = driver.FindElements(ruleFamily.RowsLocator).ToList();
//rows = rows.Take(3).ToList(); // TOP3ONLY
for (var i = 0; i < rows.Count; i++)
{
string currentRow = $"{i + 1} of {rows.Count}";
// break here to see which row we're on
var r = rows[i];
yield return extractLibraryDTO(r, ruleFamily, returnProductItems);
}
}
private void scrape1row_iterate(IEnumerable<WebElement> rows, RuleFamilyLib ruleFamily, List<LibraryDTO> returnProductItems)
{
foreach (var r in rows)
extractLibraryDTO(r, ruleFamily, returnProductItems);
}
private void scrape1row_parallel(IEnumerable<WebElement> rows, RuleFamilyLib ruleFamily, List<LibraryDTO> returnProductItems)
=> Parallel.ForEach(rows, r => extractLibraryDTO(r, ruleFamily, returnProductItems));
private object _locker { get; } = new object();
private LibraryDTO extractLibraryDTO(WebElement row, RuleFamilyLib ruleFamily, List<LibraryDTO> returnProductItems)
{
var productItem = new LibraryDTO();
ruleFamily.Rules.Run(row, productItem);
// local lock is slightly faster than ConcurrentBag
// https://stackoverflow.com/questions/2950955/concurrentbagmytype-vs-listmytype/34016915#34016915
lock (_locker)
returnProductItems.Add(productItem);
// having a return object is for testing with yield
return productItem;
}
}
}