using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using AudibleDotCom;
using DataLayer;
using Dinah.Core.ErrorHandling;
using DTOs;
using InternalUtilities;
using Scraping;
namespace ScrapingDomainServices
{
///
/// book detail page:
/// - audible webpage => AudiblePageSource
/// - AudiblePageSource => declaw => htm file
/// - AudiblePageSource => scrape => DTO
/// - DTO => json file
/// - DTO => db
/// - update lucene
///
public class ScrapeBookDetails : DownloadableBase
{
public enum NoLongerAvailableEnum { None, Abort, MarkAsMissing }
/// Returns product id of book which was successfully imported and re-indexed
public event EventHandler BookSuccessfullyImported;
/// Hook for handling book no-longer-available. String 1: book title. String 2: book url
public Func NoLongerAvailableAction { get; set; }
public override Task ValidateAsync(LibraryBook libraryBook)
=> Task.FromResult(!libraryBook.Book.HasBookDetails);
public override async Task ProcessItemAsync(LibraryBook libraryBook)
{
var productId = libraryBook.Book.AudibleProductId;
#region // TEST CODE
//productId = "B0787DGS2T"; // book with only 1 category, no sub category
//productId = "B002V1OF70"; // mult series, more narrators here than in library
//productId = "B0032N8Q58"; // abridged
//productId = "B07GXW7KHG"; // categories in product details block. no narrators
//productId = "B002ZEEDAW"; // categores above image
//productId = "B075Y4SWJ8"; // lots of narrators, no 'abridged'
#endregion
BookDetailDTO bookDetailDTO;
// if json file exists, then htm is irrelevant. important b/c in cases of no-longer-available items, json is generated but no htm
var jsonFileInfo = FileManager.WebpageStorage.GetBookDetailJsonFileInfo(productId);
if (jsonFileInfo.Exists)
{
var serialized = File.ReadAllText(jsonFileInfo.FullName);
bookDetailDTO = Newtonsoft.Json.JsonConvert.DeserializeObject(serialized);
}
// no json. download htm
else
{
var htmFile = FileManager.WebpageStorage.GetBookDetailHtmFileInfo(productId);
// htm exists, json doesn't. load existing htm
if (htmFile.Exists)
{
var detailsAudiblePageSource = DataConverter.HtmFile_2_AudiblePageSource(htmFile.FullName);
bookDetailDTO = AudibleScraper.ScrapeBookDetailsSource(detailsAudiblePageSource);
}
// no htm. download and parse
else
{
// download htm
string source;
var url = AudiblePage.Product.GetUrl(productId);
using var webClient = await GetWebClientAsync($"Getting Book Details for {libraryBook.Book.Title}");
try
{
source = await webClient.DownloadStringTaskAsync(url);
var detailsAudiblePageSource = new AudiblePageSource(AudiblePageType.ProductDetails, source, productId);
// good habit to persist htm before attempting to parse it. this way, if there's a parse error, we can test errors on a local copy
DataConverter.AudiblePageSource_2_HtmFile_Product(detailsAudiblePageSource);
bookDetailDTO = AudibleScraper.ScrapeBookDetailsSource(detailsAudiblePageSource);
}
catch (System.Net.WebException webEx)
{
// cannot continue if NoLongerAvailableAction is null,
// else we'll be right back here next loop (and infinitely) with no failure condition
if (webEx.Status != System.Net.WebExceptionStatus.ConnectionClosed || NoLongerAvailableAction == null)
throw;
var nlaEnum = NoLongerAvailableAction.Invoke(
libraryBook.Book.Title,
AudiblePage.Product.GetUrl(libraryBook.Book.AudibleProductId));
if (nlaEnum == NoLongerAvailableEnum.Abort)
return new StatusHandler { "Cannot scrape book details. Aborting." };
else if (nlaEnum == NoLongerAvailableEnum.MarkAsMissing)
bookDetailDTO = new BookDetailDTO { ProductId = productId };
else
throw;
}
}
DataConverter.Value_2_JsonFile(bookDetailDTO, jsonFileInfo.FullName);
}
await Indexer.IndexBookDetailsAsync(bookDetailDTO);
BookSuccessfullyImported?.Invoke(this, productId);
return new StatusHandler();
}
}
}