WordpressEbookScraper2/Scraper/Scraper.cs

using System.Diagnostics;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Xml.Serialization;
using HtmlAgilityPack;
using Ionic.Zip;
using WordpressEboobScraper2.Proc;

namespace WordpressEboobScraper2.Scraper;

/** *************************************************** **/
/**                                                     **/
/**      WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS)      **/
/**                                                     **/
/** *************************************************** **/

public class Scraper
{

	static EpubParameter ACTIVE_BOOK = null;

	const int LIMIT = 1500;

	readonly Regex REX_NUMSTART = new(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);

	Dictionary<string, string> webCache = new();

	string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;

	string WCACHE_FILE       => Path.Combine(Config.BASE_DIR_OUT,  @"_cache" , ACTIVE_BOOK.Foldername + @".xml");
	string HTML_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"html"   , ACTIVE_BOOK.Foldername + @".html");
	string EPUB_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"epub"   , ACTIVE_BOOK.Foldername + @".epub");
	string MOBI_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"mobi"   , ACTIVE_BOOK.Foldername + @".mobi");

	string HTML_FILE_STASH   => STASH_FOLDER + @"book.html";
	string ZIP_FILE_STASH    => STASH_FOLDER + @"book.zip";
	string EPUB_FILE_STASH   => STASH_FOLDER + @"book.epub";
	string MOBI_FILE_STASH   => STASH_FOLDER + @"book.mobi";

	string QUERY_FOLDER      => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar;      // full query result
	string HTML_FOLDER       => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar;       // unprocessed chapter code
	string EPUB_FOLDER       => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar;       // processed epub chapter code

	//----------------------------------------------------------------------------------------------------//

	//----------------------------------------------------------------------------------------------------//

	public void Generate()
	{
		foreach (var bb in Config.BOOKS)
		{
			ACTIVE_BOOK = bb;

			$"".Dump();
			$"".Dump();
			$"".Dump();
			new string('=', $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			                $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Dump();
			new string('=', $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$"".Dump();
			$"".Dump();
			$"".Dump();

			Init();

			List<Chapter> chapters = FindChapters();

			WriteBookHTML(chapters);
			WriteEpub(chapters);
			if (Config.CONVERT_MOBI) GenerateMobi();
		}
	}

	public void Verify()
	{
		foreach (var bb in Config.BOOKS)
		{
			ACTIVE_BOOK = bb;

			$"".Dump();
			$"".Dump();
			$"".Dump();
			new string('=', $" [VERIFYING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$" [VERIFYING BOOK]     {bb.DisplayStr}     ".Dump();
			new string('=', $" [VERIFYING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$"".Dump();
			$"".Dump();
			$"".Dump();

			LoadWebCache();

			VerifyChapters();
		}
	}

	void Init()
	{
		if (Directory.Exists(STASH_FOLDER))
		{
			Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
			if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
			if (File.Exists(ZIP_FILE_STASH))  File.Delete(ZIP_FILE_STASH);
			if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
			if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
		}

		Directory.CreateDirectory(STASH_FOLDER);
		Directory.CreateDirectory(QUERY_FOLDER);
		Directory.CreateDirectory(HTML_FOLDER);
		Directory.CreateDirectory(EPUB_FOLDER);

		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar);

		if (Config.USE_WEBCACHE) LoadWebCache();
	}

	void WriteBookHTML(List<Chapter> chapters)
	{
		StringBuilder b = new StringBuilder();

		b.AppendLine("<!DOCTYPE html>");
		b.AppendLine("<html>");
		b.AppendLine("<body>");

		foreach (var currChapter in chapters)
		{
			b.AppendLine();
			b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
			b.AppendLine();
			b.AppendLine(currChapter.chapter);
		}

		b.AppendLine("</html>");
		b.AppendLine("</body>");

		File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
		File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
	}

	void SaveCache()
	{
		var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
		using (var writer = new StreamWriter(WCACHE_FILE))
		{
			xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
		}
	}

	void LoadWebCache()
	{
		if (!File.Exists(WCACHE_FILE)) return;

		XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));

		using TextReader reader = new StreamReader(WCACHE_FILE);

		var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);

		webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
	}

	List<Chapter> FindChapters()
	{
		List<Chapter> result = new List<Chapter>();

		using WebClient client = new WebClient();

		client.Encoding = Encoding.UTF8;
		Stack<string> buffer = new Stack<string>();
		buffer.Push(ACTIVE_BOOK.StartURL);

		while (buffer.Any() && result.Count < LIMIT)
		{
			var url = buffer.Pop();
			Chapter curr = new Chapter() { url = url };

			var buffered = webCache.ContainsKey(url.ToLower());
			if (buffered)
			{
				curr.queryResult = webCache[url.ToLower()];
				"*(loaded from webcache)*".Dump();
			}
			else
			{
				curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
				webCache[url.ToLower()] = curr.queryResult;
				SaveCache();
			}

			var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
			if (next_url != null) buffer.Push(next_url);

			if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
			{
				"".Dump();
				"//==> *(auto-reload from live)*".Dump();
				"".Dump();
				curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
				webCache[url.ToLower()] = curr.queryResult;
				SaveCache();

				r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
				if (next_url_inner != null) buffer.Push(next_url_inner);
			}
			if (r == ProcessResult.SuccessNormal)
			{
				"    ==> Chapter processed".Dump();
				result.Add(curr);
				OutputChapter(curr, result.Count);
			}
			else if (r == ProcessResult.SkipChapter)
			{
				"    ==> Skip this chapter".Dump();
			}
			else if (r == ProcessResult.ReachedEnd)
			{
				"    ==> End reached".Dump();
			}


			"".Dump();
		}

		return result;
	}

	void VerifyChapters()
	{
		using WebClient client = new WebClient();

		client.Encoding = Encoding.UTF8;
		Stack<string> buffer = new Stack<string>();
		buffer.Push(ACTIVE_BOOK.StartURL);

		while (buffer.Any())
		{
			var url = buffer.Pop();
			Chapter curr_buffer = new Chapter() { url = url };
			Chapter curr_live   = new Chapter() { url = url };

			var buffered = webCache.ContainsKey(url.ToLower());
			if (buffered)
			{
				try
				{
					curr_buffer.queryResult = webCache[url.ToLower()];
					curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
				}
				catch (Exception e)
				{
					$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
					continue;
				}
			}
			else
			{
				continue;
			}

			var is_diff = false;

			var r_buffer = ProcessChapter(curr_buffer, new List<Chapter>(), _ => {}, out var next_buffer);
			var r_live = ProcessChapter(curr_live, new List<Chapter>(), _ => {}, out var next_live);

			if (next_buffer != null) buffer.Push(next_buffer);

			if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
			if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }

			if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
			if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }

			if (curr_buffer.chapter.Value != curr_live.chapter.Value)
			{
				var clean_buffer = GetChapterText(curr_buffer);
				var clean_live   = GetChapterText(curr_live);

				if (clean_buffer.Trim() != clean_live.Trim())
				{
					$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
					new Hyperlinq(() =>
					{

						var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
						var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
						File.WriteAllText(fa, curr_buffer.chapter.Value);
						File.WriteAllText(fb, curr_live.chapter.Value);
						Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");

					}, "[Compare Raw]").Dump();
					new Hyperlinq(() =>
					{

						var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
						var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
						File.WriteAllText(fa, clean_buffer);
						File.WriteAllText(fb, clean_live);
						Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");

					}, "[Compare Text]").Dump();
					new Hyperlinq(() =>
					{

						webCache[url.ToLower()] = curr_live.queryResult;
						SaveCache();

					}, "[Save new version to webcache]").Dump();

					is_diff = true;
				}
			}

			if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();

			if (is_diff) "".Dump();
		}
	}

	bool Relaxedurleq(string a, string b)
	{
		if (a == b) return true;
		if (a.StartsWith("https://")) a = a.Substring("https://".Length);
		if (a.StartsWith("http://"))  a = a.Substring("http://".Length);
		if (b.StartsWith("https://")) b = b.Substring("https://".Length);
		if (b.StartsWith("http://"))  b = b.Substring("http://".Length);

		return (a==b);
	}

	string GetChapterText(Chapter c)
	{
		if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;

		var clean = HTMLToText.ConvertHtml(c.chapter.Value);

		clean = clean.Trim();

		clean = new Regex(@"\s+").Replace(clean, " ");

		return clean;
	}

	ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueueNext)
	{
		forwardQueueNext = null;

		HtmlDocument doc = new HtmlDocument();
		doc.LoadHtml(curr.queryResult);

		#region Base

		var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
		if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");

		var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
		if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
		if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
		if (nodeNav == null) nodeNav = nodeContent;

		var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
		if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");

		#endregion

		#region Title

		var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
		if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
		if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
		if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
		if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");

		curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));

		var titles = new List<string>();
		titles.Add(curr.title);

		if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
		{
			var baseTitle = curr.title;

			var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);

			var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
			var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);

			titles.Add(prefix1);
			titles.Add(prefix2);

			var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
			var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
			var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
			var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
			if (altTitleNode1 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);
			}
			else if (altTitleNode2 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);
			}
			else if (altTitleNode3 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);

				altTitleNode3.Remove();
				prt("    > title node removed");
			}
			else if (altTitleNode4 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);

				altTitleNode4.Remove();
				prt("    > title node removed");
			}
			else if (suffix.Length > 2)
			{
				curr.title = suffix;
				titles.Add(suffix);
			}
			else
			{
				prt(" [!!]  Warning cannot parse title");
			}

			if (suffix.Length > 2)
			{
				curr.title = baseTitle;
				titles.Add(baseTitle);
			}
		}

		if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
			var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
			while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
			tit_alt = tit_alt.Trim();
			if (tit_alt.Length>2) curr.title = tit_alt;
		}

		#endregion

		curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";

		if (backBuffer.Any() && backBuffer.First().title == curr.title)
		{
			prt("[!] Book loop found - skipping entry");
			return ProcessResult.ReachedEnd; // prevent book II loop
		}

		curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
		curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
		curr.isBonus    = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));

		if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");

		if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
		{
			prt("[!] Epilogue found - skipping entry");
			return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
		}

		prt(curr.title + "    (" + curr.url + ")");

		#region Next

		if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
		    backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
			REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
			REX_NUMSTART.Match(curr.title).Success &&
			REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
		{
			prt("[!] Book jump found - skipping entry");
			return ProcessResult.ReachedEnd;
		}

		var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");

		if (next == null)
			next = nodeContent.Descendants()
					  .Where(p => p.Name.ToLower() == "a")
					  .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
					  .Where(p => p.Attributes.Contains("href"))
					  .FirstOrDefault();

		if (next == null)
			next = nodeNav.Descendants()
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
				.FirstOrDefault();

		if (next == null)
			next = Helper.RecursiveDescendants(nodeContent)
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
				.Where(p => p.Attributes.Contains("href"))
				.FirstOrDefault();

		if (next == null)
			next = Helper.RecursiveDescendants(nodeContent)
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
				.FirstOrDefault();

		if (next != null)
		{
			var next_url = next.Attributes["href"].Value.Trim();

			if (next_url == "." || next_url == "/" || next_url == "./")
			{
				next=null;
			}
			else
			{
				if (next_url.StartsWith("//")) next_url = "http:" + next_url;

				if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url);

				if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url);

				curr.next = next_url;
				if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
				{
					forwardQueueNext = next_url;
				}
			}

		}

		if (next == null) prt("    > (!) No next URL found");

		#endregion

		#region Chapter marker

		var cpMarkerIdentities = new List<string>
		{
			"previousnext", "previouschapternextchapter",
			"firstnext", "firstchapternextchapter",
			"firstchapter", "previouslast",

			"previouschapterlastchapter",

			"previouschapter", "nextchapter", "lastchapter",

			"first", "previous", "next", "last"
		};

		foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > Chapter marker removed");
		}

		foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > Chapter marker removed");
		}

		var alist = nodeChapter.SelectNodes("//a");
		if (alist != null)
		{
			foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
			{
				node.Remove();
				prt("    > Chapter marker removed");
			}
		}

		var plist = nodeChapter.SelectNodes("//p");
		if (plist != null)
		{
			foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
			{
				node.Remove();
				prt("    > Chapter marker removed");
			}
		}

		#endregion

		#region Share Div

		var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
		if (shareNodes != null)
		{
			foreach (var node in shareNodes)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > share div removed");
				}
				else
				{
					prt("    > share div cannot be removed - skipping");
				}
			}
		}

		#endregion

		#region Meta Div

		var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
		if (metaNodes != null)
		{
			foreach (var node in metaNodes)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > meta div removed");
				}
				else
				{
					prt("    > meta div cannot be removed - skipping");
				}
			}
		}

		#endregion

		#region Ad Blocking

		var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
		if (adNodes1 != null)
		{
			foreach (var node in adNodes1)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
		if (adNodes2 != null)
		{
			foreach (var node in adNodes2)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
		if (adNodes3 != null)
		{
			foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement"))
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		#endregion

		#region Title Paragraphs

		var titleNodes1 = nodeChapter.SelectNodes(@"p");
		if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
		{
			nodeChapter.RemoveChild(titleNodes1.First());
			prt("    > title node removed");
		}

		for (int hval = 1; hval <= 5; hval++)
		{
			var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
			if (titleNodes2 != null)
			{
				foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower())))
				{
					if (nodeChapter.ChildNodes.Contains(node))
					{
						nodeChapter.RemoveChild(node);
						prt("    > title node removed");
					}
				}
			}
		}

		var titleNodes3 = nodeChapter.SelectNodes(@"//u");
		if (titleNodes3 != null && titleNodes3.Any())
		{
			var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes3)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		var titleNodes4 = nodeChapter.SelectNodes(@"//span");
		if (titleNodes4 != null && titleNodes4.Any())
		{
			var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes4)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
		if (titleNodes5 != null && titleNodes5.Any())
		{
			var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes5)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		#endregion

		#region Remove <hr>'s

		while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
		{
			nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
			prt("    > header hr removed");
		}

		while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
		{
			nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
			prt("    > footer hr removed");
		}

		#endregion

		#region Other (Author's Node)

		foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > authors note removed");
		}

		#endregion

		var chap_html = nodeChapter.InnerHtml.Trim();

		#region Fix raw <hr>
		// KOReader doesn't like <hr>

		chap_html = chap_html.Replace("<hr>", "<hr/>");

		#endregion

		curr.chapter = chap_html;


		if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;

		return ProcessResult.SuccessNormal;
	}

	void OutputChapter(Chapter curr, int index)
	{
		File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult);

		File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);

		StringBuilder b = new StringBuilder();
		{
			b.AppendLine("<!DOCTYPE html>");
			b.AppendLine("<html>");
			b.AppendLine("<body>");
			b.AppendLine();
			b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
			b.AppendLine();
			b.AppendLine(curr.chapter);
			b.AppendLine("</body>");
			b.AppendLine("</html>");
		}
		File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
	}

	string NakedIdentity(HtmlNode raw)
	{
		return string.Join(string.Empty,
			raw
			.InnerText
			.ToLower()
			.Replace("&gt;", "")
			.Replace("&lt;", "")
			.Replace("&amp;", "")
			.Replace("&quot;", "")
			.Replace("&nbsp;", "")
			.ToCharArray()
			.Where(c => char.IsLetterOrDigit(c))
			.Select(c => char.ToLower(c))).Trim()
			.ToLower();
	}

	bool CouldBeTitle(HtmlNode n, string title)
	{
		var t0 = Helper.Striptease(n);
		var t1 = Helper.Striptease(title);

		t0 = t0.ToLower();
		t1 = t1.ToLower();

		t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
		t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");

		t0 = Regex.Replace(t0, @"\s\s+", "");
		t1 = Regex.Replace(t1, @"\s\s+", "");

		return t0 == t1;
	}

	void WriteEpub(List<Chapter> chapters)
	{
		if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
		if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);

		Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

		using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
		{
			using (var zipbook = new ZipOutputStream(fs))
			{
				WritePubString(zipbook, @"mimetype", GetEpubMimetype());
				WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
				WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
				WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));

				for (int i = 0; i < chapters.Count; i++)
				{
					WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
				}
			}
		}

		File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);

		File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
	}

	void GenerateMobi()
	{
		if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);

		"Running ebook-convert for MOBI output".Dump();
		var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\"  --max-toc-links=0  --toc-threshold=9999");

		$"ebook-convert returned: {pout.ExitCode}".Dump();
		if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);

		File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
	}

	void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
	{
		e = e ?? Encoding.UTF8;

		var f = z.PutNextEntry(n);
		f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;

		byte[] buffer = e.GetBytes(c);
		z.Write(buffer, 0, buffer.Length);
	}

	string GetEpubMimetype()
	{
		return "application/epub+zip";
	}

	string GetEpubContainerXML()
	{
		var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
					new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
						new XAttribute("version", "1.0"),
						new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
							new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
								new XAttribute("full-path", "OEBPS/content.opf"),
								new XAttribute("media-type", "application/oebps-package+xml")))));

		using Utf8StringWriter writer = new Utf8StringWriter();

		doc.Save(writer);
		var r = writer.ToString();
		r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
		return r.Trim() + "\r\n";
	}

	string GetEpubContentOPF(List<Chapter> chapters)
	{
		XNamespace dc = "http://purl.org/dc/elements/1.1/";
		XNamespace opf = "http://www.idpf.org/2007/opf";

		var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));

		var package = new XElement(opf + "package",
						new XAttribute("unique-identifier", "BookId"),
						new XAttribute("version", "2.0"));

		doc.Add(package);

		var meta = new XElement(opf + "metadata",
						new XAttribute(XNamespace.Xmlns + "dc", dc),
						new XAttribute(XNamespace.Xmlns + "opf", opf),
						new XElement(dc + "title", ACTIVE_BOOK.Title),
						new XElement(dc + "creator", ACTIVE_BOOK.Author),
						new XElement(dc + "identifier",
							new XAttribute("id", "BookId"),
							new XAttribute(opf + "scheme", "UUID"),
							"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "publication"),
							ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "modification"),
							DateTime.Now.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "creation"),
							DateTime.Now.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "language", ACTIVE_BOOK.Language),
						new XElement(dc + "identifier",
							new XAttribute(opf + "scheme", "UUID"),
							ACTIVE_BOOK.ID_CAL.ToString("D")),
						new XElement(opf + "meta",
							new XAttribute("content", "1.0"),
							new XAttribute("name", "Wordpress_eBook_scraper_version")),
						new XElement(opf + "meta",
							new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
							new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));

		if (ACTIVE_BOOK.Series != null)
		{
			meta.Add(new XElement(opf + "meta",
							new XAttribute("content", ACTIVE_BOOK.Series),
							new XAttribute("name", "calibre:series")));
			meta.Add(new XElement(opf + "meta",
							new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
							new XAttribute("name", "calibre:series_index")));
		}

		package.Add(meta);

		var manifest = new XElement(opf + "manifest");
		for(int i = 0; i < chapters.Count; i++)
		{
			manifest.Add(new XElement(opf + "item",
				new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))),
				new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))),
				new XAttribute("media-type", "application/xhtml+xml")));
		}
		manifest.Add(new XElement(opf + "item",
			new XAttribute("href", "toc.ncx"),
			new XAttribute("id", "ncx"),
			new XAttribute("media-type", "application/x-dtbncx+xml")));

		package.Add(manifest);

		var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
		for (int i = 0; i < chapters.Count; i++)
		{
			spine.Add(new XElement(opf + "itemref",
						new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))));
		}

		package.Add(spine);

		package.Add(new XElement(opf + "guide"));

		using Utf8StringWriter writer = new Utf8StringWriter();

		doc.Save(writer);
		return writer.ToString();
	}

	string GetEpubTOC(List<Chapter> chapters)
	{
		XNamespace ncx = "http://www.idpf.org/2007/opf";

		var doc = new XDocument(
						new XDeclaration("1.0", "UTF-8", null),
						new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));

		var root = new XElement(ncx + "ncx",
						new XAttribute("version", "2005-1"),
						new XElement(ncx + "head",
							new XElement(ncx + "meta",
								new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
								new XAttribute("name", "dtb:uid")),
							new XElement(ncx + "meta",
								new XAttribute("content", 1),
								new XAttribute("name", "dtb:depth")),
							new XElement(ncx + "meta",
								new XAttribute("content", 0),
								new XAttribute("name", "dtb:totalPageCount")),
							new XElement(ncx + "meta",
								new XAttribute("content", 0),
								new XAttribute("name", "dtb:maxPageNumber"))));

		doc.Add(root);

		root.Add(new XElement(ncx + "docTitle",
					new XElement(ncx + "text", "Unknown")));

		var nav = new XElement(ncx + "navMap");
		for (int i = 0; i < chapters.Count; i++)
		{
			nav.Add(new XElement(ncx + "navPoint",
				new XAttribute("id", "navPoint-" + (i + 1)),
				new XAttribute("playOrder", i + 1),
				new XElement(ncx + "navLabel",
					new XElement(ncx + "text", chapters[i].title)),
				new XElement(ncx + "content",
					new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))));
		}

		root.Add(nav);

		using Utf8StringWriter writer = new Utf8StringWriter();

		doc.Save(writer);
		return writer.ToString();
	}

	string GetEpubChapterFile(Chapter chapter, int idx)
	{
		StringBuilder xml = new StringBuilder();

		xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
		xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
		xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
		xml.AppendLine(@"<head>");
		xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
		xml.AppendLine(@"</head>");
		xml.AppendLine(@"<body>");
		xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
		xml.AppendLine(chapter.chapter);
		xml.AppendLine(@"</body>");
		xml.AppendLine(@"</html>");

		return xml.ToString();
	}
}