WordpressEbookScraper2/Scraper/Scraper.cs

using System.Diagnostics;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Xml.Serialization;
using HtmlAgilityPack;
using Ionic.Zip;

namespace WordpressEboobScraper2.Scraper;

/** *************************************************** **/
/**                                                     **/
/**      WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS)      **/
/**                                                     **/
/** *************************************************** **/

class Scraper
{
		
	static EpubParameter ACTIVE_BOOK = null;

	const int LIMIT = 1500;

	readonly Regex REX_NUMSTART = new Regex(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);

	Dictionary<string, string> webCache = new Dictionary<string, string>();

	string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;

	string WCACHE_FILE       => Path.Combine(Config.BASE_DIR_OUT,  @"_cache" , ACTIVE_BOOK.Foldername + @".xml");
	string HTML_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"html"   , ACTIVE_BOOK.Foldername + @".html");
	string EPUB_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"epub"   , ACTIVE_BOOK.Foldername + @".epub");
	string MOBI_FILE_OUT     => Path.Combine(Config.BASE_DIR_OUT,  @"mobi"   , ACTIVE_BOOK.Foldername + @".mobi");

	string HTML_FILE_STASH   => STASH_FOLDER + @"book.html";
	string ZIP_FILE_STASH    => STASH_FOLDER + @"book.zip";
	string EPUB_FILE_STASH   => STASH_FOLDER + @"book.epub";
	string MOBI_FILE_STASH   => STASH_FOLDER + @"book.mobi";

	string QUERY_FOLDER      => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar;      // full query result
	string HTML_FOLDER       => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar;       // unprocessed chapter code
	string EPUB_FOLDER       => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar;       // processed epub chapter code

	//----------------------------------------------------------------------------------------------------//

	//----------------------------------------------------------------------------------------------------//

	public void Generate()
	{
		foreach (var bb in Config.BOOKS)
		{
			ACTIVE_BOOK = bb;

			$"".Dump();
			$"".Dump();
			$"".Dump();
			new string('=', $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			                $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Dump();
			new string('=', $" [PROCESSING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$"".Dump();
			$"".Dump();
			$"".Dump();

			Init();

			List<Chapter> chapters = FindChapters();

			WriteBookHTML(chapters);
			WriteEpub(chapters);
			if (Config.CONVERT_MOBI) GenerateMobi();
		}
	}

	public void Verify()
	{
		foreach (var bb in Config.BOOKS)
		{
			ACTIVE_BOOK = bb;
			
			$"".Dump();
			$"".Dump();
			$"".Dump();
			new string('=', $" [VERIFYING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$" [VERIFYING BOOK]     {bb.DisplayStr}     ".Dump();
			new string('=', $" [VERIFYING BOOK]     {bb.DisplayStr}     ".Length).Dump();
			$"".Dump();
			$"".Dump();
			$"".Dump();
			
			LoadWebCache();
			
			VerifyChapters();
		}
	}

	void Init()
	{
		if (Directory.Exists(STASH_FOLDER))
		{
			Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
			if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
			if (File.Exists(ZIP_FILE_STASH))  File.Delete(ZIP_FILE_STASH);
			if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
			if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
		}

		Directory.CreateDirectory(STASH_FOLDER);
		Directory.CreateDirectory(QUERY_FOLDER);
		Directory.CreateDirectory(HTML_FOLDER);
		Directory.CreateDirectory(EPUB_FOLDER);

		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar);
		Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar);
		
		if (Config.USE_WEBCACHE) LoadWebCache();
	}

	void WriteBookHTML(List<Chapter> chapters)
	{
		StringBuilder b = new StringBuilder();

		b.AppendLine("<!DOCTYPE html>");
		b.AppendLine("<html>");
		b.AppendLine("<body>");

		foreach (var currChapter in chapters)
		{
			b.AppendLine();
			b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
			b.AppendLine();
			b.AppendLine(currChapter.chapter);
		}

		b.AppendLine("</html>");
		b.AppendLine("</body>");
		
		File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
		File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
	}

	void SaveCache()
	{
		var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
		using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
		{
			xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
		}
	}

	void LoadWebCache()
	{
		if (!File.Exists(WCACHE_FILE)) return;
		
		XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
		using (TextReader reader = new StreamReader(WCACHE_FILE))
		{
			var result = new List<SerializableCacheEntry>();
			
			var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
			
			webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
		}
	}

	List<Chapter> FindChapters()
	{
		List<Chapter> result = new List<Chapter>();

		using (WebClient client = new WebClient())
		{
			client.Encoding = Encoding.UTF8;
			Stack<string> buffer = new Stack<string>();
			buffer.Push(ACTIVE_BOOK.StartURL);

			while (buffer.Any() && result.Count < LIMIT)
			{
				var url = buffer.Pop();
				Chapter curr = new Chapter() { url = url };

				var buffered = webCache.ContainsKey(url.ToLower());
				if (buffered)
				{
					curr.queryResult = webCache[url.ToLower()];
					"*(loaded from webcache)*".Dump();
				}
				else
				{
					curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
					webCache[url.ToLower()] = curr.queryResult;
					SaveCache();
				}

				var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
				if (next_url != null) buffer.Push(next_url);
				
				if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
				{
					"".Dump();
					"//==> *(auto-reload from live)*".Dump();
					"".Dump();
					curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
					webCache[url.ToLower()] = curr.queryResult;
					SaveCache();

					r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
					if (next_url_inner != null) buffer.Push(next_url_inner);
				}
				if (r == ProcessResult.SuccessNormal)
				{
					"    ==> Chapter processed".Dump();
					result.Add(curr);
					OutputChapter(curr, result.Count);
				}
				else if (r == ProcessResult.SkipChapter)
				{
					"    ==> Skip this chapter".Dump();
				}
				else if (r == ProcessResult.ReachedEnd)
				{
					"    ==> End reached".Dump();
				}


				"".Dump();
			}
		}

		return result;
	}

	void VerifyChapters()
	{
		List<Chapter> result = new List<Chapter>();

		using (WebClient client = new WebClient())
		{
			client.Encoding = Encoding.UTF8;
			Stack<string> buffer = new Stack<string>();
			buffer.Push(ACTIVE_BOOK.StartURL);

			while (buffer.Any() && result.Count < LIMIT)
			{
				var url = buffer.Pop();
				Chapter curr_buffer = new Chapter() { url = url };
				Chapter curr_live   = new Chapter() { url = url };

				var buffered = webCache.ContainsKey(url.ToLower());
				if (buffered)
				{
					try
					{
						curr_buffer.queryResult = webCache[url.ToLower()];
						curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
					}
					catch (Exception e)
					{
						$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
						continue;
					}
				}
				else
				{
					continue;
				}

				var is_diff = false;

				var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
				var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);

				if (next_buffer != null) buffer.Push(next_buffer);

				if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
				if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }

				if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
				if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }

				if (curr_buffer.chapter.Value != curr_live.chapter.Value)
				{
					var clean_buffer = GetChapterText(curr_buffer);
					var clean_live   = GetChapterText(curr_live);

					if (clean_buffer.Trim() != clean_live.Trim())
					{
						$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
						new Hyperlinq(() =>
						{

							var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
							var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
							File.WriteAllText(fa, curr_buffer.chapter.Value);
							File.WriteAllText(fb, curr_live.chapter.Value);
							Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");

						}, "[Compare Raw]").Dump();
						new Hyperlinq(() =>
						{

							var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
							var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
							File.WriteAllText(fa, clean_buffer);
							File.WriteAllText(fb, clean_live);
							Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");

						}, "[Compare Text]").Dump();
						new Hyperlinq(() =>
						{

							webCache[url.ToLower()] = curr_live.queryResult;
							SaveCache();

						}, "[Save new version to webcache]").Dump();

						is_diff = true;
					}
				}

				if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();

				if (is_diff) "".Dump();
			}
		}
	}

	bool Relaxedurleq(string a, string b)
	{
		if (a == b) return true;
		if (a.StartsWith("https://")) a = a.Substring("https://".Length);
		if (a.StartsWith("http://"))  a = a.Substring("http://".Length);
		if (b.StartsWith("https://")) b = b.Substring("https://".Length);
		if (b.StartsWith("http://"))  b = b.Substring("http://".Length);
		
		return (a==b);
	}

	string GetChapterText(Chapter c)
	{
		if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;
		
		var clean = HTMLToText.ConvertHtml(c.chapter.Value);
		
		clean = clean.Trim();
		
		clean = new Regex(@"\s+").Replace(clean, " ");
		
		return clean;
	}

	ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueue_next)
	{
		forwardQueue_next = null;
		
		HtmlDocument doc = new HtmlDocument();
		doc.LoadHtml(curr.queryResult);

		#region Base

		var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
		if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
		if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");

		var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
		if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
		if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
		if (nodeNav == null) nodeNav = nodeContent;

		var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
		if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");

		#endregion

		#region Title

		var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
		if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
		if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
		if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
		if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");

		curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));

		var titles = new List<string>();
		titles.Add(curr.title);

		if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
		{
			var baseTitle = curr.title;

			var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);

			var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
			var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);

			titles.Add(prefix1);
			titles.Add(prefix2);

			var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
			var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
			var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
			var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
			if (altTitleNode1 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);
			}
			else if (altTitleNode2 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);
			}
			else if (altTitleNode3 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);

				altTitleNode3.Remove();
				prt("    > title node removed");
			}
			else if (altTitleNode4 != null)
			{
				var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
				titles.Add(newtitle);
				curr.title = newtitle;
				titles.Add(prefix1 + newtitle);
				titles.Add(prefix2 + newtitle);
				titles.Add(prefix1 + " - " + newtitle);
				titles.Add(prefix2 + " - " + newtitle);

				altTitleNode4.Remove();
				prt("    > title node removed");
			}
			else if (suffix.Length > 2)
			{
				curr.title = suffix;
				titles.Add(suffix);
			}
			else
			{
				prt(" [!!]  Warning cannot parse title");
			}

			if (suffix.Length > 2)
			{
				curr.title = baseTitle;
				titles.Add(baseTitle);
			}
		}
		
		if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
			var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
			while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
			tit_alt = tit_alt.Trim();
			if (tit_alt.Length>2) curr.title = tit_alt;
		}

		#endregion

		curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";

		if (backBuffer.Any() && backBuffer.First().title == curr.title)
		{
			prt("[!] Book loop found - skipping entry");
			return ProcessResult.ReachedEnd; // prevent book II loop
		}

		curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
		curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
		curr.isBonus    = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
		
		if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");

		if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
		{
			prt("[!] Epilogue found - skipping entry");
			return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
		}

		prt(curr.title + "    (" + curr.url + ")");

		#region Next

		string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
		
		if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && 
		    backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && 
			REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
			REX_NUMSTART.Match(curr.title).Success &&
			REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
		{
			prt("[!] Book jump found - skipping entry");
			return ProcessResult.ReachedEnd;
		}

		var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");

		if (next == null)
			next = nodeContent.Descendants()
					  .Where(p => p.Name.ToLower() == "a")
						.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
						.Where(p => p.Attributes.Contains("href"))
					  .FirstOrDefault();

		if (next == null)
			next = nodeNav.Descendants()
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
				.FirstOrDefault();

		if (next == null)
			next = Helper.RecursiveDescendants(nodeContent)
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
				.Where(p => p.Attributes.Contains("href"))
				.FirstOrDefault();

		if (next == null)
			next = Helper.RecursiveDescendants(nodeContent)
				.Where(p => p.Name.ToLower() == "a")
				.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
				.FirstOrDefault();

		if (next != null)
		{
			var next_url = next.Attributes["href"].Value.Trim();

			if (next_url == "." || next_url == "/" || next_url == "./")
			{
				next=null;
			}
			else
			{
				if (next_url.StartsWith("//")) next_url = "http:" + next_url;

				if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url);

				if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url);

				curr.next = next_url;
				if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
				{
					forwardQueue_next = next_url;
				}
			}

		}

		if (next == null) prt("    > (!) No next URL found");

		#endregion

		#region Chapter marker

		var cpMarkerIdentities = new List<string> 
		{
			"previousnext", "previouschapternextchapter", 
			"firstnext", "firstchapternextchapter", 
			"firstchapter", "previouslast", 
			
			"previouschapterlastchapter", 
			
			"previouschapter", "nextchapter", "lastchapter", 
			
			"first", "previous", "next", "last"
		};

		foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > Chapter marker removed");
		}

		foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > Chapter marker removed");
		}

		var alist = nodeChapter.SelectNodes("//a");
		if (alist != null)
		{
			foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
			{
				node.Remove();
				prt("    > Chapter marker removed");
			}
		}

		var plist = nodeChapter.SelectNodes("//p");
		if (plist != null)
		{
			foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
			{
				node.Remove();
				prt("    > Chapter marker removed");
			}
		}

		#endregion

		#region Share Div

		var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
		if (shareNodes != null)
		{
			foreach (var node in shareNodes)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > share div removed");
				}
				else
				{
					prt("    > share div cannot be removed - skipping");
				}
			}
		}

		#endregion

		#region Meta Div

		var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
		if (metaNodes != null)
		{
			foreach (var node in metaNodes)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > meta div removed");
				}
				else
				{
					prt("    > meta div cannot be removed - skipping");
				}
			}
		}

		#endregion
		
		#region Ad Blocking

		var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
		if (adNodes1 != null)
		{
			foreach (var node in adNodes1)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
		if (adNodes2 != null)
		{
			foreach (var node in adNodes2)
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
		if (adNodes3 != null)
		{
			foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement"))
			{
				if (nodeChapter.ChildNodes.Contains(node))
				{
					nodeChapter.RemoveChild(node);
					prt("    > ad div removed");
				}
				else
				{
					prt("    > ad div cannot be removed - skipping");
				}
			}
		}

		#endregion

		#region Title Paragraphs

		var titleNodes1 = nodeChapter.SelectNodes(@"p");
		if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
		{
			nodeChapter.RemoveChild(titleNodes1.First());
			prt("    > title node removed");
		}

		for (int hval = 1; hval <= 5; hval++)
		{
			var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
			if (titleNodes2 != null)
			{
				foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower())))
				{
					if (nodeChapter.ChildNodes.Contains(node))
					{
						nodeChapter.RemoveChild(node);
						prt("    > title node removed");
					}
				}
			}
		}

		var titleNodes3 = nodeChapter.SelectNodes(@"//u");
		if (titleNodes3 != null && titleNodes3.Any())
		{
			var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes3)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		var titleNodes4 = nodeChapter.SelectNodes(@"//span");
		if (titleNodes4 != null && titleNodes4.Any())
		{
			var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes4)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
		if (titleNodes5 != null && titleNodes5.Any())
		{
			var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
			foreach (var t in xTitleNodes5)
			{
				t.Remove();
				prt("    > title node removed");
			}
		}

		#endregion

		#region Remove <hr>'s

		while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
		{
			nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
			prt("    > header hr removed");
		}

		while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
		{
			nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
			prt("    > footer hr removed");
		}

		#endregion

		#region Other (Author's Node)

		foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
		{
			nodeChapter.RemoveChild(node);
			prt("    > authors note removed");
		}

		#endregion

		var chap_html = nodeChapter.InnerHtml.Trim();

		#region Fix raw <hr>
		// KOReader doesn't like <hr>

		chap_html = chap_html.Replace("<hr>", "<hr/>");

		#endregion

		curr.chapter = chap_html;


		if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;
		
		return ProcessResult.SuccessNormal;
	}

	void OutputChapter(Chapter curr, int index)
	{
		File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult);

		File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);

		StringBuilder b = new StringBuilder();
		{
			b.AppendLine("<!DOCTYPE html>");
			b.AppendLine("<html>");
			b.AppendLine("<body>");
			b.AppendLine();
			b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
			b.AppendLine();
			b.AppendLine(curr.chapter);
			b.AppendLine("</body>");
			b.AppendLine("</html>");
		}
		File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
	}

	string NakedIdentity(HtmlNode raw)
	{
		return string.Join(string.Empty,
			raw
			.InnerText
			.ToLower()
			.Replace("&gt;", "")
			.Replace("&lt;", "")
			.Replace("&amp;", "")
			.Replace("&quot;", "")
			.Replace("&nbsp;", "")
			.ToCharArray()
			.Where(c => char.IsLetterOrDigit(c))
			.Select(c => char.ToLower(c))).Trim()
			.ToLower();
	}

	bool CouldBeTitle(HtmlNode n, string title)
	{
		var t0 = Helper.Striptease(n);
		var t1 = Helper.Striptease(title);

		t0 = t0.ToLower();
		t1 = t1.ToLower();

		t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
		t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");

		t0 = Regex.Replace(t0, @"\s\s+", "");
		t1 = Regex.Replace(t1, @"\s\s+", "");
		
		return t0 == t1;
	}

	void WriteEpub(List<Chapter> chapters)
	{
		if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
		if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);

		Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

		using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
		{
			using (var zipbook = new ZipOutputStream(fs))
			{
				WritePubString(zipbook, @"mimetype", GetEpubMimetype());
				WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
				WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
				WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));

				for (int i = 0; i < chapters.Count; i++)
				{
					WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
				}
			}
		}

		File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);

		File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
	}

	void GenerateMobi()
	{
		if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
		
		"Running ebook-convert for MOBI output".Dump();
		var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\"  --max-toc-links=0  --toc-threshold=9999");

		$"ebook-convert returned: {pout.ExitCode}".Dump();
		if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);
		
		File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
	}

	void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
	{
		e = e ?? Encoding.UTF8;
		
		var f = z.PutNextEntry(n);
		f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;

		byte[] buffer = e.GetBytes(c);
		z.Write(buffer, 0, buffer.Length);
	}

	string GetEpubMimetype()
	{
		return "application/epub+zip";
	}

	string GetEpubContainerXML()
	{
		var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
					new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
						new XAttribute("version", "1.0"),
						new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
							new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
								new XAttribute("full-path", "OEBPS/content.opf"),
								new XAttribute("media-type", "application/oebps-package+xml")))));

		StringBuilder builder = new StringBuilder();
		using (Utf8StringWriter writer = new Utf8StringWriter())
		{
			doc.Save(writer);
			var r = writer.ToString();
			r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
			return r.Trim() + "\r\n";
		}
	}

	string GetEpubContentOPF(List<Chapter> chapters)
	{
		XNamespace dc = "http://purl.org/dc/elements/1.1/";
		XNamespace opf = "http://www.idpf.org/2007/opf";

		var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));

		var package = new XElement(opf + "package",
						new XAttribute("unique-identifier", "BookId"),
						new XAttribute("version", "2.0"));

		doc.Add(package);

		var meta = new XElement(opf + "metadata",
						new XAttribute(XNamespace.Xmlns + "dc", dc),
						new XAttribute(XNamespace.Xmlns + "opf", opf),
						new XElement(dc + "title", ACTIVE_BOOK.Title),
						new XElement(dc + "creator", ACTIVE_BOOK.Author),
						new XElement(dc + "identifier",
							new XAttribute("id", "BookId"),
							new XAttribute(opf + "scheme", "UUID"),
							"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "publication"),
							ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "modification"),
							DateTime.Now.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "date",
							new XAttribute(opf + "event", "creation"),
							DateTime.Now.ToString("yyyy'-'MM'-'dd")),
						new XElement(dc + "language", ACTIVE_BOOK.Language),
						new XElement(dc + "identifier",
							new XAttribute(opf + "scheme", "UUID"),
							ACTIVE_BOOK.ID_CAL.ToString("D")),
						new XElement(opf + "meta",
							new XAttribute("content", "1.0"),
							new XAttribute("name", "Wordpress_eBook_scraper_version")),
						new XElement(opf + "meta",
							new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
							new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));

		if (ACTIVE_BOOK.Series != null)
		{
			meta.Add(new XElement(opf + "meta",
							new XAttribute("content", ACTIVE_BOOK.Series),
							new XAttribute("name", "calibre:series")));
			meta.Add(new XElement(opf + "meta",
							new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
							new XAttribute("name", "calibre:series_index")));
		}

		package.Add(meta);

		var manifest = new XElement(opf + "manifest");
		for(int i = 0; i < chapters.Count; i++)
		{
			manifest.Add(new XElement(opf + "item",
				new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))),
				new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))),
				new XAttribute("media-type", "application/xhtml+xml")));
		}
		manifest.Add(new XElement(opf + "item",
			new XAttribute("href", "toc.ncx"),
			new XAttribute("id", "ncx"),
			new XAttribute("media-type", "application/x-dtbncx+xml")));

		package.Add(manifest);

		var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
		for (int i = 0; i < chapters.Count; i++)
		{
			spine.Add(new XElement(opf + "itemref", 
						new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))));
		}

		package.Add(spine);

		package.Add(new XElement(opf + "guide"));

		StringBuilder builder = new StringBuilder();
		using (Utf8StringWriter writer = new Utf8StringWriter())
		{
			doc.Save(writer);
			return writer.ToString();
		}
	}

	string GetEpubTOC(List<Chapter> chapters)
	{
		XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
		XNamespace ncx = "http://www.idpf.org/2007/opf";

		var doc = new XDocument(
						new XDeclaration("1.0", "UTF-8", null),
						new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));

		var root = new XElement(ncx + "ncx",
						new XAttribute("version", "2005-1"),
						new XElement(ncx + "head",
							new XElement(ncx + "meta",
								new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
								new XAttribute("name", "dtb:uid")),
							new XElement(ncx + "meta",
								new XAttribute("content", 1),
								new XAttribute("name", "dtb:depth")),
							new XElement(ncx + "meta",
								new XAttribute("content", 0),
								new XAttribute("name", "dtb:totalPageCount")),
							new XElement(ncx + "meta",
								new XAttribute("content", 0),
								new XAttribute("name", "dtb:maxPageNumber"))));

		doc.Add(root);

		root.Add(new XElement(ncx + "docTitle",
					new XElement(ncx + "text", "Unknown")));

		var nav = new XElement(ncx + "navMap");
		for (int i = 0; i < chapters.Count; i++)
		{
			nav.Add(new XElement(ncx + "navPoint",
				new XAttribute("id", "navPoint-" + (i + 1)),
				new XAttribute("playOrder", i + 1),
				new XElement(ncx + "navLabel",
					new XElement(ncx + "text", chapters[i].title)),
				new XElement(ncx + "content",
					new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))));
		}

		root.Add(nav);

		StringBuilder builder = new StringBuilder();
		using (Utf8StringWriter writer = new Utf8StringWriter())
		{
			doc.Save(writer);
			return writer.ToString();
		}
	}

	string GetEpubChapterFile(Chapter chapter, int idx)
	{
		StringBuilder xml = new StringBuilder();

		xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
		xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
		xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
		xml.AppendLine(@"<head>");
		xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
		xml.AppendLine(@"</head>");
		xml.AppendLine(@"<body>");
		xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
		xml.AppendLine(chapter.chapter);
		xml.AppendLine(@"</body>");
		xml.AppendLine(@"</html>");

		return xml.ToString();
	}

	public struct ProcessOutput
	{
		public readonly string Command;
		public readonly int ExitCode;
		public readonly string StdOut;
		public readonly string StdErr;
		public readonly string StdCombined;

		public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
		{
			Command = cmd;
			ExitCode = ex;
			StdOut = stdout;
			StdErr = stderr;
			StdCombined = stdcom;
		}

		public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
	}

	public static class ProcessHelper
	{
		public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
		{
			var process = new Process
			{
				StartInfo =
					{
						FileName = command,
						Arguments = arguments,
						WorkingDirectory = workingDirectory ?? string.Empty,
						UseShellExecute = false,
						RedirectStandardOutput = true,
						RedirectStandardError = true,
						CreateNoWindow = true,
						ErrorDialog = false,
					}
			};

			var builderOut = new StringBuilder();
			var builderErr = new StringBuilder();
			var builderBoth = new StringBuilder();

			process.OutputDataReceived += (sender, args) =>
			{
				if (args.Data == null) return;

				if (builderOut.Length == 0) builderOut.Append(args.Data);
				else builderOut.Append("\n" + args.Data);

				if (builderBoth.Length == 0) builderBoth.Append(args.Data);
				else builderBoth.Append("\n" + args.Data);
			};

			process.ErrorDataReceived += (sender, args) =>
			{
				if (args.Data == null) return;

				if (builderErr.Length == 0) builderErr.Append(args.Data);
				else builderErr.Append("\n" + args.Data);

				if (builderBoth.Length == 0) builderBoth.Append(args.Data);
				else builderBoth.Append("\n" + args.Data);
			};

			process.Start();

			process.BeginOutputReadLine();
			process.BeginErrorReadLine();

			process.WaitForExit();

			return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
		}
	}
	public static class HTMLToText
	{
		private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
		private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);

		private class PreceedingDomTextInfo
		{
			public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
			{
				IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
			}
			public bool WritePrecedingWhiteSpace { get; set; }
			public bool LastCharWasSpace { get; set; }
			public readonly BoolWrapper IsFirstTextOfDocWritten;
			public int ListIndex { get; set; }
		}

		private class BoolWrapper
		{
			public BoolWrapper() { }
			public bool Value { get; set; }
			public static implicit operator bool(BoolWrapper boolWrapper)
			{
				return boolWrapper.Value;
			}
			public static implicit operator BoolWrapper(bool boolWrapper)
			{
				return new BoolWrapper { Value = boolWrapper };
			}
		}

		public static string Convert(string path)
		{
			HtmlDocument doc = new HtmlDocument();
			doc.Load(path);
			return ConvertDoc(doc);
		}

		public static string ConvertHtml(string html)
		{
			HtmlDocument doc = new HtmlDocument();
			html = REX_TAG1.Replace(html, " ");
			html = REX_TAG2.Replace(html, " ");
			doc.LoadHtml(html);
			return ConvertDoc(doc);
		}

		public static string ConvertDoc(HtmlDocument doc)
		{
			using (StringWriter sw = new StringWriter())
			{
				ConvertTo(doc.DocumentNode, sw);
				sw.Flush();
				return sw.ToString();
			}
		}

		private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
		{
			foreach (HtmlNode subnode in node.ChildNodes)
			{
				ConvertTo(subnode, outText, textInfo);
			}
		}

		public static void ConvertTo(HtmlNode node, TextWriter outText)
		{
			ConvertTo(node, outText, new PreceedingDomTextInfo(false));
		}

		private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
		{
			string html;
			switch (node.NodeType)
			{
				case HtmlNodeType.Comment:
					// don't output comments
					break;
				case HtmlNodeType.Document:
					ConvertContentTo(node, outText, textInfo);
					break;
				case HtmlNodeType.Text:
					// script and style must not be output
					string parentName = node.ParentNode.Name;
					if ((parentName == "script") || (parentName == "style"))
					{
						break;
					}
					// get text
					html = ((HtmlTextNode)node).Text;
					// is it in fact a special closing node output as text?
					if (HtmlNode.IsOverlappedClosingElement(html)) break;

					// check the text is meaningful and not a bunch of whitespaces
					if (html.Length == 0) break;

					if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;

					if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
					{
						html = html.TrimStart();
						if (html.Length == 0) { break; }
						textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
					}
					outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
					if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
					{
						outText.Write(' ');
					}
					break;
				case HtmlNodeType.Element:
					string endElementString = null;
					bool isInline;
					bool skip = false;
					int listIndex = 0;
					switch (node.Name)
					{
						case "nav":
							skip = true;
							isInline = false;
							break;
						case "body":
						case "section":
						case "article":
						case "aside":
						case "h1":
						case "h2":
						case "header":
						case "footer":
						case "address":
						case "main":
						case "div":
						case "span":
						case "p": // stylistic - adjust as you tend to use
							if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
							endElementString = "\r\n";
							isInline = false;
							break;
						case "br":
							outText.Write("\r\n");
							skip = true;
							textInfo.WritePrecedingWhiteSpace = false;
							isInline = true;
							break;
						case "a":
							isInline = true;
							break;
						case "li":
							isInline = false;
							break;
						case "ol":
							listIndex = 1;
							goto case "ul";
						case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
							endElementString = "\r\n";
							isInline = false;
							break;
						case "img": //inline-block in reality
							isInline = true;
							break;
						default:
							isInline = true;
							break;
					}
					if (!skip && node.HasChildNodes)
					{
						ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
					}
					if (endElementString != null)
					{
						outText.Write(endElementString);
					}
					break;
			}
		}
	}
}