using System.Diagnostics; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Xml.Linq; using System.Xml.Serialization; using HtmlAgilityPack; using Ionic.Zip; using WordpressEboobScraper2.Proc; namespace WordpressEboobScraper2.Scraper; /** *************************************************** **/ /** **/ /** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/ /** **/ /** *************************************************** **/ public class Scraper { static EpubParameter ACTIVE_BOOK = null; const int LIMIT = 1500; readonly Regex REX_NUMSTART = new(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); Dictionary webCache = new(); string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar; string WCACHE_FILE => Path.Combine(Config.BASE_DIR_OUT, @"_cache" , ACTIVE_BOOK.Foldername + @".xml"); string HTML_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"html" , ACTIVE_BOOK.Foldername + @".html"); string EPUB_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"epub" , ACTIVE_BOOK.Foldername + @".epub"); string MOBI_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"mobi" , ACTIVE_BOOK.Foldername + @".mobi"); string HTML_FILE_STASH => STASH_FOLDER + @"book.html"; string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip"; string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub"; string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi"; string QUERY_FOLDER => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar; // full query result string HTML_FOLDER => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar; // unprocessed chapter code string EPUB_FOLDER => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar; // processed epub chapter code //----------------------------------------------------------------------------------------------------// //----------------------------------------------------------------------------------------------------// public void Generate() { foreach (var bb in Config.BOOKS) { ACTIVE_BOOK = bb; $"".Dump(); $"".Dump(); $"".Dump(); new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); $" [PROCESSING BOOK] {bb.DisplayStr} ".Dump(); new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); $"".Dump(); $"".Dump(); $"".Dump(); Init(); List chapters = FindChapters(); WriteBookHTML(chapters); WriteEpub(chapters); if (Config.CONVERT_MOBI) GenerateMobi(); } } public void Verify() { foreach (var bb in Config.BOOKS) { ACTIVE_BOOK = bb; $"".Dump(); $"".Dump(); $"".Dump(); new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); $" [VERIFYING BOOK] {bb.DisplayStr} ".Dump(); new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); $"".Dump(); $"".Dump(); $"".Dump(); LoadWebCache(); VerifyChapters(); } } void Init() { if (Directory.Exists(STASH_FOLDER)) { Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete)); if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH); if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); } Directory.CreateDirectory(STASH_FOLDER); Directory.CreateDirectory(QUERY_FOLDER); Directory.CreateDirectory(HTML_FOLDER); Directory.CreateDirectory(EPUB_FOLDER); Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar); Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar); Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar); Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar); if (Config.USE_WEBCACHE) LoadWebCache(); } void WriteBookHTML(List chapters) { StringBuilder b = new StringBuilder(); b.AppendLine(""); b.AppendLine(""); b.AppendLine(""); foreach (var currChapter in chapters) { b.AppendLine(); b.AppendLine("

" + HtmlEntity.Entitize(currChapter.title) + "

"); b.AppendLine(); b.AppendLine(currChapter.chapter); } b.AppendLine(""); b.AppendLine(""); File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8); File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true); } void SaveCache() { var xs = new XmlSerializer(typeof(List)); using (var writer = new StreamWriter(WCACHE_FILE)) { xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); } } void LoadWebCache() { if (!File.Exists(WCACHE_FILE)) return; XmlSerializer deserializer = new XmlSerializer(typeof(List)); using TextReader reader = new StreamReader(WCACHE_FILE); var l = (List)deserializer.Deserialize(reader); webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); } List FindChapters() { List result = new List(); using WebClient client = new WebClient(); client.Encoding = Encoding.UTF8; Stack buffer = new Stack(); buffer.Push(ACTIVE_BOOK.StartURL); while (buffer.Any() && result.Count < LIMIT) { var url = buffer.Pop(); Chapter curr = new Chapter() { url = url }; var buffered = webCache.ContainsKey(url.ToLower()); if (buffered) { curr.queryResult = webCache[url.ToLower()]; "*(loaded from webcache)*".Dump(); } else { curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); webCache[url.ToLower()] = curr.queryResult; SaveCache(); } var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); if (next_url != null) buffer.Push(next_url); if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST) { "".Dump(); "//==> *(auto-reload from live)*".Dump(); "".Dump(); curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); webCache[url.ToLower()] = curr.queryResult; SaveCache(); r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); if (next_url_inner != null) buffer.Push(next_url_inner); } if (r == ProcessResult.SuccessNormal) { " ==> Chapter processed".Dump(); result.Add(curr); OutputChapter(curr, result.Count); } else if (r == ProcessResult.SkipChapter) { " ==> Skip this chapter".Dump(); } else if (r == ProcessResult.ReachedEnd) { " ==> End reached".Dump(); } "".Dump(); } return result; } void VerifyChapters() { using WebClient client = new WebClient(); client.Encoding = Encoding.UTF8; Stack buffer = new Stack(); buffer.Push(ACTIVE_BOOK.StartURL); while (buffer.Any()) { var url = buffer.Pop(); Chapter curr_buffer = new Chapter() { url = url }; Chapter curr_live = new Chapter() { url = url }; var buffered = webCache.ContainsKey(url.ToLower()); if (buffered) { try { curr_buffer.queryResult = webCache[url.ToLower()]; curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); } catch (Exception e) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); continue; } } else { continue; } var is_diff = false; var r_buffer = ProcessChapter(curr_buffer, new List(), _ => {}, out var next_buffer); var r_live = ProcessChapter(curr_live, new List(), _ => {}, out var next_live); if (next_buffer != null) buffer.Push(next_buffer); if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } if (curr_buffer.chapter.Value != curr_live.chapter.Value) { var clean_buffer = GetChapterText(curr_buffer); var clean_live = GetChapterText(curr_live); if (clean_buffer.Trim() != clean_live.Trim()) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); new Hyperlinq(() => { var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); File.WriteAllText(fa, curr_buffer.chapter.Value); File.WriteAllText(fb, curr_live.chapter.Value); Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); }, "[Compare Raw]").Dump(); new Hyperlinq(() => { var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); File.WriteAllText(fa, clean_buffer); File.WriteAllText(fb, clean_live); Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); }, "[Compare Text]").Dump(); new Hyperlinq(() => { webCache[url.ToLower()] = curr_live.queryResult; SaveCache(); }, "[Save new version to webcache]").Dump(); is_diff = true; } } if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); if (is_diff) "".Dump(); } } bool Relaxedurleq(string a, string b) { if (a == b) return true; if (a.StartsWith("https://")) a = a.Substring("https://".Length); if (a.StartsWith("http://")) a = a.Substring("http://".Length); if (b.StartsWith("https://")) b = b.Substring("https://".Length); if (b.StartsWith("http://")) b = b.Substring("http://".Length); return (a==b); } string GetChapterText(Chapter c) { if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty; var clean = HTMLToText.ConvertHtml(c.chapter.Value); clean = clean.Trim(); clean = new Regex(@"\s+").Replace(clean, " "); return clean; } ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueueNext) { forwardQueueNext = null; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(curr.queryResult); #region Base var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]"); if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]"); if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]"); if (nodeNav == null) nodeNav = nodeContent; var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); #endregion #region Title var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1"); curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText)); var titles = new List(); titles.Add(curr.title); if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*")) { var baseTitle = curr.title; var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value); var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value; var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value); titles.Add(prefix1); titles.Add(prefix2); var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2); var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2); var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" title node removed"); } else if (altTitleNode4 != null) { var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length)); titles.Add(newtitle); curr.title = newtitle; titles.Add(prefix1 + newtitle); titles.Add(prefix2 + newtitle); titles.Add(prefix1 + " - " + newtitle); titles.Add(prefix2 + " - " + newtitle); altTitleNode4.Remove(); prt(" > title node removed"); } else if (suffix.Length > 2) { curr.title = suffix; titles.Add(suffix); } else { prt(" [!!] Warning cannot parse title"); } if (suffix.Length > 2) { curr.title = baseTitle; titles.Add(baseTitle); } } if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length); while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1); tit_alt = tit_alt.Trim(); if (tit_alt.Length>2) curr.title = tit_alt; } #endregion curr.sourcecode = "\r\n\r\n\r\n" + nodeContent.OuterHtml + "\r\n\r\n\r\n"; if (backBuffer.Any() && backBuffer.First().title == curr.title) { prt("[!] Book loop found - skipping entry"); return ProcessResult.ReachedEnd; // prevent book II loop } curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II"); if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus) { prt("[!] Epilogue found - skipping entry"); return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue } prt(curr.title + " (" + curr.url + ")"); #region Next if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && REX_NUMSTART.Match(curr.title).Success && REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value) { prt("[!] Book jump found - skipping entry"); return ProcessResult.ReachedEnd; } var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); if (next == null) next = nodeContent.Descendants() .Where(p => p.Name.ToLower() == "a") .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") .Where(p => p.Attributes.Contains("href")) .FirstOrDefault(); if (next == null) next = nodeNav.Descendants() .Where(p => p.Name.ToLower() == "a") .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) .FirstOrDefault(); if (next == null) next = Helper.RecursiveDescendants(nodeContent) .Where(p => p.Name.ToLower() == "a") .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") .Where(p => p.Attributes.Contains("href")) .FirstOrDefault(); if (next == null) next = Helper.RecursiveDescendants(nodeContent) .Where(p => p.Name.ToLower() == "a") .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) .FirstOrDefault(); if (next != null) { var next_url = next.Attributes["href"].Value.Trim(); if (next_url == "." || next_url == "/" || next_url == "./") { next=null; } else { if (next_url.StartsWith("//")) next_url = "http:" + next_url; if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url); if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url); curr.next = next_url; if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) { forwardQueueNext = next_url; } } } if (next == null) prt(" > (!) No next URL found"); #endregion #region Chapter marker var cpMarkerIdentities = new List { "previousnext", "previouschapternextchapter", "firstnext", "firstchapternextchapter", "firstchapter", "previouslast", "previouschapterlastchapter", "previouschapter", "nextchapter", "lastchapter", "first", "previous", "next", "last" }; foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList()) { nodeChapter.RemoveChild(node); prt(" > Chapter marker removed"); } foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { nodeChapter.RemoveChild(node); prt(" > Chapter marker removed"); } var alist = nodeChapter.SelectNodes("//a"); if (alist != null) { foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { node.Remove(); prt(" > Chapter marker removed"); } } var plist = nodeChapter.SelectNodes("//p"); if (plist != null) { foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { node.Remove(); prt(" > Chapter marker removed"); } } #endregion #region Share Div var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); if (shareNodes != null) { foreach (var node in shareNodes) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > share div removed"); } else { prt(" > share div cannot be removed - skipping"); } } } #endregion #region Meta Div var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]"); if (metaNodes != null) { foreach (var node in metaNodes) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > meta div removed"); } else { prt(" > meta div cannot be removed - skipping"); } } } #endregion #region Ad Blocking var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.."); if (adNodes1 != null) { foreach (var node in adNodes1) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.."); if (adNodes2 != null) { foreach (var node in adNodes2) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]"); if (adNodes3 != null) { foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement")) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } #endregion #region Title Paragraphs var titleNodes1 = nodeChapter.SelectNodes(@"p"); if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First())) { nodeChapter.RemoveChild(titleNodes1.First()); prt(" > title node removed"); } for (int hval = 1; hval <= 5; hval++) { var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval); if (titleNodes2 != null) { foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower()))) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > title node removed"); } } } } var titleNodes3 = nodeChapter.SelectNodes(@"//u"); if (titleNodes3 != null && titleNodes3.Any()) { var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes3) { t.Remove(); prt(" > title node removed"); } } var titleNodes4 = nodeChapter.SelectNodes(@"//span"); if (titleNodes4 != null && titleNodes4.Any()) { var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes4) { t.Remove(); prt(" > title node removed"); } } var titleNodes5 = nodeChapter.SelectNodes(@"//strong"); if (titleNodes5 != null && titleNodes5.Any()) { var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes5) { t.Remove(); prt(" > title node removed"); } } #endregion #region Remove


's while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr") { nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First()); prt(" > header hr removed"); } while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr") { nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last()); prt(" > footer hr removed"); } #endregion #region Other (Author's Node) foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList()) { nodeChapter.RemoveChild(node); prt(" > authors note removed"); } #endregion var chap_html = nodeChapter.InnerHtml.Trim(); #region Fix raw
// KOReader doesn't like
chap_html = chap_html.Replace("
", "
"); #endregion curr.chapter = chap_html; if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter; return ProcessResult.SuccessNormal; } void OutputChapter(Chapter curr, int index) { File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult); File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8); StringBuilder b = new StringBuilder(); { b.AppendLine(""); b.AppendLine(""); b.AppendLine(""); b.AppendLine(); b.AppendLine("

" + HtmlEntity.Entitize(curr.title) + "

"); b.AppendLine(); b.AppendLine(curr.chapter); b.AppendLine(""); b.AppendLine(""); } File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8); } string NakedIdentity(HtmlNode raw) { return string.Join(string.Empty, raw .InnerText .ToLower() .Replace(">", "") .Replace("<", "") .Replace("&", "") .Replace(""", "") .Replace(" ", "") .ToCharArray() .Where(c => char.IsLetterOrDigit(c)) .Select(c => char.ToLower(c))).Trim() .ToLower(); } bool CouldBeTitle(HtmlNode n, string title) { var t0 = Helper.Striptease(n); var t1 = Helper.Striptease(title); t0 = t0.ToLower(); t1 = t1.ToLower(); t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); t0 = Regex.Replace(t0, @"\s\s+", ""); t1 = Regex.Replace(t1, @"\s\s+", ""); return t0 == t1; } void WriteEpub(List chapters) { if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite)) { using (var zipbook = new ZipOutputStream(fs)) { WritePubString(zipbook, @"mimetype", GetEpubMimetype()); WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML()); WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters)); WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters)); for (int i = 0; i < chapters.Count; i++) { WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i)); } } } File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH); File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true); } void GenerateMobi() { if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); "Running ebook-convert for MOBI output".Dump(); var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999"); $"ebook-convert returned: {pout.ExitCode}".Dump(); if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined); File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true); } void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null) { e = e ?? Encoding.UTF8; var f = z.PutNextEntry(n); f.CompressionLevel = Ionic.Zlib.CompressionLevel.None; byte[] buffer = e.GetBytes(c); z.Write(buffer, 0, buffer.Length); } string GetEpubMimetype() { return "application/epub+zip"; } string GetEpubContainerXML() { var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null), new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"), new XAttribute("version", "1.0"), new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"), new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"), new XAttribute("full-path", "OEBPS/content.opf"), new XAttribute("media-type", "application/oebps-package+xml"))))); using Utf8StringWriter writer = new Utf8StringWriter(); doc.Save(writer); var r = writer.ToString(); r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); return r.Trim() + "\r\n"; } string GetEpubContentOPF(List chapters) { XNamespace dc = "http://purl.org/dc/elements/1.1/"; XNamespace opf = "http://www.idpf.org/2007/opf"; var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null)); var package = new XElement(opf + "package", new XAttribute("unique-identifier", "BookId"), new XAttribute("version", "2.0")); doc.Add(package); var meta = new XElement(opf + "metadata", new XAttribute(XNamespace.Xmlns + "dc", dc), new XAttribute(XNamespace.Xmlns + "opf", opf), new XElement(dc + "title", ACTIVE_BOOK.Title), new XElement(dc + "creator", ACTIVE_BOOK.Author), new XElement(dc + "identifier", new XAttribute("id", "BookId"), new XAttribute(opf + "scheme", "UUID"), "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), new XElement(dc + "date", new XAttribute(opf + "event", "publication"), ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "date", new XAttribute(opf + "event", "modification"), DateTime.Now.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "date", new XAttribute(opf + "event", "creation"), DateTime.Now.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "language", ACTIVE_BOOK.Language), new XElement(dc + "identifier", new XAttribute(opf + "scheme", "UUID"), ACTIVE_BOOK.ID_CAL.ToString("D")), new XElement(opf + "meta", new XAttribute("content", "1.0"), new XAttribute("name", "Wordpress_eBook_scraper_version")), new XElement(opf + "meta", new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")), new XAttribute("name", "Wordpress_eBook_scraper_creation_time"))); if (ACTIVE_BOOK.Series != null) { meta.Add(new XElement(opf + "meta", new XAttribute("content", ACTIVE_BOOK.Series), new XAttribute("name", "calibre:series"))); meta.Add(new XElement(opf + "meta", new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)), new XAttribute("name", "calibre:series_index"))); } package.Add(meta); var manifest = new XElement(opf + "manifest"); for(int i = 0; i < chapters.Count; i++) { manifest.Add(new XElement(opf + "item", new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))), new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))), new XAttribute("media-type", "application/xhtml+xml"))); } manifest.Add(new XElement(opf + "item", new XAttribute("href", "toc.ncx"), new XAttribute("id", "ncx"), new XAttribute("media-type", "application/x-dtbncx+xml"))); package.Add(manifest); var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx")); for (int i = 0; i < chapters.Count; i++) { spine.Add(new XElement(opf + "itemref", new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))); } package.Add(spine); package.Add(new XElement(opf + "guide")); using Utf8StringWriter writer = new Utf8StringWriter(); doc.Save(writer); return writer.ToString(); } string GetEpubTOC(List chapters) { XNamespace ncx = "http://www.idpf.org/2007/opf"; var doc = new XDocument( new XDeclaration("1.0", "UTF-8", null), new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null)); var root = new XElement(ncx + "ncx", new XAttribute("version", "2005-1"), new XElement(ncx + "head", new XElement(ncx + "meta", new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), new XAttribute("name", "dtb:uid")), new XElement(ncx + "meta", new XAttribute("content", 1), new XAttribute("name", "dtb:depth")), new XElement(ncx + "meta", new XAttribute("content", 0), new XAttribute("name", "dtb:totalPageCount")), new XElement(ncx + "meta", new XAttribute("content", 0), new XAttribute("name", "dtb:maxPageNumber")))); doc.Add(root); root.Add(new XElement(ncx + "docTitle", new XElement(ncx + "text", "Unknown"))); var nav = new XElement(ncx + "navMap"); for (int i = 0; i < chapters.Count; i++) { nav.Add(new XElement(ncx + "navPoint", new XAttribute("id", "navPoint-" + (i + 1)), new XAttribute("playOrder", i + 1), new XElement(ncx + "navLabel", new XElement(ncx + "text", chapters[i].title)), new XElement(ncx + "content", new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))))); } root.Add(nav); using Utf8StringWriter writer = new Utf8StringWriter(); doc.Save(writer); return writer.ToString(); } string GetEpubChapterFile(Chapter chapter, int idx) { StringBuilder xml = new StringBuilder(); xml.AppendLine(@""); xml.AppendLine(@" "); xml.AppendLine(@""); xml.AppendLine(@""); xml.AppendLine("" + HtmlEntity.Entitize(chapter.title) + ""); xml.AppendLine(@""); xml.AppendLine(@""); xml.AppendLine("

" + HtmlEntity.Entitize(chapter.title) + "

"); xml.AppendLine(chapter.chapter); xml.AppendLine(@""); xml.AppendLine(@""); return xml.ToString(); } }