1
0
WordpressEbookScraper2/Scraper/Scraper.cs

1098 lines
36 KiB
C#
Raw Normal View History

using System.Diagnostics;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Xml.Serialization;
using HtmlAgilityPack;
using Ionic.Zip;
2023-10-03 16:13:37 +02:00
using WordpressEboobScraper2.Proc;
namespace WordpressEboobScraper2.Scraper;
/** *************************************************** **/
/** **/
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
/** **/
/** *************************************************** **/
2023-10-03 16:13:37 +02:00
public class Scraper
{
static EpubParameter ACTIVE_BOOK = null;
const int LIMIT = 1500;
2023-10-03 16:13:37 +02:00
readonly Regex REX_NUMSTART = new(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
2023-10-03 16:13:37 +02:00
Dictionary<string, string> webCache = new();
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
string WCACHE_FILE => Path.Combine(Config.BASE_DIR_OUT, @"_cache" , ACTIVE_BOOK.Foldername + @".xml");
string HTML_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"html" , ACTIVE_BOOK.Foldername + @".html");
string EPUB_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"epub" , ACTIVE_BOOK.Foldername + @".epub");
string MOBI_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"mobi" , ACTIVE_BOOK.Foldername + @".mobi");
string HTML_FILE_STASH => STASH_FOLDER + @"book.html";
string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip";
string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub";
string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi";
string QUERY_FOLDER => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar; // full query result
string HTML_FOLDER => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar; // unprocessed chapter code
string EPUB_FOLDER => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar; // processed epub chapter code
//----------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------------------------------------------//
public void Generate()
{
foreach (var bb in Config.BOOKS)
{
ACTIVE_BOOK = bb;
$"".Dump();
$"".Dump();
$"".Dump();
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
$" [PROCESSING BOOK] {bb.DisplayStr} ".Dump();
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
$"".Dump();
$"".Dump();
$"".Dump();
Init();
List<Chapter> chapters = FindChapters();
WriteBookHTML(chapters);
WriteEpub(chapters);
if (Config.CONVERT_MOBI) GenerateMobi();
}
}
public void Verify()
{
foreach (var bb in Config.BOOKS)
{
ACTIVE_BOOK = bb;
$"".Dump();
$"".Dump();
$"".Dump();
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
$" [VERIFYING BOOK] {bb.DisplayStr} ".Dump();
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
$"".Dump();
$"".Dump();
$"".Dump();
LoadWebCache();
VerifyChapters();
}
}
void Init()
{
if (Directory.Exists(STASH_FOLDER))
{
Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
}
Directory.CreateDirectory(STASH_FOLDER);
Directory.CreateDirectory(QUERY_FOLDER);
Directory.CreateDirectory(HTML_FOLDER);
Directory.CreateDirectory(EPUB_FOLDER);
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar);
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar);
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar);
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar);
if (Config.USE_WEBCACHE) LoadWebCache();
}
void WriteBookHTML(List<Chapter> chapters)
{
StringBuilder b = new StringBuilder();
b.AppendLine("<!DOCTYPE html>");
b.AppendLine("<html>");
b.AppendLine("<body>");
foreach (var currChapter in chapters)
{
b.AppendLine();
b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
b.AppendLine();
b.AppendLine(currChapter.chapter);
}
b.AppendLine("</html>");
b.AppendLine("</body>");
File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
}
void SaveCache()
{
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
2023-10-03 16:13:37 +02:00
using (var writer = new StreamWriter(WCACHE_FILE))
{
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
}
}
void LoadWebCache()
{
if (!File.Exists(WCACHE_FILE)) return;
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
2023-10-03 16:13:37 +02:00
using TextReader reader = new StreamReader(WCACHE_FILE);
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
2023-10-03 16:13:37 +02:00
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
}
List<Chapter> FindChapters()
{
List<Chapter> result = new List<Chapter>();
2023-10-03 16:13:37 +02:00
using WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
while (buffer.Any() && result.Count < LIMIT)
{
2023-10-03 16:13:37 +02:00
var url = buffer.Pop();
Chapter curr = new Chapter() { url = url };
2023-10-03 16:13:37 +02:00
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
2023-10-03 16:13:37 +02:00
curr.queryResult = webCache[url.ToLower()];
"*(loaded from webcache)*".Dump();
}
else
{
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
}
2023-10-03 16:13:37 +02:00
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
if (next_url != null) buffer.Push(next_url);
2023-10-03 16:13:37 +02:00
if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
{
"".Dump();
"//==> *(auto-reload from live)*".Dump();
"".Dump();
2023-10-03 16:13:37 +02:00
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
if (next_url_inner != null) buffer.Push(next_url_inner);
}
if (r == ProcessResult.SuccessNormal)
{
" ==> Chapter processed".Dump();
result.Add(curr);
OutputChapter(curr, result.Count);
}
else if (r == ProcessResult.SkipChapter)
{
" ==> Skip this chapter".Dump();
}
2023-10-03 16:13:37 +02:00
else if (r == ProcessResult.ReachedEnd)
{
" ==> End reached".Dump();
}
"".Dump();
}
return result;
}
void VerifyChapters()
{
2023-10-03 16:13:37 +02:00
using WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
2023-10-03 16:13:37 +02:00
while (buffer.Any())
{
2023-10-03 16:13:37 +02:00
var url = buffer.Pop();
Chapter curr_buffer = new Chapter() { url = url };
Chapter curr_live = new Chapter() { url = url };
2023-10-03 16:13:37 +02:00
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
2023-10-03 16:13:37 +02:00
try
{
2023-10-03 16:13:37 +02:00
curr_buffer.queryResult = webCache[url.ToLower()];
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
}
2023-10-03 16:13:37 +02:00
catch (Exception e)
{
2023-10-03 16:13:37 +02:00
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
continue;
}
2023-10-03 16:13:37 +02:00
}
else
{
continue;
}
2023-10-03 16:13:37 +02:00
var is_diff = false;
2023-10-03 16:13:37 +02:00
var r_buffer = ProcessChapter(curr_buffer, new List<Chapter>(), _ => {}, out var next_buffer);
var r_live = ProcessChapter(curr_live, new List<Chapter>(), _ => {}, out var next_live);
2023-10-03 16:13:37 +02:00
if (next_buffer != null) buffer.Push(next_buffer);
2023-10-03 16:13:37 +02:00
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
2023-10-03 16:13:37 +02:00
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
2023-10-03 16:13:37 +02:00
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
{
var clean_buffer = GetChapterText(curr_buffer);
var clean_live = GetChapterText(curr_live);
2023-10-03 16:13:37 +02:00
if (clean_buffer.Trim() != clean_live.Trim())
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
new Hyperlinq(() =>
{
2023-10-03 16:13:37 +02:00
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, curr_buffer.chapter.Value);
File.WriteAllText(fb, curr_live.chapter.Value);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
2023-10-03 16:13:37 +02:00
}, "[Compare Raw]").Dump();
new Hyperlinq(() =>
{
2023-10-03 16:13:37 +02:00
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, clean_buffer);
File.WriteAllText(fb, clean_live);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
2023-10-03 16:13:37 +02:00
}, "[Compare Text]").Dump();
new Hyperlinq(() =>
{
2023-10-03 16:13:37 +02:00
webCache[url.ToLower()] = curr_live.queryResult;
SaveCache();
2023-10-03 16:13:37 +02:00
}, "[Save new version to webcache]").Dump();
2023-10-03 16:13:37 +02:00
is_diff = true;
}
2023-10-03 16:13:37 +02:00
}
2023-10-03 16:13:37 +02:00
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
2023-10-03 16:13:37 +02:00
if (is_diff) "".Dump();
}
}
bool Relaxedurleq(string a, string b)
{
if (a == b) return true;
if (a.StartsWith("https://")) a = a.Substring("https://".Length);
if (a.StartsWith("http://")) a = a.Substring("http://".Length);
if (b.StartsWith("https://")) b = b.Substring("https://".Length);
if (b.StartsWith("http://")) b = b.Substring("http://".Length);
return (a==b);
}
string GetChapterText(Chapter c)
{
if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;
var clean = HTMLToText.ConvertHtml(c.chapter.Value);
clean = clean.Trim();
clean = new Regex(@"\s+").Replace(clean, " ");
return clean;
}
2023-10-03 16:13:37 +02:00
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueueNext)
{
2023-10-03 16:13:37 +02:00
forwardQueueNext = null;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(curr.queryResult);
#region Base
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");
var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
if (nodeNav == null) nodeNav = nodeContent;
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
#endregion
#region Title
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");
curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));
var titles = new List<string>();
titles.Add(curr.title);
if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
{
var baseTitle = curr.title;
var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);
var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);
titles.Add(prefix1);
titles.Add(prefix2);
var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
if (altTitleNode1 != null)
{
var newtitle = Helper.TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
}
else if (altTitleNode2 != null)
{
var newtitle = Helper.TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
}
else if (altTitleNode3 != null)
{
var newtitle = Helper.TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
altTitleNode3.Remove();
prt(" > title node removed");
}
else if (altTitleNode4 != null)
{
var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
altTitleNode4.Remove();
prt(" > title node removed");
}
else if (suffix.Length > 2)
{
curr.title = suffix;
titles.Add(suffix);
}
else
{
prt(" [!!] Warning cannot parse title");
}
if (suffix.Length > 2)
{
curr.title = baseTitle;
titles.Add(baseTitle);
}
}
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
tit_alt = tit_alt.Trim();
if (tit_alt.Length>2) curr.title = tit_alt;
}
#endregion
curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";
if (backBuffer.Any() && backBuffer.First().title == curr.title)
{
prt("[!] Book loop found - skipping entry");
return ProcessResult.ReachedEnd; // prevent book II loop
}
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");
if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
{
prt("[!] Epilogue found - skipping entry");
return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
}
prt(curr.title + " (" + curr.url + ")");
#region Next
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
REX_NUMSTART.Match(curr.title).Success &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
{
prt("[!] Book jump found - skipping entry");
return ProcessResult.ReachedEnd;
}
var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");
if (next == null)
next = nodeContent.Descendants()
.Where(p => p.Name.ToLower() == "a")
2023-10-03 16:13:37 +02:00
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
if (next == null)
next = nodeNav.Descendants()
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next == null)
next = Helper.RecursiveDescendants(nodeContent)
.Where(p => p.Name.ToLower() == "a")
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
if (next == null)
next = Helper.RecursiveDescendants(nodeContent)
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next != null)
{
var next_url = next.Attributes["href"].Value.Trim();
if (next_url == "." || next_url == "/" || next_url == "./")
{
next=null;
}
else
{
if (next_url.StartsWith("//")) next_url = "http:" + next_url;
if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url);
if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url);
curr.next = next_url;
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
{
2023-10-03 16:13:37 +02:00
forwardQueueNext = next_url;
}
}
}
if (next == null) prt(" > (!) No next URL found");
#endregion
#region Chapter marker
var cpMarkerIdentities = new List<string>
{
"previousnext", "previouschapternextchapter",
"firstnext", "firstchapternextchapter",
"firstchapter", "previouslast",
"previouschapterlastchapter",
"previouschapter", "nextchapter", "lastchapter",
"first", "previous", "next", "last"
};
foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > Chapter marker removed");
}
foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > Chapter marker removed");
}
var alist = nodeChapter.SelectNodes("//a");
if (alist != null)
{
foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
node.Remove();
prt(" > Chapter marker removed");
}
}
var plist = nodeChapter.SelectNodes("//p");
if (plist != null)
{
foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
node.Remove();
prt(" > Chapter marker removed");
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
if (shareNodes != null)
{
foreach (var node in shareNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > share div removed");
}
else
{
prt(" > share div cannot be removed - skipping");
}
}
}
#endregion
#region Meta Div
var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
if (metaNodes != null)
{
foreach (var node in metaNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > meta div removed");
}
else
{
prt(" > meta div cannot be removed - skipping");
}
}
}
#endregion
#region Ad Blocking
var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
if (adNodes1 != null)
{
foreach (var node in adNodes1)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
if (adNodes2 != null)
{
foreach (var node in adNodes2)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
if (adNodes3 != null)
{
foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement"))
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
#endregion
#region Title Paragraphs
var titleNodes1 = nodeChapter.SelectNodes(@"p");
if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
{
nodeChapter.RemoveChild(titleNodes1.First());
prt(" > title node removed");
}
for (int hval = 1; hval <= 5; hval++)
{
var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
if (titleNodes2 != null)
{
foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower())))
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > title node removed");
}
}
}
}
var titleNodes3 = nodeChapter.SelectNodes(@"//u");
if (titleNodes3 != null && titleNodes3.Any())
{
var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes3)
{
t.Remove();
prt(" > title node removed");
}
}
var titleNodes4 = nodeChapter.SelectNodes(@"//span");
if (titleNodes4 != null && titleNodes4.Any())
{
var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes4)
{
t.Remove();
prt(" > title node removed");
}
}
var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
if (titleNodes5 != null && titleNodes5.Any())
{
var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes5)
{
t.Remove();
prt(" > title node removed");
}
}
#endregion
#region Remove <hr>'s
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
{
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
prt(" > header hr removed");
}
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
{
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
prt(" > footer hr removed");
}
#endregion
#region Other (Author's Node)
foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > authors note removed");
}
#endregion
var chap_html = nodeChapter.InnerHtml.Trim();
#region Fix raw <hr>
// KOReader doesn't like <hr>
chap_html = chap_html.Replace("<hr>", "<hr/>");
#endregion
curr.chapter = chap_html;
if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;
return ProcessResult.SuccessNormal;
}
void OutputChapter(Chapter curr, int index)
{
File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult);
File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);
StringBuilder b = new StringBuilder();
{
b.AppendLine("<!DOCTYPE html>");
b.AppendLine("<html>");
b.AppendLine("<body>");
b.AppendLine();
b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
b.AppendLine();
b.AppendLine(curr.chapter);
b.AppendLine("</body>");
b.AppendLine("</html>");
}
File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
}
string NakedIdentity(HtmlNode raw)
{
return string.Join(string.Empty,
raw
.InnerText
.ToLower()
.Replace("&gt;", "")
.Replace("&lt;", "")
.Replace("&amp;", "")
.Replace("&quot;", "")
.Replace("&nbsp;", "")
.ToCharArray()
.Where(c => char.IsLetterOrDigit(c))
.Select(c => char.ToLower(c))).Trim()
.ToLower();
}
bool CouldBeTitle(HtmlNode n, string title)
{
var t0 = Helper.Striptease(n);
var t1 = Helper.Striptease(title);
t0 = t0.ToLower();
t1 = t1.ToLower();
t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
t0 = Regex.Replace(t0, @"\s\s+", "");
t1 = Regex.Replace(t1, @"\s\s+", "");
return t0 == t1;
}
void WriteEpub(List<Chapter> chapters)
{
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
{
using (var zipbook = new ZipOutputStream(fs))
{
WritePubString(zipbook, @"mimetype", GetEpubMimetype());
WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));
for (int i = 0; i < chapters.Count; i++)
{
WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
}
}
}
File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);
File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
}
void GenerateMobi()
{
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
"Running ebook-convert for MOBI output".Dump();
var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999");
$"ebook-convert returned: {pout.ExitCode}".Dump();
if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);
File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
}
void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
{
e = e ?? Encoding.UTF8;
var f = z.PutNextEntry(n);
f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;
byte[] buffer = e.GetBytes(c);
z.Write(buffer, 0, buffer.Length);
}
string GetEpubMimetype()
{
return "application/epub+zip";
}
string GetEpubContainerXML()
{
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XAttribute("version", "1.0"),
new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XAttribute("full-path", "OEBPS/content.opf"),
new XAttribute("media-type", "application/oebps-package+xml")))));
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
var r = writer.ToString();
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
return r.Trim() + "\r\n";
}
string GetEpubContentOPF(List<Chapter> chapters)
{
XNamespace dc = "http://purl.org/dc/elements/1.1/";
XNamespace opf = "http://www.idpf.org/2007/opf";
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));
var package = new XElement(opf + "package",
new XAttribute("unique-identifier", "BookId"),
new XAttribute("version", "2.0"));
doc.Add(package);
var meta = new XElement(opf + "metadata",
new XAttribute(XNamespace.Xmlns + "dc", dc),
new XAttribute(XNamespace.Xmlns + "opf", opf),
new XElement(dc + "title", ACTIVE_BOOK.Title),
new XElement(dc + "creator", ACTIVE_BOOK.Author),
new XElement(dc + "identifier",
new XAttribute("id", "BookId"),
new XAttribute(opf + "scheme", "UUID"),
"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
new XElement(dc + "date",
new XAttribute(opf + "event", "publication"),
ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "date",
new XAttribute(opf + "event", "modification"),
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "date",
new XAttribute(opf + "event", "creation"),
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "language", ACTIVE_BOOK.Language),
new XElement(dc + "identifier",
new XAttribute(opf + "scheme", "UUID"),
ACTIVE_BOOK.ID_CAL.ToString("D")),
new XElement(opf + "meta",
new XAttribute("content", "1.0"),
new XAttribute("name", "Wordpress_eBook_scraper_version")),
new XElement(opf + "meta",
new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));
if (ACTIVE_BOOK.Series != null)
{
meta.Add(new XElement(opf + "meta",
new XAttribute("content", ACTIVE_BOOK.Series),
new XAttribute("name", "calibre:series")));
meta.Add(new XElement(opf + "meta",
new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
new XAttribute("name", "calibre:series_index")));
}
package.Add(meta);
var manifest = new XElement(opf + "manifest");
for(int i = 0; i < chapters.Count; i++)
{
manifest.Add(new XElement(opf + "item",
new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))),
new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))),
new XAttribute("media-type", "application/xhtml+xml")));
}
manifest.Add(new XElement(opf + "item",
new XAttribute("href", "toc.ncx"),
new XAttribute("id", "ncx"),
new XAttribute("media-type", "application/x-dtbncx+xml")));
package.Add(manifest);
var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
for (int i = 0; i < chapters.Count; i++)
{
spine.Add(new XElement(opf + "itemref",
new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))));
}
package.Add(spine);
package.Add(new XElement(opf + "guide"));
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
return writer.ToString();
}
string GetEpubTOC(List<Chapter> chapters)
{
XNamespace ncx = "http://www.idpf.org/2007/opf";
var doc = new XDocument(
new XDeclaration("1.0", "UTF-8", null),
new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));
var root = new XElement(ncx + "ncx",
new XAttribute("version", "2005-1"),
new XElement(ncx + "head",
new XElement(ncx + "meta",
new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
new XAttribute("name", "dtb:uid")),
new XElement(ncx + "meta",
new XAttribute("content", 1),
new XAttribute("name", "dtb:depth")),
new XElement(ncx + "meta",
new XAttribute("content", 0),
new XAttribute("name", "dtb:totalPageCount")),
new XElement(ncx + "meta",
new XAttribute("content", 0),
new XAttribute("name", "dtb:maxPageNumber"))));
doc.Add(root);
root.Add(new XElement(ncx + "docTitle",
new XElement(ncx + "text", "Unknown")));
var nav = new XElement(ncx + "navMap");
for (int i = 0; i < chapters.Count; i++)
{
nav.Add(new XElement(ncx + "navPoint",
new XAttribute("id", "navPoint-" + (i + 1)),
new XAttribute("playOrder", i + 1),
new XElement(ncx + "navLabel",
new XElement(ncx + "text", chapters[i].title)),
new XElement(ncx + "content",
new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))));
}
root.Add(nav);
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
return writer.ToString();
}
string GetEpubChapterFile(Chapter chapter, int idx)
{
StringBuilder xml = new StringBuilder();
xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
xml.AppendLine(@"<head>");
xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
xml.AppendLine(@"</head>");
xml.AppendLine(@"<body>");
xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
xml.AppendLine(chapter.chapter);
xml.AppendLine(@"</body>");
xml.AppendLine(@"</html>");
return xml.ToString();
}
}