1098 lines
36 KiB
C#
1098 lines
36 KiB
C#
using System.Diagnostics;
|
|
using System.Net;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Xml.Linq;
|
|
using System.Xml.Serialization;
|
|
using HtmlAgilityPack;
|
|
using Ionic.Zip;
|
|
using WordpressEboobScraper2.Proc;
|
|
|
|
namespace WordpressEboobScraper2.Scraper;
|
|
|
|
/** *************************************************** **/
|
|
/** **/
|
|
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
|
|
/** **/
|
|
/** *************************************************** **/
|
|
|
|
public class Scraper
|
|
{
|
|
|
|
static EpubParameter ACTIVE_BOOK = null;
|
|
|
|
const int LIMIT = 1500;
|
|
|
|
readonly Regex REX_NUMSTART = new(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
|
|
|
|
Dictionary<string, string> webCache = new();
|
|
|
|
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
|
|
|
|
string WCACHE_FILE => Path.Combine(Config.BASE_DIR_OUT, @"_cache" , ACTIVE_BOOK.Foldername + @".xml");
|
|
string HTML_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"html" , ACTIVE_BOOK.Foldername + @".html");
|
|
string EPUB_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"epub" , ACTIVE_BOOK.Foldername + @".epub");
|
|
string MOBI_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"mobi" , ACTIVE_BOOK.Foldername + @".mobi");
|
|
|
|
string HTML_FILE_STASH => STASH_FOLDER + @"book.html";
|
|
string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip";
|
|
string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub";
|
|
string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi";
|
|
|
|
string QUERY_FOLDER => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar; // full query result
|
|
string HTML_FOLDER => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar; // unprocessed chapter code
|
|
string EPUB_FOLDER => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar; // processed epub chapter code
|
|
|
|
//----------------------------------------------------------------------------------------------------//
|
|
|
|
//----------------------------------------------------------------------------------------------------//
|
|
|
|
public void Generate()
|
|
{
|
|
foreach (var bb in Config.BOOKS)
|
|
{
|
|
ACTIVE_BOOK = bb;
|
|
|
|
$"".Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
|
|
$" [PROCESSING BOOK] {bb.DisplayStr} ".Dump();
|
|
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
|
|
Init();
|
|
|
|
List<Chapter> chapters = FindChapters();
|
|
|
|
WriteBookHTML(chapters);
|
|
WriteEpub(chapters);
|
|
if (Config.CONVERT_MOBI) GenerateMobi();
|
|
}
|
|
}
|
|
|
|
public void Verify()
|
|
{
|
|
foreach (var bb in Config.BOOKS)
|
|
{
|
|
ACTIVE_BOOK = bb;
|
|
|
|
$"".Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
|
|
$" [VERIFYING BOOK] {bb.DisplayStr} ".Dump();
|
|
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
$"".Dump();
|
|
|
|
LoadWebCache();
|
|
|
|
VerifyChapters();
|
|
}
|
|
}
|
|
|
|
void Init()
|
|
{
|
|
if (Directory.Exists(STASH_FOLDER))
|
|
{
|
|
Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
|
|
if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
|
|
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
|
|
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
|
|
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
|
|
}
|
|
|
|
Directory.CreateDirectory(STASH_FOLDER);
|
|
Directory.CreateDirectory(QUERY_FOLDER);
|
|
Directory.CreateDirectory(HTML_FOLDER);
|
|
Directory.CreateDirectory(EPUB_FOLDER);
|
|
|
|
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar);
|
|
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar);
|
|
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar);
|
|
Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar);
|
|
|
|
if (Config.USE_WEBCACHE) LoadWebCache();
|
|
}
|
|
|
|
void WriteBookHTML(List<Chapter> chapters)
|
|
{
|
|
StringBuilder b = new StringBuilder();
|
|
|
|
b.AppendLine("<!DOCTYPE html>");
|
|
b.AppendLine("<html>");
|
|
b.AppendLine("<body>");
|
|
|
|
foreach (var currChapter in chapters)
|
|
{
|
|
b.AppendLine();
|
|
b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
|
|
b.AppendLine();
|
|
b.AppendLine(currChapter.chapter);
|
|
}
|
|
|
|
b.AppendLine("</html>");
|
|
b.AppendLine("</body>");
|
|
|
|
File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
|
|
File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
|
|
}
|
|
|
|
void SaveCache()
|
|
{
|
|
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
|
using (var writer = new StreamWriter(WCACHE_FILE))
|
|
{
|
|
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
|
|
}
|
|
}
|
|
|
|
void LoadWebCache()
|
|
{
|
|
if (!File.Exists(WCACHE_FILE)) return;
|
|
|
|
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
|
|
|
using TextReader reader = new StreamReader(WCACHE_FILE);
|
|
|
|
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
|
|
|
|
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
|
|
}
|
|
|
|
List<Chapter> FindChapters()
|
|
{
|
|
List<Chapter> result = new List<Chapter>();
|
|
|
|
using WebClient client = new WebClient();
|
|
|
|
client.Encoding = Encoding.UTF8;
|
|
Stack<string> buffer = new Stack<string>();
|
|
buffer.Push(ACTIVE_BOOK.StartURL);
|
|
|
|
while (buffer.Any() && result.Count < LIMIT)
|
|
{
|
|
var url = buffer.Pop();
|
|
Chapter curr = new Chapter() { url = url };
|
|
|
|
var buffered = webCache.ContainsKey(url.ToLower());
|
|
if (buffered)
|
|
{
|
|
curr.queryResult = webCache[url.ToLower()];
|
|
"*(loaded from webcache)*".Dump();
|
|
}
|
|
else
|
|
{
|
|
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
|
webCache[url.ToLower()] = curr.queryResult;
|
|
SaveCache();
|
|
}
|
|
|
|
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
|
|
if (next_url != null) buffer.Push(next_url);
|
|
|
|
if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
|
|
{
|
|
"".Dump();
|
|
"//==> *(auto-reload from live)*".Dump();
|
|
"".Dump();
|
|
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
|
webCache[url.ToLower()] = curr.queryResult;
|
|
SaveCache();
|
|
|
|
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
|
|
if (next_url_inner != null) buffer.Push(next_url_inner);
|
|
}
|
|
if (r == ProcessResult.SuccessNormal)
|
|
{
|
|
" ==> Chapter processed".Dump();
|
|
result.Add(curr);
|
|
OutputChapter(curr, result.Count);
|
|
}
|
|
else if (r == ProcessResult.SkipChapter)
|
|
{
|
|
" ==> Skip this chapter".Dump();
|
|
}
|
|
else if (r == ProcessResult.ReachedEnd)
|
|
{
|
|
" ==> End reached".Dump();
|
|
}
|
|
|
|
|
|
"".Dump();
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void VerifyChapters()
|
|
{
|
|
using WebClient client = new WebClient();
|
|
|
|
client.Encoding = Encoding.UTF8;
|
|
Stack<string> buffer = new Stack<string>();
|
|
buffer.Push(ACTIVE_BOOK.StartURL);
|
|
|
|
while (buffer.Any())
|
|
{
|
|
var url = buffer.Pop();
|
|
Chapter curr_buffer = new Chapter() { url = url };
|
|
Chapter curr_live = new Chapter() { url = url };
|
|
|
|
var buffered = webCache.ContainsKey(url.ToLower());
|
|
if (buffered)
|
|
{
|
|
try
|
|
{
|
|
curr_buffer.queryResult = webCache[url.ToLower()];
|
|
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
|
|
continue;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var is_diff = false;
|
|
|
|
var r_buffer = ProcessChapter(curr_buffer, new List<Chapter>(), _ => {}, out var next_buffer);
|
|
var r_live = ProcessChapter(curr_live, new List<Chapter>(), _ => {}, out var next_live);
|
|
|
|
if (next_buffer != null) buffer.Push(next_buffer);
|
|
|
|
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
|
|
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
|
|
|
|
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
|
|
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
|
|
|
|
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
|
|
{
|
|
var clean_buffer = GetChapterText(curr_buffer);
|
|
var clean_live = GetChapterText(curr_live);
|
|
|
|
if (clean_buffer.Trim() != clean_live.Trim())
|
|
{
|
|
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
|
|
new Hyperlinq(() =>
|
|
{
|
|
|
|
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
|
|
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
|
|
File.WriteAllText(fa, curr_buffer.chapter.Value);
|
|
File.WriteAllText(fb, curr_live.chapter.Value);
|
|
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
|
|
|
|
}, "[Compare Raw]").Dump();
|
|
new Hyperlinq(() =>
|
|
{
|
|
|
|
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
|
|
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
|
|
File.WriteAllText(fa, clean_buffer);
|
|
File.WriteAllText(fb, clean_live);
|
|
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
|
|
|
|
}, "[Compare Text]").Dump();
|
|
new Hyperlinq(() =>
|
|
{
|
|
|
|
webCache[url.ToLower()] = curr_live.queryResult;
|
|
SaveCache();
|
|
|
|
}, "[Save new version to webcache]").Dump();
|
|
|
|
is_diff = true;
|
|
}
|
|
}
|
|
|
|
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
|
|
|
|
if (is_diff) "".Dump();
|
|
}
|
|
}
|
|
|
|
bool Relaxedurleq(string a, string b)
|
|
{
|
|
if (a == b) return true;
|
|
if (a.StartsWith("https://")) a = a.Substring("https://".Length);
|
|
if (a.StartsWith("http://")) a = a.Substring("http://".Length);
|
|
if (b.StartsWith("https://")) b = b.Substring("https://".Length);
|
|
if (b.StartsWith("http://")) b = b.Substring("http://".Length);
|
|
|
|
return (a==b);
|
|
}
|
|
|
|
string GetChapterText(Chapter c)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;
|
|
|
|
var clean = HTMLToText.ConvertHtml(c.chapter.Value);
|
|
|
|
clean = clean.Trim();
|
|
|
|
clean = new Regex(@"\s+").Replace(clean, " ");
|
|
|
|
return clean;
|
|
}
|
|
|
|
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueueNext)
|
|
{
|
|
forwardQueueNext = null;
|
|
|
|
HtmlDocument doc = new HtmlDocument();
|
|
doc.LoadHtml(curr.queryResult);
|
|
|
|
#region Base
|
|
|
|
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
|
|
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
|
|
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
|
|
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
|
|
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");
|
|
|
|
var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
|
|
if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
|
|
if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
|
|
if (nodeNav == null) nodeNav = nodeContent;
|
|
|
|
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
|
|
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
|
|
|
|
#endregion
|
|
|
|
#region Title
|
|
|
|
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
|
|
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
|
|
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
|
|
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
|
|
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");
|
|
|
|
curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));
|
|
|
|
var titles = new List<string>();
|
|
titles.Add(curr.title);
|
|
|
|
if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
|
|
{
|
|
var baseTitle = curr.title;
|
|
|
|
var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);
|
|
|
|
var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
|
|
var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);
|
|
|
|
titles.Add(prefix1);
|
|
titles.Add(prefix2);
|
|
|
|
var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
|
|
var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
|
|
var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
|
|
var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
|
|
if (altTitleNode1 != null)
|
|
{
|
|
var newtitle = Helper.TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
|
|
titles.Add(newtitle);
|
|
curr.title = newtitle;
|
|
titles.Add(prefix1 + newtitle);
|
|
titles.Add(prefix2 + newtitle);
|
|
titles.Add(prefix1 + " - " + newtitle);
|
|
titles.Add(prefix2 + " - " + newtitle);
|
|
}
|
|
else if (altTitleNode2 != null)
|
|
{
|
|
var newtitle = Helper.TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
|
|
titles.Add(newtitle);
|
|
curr.title = newtitle;
|
|
titles.Add(prefix1 + newtitle);
|
|
titles.Add(prefix2 + newtitle);
|
|
titles.Add(prefix1 + " - " + newtitle);
|
|
titles.Add(prefix2 + " - " + newtitle);
|
|
}
|
|
else if (altTitleNode3 != null)
|
|
{
|
|
var newtitle = Helper.TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
|
|
titles.Add(newtitle);
|
|
curr.title = newtitle;
|
|
titles.Add(prefix1 + newtitle);
|
|
titles.Add(prefix2 + newtitle);
|
|
titles.Add(prefix1 + " - " + newtitle);
|
|
titles.Add(prefix2 + " - " + newtitle);
|
|
|
|
altTitleNode3.Remove();
|
|
prt(" > title node removed");
|
|
}
|
|
else if (altTitleNode4 != null)
|
|
{
|
|
var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
|
|
titles.Add(newtitle);
|
|
curr.title = newtitle;
|
|
titles.Add(prefix1 + newtitle);
|
|
titles.Add(prefix2 + newtitle);
|
|
titles.Add(prefix1 + " - " + newtitle);
|
|
titles.Add(prefix2 + " - " + newtitle);
|
|
|
|
altTitleNode4.Remove();
|
|
prt(" > title node removed");
|
|
}
|
|
else if (suffix.Length > 2)
|
|
{
|
|
curr.title = suffix;
|
|
titles.Add(suffix);
|
|
}
|
|
else
|
|
{
|
|
prt(" [!!] Warning cannot parse title");
|
|
}
|
|
|
|
if (suffix.Length > 2)
|
|
{
|
|
curr.title = baseTitle;
|
|
titles.Add(baseTitle);
|
|
}
|
|
}
|
|
|
|
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
|
|
var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
|
|
while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
|
|
tit_alt = tit_alt.Trim();
|
|
if (tit_alt.Length>2) curr.title = tit_alt;
|
|
}
|
|
|
|
#endregion
|
|
|
|
curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";
|
|
|
|
if (backBuffer.Any() && backBuffer.First().title == curr.title)
|
|
{
|
|
prt("[!] Book loop found - skipping entry");
|
|
return ProcessResult.ReachedEnd; // prevent book II loop
|
|
}
|
|
|
|
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
|
|
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
|
|
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
|
|
|
|
if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");
|
|
|
|
if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
|
|
{
|
|
prt("[!] Epilogue found - skipping entry");
|
|
return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
|
|
}
|
|
|
|
prt(curr.title + " (" + curr.url + ")");
|
|
|
|
#region Next
|
|
|
|
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
|
|
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
|
|
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
|
|
REX_NUMSTART.Match(curr.title).Success &&
|
|
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
|
|
{
|
|
prt("[!] Book jump found - skipping entry");
|
|
return ProcessResult.ReachedEnd;
|
|
}
|
|
|
|
var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");
|
|
|
|
if (next == null)
|
|
next = nodeContent.Descendants()
|
|
.Where(p => p.Name.ToLower() == "a")
|
|
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
|
|
.Where(p => p.Attributes.Contains("href"))
|
|
.FirstOrDefault();
|
|
|
|
if (next == null)
|
|
next = nodeNav.Descendants()
|
|
.Where(p => p.Name.ToLower() == "a")
|
|
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
|
|
.FirstOrDefault();
|
|
|
|
if (next == null)
|
|
next = Helper.RecursiveDescendants(nodeContent)
|
|
.Where(p => p.Name.ToLower() == "a")
|
|
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
|
|
.Where(p => p.Attributes.Contains("href"))
|
|
.FirstOrDefault();
|
|
|
|
if (next == null)
|
|
next = Helper.RecursiveDescendants(nodeContent)
|
|
.Where(p => p.Name.ToLower() == "a")
|
|
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
|
|
.FirstOrDefault();
|
|
|
|
if (next != null)
|
|
{
|
|
var next_url = next.Attributes["href"].Value.Trim();
|
|
|
|
if (next_url == "." || next_url == "/" || next_url == "./")
|
|
{
|
|
next=null;
|
|
}
|
|
else
|
|
{
|
|
if (next_url.StartsWith("//")) next_url = "http:" + next_url;
|
|
|
|
if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url);
|
|
|
|
if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url);
|
|
|
|
curr.next = next_url;
|
|
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
|
|
{
|
|
forwardQueueNext = next_url;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if (next == null) prt(" > (!) No next URL found");
|
|
|
|
#endregion
|
|
|
|
#region Chapter marker
|
|
|
|
var cpMarkerIdentities = new List<string>
|
|
{
|
|
"previousnext", "previouschapternextchapter",
|
|
"firstnext", "firstchapternextchapter",
|
|
"firstchapter", "previouslast",
|
|
|
|
"previouschapterlastchapter",
|
|
|
|
"previouschapter", "nextchapter", "lastchapter",
|
|
|
|
"first", "previous", "next", "last"
|
|
};
|
|
|
|
foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > Chapter marker removed");
|
|
}
|
|
|
|
foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > Chapter marker removed");
|
|
}
|
|
|
|
var alist = nodeChapter.SelectNodes("//a");
|
|
if (alist != null)
|
|
{
|
|
foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
|
{
|
|
node.Remove();
|
|
prt(" > Chapter marker removed");
|
|
}
|
|
}
|
|
|
|
var plist = nodeChapter.SelectNodes("//p");
|
|
if (plist != null)
|
|
{
|
|
foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
|
{
|
|
node.Remove();
|
|
prt(" > Chapter marker removed");
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Share Div
|
|
|
|
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
|
|
if (shareNodes != null)
|
|
{
|
|
foreach (var node in shareNodes)
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > share div removed");
|
|
}
|
|
else
|
|
{
|
|
prt(" > share div cannot be removed - skipping");
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Meta Div
|
|
|
|
var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
|
|
if (metaNodes != null)
|
|
{
|
|
foreach (var node in metaNodes)
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > meta div removed");
|
|
}
|
|
else
|
|
{
|
|
prt(" > meta div cannot be removed - skipping");
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Ad Blocking
|
|
|
|
var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
|
|
if (adNodes1 != null)
|
|
{
|
|
foreach (var node in adNodes1)
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > ad div removed");
|
|
}
|
|
else
|
|
{
|
|
prt(" > ad div cannot be removed - skipping");
|
|
}
|
|
}
|
|
}
|
|
|
|
var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
|
|
if (adNodes2 != null)
|
|
{
|
|
foreach (var node in adNodes2)
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > ad div removed");
|
|
}
|
|
else
|
|
{
|
|
prt(" > ad div cannot be removed - skipping");
|
|
}
|
|
}
|
|
}
|
|
|
|
var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
|
|
if (adNodes3 != null)
|
|
{
|
|
foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement"))
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > ad div removed");
|
|
}
|
|
else
|
|
{
|
|
prt(" > ad div cannot be removed - skipping");
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Title Paragraphs
|
|
|
|
var titleNodes1 = nodeChapter.SelectNodes(@"p");
|
|
if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
|
|
{
|
|
nodeChapter.RemoveChild(titleNodes1.First());
|
|
prt(" > title node removed");
|
|
}
|
|
|
|
for (int hval = 1; hval <= 5; hval++)
|
|
{
|
|
var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
|
|
if (titleNodes2 != null)
|
|
{
|
|
foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower())))
|
|
{
|
|
if (nodeChapter.ChildNodes.Contains(node))
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > title node removed");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var titleNodes3 = nodeChapter.SelectNodes(@"//u");
|
|
if (titleNodes3 != null && titleNodes3.Any())
|
|
{
|
|
var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
|
foreach (var t in xTitleNodes3)
|
|
{
|
|
t.Remove();
|
|
prt(" > title node removed");
|
|
}
|
|
}
|
|
|
|
var titleNodes4 = nodeChapter.SelectNodes(@"//span");
|
|
if (titleNodes4 != null && titleNodes4.Any())
|
|
{
|
|
var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
|
foreach (var t in xTitleNodes4)
|
|
{
|
|
t.Remove();
|
|
prt(" > title node removed");
|
|
}
|
|
}
|
|
|
|
var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
|
|
if (titleNodes5 != null && titleNodes5.Any())
|
|
{
|
|
var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
|
foreach (var t in xTitleNodes5)
|
|
{
|
|
t.Remove();
|
|
prt(" > title node removed");
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Remove <hr>'s
|
|
|
|
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
|
|
{
|
|
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
|
|
prt(" > header hr removed");
|
|
}
|
|
|
|
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
|
|
{
|
|
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
|
|
prt(" > footer hr removed");
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Other (Author's Node)
|
|
|
|
foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
|
|
{
|
|
nodeChapter.RemoveChild(node);
|
|
prt(" > authors note removed");
|
|
}
|
|
|
|
#endregion
|
|
|
|
var chap_html = nodeChapter.InnerHtml.Trim();
|
|
|
|
#region Fix raw <hr>
|
|
// KOReader doesn't like <hr>
|
|
|
|
chap_html = chap_html.Replace("<hr>", "<hr/>");
|
|
|
|
#endregion
|
|
|
|
curr.chapter = chap_html;
|
|
|
|
|
|
if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;
|
|
|
|
return ProcessResult.SuccessNormal;
|
|
}
|
|
|
|
void OutputChapter(Chapter curr, int index)
|
|
{
|
|
File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult);
|
|
|
|
File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);
|
|
|
|
StringBuilder b = new StringBuilder();
|
|
{
|
|
b.AppendLine("<!DOCTYPE html>");
|
|
b.AppendLine("<html>");
|
|
b.AppendLine("<body>");
|
|
b.AppendLine();
|
|
b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
|
|
b.AppendLine();
|
|
b.AppendLine(curr.chapter);
|
|
b.AppendLine("</body>");
|
|
b.AppendLine("</html>");
|
|
}
|
|
File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
|
|
}
|
|
|
|
string NakedIdentity(HtmlNode raw)
|
|
{
|
|
return string.Join(string.Empty,
|
|
raw
|
|
.InnerText
|
|
.ToLower()
|
|
.Replace(">", "")
|
|
.Replace("<", "")
|
|
.Replace("&", "")
|
|
.Replace(""", "")
|
|
.Replace(" ", "")
|
|
.ToCharArray()
|
|
.Where(c => char.IsLetterOrDigit(c))
|
|
.Select(c => char.ToLower(c))).Trim()
|
|
.ToLower();
|
|
}
|
|
|
|
bool CouldBeTitle(HtmlNode n, string title)
|
|
{
|
|
var t0 = Helper.Striptease(n);
|
|
var t1 = Helper.Striptease(title);
|
|
|
|
t0 = t0.ToLower();
|
|
t1 = t1.ToLower();
|
|
|
|
t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
|
|
t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
|
|
|
|
t0 = Regex.Replace(t0, @"\s\s+", "");
|
|
t1 = Regex.Replace(t1, @"\s\s+", "");
|
|
|
|
return t0 == t1;
|
|
}
|
|
|
|
void WriteEpub(List<Chapter> chapters)
|
|
{
|
|
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
|
|
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
|
|
|
|
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
|
|
|
using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
|
|
{
|
|
using (var zipbook = new ZipOutputStream(fs))
|
|
{
|
|
WritePubString(zipbook, @"mimetype", GetEpubMimetype());
|
|
WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
|
|
WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
|
|
WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));
|
|
|
|
for (int i = 0; i < chapters.Count; i++)
|
|
{
|
|
WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
|
|
}
|
|
}
|
|
}
|
|
|
|
File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);
|
|
|
|
File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
|
|
}
|
|
|
|
void GenerateMobi()
|
|
{
|
|
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
|
|
|
|
"Running ebook-convert for MOBI output".Dump();
|
|
var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999");
|
|
|
|
$"ebook-convert returned: {pout.ExitCode}".Dump();
|
|
if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);
|
|
|
|
File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
|
|
}
|
|
|
|
void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
|
|
{
|
|
e = e ?? Encoding.UTF8;
|
|
|
|
var f = z.PutNextEntry(n);
|
|
f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;
|
|
|
|
byte[] buffer = e.GetBytes(c);
|
|
z.Write(buffer, 0, buffer.Length);
|
|
}
|
|
|
|
string GetEpubMimetype()
|
|
{
|
|
return "application/epub+zip";
|
|
}
|
|
|
|
string GetEpubContainerXML()
|
|
{
|
|
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
|
|
new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
|
new XAttribute("version", "1.0"),
|
|
new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
|
new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
|
new XAttribute("full-path", "OEBPS/content.opf"),
|
|
new XAttribute("media-type", "application/oebps-package+xml")))));
|
|
|
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
|
|
|
doc.Save(writer);
|
|
var r = writer.ToString();
|
|
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
|
|
return r.Trim() + "\r\n";
|
|
}
|
|
|
|
string GetEpubContentOPF(List<Chapter> chapters)
|
|
{
|
|
XNamespace dc = "http://purl.org/dc/elements/1.1/";
|
|
XNamespace opf = "http://www.idpf.org/2007/opf";
|
|
|
|
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));
|
|
|
|
var package = new XElement(opf + "package",
|
|
new XAttribute("unique-identifier", "BookId"),
|
|
new XAttribute("version", "2.0"));
|
|
|
|
doc.Add(package);
|
|
|
|
var meta = new XElement(opf + "metadata",
|
|
new XAttribute(XNamespace.Xmlns + "dc", dc),
|
|
new XAttribute(XNamespace.Xmlns + "opf", opf),
|
|
new XElement(dc + "title", ACTIVE_BOOK.Title),
|
|
new XElement(dc + "creator", ACTIVE_BOOK.Author),
|
|
new XElement(dc + "identifier",
|
|
new XAttribute("id", "BookId"),
|
|
new XAttribute(opf + "scheme", "UUID"),
|
|
"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
|
|
new XElement(dc + "date",
|
|
new XAttribute(opf + "event", "publication"),
|
|
ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
|
|
new XElement(dc + "date",
|
|
new XAttribute(opf + "event", "modification"),
|
|
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
|
|
new XElement(dc + "date",
|
|
new XAttribute(opf + "event", "creation"),
|
|
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
|
|
new XElement(dc + "language", ACTIVE_BOOK.Language),
|
|
new XElement(dc + "identifier",
|
|
new XAttribute(opf + "scheme", "UUID"),
|
|
ACTIVE_BOOK.ID_CAL.ToString("D")),
|
|
new XElement(opf + "meta",
|
|
new XAttribute("content", "1.0"),
|
|
new XAttribute("name", "Wordpress_eBook_scraper_version")),
|
|
new XElement(opf + "meta",
|
|
new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
|
|
new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));
|
|
|
|
if (ACTIVE_BOOK.Series != null)
|
|
{
|
|
meta.Add(new XElement(opf + "meta",
|
|
new XAttribute("content", ACTIVE_BOOK.Series),
|
|
new XAttribute("name", "calibre:series")));
|
|
meta.Add(new XElement(opf + "meta",
|
|
new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
|
|
new XAttribute("name", "calibre:series_index")));
|
|
}
|
|
|
|
package.Add(meta);
|
|
|
|
var manifest = new XElement(opf + "manifest");
|
|
for(int i = 0; i < chapters.Count; i++)
|
|
{
|
|
manifest.Add(new XElement(opf + "item",
|
|
new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))),
|
|
new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))),
|
|
new XAttribute("media-type", "application/xhtml+xml")));
|
|
}
|
|
manifest.Add(new XElement(opf + "item",
|
|
new XAttribute("href", "toc.ncx"),
|
|
new XAttribute("id", "ncx"),
|
|
new XAttribute("media-type", "application/x-dtbncx+xml")));
|
|
|
|
package.Add(manifest);
|
|
|
|
var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
|
|
for (int i = 0; i < chapters.Count; i++)
|
|
{
|
|
spine.Add(new XElement(opf + "itemref",
|
|
new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))));
|
|
}
|
|
|
|
package.Add(spine);
|
|
|
|
package.Add(new XElement(opf + "guide"));
|
|
|
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
|
|
|
doc.Save(writer);
|
|
return writer.ToString();
|
|
}
|
|
|
|
string GetEpubTOC(List<Chapter> chapters)
|
|
{
|
|
XNamespace ncx = "http://www.idpf.org/2007/opf";
|
|
|
|
var doc = new XDocument(
|
|
new XDeclaration("1.0", "UTF-8", null),
|
|
new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));
|
|
|
|
var root = new XElement(ncx + "ncx",
|
|
new XAttribute("version", "2005-1"),
|
|
new XElement(ncx + "head",
|
|
new XElement(ncx + "meta",
|
|
new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
|
|
new XAttribute("name", "dtb:uid")),
|
|
new XElement(ncx + "meta",
|
|
new XAttribute("content", 1),
|
|
new XAttribute("name", "dtb:depth")),
|
|
new XElement(ncx + "meta",
|
|
new XAttribute("content", 0),
|
|
new XAttribute("name", "dtb:totalPageCount")),
|
|
new XElement(ncx + "meta",
|
|
new XAttribute("content", 0),
|
|
new XAttribute("name", "dtb:maxPageNumber"))));
|
|
|
|
doc.Add(root);
|
|
|
|
root.Add(new XElement(ncx + "docTitle",
|
|
new XElement(ncx + "text", "Unknown")));
|
|
|
|
var nav = new XElement(ncx + "navMap");
|
|
for (int i = 0; i < chapters.Count; i++)
|
|
{
|
|
nav.Add(new XElement(ncx + "navPoint",
|
|
new XAttribute("id", "navPoint-" + (i + 1)),
|
|
new XAttribute("playOrder", i + 1),
|
|
new XElement(ncx + "navLabel",
|
|
new XElement(ncx + "text", chapters[i].title)),
|
|
new XElement(ncx + "content",
|
|
new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))));
|
|
}
|
|
|
|
root.Add(nav);
|
|
|
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
|
|
|
doc.Save(writer);
|
|
return writer.ToString();
|
|
}
|
|
|
|
string GetEpubChapterFile(Chapter chapter, int idx)
|
|
{
|
|
StringBuilder xml = new StringBuilder();
|
|
|
|
xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
|
|
xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
|
|
xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
|
|
xml.AppendLine(@"<head>");
|
|
xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
|
|
xml.AppendLine(@"</head>");
|
|
xml.AppendLine(@"<body>");
|
|
xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
|
|
xml.AppendLine(chapter.chapter);
|
|
xml.AppendLine(@"</body>");
|
|
xml.AppendLine(@"</html>");
|
|
|
|
return xml.ToString();
|
|
}
|
|
}
|