1
0
Fork 0
WordpressEbookScraper2/Program.cs

1662 lines
56 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/** *************************************************** **/
/** **/
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
/** **/
/** *************************************************** **/
const string BASE_DIR_STASH = @"F:\Stash\eBook_scraper\";
const string BASE_DIR_OUT = @"F:\Home\Cloud\Dokumente\E-Books\Scraper\";
const string COMPARE_PROG = @"C:\Program Files\Beyond Compare 4\BCompare.exe";
//----------------------------------------------------------------------------------------------------//
static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/");
static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/");
static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/");
static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/");
static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/");
static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/");
static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/");
static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/");
static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/");
static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/");
static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/");
static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/");
static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/");
static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/");
static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/");
static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/");
static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/");
static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/");
static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/");
static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/");
static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/");
static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/");
static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/");
static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/");
static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/");
static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/");
static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/");
static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/");
static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/");
static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/");
static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/");
static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/");
static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue");
static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/");
static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother");
static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again");
static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave");
//----------------------------------------------------------------------------------------------------//
readonly EpubParameter[] BOOKS = new[] { TPR };
readonly bool USE_WEBCACHE = true;
readonly bool DO_LIVE_RELOAD_OF_LAST = true;
readonly bool CONVERT_MOBI = true;
readonly MainMode MODE = MainMode.Generate;
//----------------------------------------------------------------------------------------------------//
static EpubParameter ACTIVE_BOOK = null;
const int LIMIT = 1500;
readonly Regex REX_NUMSTART = new Regex(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
Dictionary<string, string> webCache = new Dictionary<string, string>();
string STASH_FOLDER => BASE_DIR_STASH + ACTIVE_BOOK.Foldername + @"\";
string WCACHE_FILE => BASE_DIR_OUT + @"_cache\" + ACTIVE_BOOK.Foldername + @".xml";
string HTML_FILE_OUT => BASE_DIR_OUT + @"html\" + ACTIVE_BOOK.Foldername + @".html";
string EPUB_FILE_OUT => BASE_DIR_OUT + @"epub\" + ACTIVE_BOOK.Foldername + @".epub";
string MOBI_FILE_OUT => BASE_DIR_OUT + @"mobi\" + ACTIVE_BOOK.Foldername + @".mobi";
string HTML_FILE_STASH => STASH_FOLDER + @"book.html";
string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip";
string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub";
string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi";
string QUERY_FOLDER => STASH_FOLDER + @"query\"; // full query result
string HTML_FOLDER => STASH_FOLDER + @"html\"; // unprocessed chapter code
string EPUB_FOLDER => STASH_FOLDER + @"epub\"; // processed epub chapter code
//----------------------------------------------------------------------------------------------------//
public enum MainMode
{
Generate,
Verify,
}
public enum ProcessResult
{
SuccessNormal,
ReachedEnd,
SkipChapter,
}
public enum Site
{
Wordpress,
WuxiaWorld,
Royalroad,
WP = Wordpress,
WW = WuxiaWorld,
RR = Royalroad,
}
public class Chapter
{
public string url;
public string title;
public string next;
public GZippedString queryResult;
public GZippedString sourcecode;
public GZippedString chapter;
public bool isPrologue;
public bool isEpilogue;
public bool isBonus;
public bool isSpecial => isPrologue || isEpilogue || isBonus;
}
public class SerializableCacheEntry
{
public string URL;
public GZippedString Content;
}
public class GZippedString : IXmlSerializable
{
public string Value { get; set; }
public System.Xml.Schema.XmlSchema GetSchema() { return null; }
public void ReadXml(System.Xml.XmlReader reader)
{
Value = DecompressString(reader.ReadString());
reader.ReadEndElement();
}
public void WriteXml(System.Xml.XmlWriter writer)
{
writer.WriteString(CompressString(Value));
}
private string CompressString(string text)
{
byte[] buffer = Encoding.UTF8.GetBytes(text);
var memoryStream = new MemoryStream();
using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true))
gZipStream.Write(buffer, 0, buffer.Length);
memoryStream.Position = 0;
var compressedData = new byte[memoryStream.Length];
memoryStream.Read(compressedData, 0, compressedData.Length);
var gZipBuffer = new byte[compressedData.Length + 4];
Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length);
Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4);
return Convert.ToBase64String(gZipBuffer);
}
private string DecompressString(string compressedText)
{
byte[] gZipBuffer = Convert.FromBase64String(compressedText);
using (var memoryStream = new MemoryStream())
{
int dataLength = BitConverter.ToInt32(gZipBuffer, 0);
memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4);
var buffer = new byte[dataLength];
memoryStream.Position = 0;
using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress))
gZipStream.Read(buffer, 0, buffer.Length);
return Encoding.UTF8.GetString(buffer);
}
}
public static implicit operator GZippedString(string v) => new GZippedString{Value = v};
public static implicit operator string (GZippedString v) => v.Value;
}
public class Utf8StringWriter : StringWriter
{
public override Encoding Encoding { get { return Encoding.UTF8; } }
}
public class EpubParameter
{
public readonly string Series;
public readonly int SeriesIndex;
public readonly Guid ID_OPF;
public readonly Guid ID_CAL;
public readonly string Title;
public readonly string Author;
public readonly DateTime Release;
public readonly string Language;
public readonly string StartURL;
public readonly string Foldername;
public readonly Site SiteType;
public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } }
public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { }
public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s)
{
SiteType = st;
Series = z;
SeriesIndex = i;
Title = t;
Author = a;
Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture);
Language = l;
StartURL = s;
if (z == null)
Foldername = Filenamify(t);
else
Foldername = string.Format("{0} {1} - {2}", Filenamify(z), i, Filenamify(t));
var u = new Random(Title.GetHashCode() ^ Author.GetHashCode());
var g = new byte[16];
u.NextBytes(g);
ID_OPF = new Guid(g);
u.NextBytes(g);
ID_CAL = new Guid(g);
}
public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}";
}
//----------------------------------------------------------------------------------------------------//
void Main()
{
Util.AutoScrollResults = true;
if (MODE == MainMode.Generate) Generate();
if (MODE == MainMode.Verify) Verify();
}
void Generate()
{
foreach (var bb in BOOKS)
{
ACTIVE_BOOK = bb;
$"".Dump();
$"".Dump();
$"".Dump();
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
$" [PROCESSING BOOK] {bb.DisplayStr} ".Dump();
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
$"".Dump();
$"".Dump();
$"".Dump();
Init();
List<Chapter> chapters = FindChapters();
WriteBookHTML(chapters);
WriteEpub(chapters);
if (CONVERT_MOBI) GenerateMobi();
}
}
void Verify()
{
foreach (var bb in BOOKS)
{
ACTIVE_BOOK = bb;
$"".Dump();
$"".Dump();
$"".Dump();
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
$" [VERIFYING BOOK] {bb.DisplayStr} ".Dump();
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
$"".Dump();
$"".Dump();
$"".Dump();
LoadWebCache();
VerifyChapters();
}
}
void Init()
{
if (Directory.Exists(STASH_FOLDER))
{
Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
}
Directory.CreateDirectory(STASH_FOLDER);
Directory.CreateDirectory(QUERY_FOLDER);
Directory.CreateDirectory(HTML_FOLDER);
Directory.CreateDirectory(EPUB_FOLDER);
Directory.CreateDirectory(BASE_DIR_OUT + @"_cache\");
Directory.CreateDirectory(BASE_DIR_OUT + @"html\");
Directory.CreateDirectory(BASE_DIR_OUT + @"epub\");
Directory.CreateDirectory(BASE_DIR_OUT + @"mobi\");
if (USE_WEBCACHE) LoadWebCache();
}
void WriteBookHTML(List<Chapter> chapters)
{
StringBuilder b = new StringBuilder();
b.AppendLine("<!DOCTYPE html>");
b.AppendLine("<html>");
b.AppendLine("<body>");
foreach (var currChapter in chapters)
{
b.AppendLine();
b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
b.AppendLine();
b.AppendLine(currChapter.chapter);
}
b.AppendLine("</html>");
b.AppendLine("</body>");
File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
}
void SaveCache()
{
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
{
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
}
}
void LoadWebCache()
{
if (!File.Exists(WCACHE_FILE)) return;
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
using (TextReader reader = new StreamReader(WCACHE_FILE))
{
var result = new List<SerializableCacheEntry>();
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
}
}
List<Chapter> FindChapters()
{
List<Chapter> result = new List<Chapter>();
using (WebClient client = new WebClient())
{
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
while (buffer.Any() && result.Count < LIMIT)
{
var url = buffer.Pop();
Chapter curr = new Chapter() { url = url };
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
curr.queryResult = webCache[url.ToLower()];
"*(loaded from webcache)*".Dump();
}
else
{
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
}
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
if (next_url != null) buffer.Push(next_url);
if (buffered && buffer.Count == 0 && DO_LIVE_RELOAD_OF_LAST)
{
"".Dump();
"//==> *(auto-reload from live)*".Dump();
"".Dump();
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
if (next_url_inner != null) buffer.Push(next_url_inner);
}
if (r == ProcessResult.SuccessNormal)
{
" ==> Chapter processed".Dump();
result.Add(curr);
OutputChapter(curr, result.Count);
}
else if (r == ProcessResult.SkipChapter)
{
" ==> Skip this chapter".Dump();
}
else if (r == ProcessResult.ReachedEnd)
{
" ==> End reached".Dump();
}
"".Dump();
}
}
return result;
}
void VerifyChapters()
{
List<Chapter> result = new List<Chapter>();
using (WebClient client = new WebClient())
{
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
while (buffer.Any() && result.Count < LIMIT)
{
var url = buffer.Pop();
Chapter curr_buffer = new Chapter() { url = url };
Chapter curr_live = new Chapter() { url = url };
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
try
{
curr_buffer.queryResult = webCache[url.ToLower()];
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
}
catch (Exception e)
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
continue;
}
}
else
{
continue;
}
var is_diff = false;
var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);
if (next_buffer != null) buffer.Push(next_buffer);
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
{
var clean_buffer = GetChapterText(curr_buffer);
var clean_live = GetChapterText(curr_live);
if (clean_buffer.Trim() != clean_live.Trim())
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, curr_buffer.chapter.Value);
File.WriteAllText(fb, curr_live.chapter.Value);
Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Raw]").Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, clean_buffer);
File.WriteAllText(fb, clean_live);
Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Text]").Dump();
new Hyperlinq(() =>
{
webCache[url.ToLower()] = curr_live.queryResult;
SaveCache();
}, "[Save new version to webcache]").Dump();
is_diff = true;
}
}
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
if (is_diff) "".Dump();
}
}
}
bool Relaxedurleq(string a, string b)
{
if (a == b) return true;
if (a.StartsWith("https://")) a = a.Substring("https://".Length);
if (a.StartsWith("http://")) a = a.Substring("http://".Length);
if (b.StartsWith("https://")) b = b.Substring("https://".Length);
if (b.StartsWith("http://")) b = b.Substring("http://".Length);
return (a==b);
}
string GetChapterText(Chapter c)
{
if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;
var clean = HTMLToText.ConvertHtml(c.chapter.Value);
clean = clean.Trim();
clean = new Regex(@"\s+").Replace(clean, " ");
return clean;
}
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueue_next)
{
forwardQueue_next = null;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(curr.queryResult);
#region Base
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]");
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");
var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
if (nodeNav == null) nodeNav = nodeContent;
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
#endregion
#region Title
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");
curr.title = TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));
var titles = new List<string>();
titles.Add(curr.title);
if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
{
var baseTitle = curr.title;
var suffix = TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);
var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);
titles.Add(prefix1);
titles.Add(prefix2);
var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
if (altTitleNode1 != null)
{
var newtitle = TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
}
else if (altTitleNode2 != null)
{
var newtitle = TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
}
else if (altTitleNode3 != null)
{
var newtitle = TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
altTitleNode3.Remove();
prt(" > title node removed");
}
else if (altTitleNode4 != null)
{
var newtitle = TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
titles.Add(newtitle);
curr.title = newtitle;
titles.Add(prefix1 + newtitle);
titles.Add(prefix2 + newtitle);
titles.Add(prefix1 + " - " + newtitle);
titles.Add(prefix2 + " - " + newtitle);
altTitleNode4.Remove();
prt(" > title node removed");
}
else if (suffix.Length > 2)
{
curr.title = suffix;
titles.Add(suffix);
}
else
{
prt(" [!!] Warning cannot parse title");
}
if (suffix.Length > 2)
{
curr.title = baseTitle;
titles.Add(baseTitle);
}
}
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
tit_alt = tit_alt.Trim();
if (tit_alt.Length>2) curr.title = tit_alt;
}
#endregion
curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";
if (backBuffer.Any() && backBuffer.First().title == curr.title)
{
prt("[!] Book loop found - skipping entry");
return ProcessResult.ReachedEnd; // prevent book II loop
}
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
if (ACTIVE_BOOK == APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");
if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
{
prt("[!] Epilogue found - skipping entry");
return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
}
prt(curr.title + " (" + curr.url + ")");
#region Next
string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
REX_NUMSTART.Match(curr.title).Success &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
{
prt("[!] Book jump found - skipping entry");
return ProcessResult.ReachedEnd;
}
var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");
if (next == null)
next = nodeContent.Descendants()
.Where(p => p.Name.ToLower() == "a")
.Where(p => Striptease(p) == "next chapter" || Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a");
if (next == null)
next = nodeNav.Descendants()
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next != null)
{
var next_url = next.Attributes["href"].Value.Trim();
if (next_url == "." || next_url == "/" || next_url == "./")
{
next=null;
}
else
{
if (next_url.StartsWith("//")) next_url = "http:" + next_url;
if (next_url.StartsWith("/")) next_url = combineAuthority(curr.url, next_url);
if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = CombineUri(curr.url, next_url);
curr.next = next_url;
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
{
forwardQueue_next = next_url;
}
}
}
if (next == null) prt(" > (!) No next URL found");
#endregion
#region Chapter marker
var cpMarkerIdentities = new List<string>
{
"previousnext", "previouschapternextchapter",
"firstnext", "firstchapternextchapter",
"firstchapter", "previouslast",
"previouschapterlastchapter",
"previouschapter", "nextchapter", "lastchapter",
"first", "previous", "next", "last"
};
foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > Chapter marker removed");
}
foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > Chapter marker removed");
}
var alist = nodeChapter.SelectNodes("//a");
if (alist != null)
{
foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
node.Remove();
prt(" > Chapter marker removed");
}
}
var plist = nodeChapter.SelectNodes("//p");
if (plist != null)
{
foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
{
node.Remove();
prt(" > Chapter marker removed");
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
if (shareNodes != null)
{
foreach (var node in shareNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > share div removed");
}
else
{
prt(" > share div cannot be removed - skipping");
}
}
}
#endregion
#region Meta Div
var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
if (metaNodes != null)
{
foreach (var node in metaNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > meta div removed");
}
else
{
prt(" > meta div cannot be removed - skipping");
}
}
}
#endregion
#region Ad Blocking
var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
if (adNodes1 != null)
{
foreach (var node in adNodes1)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
if (adNodes2 != null)
{
foreach (var node in adNodes2)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
if (adNodes3 != null)
{
foreach (var node in adNodes3.Where(n => Striptease(n) == "advertisement"))
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ad div removed");
}
else
{
prt(" > ad div cannot be removed - skipping");
}
}
}
#endregion
#region Title Paragraphs
var titleNodes1 = nodeChapter.SelectNodes(@"p");
if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
{
nodeChapter.RemoveChild(titleNodes1.First());
prt(" > title node removed");
}
for (int hval = 1; hval <= 5; hval++)
{
var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
if (titleNodes2 != null)
{
foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == TitleFmt(node.InnerText).ToLower())))
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > title node removed");
}
}
}
}
var titleNodes3 = nodeChapter.SelectNodes(@"//u");
if (titleNodes3 != null && titleNodes3.Any())
{
var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes3)
{
t.Remove();
prt(" > title node removed");
}
}
var titleNodes4 = nodeChapter.SelectNodes(@"//span");
if (titleNodes4 != null && titleNodes4.Any())
{
var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes4)
{
t.Remove();
prt(" > title node removed");
}
}
var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
if (titleNodes5 != null && titleNodes5.Any())
{
var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
foreach (var t in xTitleNodes5)
{
t.Remove();
prt(" > title node removed");
}
}
#endregion
#region Remove <hr>'s
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
{
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
prt(" > header hr removed");
}
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
{
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
prt(" > footer hr removed");
}
#endregion
#region Other (Author's Node)
foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
{
nodeChapter.RemoveChild(node);
prt(" > authors note removed");
}
#endregion
var chap_html = nodeChapter.InnerHtml.Trim();
#region Fix raw <hr>
// KOReader doesn't like <hr>
chap_html = chap_html.Replace("<hr>", "<hr/>");
#endregion
curr.chapter = chap_html;
if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;
return ProcessResult.SuccessNormal;
}
string combineAuthority(string url, string suffix)
{
var left = new Uri(url).GetLeftPart(UriPartial.Authority);
if (!left.EndsWith("/")) left = left + "/";
if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/');
return left + suffix;
}
string CombineUri(string uri1, string uri2)
{
if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/"));
uri1 = uri1.TrimEnd('/');
uri2 = uri2.TrimStart('/');
return string.Format("{0}/{1}", uri1, uri2);
}
void OutputChapter(Chapter curr, int index)
{
File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.queryResult);
File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);
StringBuilder b = new StringBuilder();
{
b.AppendLine("<!DOCTYPE html>");
b.AppendLine("<html>");
b.AppendLine("<body>");
b.AppendLine();
b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
b.AppendLine();
b.AppendLine(curr.chapter);
b.AppendLine("</body>");
b.AppendLine("</html>");
}
File.WriteAllText(Path.Combine(EPUB_FOLDER, Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
}
static string Filenamify(string v, bool repl = false)
{
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>
(p >= '0' && p <= '9') ||
(p >= 'A' && p <= 'Z') ||
(p >= 'a' && p <= 'z') ||
p == ' ' ||
p == '.' ||
p == '-' ||
p == '*' ||
p == '_' ||
p == '.' ||
p == ',').ToArray());
if (repl) s = s.Replace(' ', '_');
return s;
}
string TitleFmt(string raw)
{
raw = HtmlEntity.DeEntitize(raw);
raw = raw.Replace('', '-');
raw = raw.Replace((char)160, ' ');
raw = raw.Trim().Trim('-', ':', '_', '#').Trim();
if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3);
raw = raw.Trim().Trim('-', ':', '_', '#').Trim();
if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1);
return raw;
}
string Striptease(HtmlNode raw)
{
{
var rm = raw.SelectNodes(@"//script");
if (rm != null && rm.Any())
{
var copy = HtmlNode.CreateNode($"<{raw.Name}></{raw.Name}>");
copy.CopyFrom(raw);
raw = copy;
rm = raw.SelectNodes(@"//script");
if (rm != null) foreach (var e in rm) e.Remove();
}
}
{
var rm = raw.SelectNodes(@"//meta");
if (rm != null && rm.Any())
{
var copy = HtmlNode.CreateNode($"<{raw.Name}></{raw.Name}>");
copy.CopyFrom(raw);
raw = copy;
rm = raw.SelectNodes(@"//meta");
if (rm != null) foreach (var e in rm) e.Remove();
}
}
return Striptease(HtmlEntity.DeEntitize(raw.InnerText));
}
string Striptease(string raw)
{
var r = string.Join(string.Empty,
raw
.ToCharArray()
.Select(c => char.IsWhiteSpace(c) ? ' ' : c)
.Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c))
.Select(c => char.ToLower(c))).Trim();
return r;
}
string NakedIdentity(HtmlNode raw)
{
return string.Join(string.Empty,
raw
.InnerText
.ToLower()
.Replace("&gt;", "")
.Replace("&lt;", "")
.Replace("&amp;", "")
.Replace("&quot;", "")
.Replace("&nbsp;", "")
.ToCharArray()
.Where(c => char.IsLetterOrDigit(c))
.Select(c => char.ToLower(c))).Trim()
.ToLower();
}
bool CouldBeTitle(HtmlNode n, string title)
{
var t0 = Striptease(n);
var t1 = Striptease(title);
t0 = t0.ToLower();
t1 = t1.ToLower();
t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
t0 = Regex.Replace(t0, @"\s\s+", "");
t1 = Regex.Replace(t1, @"\s\s+", "");
return t0 == t1;
}
void WriteEpub(List<Chapter> chapters)
{
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
{
using (var zipbook = new ZipOutputStream(fs))
{
WritePubString(zipbook, @"mimetype", GetEpubMimetype());
WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));
for (int i = 0; i < chapters.Count; i++)
{
WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
}
}
}
File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);
File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
}
void GenerateMobi()
{
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
"Running ebook-convert for MOBI output".Dump();
var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999");
$"ebook-convert returned: {pout.ExitCode}".Dump();
if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);
File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
}
void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
{
e = e ?? Encoding.UTF8;
var f = z.PutNextEntry(n);
f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;
byte[] buffer = e.GetBytes(c);
z.Write(buffer, 0, buffer.Length);
}
string GetEpubMimetype()
{
return "application/epub+zip";
}
string GetEpubContainerXML()
{
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XAttribute("version", "1.0"),
new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
new XAttribute("full-path", "OEBPS/content.opf"),
new XAttribute("media-type", "application/oebps-package+xml")))));
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
var r = writer.ToString();
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
return r.Trim() + "\r\n";
}
}
string GetEpubContentOPF(List<Chapter> chapters)
{
XNamespace dc = "http://purl.org/dc/elements/1.1/";
XNamespace opf = "http://www.idpf.org/2007/opf";
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));
var package = new XElement(opf + "package",
new XAttribute("unique-identifier", "BookId"),
new XAttribute("version", "2.0"));
doc.Add(package);
var meta = new XElement(opf + "metadata",
new XAttribute(XNamespace.Xmlns + "dc", dc),
new XAttribute(XNamespace.Xmlns + "opf", opf),
new XElement(dc + "title", ACTIVE_BOOK.Title),
new XElement(dc + "creator", ACTIVE_BOOK.Author),
new XElement(dc + "identifier",
new XAttribute("id", "BookId"),
new XAttribute(opf + "scheme", "UUID"),
"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
new XElement(dc + "date",
new XAttribute(opf + "event", "publication"),
ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "date",
new XAttribute(opf + "event", "modification"),
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "date",
new XAttribute(opf + "event", "creation"),
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
new XElement(dc + "language", ACTIVE_BOOK.Language),
new XElement(dc + "identifier",
new XAttribute(opf + "scheme", "UUID"),
ACTIVE_BOOK.ID_CAL.ToString("D")),
new XElement(opf + "meta",
new XAttribute("content", "1.0"),
new XAttribute("name", "Wordpress_eBook_scraper_version")),
new XElement(opf + "meta",
new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));
if (ACTIVE_BOOK.Series != null)
{
meta.Add(new XElement(opf + "meta",
new XAttribute("content", ACTIVE_BOOK.Series),
new XAttribute("name", "calibre:series")));
meta.Add(new XElement(opf + "meta",
new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
new XAttribute("name", "calibre:series_index")));
}
package.Add(meta);
var manifest = new XElement(opf + "manifest");
for(int i = 0; i < chapters.Count; i++)
{
manifest.Add(new XElement(opf + "item",
new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Filenamify(chapters[i].title, true)))),
new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))),
new XAttribute("media-type", "application/xhtml+xml")));
}
manifest.Add(new XElement(opf + "item",
new XAttribute("href", "toc.ncx"),
new XAttribute("id", "ncx"),
new XAttribute("media-type", "application/x-dtbncx+xml")));
package.Add(manifest);
var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
for (int i = 0; i < chapters.Count; i++)
{
spine.Add(new XElement(opf + "itemref",
new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true)))));
}
package.Add(spine);
package.Add(new XElement(opf + "guide"));
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
return writer.ToString();
}
}
string GetEpubTOC(List<Chapter> chapters)
{
XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
XNamespace ncx = "http://www.idpf.org/2007/opf";
var doc = new XDocument(
new XDeclaration("1.0", "UTF-8", null),
new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));
var root = new XElement(ncx + "ncx",
new XAttribute("version", "2005-1"),
new XElement(ncx + "head",
new XElement(ncx + "meta",
new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
new XAttribute("name", "dtb:uid")),
new XElement(ncx + "meta",
new XAttribute("content", 1),
new XAttribute("name", "dtb:depth")),
new XElement(ncx + "meta",
new XAttribute("content", 0),
new XAttribute("name", "dtb:totalPageCount")),
new XElement(ncx + "meta",
new XAttribute("content", 0),
new XAttribute("name", "dtb:maxPageNumber"))));
doc.Add(root);
root.Add(new XElement(ncx + "docTitle",
new XElement(ncx + "text", "Unknown")));
var nav = new XElement(ncx + "navMap");
for (int i = 0; i < chapters.Count; i++)
{
nav.Add(new XElement(ncx + "navPoint",
new XAttribute("id", "navPoint-" + (i + 1)),
new XAttribute("playOrder", i + 1),
new XElement(ncx + "navLabel",
new XElement(ncx + "text", chapters[i].title)),
new XElement(ncx + "content",
new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))))));
}
root.Add(nav);
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
return writer.ToString();
}
}
string GetEpubChapterFile(Chapter chapter, int idx)
{
StringBuilder xml = new StringBuilder();
xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
xml.AppendLine(@"<head>");
xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
xml.AppendLine(@"</head>");
xml.AppendLine(@"<body>");
xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
xml.AppendLine(chapter.chapter);
xml.AppendLine(@"</body>");
xml.AppendLine(@"</html>");
return xml.ToString();
}
public struct ProcessOutput
{
public readonly string Command;
public readonly int ExitCode;
public readonly string StdOut;
public readonly string StdErr;
public readonly string StdCombined;
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
{
Command = cmd;
ExitCode = ex;
StdOut = stdout;
StdErr = stderr;
StdCombined = stdcom;
}
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
}
public static class ProcessHelper
{
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
{
var process = new Process
{
StartInfo =
{
FileName = command,
Arguments = arguments,
WorkingDirectory = workingDirectory ?? string.Empty,
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true,
ErrorDialog = false,
}
};
var builderOut = new StringBuilder();
var builderErr = new StringBuilder();
var builderBoth = new StringBuilder();
process.OutputDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderOut.Length == 0) builderOut.Append(args.Data);
else builderOut.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.ErrorDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderErr.Length == 0) builderErr.Append(args.Data);
else builderErr.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
process.WaitForExit();
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
}
}
public static class HTMLToText
{
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
private class PreceedingDomTextInfo
{
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
}
public bool WritePrecedingWhiteSpace { get; set; }
public bool LastCharWasSpace { get; set; }
public readonly BoolWrapper IsFirstTextOfDocWritten;
public int ListIndex { get; set; }
}
private class BoolWrapper
{
public BoolWrapper() { }
public bool Value { get; set; }
public static implicit operator bool(BoolWrapper boolWrapper)
{
return boolWrapper.Value;
}
public static implicit operator BoolWrapper(bool boolWrapper)
{
return new BoolWrapper { Value = boolWrapper };
}
}
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
return ConvertDoc(doc);
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
html = REX_TAG1.Replace(html, " ");
html = REX_TAG2.Replace(html, " ");
doc.LoadHtml(html);
return ConvertDoc(doc);
}
public static string ConvertDoc(HtmlDocument doc)
{
using (StringWriter sw = new StringWriter())
{
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
}
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText, textInfo);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
}
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText, textInfo);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
{
break;
}
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html)) break;
// check the text is meaningful and not a bunch of whitespaces
if (html.Length == 0) break;
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
{
html = html.TrimStart();
if (html.Length == 0) { break; }
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
}
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
{
outText.Write(' ');
}
break;
case HtmlNodeType.Element:
string endElementString = null;
bool isInline;
bool skip = false;
int listIndex = 0;
switch (node.Name)
{
case "nav":
skip = true;
isInline = false;
break;
case "body":
case "section":
case "article":
case "aside":
case "h1":
case "h2":
case "header":
case "footer":
case "address":
case "main":
case "div":
case "span":
case "p": // stylistic - adjust as you tend to use
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
endElementString = "\r\n";
isInline = false;
break;
case "br":
outText.Write("\r\n");
skip = true;
textInfo.WritePrecedingWhiteSpace = false;
isInline = true;
break;
case "a":
isInline = true;
break;
case "li":
isInline = false;
break;
case "ol":
listIndex = 1;
goto case "ul";
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n";
isInline = false;
break;
case "img": //inline-block in reality
isInline = true;
break;
default:
isInline = true;
break;
}
if (!skip && node.HasChildNodes)
{
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
}
if (endElementString != null)
{
outText.Write(endElementString);
}
break;
}
}
}