1662 lines
56 KiB
C#
1662 lines
56 KiB
C#
/** *************************************************** **/
|
||
/** **/
|
||
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
|
||
/** **/
|
||
/** *************************************************** **/
|
||
|
||
const string BASE_DIR_STASH = @"F:\Stash\eBook_scraper\";
|
||
const string BASE_DIR_OUT = @"F:\Home\Cloud\Dokumente\E-Books\Scraper\";
|
||
const string COMPARE_PROG = @"C:\Program Files\Beyond Compare 4\BCompare.exe";
|
||
|
||
//----------------------------------------------------------------------------------------------------//
|
||
|
||
static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/");
|
||
static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/");
|
||
static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/");
|
||
static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/");
|
||
static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/");
|
||
|
||
static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/");
|
||
static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/");
|
||
static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/");
|
||
static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/");
|
||
static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/");
|
||
static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/");
|
||
static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/");
|
||
|
||
static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/");
|
||
static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/");
|
||
static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/");
|
||
static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
|
||
|
||
static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/");
|
||
|
||
static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/");
|
||
|
||
static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/");
|
||
static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/");
|
||
static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/");
|
||
static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/");
|
||
static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/");
|
||
static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/");
|
||
static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/");
|
||
static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/");
|
||
static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/");
|
||
static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/");
|
||
static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/");
|
||
static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/");
|
||
static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/");
|
||
static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/");
|
||
static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/");
|
||
|
||
static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
|
||
|
||
static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue");
|
||
|
||
static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
|
||
|
||
static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
|
||
|
||
static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/");
|
||
|
||
static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
|
||
|
||
static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
|
||
|
||
static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother");
|
||
|
||
static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again");
|
||
|
||
static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave");
|
||
|
||
//----------------------------------------------------------------------------------------------------//
|
||
|
||
readonly EpubParameter[] BOOKS = new[] { TPR };
|
||
|
||
readonly bool USE_WEBCACHE = true;
|
||
readonly bool DO_LIVE_RELOAD_OF_LAST = true;
|
||
readonly bool CONVERT_MOBI = true;
|
||
|
||
readonly MainMode MODE = MainMode.Generate;
|
||
|
||
//----------------------------------------------------------------------------------------------------//
|
||
|
||
static EpubParameter ACTIVE_BOOK = null;
|
||
|
||
const int LIMIT = 1500;
|
||
|
||
readonly Regex REX_NUMSTART = new Regex(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
|
||
|
||
Dictionary<string, string> webCache = new Dictionary<string, string>();
|
||
|
||
string STASH_FOLDER => BASE_DIR_STASH + ACTIVE_BOOK.Foldername + @"\";
|
||
|
||
string WCACHE_FILE => BASE_DIR_OUT + @"_cache\" + ACTIVE_BOOK.Foldername + @".xml";
|
||
string HTML_FILE_OUT => BASE_DIR_OUT + @"html\" + ACTIVE_BOOK.Foldername + @".html";
|
||
string EPUB_FILE_OUT => BASE_DIR_OUT + @"epub\" + ACTIVE_BOOK.Foldername + @".epub";
|
||
string MOBI_FILE_OUT => BASE_DIR_OUT + @"mobi\" + ACTIVE_BOOK.Foldername + @".mobi";
|
||
|
||
string HTML_FILE_STASH => STASH_FOLDER + @"book.html";
|
||
string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip";
|
||
string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub";
|
||
string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi";
|
||
|
||
string QUERY_FOLDER => STASH_FOLDER + @"query\"; // full query result
|
||
string HTML_FOLDER => STASH_FOLDER + @"html\"; // unprocessed chapter code
|
||
string EPUB_FOLDER => STASH_FOLDER + @"epub\"; // processed epub chapter code
|
||
|
||
//----------------------------------------------------------------------------------------------------//
|
||
|
||
public enum MainMode
|
||
{
|
||
Generate,
|
||
Verify,
|
||
}
|
||
|
||
public enum ProcessResult
|
||
{
|
||
SuccessNormal,
|
||
ReachedEnd,
|
||
SkipChapter,
|
||
}
|
||
|
||
public enum Site
|
||
{
|
||
Wordpress,
|
||
WuxiaWorld,
|
||
Royalroad,
|
||
|
||
WP = Wordpress,
|
||
WW = WuxiaWorld,
|
||
RR = Royalroad,
|
||
}
|
||
|
||
public class Chapter
|
||
{
|
||
public string url;
|
||
public string title;
|
||
public string next;
|
||
|
||
public GZippedString queryResult;
|
||
public GZippedString sourcecode;
|
||
public GZippedString chapter;
|
||
|
||
public bool isPrologue;
|
||
public bool isEpilogue;
|
||
public bool isBonus;
|
||
public bool isSpecial => isPrologue || isEpilogue || isBonus;
|
||
}
|
||
|
||
public class SerializableCacheEntry
|
||
{
|
||
public string URL;
|
||
public GZippedString Content;
|
||
}
|
||
|
||
public class GZippedString : IXmlSerializable
|
||
{
|
||
public string Value { get; set; }
|
||
|
||
public System.Xml.Schema.XmlSchema GetSchema() { return null; }
|
||
|
||
public void ReadXml(System.Xml.XmlReader reader)
|
||
{
|
||
Value = DecompressString(reader.ReadString());
|
||
reader.ReadEndElement();
|
||
}
|
||
|
||
public void WriteXml(System.Xml.XmlWriter writer)
|
||
{
|
||
writer.WriteString(CompressString(Value));
|
||
}
|
||
|
||
private string CompressString(string text)
|
||
{
|
||
byte[] buffer = Encoding.UTF8.GetBytes(text);
|
||
var memoryStream = new MemoryStream();
|
||
using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true))
|
||
gZipStream.Write(buffer, 0, buffer.Length);
|
||
memoryStream.Position = 0;
|
||
var compressedData = new byte[memoryStream.Length];
|
||
memoryStream.Read(compressedData, 0, compressedData.Length);
|
||
var gZipBuffer = new byte[compressedData.Length + 4];
|
||
Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length);
|
||
Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4);
|
||
return Convert.ToBase64String(gZipBuffer);
|
||
}
|
||
|
||
private string DecompressString(string compressedText)
|
||
{
|
||
byte[] gZipBuffer = Convert.FromBase64String(compressedText);
|
||
using (var memoryStream = new MemoryStream())
|
||
{
|
||
int dataLength = BitConverter.ToInt32(gZipBuffer, 0);
|
||
memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4);
|
||
var buffer = new byte[dataLength];
|
||
memoryStream.Position = 0;
|
||
using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress))
|
||
gZipStream.Read(buffer, 0, buffer.Length);
|
||
return Encoding.UTF8.GetString(buffer);
|
||
}
|
||
}
|
||
|
||
public static implicit operator GZippedString(string v) => new GZippedString{Value = v};
|
||
public static implicit operator string (GZippedString v) => v.Value;
|
||
|
||
}
|
||
|
||
public class Utf8StringWriter : StringWriter
|
||
{
|
||
public override Encoding Encoding { get { return Encoding.UTF8; } }
|
||
}
|
||
|
||
public class EpubParameter
|
||
{
|
||
public readonly string Series;
|
||
public readonly int SeriesIndex;
|
||
public readonly Guid ID_OPF;
|
||
public readonly Guid ID_CAL;
|
||
public readonly string Title;
|
||
public readonly string Author;
|
||
public readonly DateTime Release;
|
||
public readonly string Language;
|
||
public readonly string StartURL;
|
||
public readonly string Foldername;
|
||
public readonly Site SiteType;
|
||
|
||
public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } }
|
||
|
||
public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { }
|
||
|
||
public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s)
|
||
{
|
||
SiteType = st;
|
||
Series = z;
|
||
SeriesIndex = i;
|
||
Title = t;
|
||
Author = a;
|
||
Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture);
|
||
Language = l;
|
||
StartURL = s;
|
||
if (z == null)
|
||
Foldername = Filenamify(t);
|
||
else
|
||
Foldername = string.Format("{0} {1} - {2}", Filenamify(z), i, Filenamify(t));
|
||
|
||
var u = new Random(Title.GetHashCode() ^ Author.GetHashCode());
|
||
var g = new byte[16];
|
||
u.NextBytes(g);
|
||
ID_OPF = new Guid(g);
|
||
u.NextBytes(g);
|
||
ID_CAL = new Guid(g);
|
||
}
|
||
|
||
public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}";
|
||
}
|
||
|
||
//----------------------------------------------------------------------------------------------------//
|
||
|
||
void Main()
|
||
{
|
||
Util.AutoScrollResults = true;
|
||
|
||
if (MODE == MainMode.Generate) Generate();
|
||
if (MODE == MainMode.Verify) Verify();
|
||
}
|
||
|
||
void Generate()
|
||
{
|
||
foreach (var bb in BOOKS)
|
||
{
|
||
ACTIVE_BOOK = bb;
|
||
|
||
$"".Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
|
||
$" [PROCESSING BOOK] {bb.DisplayStr} ".Dump();
|
||
new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
|
||
Init();
|
||
|
||
List<Chapter> chapters = FindChapters();
|
||
|
||
WriteBookHTML(chapters);
|
||
WriteEpub(chapters);
|
||
if (CONVERT_MOBI) GenerateMobi();
|
||
}
|
||
}
|
||
|
||
void Verify()
|
||
{
|
||
foreach (var bb in BOOKS)
|
||
{
|
||
ACTIVE_BOOK = bb;
|
||
|
||
$"".Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
|
||
$" [VERIFYING BOOK] {bb.DisplayStr} ".Dump();
|
||
new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
$"".Dump();
|
||
|
||
LoadWebCache();
|
||
|
||
VerifyChapters();
|
||
}
|
||
}
|
||
|
||
void Init()
|
||
{
|
||
if (Directory.Exists(STASH_FOLDER))
|
||
{
|
||
Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete));
|
||
if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH);
|
||
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
|
||
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
|
||
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
|
||
}
|
||
|
||
Directory.CreateDirectory(STASH_FOLDER);
|
||
Directory.CreateDirectory(QUERY_FOLDER);
|
||
Directory.CreateDirectory(HTML_FOLDER);
|
||
Directory.CreateDirectory(EPUB_FOLDER);
|
||
|
||
Directory.CreateDirectory(BASE_DIR_OUT + @"_cache\");
|
||
Directory.CreateDirectory(BASE_DIR_OUT + @"html\");
|
||
Directory.CreateDirectory(BASE_DIR_OUT + @"epub\");
|
||
Directory.CreateDirectory(BASE_DIR_OUT + @"mobi\");
|
||
|
||
if (USE_WEBCACHE) LoadWebCache();
|
||
}
|
||
|
||
void WriteBookHTML(List<Chapter> chapters)
|
||
{
|
||
StringBuilder b = new StringBuilder();
|
||
|
||
b.AppendLine("<!DOCTYPE html>");
|
||
b.AppendLine("<html>");
|
||
b.AppendLine("<body>");
|
||
|
||
foreach (var currChapter in chapters)
|
||
{
|
||
b.AppendLine();
|
||
b.AppendLine("<h1>" + HtmlEntity.Entitize(currChapter.title) + "</h1>");
|
||
b.AppendLine();
|
||
b.AppendLine(currChapter.chapter);
|
||
}
|
||
|
||
b.AppendLine("</html>");
|
||
b.AppendLine("</body>");
|
||
|
||
File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8);
|
||
File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true);
|
||
}
|
||
|
||
void SaveCache()
|
||
{
|
||
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
||
using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
|
||
{
|
||
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
|
||
}
|
||
}
|
||
|
||
void LoadWebCache()
|
||
{
|
||
if (!File.Exists(WCACHE_FILE)) return;
|
||
|
||
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
||
using (TextReader reader = new StreamReader(WCACHE_FILE))
|
||
{
|
||
var result = new List<SerializableCacheEntry>();
|
||
|
||
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
|
||
|
||
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
|
||
}
|
||
}
|
||
|
||
List<Chapter> FindChapters()
|
||
{
|
||
List<Chapter> result = new List<Chapter>();
|
||
|
||
using (WebClient client = new WebClient())
|
||
{
|
||
client.Encoding = Encoding.UTF8;
|
||
Stack<string> buffer = new Stack<string>();
|
||
buffer.Push(ACTIVE_BOOK.StartURL);
|
||
|
||
while (buffer.Any() && result.Count < LIMIT)
|
||
{
|
||
var url = buffer.Pop();
|
||
Chapter curr = new Chapter() { url = url };
|
||
|
||
var buffered = webCache.ContainsKey(url.ToLower());
|
||
if (buffered)
|
||
{
|
||
curr.queryResult = webCache[url.ToLower()];
|
||
"*(loaded from webcache)*".Dump();
|
||
}
|
||
else
|
||
{
|
||
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
||
webCache[url.ToLower()] = curr.queryResult;
|
||
SaveCache();
|
||
}
|
||
|
||
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
|
||
if (next_url != null) buffer.Push(next_url);
|
||
|
||
if (buffered && buffer.Count == 0 && DO_LIVE_RELOAD_OF_LAST)
|
||
{
|
||
"".Dump();
|
||
"//==> *(auto-reload from live)*".Dump();
|
||
"".Dump();
|
||
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
||
webCache[url.ToLower()] = curr.queryResult;
|
||
SaveCache();
|
||
|
||
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
|
||
if (next_url_inner != null) buffer.Push(next_url_inner);
|
||
}
|
||
if (r == ProcessResult.SuccessNormal)
|
||
{
|
||
" ==> Chapter processed".Dump();
|
||
result.Add(curr);
|
||
OutputChapter(curr, result.Count);
|
||
}
|
||
else if (r == ProcessResult.SkipChapter)
|
||
{
|
||
" ==> Skip this chapter".Dump();
|
||
}
|
||
else if (r == ProcessResult.ReachedEnd)
|
||
{
|
||
" ==> End reached".Dump();
|
||
}
|
||
|
||
|
||
"".Dump();
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
void VerifyChapters()
|
||
{
|
||
List<Chapter> result = new List<Chapter>();
|
||
|
||
using (WebClient client = new WebClient())
|
||
{
|
||
client.Encoding = Encoding.UTF8;
|
||
Stack<string> buffer = new Stack<string>();
|
||
buffer.Push(ACTIVE_BOOK.StartURL);
|
||
|
||
while (buffer.Any() && result.Count < LIMIT)
|
||
{
|
||
var url = buffer.Pop();
|
||
Chapter curr_buffer = new Chapter() { url = url };
|
||
Chapter curr_live = new Chapter() { url = url };
|
||
|
||
var buffered = webCache.ContainsKey(url.ToLower());
|
||
if (buffered)
|
||
{
|
||
try
|
||
{
|
||
curr_buffer.queryResult = webCache[url.ToLower()];
|
||
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
|
||
continue;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
continue;
|
||
}
|
||
|
||
var is_diff = false;
|
||
|
||
var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
|
||
var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);
|
||
|
||
if (next_buffer != null) buffer.Push(next_buffer);
|
||
|
||
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
|
||
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
|
||
|
||
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
|
||
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
|
||
|
||
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
|
||
{
|
||
var clean_buffer = GetChapterText(curr_buffer);
|
||
var clean_live = GetChapterText(curr_live);
|
||
|
||
if (clean_buffer.Trim() != clean_live.Trim())
|
||
{
|
||
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
|
||
new Hyperlinq(() =>
|
||
{
|
||
|
||
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
|
||
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
|
||
File.WriteAllText(fa, curr_buffer.chapter.Value);
|
||
File.WriteAllText(fb, curr_live.chapter.Value);
|
||
Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
|
||
|
||
}, "[Compare Raw]").Dump();
|
||
new Hyperlinq(() =>
|
||
{
|
||
|
||
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
|
||
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
|
||
File.WriteAllText(fa, clean_buffer);
|
||
File.WriteAllText(fb, clean_live);
|
||
Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
|
||
|
||
}, "[Compare Text]").Dump();
|
||
new Hyperlinq(() =>
|
||
{
|
||
|
||
webCache[url.ToLower()] = curr_live.queryResult;
|
||
SaveCache();
|
||
|
||
}, "[Save new version to webcache]").Dump();
|
||
|
||
is_diff = true;
|
||
}
|
||
}
|
||
|
||
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
|
||
|
||
if (is_diff) "".Dump();
|
||
}
|
||
}
|
||
}
|
||
|
||
bool Relaxedurleq(string a, string b)
|
||
{
|
||
if (a == b) return true;
|
||
if (a.StartsWith("https://")) a = a.Substring("https://".Length);
|
||
if (a.StartsWith("http://")) a = a.Substring("http://".Length);
|
||
if (b.StartsWith("https://")) b = b.Substring("https://".Length);
|
||
if (b.StartsWith("http://")) b = b.Substring("http://".Length);
|
||
|
||
return (a==b);
|
||
}
|
||
|
||
string GetChapterText(Chapter c)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty;
|
||
|
||
var clean = HTMLToText.ConvertHtml(c.chapter.Value);
|
||
|
||
clean = clean.Trim();
|
||
|
||
clean = new Regex(@"\s+").Replace(clean, " ");
|
||
|
||
return clean;
|
||
}
|
||
|
||
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueue_next)
|
||
{
|
||
forwardQueue_next = null;
|
||
|
||
HtmlDocument doc = new HtmlDocument();
|
||
doc.LoadHtml(curr.queryResult);
|
||
|
||
#region Base
|
||
|
||
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
|
||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
|
||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
|
||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]");
|
||
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");
|
||
|
||
var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
|
||
if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]");
|
||
if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]");
|
||
if (nodeNav == null) nodeNav = nodeContent;
|
||
|
||
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
|
||
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
|
||
|
||
#endregion
|
||
|
||
#region Title
|
||
|
||
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
|
||
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
|
||
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
|
||
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
|
||
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1");
|
||
|
||
curr.title = TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText));
|
||
|
||
var titles = new List<string>();
|
||
titles.Add(curr.title);
|
||
|
||
if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*"))
|
||
{
|
||
var baseTitle = curr.title;
|
||
|
||
var suffix = TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value);
|
||
|
||
var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value;
|
||
var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value);
|
||
|
||
titles.Add(prefix1);
|
||
titles.Add(prefix2);
|
||
|
||
var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2);
|
||
var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2);
|
||
var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
|
||
var altTitleNode4 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("<p>") || p.InnerHtml.Contains("<br")));
|
||
if (altTitleNode1 != null)
|
||
{
|
||
var newtitle = TitleFmt(altTitleNode1.InnerText.Trim().Substring(prefix1.Length));
|
||
titles.Add(newtitle);
|
||
curr.title = newtitle;
|
||
titles.Add(prefix1 + newtitle);
|
||
titles.Add(prefix2 + newtitle);
|
||
titles.Add(prefix1 + " - " + newtitle);
|
||
titles.Add(prefix2 + " - " + newtitle);
|
||
}
|
||
else if (altTitleNode2 != null)
|
||
{
|
||
var newtitle = TitleFmt(altTitleNode2.InnerText.Trim().Substring(prefix2.Length));
|
||
titles.Add(newtitle);
|
||
curr.title = newtitle;
|
||
titles.Add(prefix1 + newtitle);
|
||
titles.Add(prefix2 + newtitle);
|
||
titles.Add(prefix1 + " - " + newtitle);
|
||
titles.Add(prefix2 + " - " + newtitle);
|
||
}
|
||
else if (altTitleNode3 != null)
|
||
{
|
||
var newtitle = TitleFmt(altTitleNode3.InnerText.Trim().Substring(prefix1.Length));
|
||
titles.Add(newtitle);
|
||
curr.title = newtitle;
|
||
titles.Add(prefix1 + newtitle);
|
||
titles.Add(prefix2 + newtitle);
|
||
titles.Add(prefix1 + " - " + newtitle);
|
||
titles.Add(prefix2 + " - " + newtitle);
|
||
|
||
altTitleNode3.Remove();
|
||
prt(" > title node removed");
|
||
}
|
||
else if (altTitleNode4 != null)
|
||
{
|
||
var newtitle = TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length));
|
||
titles.Add(newtitle);
|
||
curr.title = newtitle;
|
||
titles.Add(prefix1 + newtitle);
|
||
titles.Add(prefix2 + newtitle);
|
||
titles.Add(prefix1 + " - " + newtitle);
|
||
titles.Add(prefix2 + " - " + newtitle);
|
||
|
||
altTitleNode4.Remove();
|
||
prt(" > title node removed");
|
||
}
|
||
else if (suffix.Length > 2)
|
||
{
|
||
curr.title = suffix;
|
||
titles.Add(suffix);
|
||
}
|
||
else
|
||
{
|
||
prt(" [!!] Warning cannot parse title");
|
||
}
|
||
|
||
if (suffix.Length > 2)
|
||
{
|
||
curr.title = baseTitle;
|
||
titles.Add(baseTitle);
|
||
}
|
||
}
|
||
|
||
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
|
||
var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length);
|
||
while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1);
|
||
tit_alt = tit_alt.Trim();
|
||
if (tit_alt.Length>2) curr.title = tit_alt;
|
||
}
|
||
|
||
#endregion
|
||
|
||
curr.sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent.OuterHtml + "\r\n</body>\r\n</html>\r\n";
|
||
|
||
if (backBuffer.Any() && backBuffer.First().title == curr.title)
|
||
{
|
||
prt("[!] Book loop found - skipping entry");
|
||
return ProcessResult.ReachedEnd; // prevent book II loop
|
||
}
|
||
|
||
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
|
||
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
|
||
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
|
||
|
||
if (ACTIVE_BOOK == APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II");
|
||
|
||
if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus)
|
||
{
|
||
prt("[!] Epilogue found - skipping entry");
|
||
return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue
|
||
}
|
||
|
||
prt(curr.title + " (" + curr.url + ")");
|
||
|
||
#region Next
|
||
|
||
string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
|
||
|
||
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
|
||
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
|
||
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
|
||
REX_NUMSTART.Match(curr.title).Success &&
|
||
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value)
|
||
{
|
||
prt("[!] Book jump found - skipping entry");
|
||
return ProcessResult.ReachedEnd;
|
||
}
|
||
|
||
var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");
|
||
if (next == null)
|
||
next = nodeContent.Descendants()
|
||
.Where(p => p.Name.ToLower() == "a")
|
||
.Where(p => Striptease(p) == "next chapter" || Striptease(p) == "next")
|
||
.Where(p => p.Attributes.Contains("href"))
|
||
.FirstOrDefault();
|
||
|
||
var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a");
|
||
|
||
if (next == null)
|
||
next = nodeNav.Descendants()
|
||
.Where(p => p.Name.ToLower() == "a")
|
||
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
|
||
.FirstOrDefault();
|
||
|
||
if (next != null)
|
||
{
|
||
var next_url = next.Attributes["href"].Value.Trim();
|
||
|
||
if (next_url == "." || next_url == "/" || next_url == "./")
|
||
{
|
||
next=null;
|
||
}
|
||
else
|
||
{
|
||
if (next_url.StartsWith("//")) next_url = "http:" + next_url;
|
||
|
||
if (next_url.StartsWith("/")) next_url = combineAuthority(curr.url, next_url);
|
||
|
||
if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = CombineUri(curr.url, next_url);
|
||
|
||
curr.next = next_url;
|
||
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
|
||
{
|
||
forwardQueue_next = next_url;
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
if (next == null) prt(" > (!) No next URL found");
|
||
|
||
#endregion
|
||
|
||
#region Chapter marker
|
||
|
||
var cpMarkerIdentities = new List<string>
|
||
{
|
||
"previousnext", "previouschapternextchapter",
|
||
"firstnext", "firstchapternextchapter",
|
||
"firstchapter", "previouslast",
|
||
|
||
"previouschapterlastchapter",
|
||
|
||
"previouschapter", "nextchapter", "lastchapter",
|
||
|
||
"first", "previous", "next", "last"
|
||
};
|
||
|
||
foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList())
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > Chapter marker removed");
|
||
}
|
||
|
||
foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > Chapter marker removed");
|
||
}
|
||
|
||
var alist = nodeChapter.SelectNodes("//a");
|
||
if (alist != null)
|
||
{
|
||
foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
||
{
|
||
node.Remove();
|
||
prt(" > Chapter marker removed");
|
||
}
|
||
}
|
||
|
||
var plist = nodeChapter.SelectNodes("//p");
|
||
if (plist != null)
|
||
{
|
||
foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList())
|
||
{
|
||
node.Remove();
|
||
prt(" > Chapter marker removed");
|
||
}
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Share Div
|
||
|
||
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
|
||
if (shareNodes != null)
|
||
{
|
||
foreach (var node in shareNodes)
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > share div removed");
|
||
}
|
||
else
|
||
{
|
||
prt(" > share div cannot be removed - skipping");
|
||
}
|
||
}
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Meta Div
|
||
|
||
var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]");
|
||
if (metaNodes != null)
|
||
{
|
||
foreach (var node in metaNodes)
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > meta div removed");
|
||
}
|
||
else
|
||
{
|
||
prt(" > meta div cannot be removed - skipping");
|
||
}
|
||
}
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Ad Blocking
|
||
|
||
var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/..");
|
||
if (adNodes1 != null)
|
||
{
|
||
foreach (var node in adNodes1)
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > ad div removed");
|
||
}
|
||
else
|
||
{
|
||
prt(" > ad div cannot be removed - skipping");
|
||
}
|
||
}
|
||
}
|
||
|
||
var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/..");
|
||
if (adNodes2 != null)
|
||
{
|
||
foreach (var node in adNodes2)
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > ad div removed");
|
||
}
|
||
else
|
||
{
|
||
prt(" > ad div cannot be removed - skipping");
|
||
}
|
||
}
|
||
}
|
||
|
||
var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]");
|
||
if (adNodes3 != null)
|
||
{
|
||
foreach (var node in adNodes3.Where(n => Striptease(n) == "advertisement"))
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > ad div removed");
|
||
}
|
||
else
|
||
{
|
||
prt(" > ad div cannot be removed - skipping");
|
||
}
|
||
}
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Title Paragraphs
|
||
|
||
var titleNodes1 = nodeChapter.SelectNodes(@"p");
|
||
if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First()))
|
||
{
|
||
nodeChapter.RemoveChild(titleNodes1.First());
|
||
prt(" > title node removed");
|
||
}
|
||
|
||
for (int hval = 1; hval <= 5; hval++)
|
||
{
|
||
var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval);
|
||
if (titleNodes2 != null)
|
||
{
|
||
foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == TitleFmt(node.InnerText).ToLower())))
|
||
{
|
||
if (nodeChapter.ChildNodes.Contains(node))
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > title node removed");
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
var titleNodes3 = nodeChapter.SelectNodes(@"//u");
|
||
if (titleNodes3 != null && titleNodes3.Any())
|
||
{
|
||
var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
||
foreach (var t in xTitleNodes3)
|
||
{
|
||
t.Remove();
|
||
prt(" > title node removed");
|
||
}
|
||
}
|
||
|
||
var titleNodes4 = nodeChapter.SelectNodes(@"//span");
|
||
if (titleNodes4 != null && titleNodes4.Any())
|
||
{
|
||
var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
||
foreach (var t in xTitleNodes4)
|
||
{
|
||
t.Remove();
|
||
prt(" > title node removed");
|
||
}
|
||
}
|
||
|
||
var titleNodes5 = nodeChapter.SelectNodes(@"//strong");
|
||
if (titleNodes5 != null && titleNodes5.Any())
|
||
{
|
||
var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t)));
|
||
foreach (var t in xTitleNodes5)
|
||
{
|
||
t.Remove();
|
||
prt(" > title node removed");
|
||
}
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Remove <hr>'s
|
||
|
||
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr")
|
||
{
|
||
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First());
|
||
prt(" > header hr removed");
|
||
}
|
||
|
||
while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr")
|
||
{
|
||
nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last());
|
||
prt(" > footer hr removed");
|
||
}
|
||
|
||
#endregion
|
||
|
||
#region Other (Author's Node)
|
||
|
||
foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList())
|
||
{
|
||
nodeChapter.RemoveChild(node);
|
||
prt(" > authors note removed");
|
||
}
|
||
|
||
#endregion
|
||
|
||
var chap_html = nodeChapter.InnerHtml.Trim();
|
||
|
||
#region Fix raw <hr>
|
||
// KOReader doesn't like <hr>
|
||
|
||
chap_html = chap_html.Replace("<hr>", "<hr/>");
|
||
|
||
#endregion
|
||
|
||
curr.chapter = chap_html;
|
||
|
||
|
||
if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter;
|
||
|
||
return ProcessResult.SuccessNormal;
|
||
}
|
||
|
||
string combineAuthority(string url, string suffix)
|
||
{
|
||
var left = new Uri(url).GetLeftPart(UriPartial.Authority);
|
||
if (!left.EndsWith("/")) left = left + "/";
|
||
if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/');
|
||
return left + suffix;
|
||
}
|
||
|
||
string CombineUri(string uri1, string uri2)
|
||
{
|
||
if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/"));
|
||
uri1 = uri1.TrimEnd('/');
|
||
uri2 = uri2.TrimStart('/');
|
||
return string.Format("{0}/{1}", uri1, uri2);
|
||
}
|
||
|
||
void OutputChapter(Chapter curr, int index)
|
||
{
|
||
File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.queryResult);
|
||
|
||
File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8);
|
||
|
||
StringBuilder b = new StringBuilder();
|
||
{
|
||
b.AppendLine("<!DOCTYPE html>");
|
||
b.AppendLine("<html>");
|
||
b.AppendLine("<body>");
|
||
b.AppendLine();
|
||
b.AppendLine("<h1>" + HtmlEntity.Entitize(curr.title) + "</h1>");
|
||
b.AppendLine();
|
||
b.AppendLine(curr.chapter);
|
||
b.AppendLine("</body>");
|
||
b.AppendLine("</html>");
|
||
}
|
||
File.WriteAllText(Path.Combine(EPUB_FOLDER, Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8);
|
||
}
|
||
|
||
static string Filenamify(string v, bool repl = false)
|
||
{
|
||
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>
|
||
(p >= '0' && p <= '9') ||
|
||
(p >= 'A' && p <= 'Z') ||
|
||
(p >= 'a' && p <= 'z') ||
|
||
p == ' ' ||
|
||
p == '.' ||
|
||
p == '-' ||
|
||
p == '*' ||
|
||
p == '_' ||
|
||
p == '.' ||
|
||
p == ',').ToArray());
|
||
|
||
if (repl) s = s.Replace(' ', '_');
|
||
|
||
return s;
|
||
}
|
||
|
||
string TitleFmt(string raw)
|
||
{
|
||
raw = HtmlEntity.DeEntitize(raw);
|
||
|
||
raw = raw.Replace('–', '-');
|
||
raw = raw.Replace((char)160, ' ');
|
||
|
||
raw = raw.Trim().Trim('-', ':', '_', '#').Trim();
|
||
if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3);
|
||
|
||
raw = raw.Trim().Trim('-', ':', '_', '#').Trim();
|
||
|
||
if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1);
|
||
|
||
return raw;
|
||
}
|
||
|
||
string Striptease(HtmlNode raw)
|
||
{
|
||
{
|
||
var rm = raw.SelectNodes(@"//script");
|
||
if (rm != null && rm.Any())
|
||
{
|
||
var copy = HtmlNode.CreateNode($"<{raw.Name}></{raw.Name}>");
|
||
copy.CopyFrom(raw);
|
||
raw = copy;
|
||
|
||
rm = raw.SelectNodes(@"//script");
|
||
if (rm != null) foreach (var e in rm) e.Remove();
|
||
}
|
||
}
|
||
|
||
{
|
||
var rm = raw.SelectNodes(@"//meta");
|
||
if (rm != null && rm.Any())
|
||
{
|
||
var copy = HtmlNode.CreateNode($"<{raw.Name}></{raw.Name}>");
|
||
copy.CopyFrom(raw);
|
||
raw = copy;
|
||
|
||
rm = raw.SelectNodes(@"//meta");
|
||
if (rm != null) foreach (var e in rm) e.Remove();
|
||
}
|
||
}
|
||
|
||
return Striptease(HtmlEntity.DeEntitize(raw.InnerText));
|
||
}
|
||
|
||
string Striptease(string raw)
|
||
{
|
||
var r = string.Join(string.Empty,
|
||
raw
|
||
.ToCharArray()
|
||
.Select(c => char.IsWhiteSpace(c) ? ' ' : c)
|
||
.Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c))
|
||
.Select(c => char.ToLower(c))).Trim();
|
||
return r;
|
||
}
|
||
|
||
string NakedIdentity(HtmlNode raw)
|
||
{
|
||
return string.Join(string.Empty,
|
||
raw
|
||
.InnerText
|
||
.ToLower()
|
||
.Replace(">", "")
|
||
.Replace("<", "")
|
||
.Replace("&", "")
|
||
.Replace(""", "")
|
||
.Replace(" ", "")
|
||
.ToCharArray()
|
||
.Where(c => char.IsLetterOrDigit(c))
|
||
.Select(c => char.ToLower(c))).Trim()
|
||
.ToLower();
|
||
}
|
||
|
||
bool CouldBeTitle(HtmlNode n, string title)
|
||
{
|
||
var t0 = Striptease(n);
|
||
var t1 = Striptease(title);
|
||
|
||
t0 = t0.ToLower();
|
||
t1 = t1.ToLower();
|
||
|
||
t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
|
||
t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", "");
|
||
|
||
t0 = Regex.Replace(t0, @"\s\s+", "");
|
||
t1 = Regex.Replace(t1, @"\s\s+", "");
|
||
|
||
return t0 == t1;
|
||
}
|
||
|
||
void WriteEpub(List<Chapter> chapters)
|
||
{
|
||
if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH);
|
||
if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH);
|
||
|
||
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
||
|
||
using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite))
|
||
{
|
||
using (var zipbook = new ZipOutputStream(fs))
|
||
{
|
||
WritePubString(zipbook, @"mimetype", GetEpubMimetype());
|
||
WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML());
|
||
WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters));
|
||
WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters));
|
||
|
||
for (int i = 0; i < chapters.Count; i++)
|
||
{
|
||
WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i));
|
||
}
|
||
}
|
||
}
|
||
|
||
File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH);
|
||
|
||
File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true);
|
||
}
|
||
|
||
void GenerateMobi()
|
||
{
|
||
if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH);
|
||
|
||
"Running ebook-convert for MOBI output".Dump();
|
||
var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999");
|
||
|
||
$"ebook-convert returned: {pout.ExitCode}".Dump();
|
||
if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined);
|
||
|
||
File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true);
|
||
}
|
||
|
||
void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null)
|
||
{
|
||
e = e ?? Encoding.UTF8;
|
||
|
||
var f = z.PutNextEntry(n);
|
||
f.CompressionLevel = Ionic.Zlib.CompressionLevel.None;
|
||
|
||
byte[] buffer = e.GetBytes(c);
|
||
z.Write(buffer, 0, buffer.Length);
|
||
}
|
||
|
||
string GetEpubMimetype()
|
||
{
|
||
return "application/epub+zip";
|
||
}
|
||
|
||
string GetEpubContainerXML()
|
||
{
|
||
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null),
|
||
new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
||
new XAttribute("version", "1.0"),
|
||
new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
||
new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"),
|
||
new XAttribute("full-path", "OEBPS/content.opf"),
|
||
new XAttribute("media-type", "application/oebps-package+xml")))));
|
||
|
||
StringBuilder builder = new StringBuilder();
|
||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
||
{
|
||
doc.Save(writer);
|
||
var r = writer.ToString();
|
||
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
|
||
return r.Trim() + "\r\n";
|
||
}
|
||
}
|
||
|
||
string GetEpubContentOPF(List<Chapter> chapters)
|
||
{
|
||
XNamespace dc = "http://purl.org/dc/elements/1.1/";
|
||
XNamespace opf = "http://www.idpf.org/2007/opf";
|
||
|
||
var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null));
|
||
|
||
var package = new XElement(opf + "package",
|
||
new XAttribute("unique-identifier", "BookId"),
|
||
new XAttribute("version", "2.0"));
|
||
|
||
doc.Add(package);
|
||
|
||
var meta = new XElement(opf + "metadata",
|
||
new XAttribute(XNamespace.Xmlns + "dc", dc),
|
||
new XAttribute(XNamespace.Xmlns + "opf", opf),
|
||
new XElement(dc + "title", ACTIVE_BOOK.Title),
|
||
new XElement(dc + "creator", ACTIVE_BOOK.Author),
|
||
new XElement(dc + "identifier",
|
||
new XAttribute("id", "BookId"),
|
||
new XAttribute(opf + "scheme", "UUID"),
|
||
"urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
|
||
new XElement(dc + "date",
|
||
new XAttribute(opf + "event", "publication"),
|
||
ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")),
|
||
new XElement(dc + "date",
|
||
new XAttribute(opf + "event", "modification"),
|
||
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
|
||
new XElement(dc + "date",
|
||
new XAttribute(opf + "event", "creation"),
|
||
DateTime.Now.ToString("yyyy'-'MM'-'dd")),
|
||
new XElement(dc + "language", ACTIVE_BOOK.Language),
|
||
new XElement(dc + "identifier",
|
||
new XAttribute(opf + "scheme", "UUID"),
|
||
ACTIVE_BOOK.ID_CAL.ToString("D")),
|
||
new XElement(opf + "meta",
|
||
new XAttribute("content", "1.0"),
|
||
new XAttribute("name", "Wordpress_eBook_scraper_version")),
|
||
new XElement(opf + "meta",
|
||
new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")),
|
||
new XAttribute("name", "Wordpress_eBook_scraper_creation_time")));
|
||
|
||
if (ACTIVE_BOOK.Series != null)
|
||
{
|
||
meta.Add(new XElement(opf + "meta",
|
||
new XAttribute("content", ACTIVE_BOOK.Series),
|
||
new XAttribute("name", "calibre:series")));
|
||
meta.Add(new XElement(opf + "meta",
|
||
new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)),
|
||
new XAttribute("name", "calibre:series_index")));
|
||
}
|
||
|
||
package.Add(meta);
|
||
|
||
var manifest = new XElement(opf + "manifest");
|
||
for(int i = 0; i < chapters.Count; i++)
|
||
{
|
||
manifest.Add(new XElement(opf + "item",
|
||
new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Filenamify(chapters[i].title, true)))),
|
||
new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))),
|
||
new XAttribute("media-type", "application/xhtml+xml")));
|
||
}
|
||
manifest.Add(new XElement(opf + "item",
|
||
new XAttribute("href", "toc.ncx"),
|
||
new XAttribute("id", "ncx"),
|
||
new XAttribute("media-type", "application/x-dtbncx+xml")));
|
||
|
||
package.Add(manifest);
|
||
|
||
var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx"));
|
||
for (int i = 0; i < chapters.Count; i++)
|
||
{
|
||
spine.Add(new XElement(opf + "itemref",
|
||
new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true)))));
|
||
}
|
||
|
||
package.Add(spine);
|
||
|
||
package.Add(new XElement(opf + "guide"));
|
||
|
||
StringBuilder builder = new StringBuilder();
|
||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
||
{
|
||
doc.Save(writer);
|
||
return writer.ToString();
|
||
}
|
||
}
|
||
|
||
string GetEpubTOC(List<Chapter> chapters)
|
||
{
|
||
XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
|
||
XNamespace ncx = "http://www.idpf.org/2007/opf";
|
||
|
||
var doc = new XDocument(
|
||
new XDeclaration("1.0", "UTF-8", null),
|
||
new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null));
|
||
|
||
var root = new XElement(ncx + "ncx",
|
||
new XAttribute("version", "2005-1"),
|
||
new XElement(ncx + "head",
|
||
new XElement(ncx + "meta",
|
||
new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")),
|
||
new XAttribute("name", "dtb:uid")),
|
||
new XElement(ncx + "meta",
|
||
new XAttribute("content", 1),
|
||
new XAttribute("name", "dtb:depth")),
|
||
new XElement(ncx + "meta",
|
||
new XAttribute("content", 0),
|
||
new XAttribute("name", "dtb:totalPageCount")),
|
||
new XElement(ncx + "meta",
|
||
new XAttribute("content", 0),
|
||
new XAttribute("name", "dtb:maxPageNumber"))));
|
||
|
||
doc.Add(root);
|
||
|
||
root.Add(new XElement(ncx + "docTitle",
|
||
new XElement(ncx + "text", "Unknown")));
|
||
|
||
var nav = new XElement(ncx + "navMap");
|
||
for (int i = 0; i < chapters.Count; i++)
|
||
{
|
||
nav.Add(new XElement(ncx + "navPoint",
|
||
new XAttribute("id", "navPoint-" + (i + 1)),
|
||
new XAttribute("playOrder", i + 1),
|
||
new XElement(ncx + "navLabel",
|
||
new XElement(ncx + "text", chapters[i].title)),
|
||
new XElement(ncx + "content",
|
||
new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))))));
|
||
}
|
||
|
||
root.Add(nav);
|
||
|
||
StringBuilder builder = new StringBuilder();
|
||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
||
{
|
||
doc.Save(writer);
|
||
return writer.ToString();
|
||
}
|
||
}
|
||
|
||
string GetEpubChapterFile(Chapter chapter, int idx)
|
||
{
|
||
StringBuilder xml = new StringBuilder();
|
||
|
||
xml.AppendLine(@"<?xml version=""1.0"" encoding=""utf-8""?>");
|
||
xml.AppendLine(@"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > ");
|
||
xml.AppendLine(@"<html xmlns=""http://www.w3.org/1999/xhtml"">");
|
||
xml.AppendLine(@"<head>");
|
||
xml.AppendLine("<title>" + HtmlEntity.Entitize(chapter.title) + "</title>");
|
||
xml.AppendLine(@"</head>");
|
||
xml.AppendLine(@"<body>");
|
||
xml.AppendLine("<h1>" + HtmlEntity.Entitize(chapter.title) + "</h1>");
|
||
xml.AppendLine(chapter.chapter);
|
||
xml.AppendLine(@"</body>");
|
||
xml.AppendLine(@"</html>");
|
||
|
||
return xml.ToString();
|
||
}
|
||
|
||
public struct ProcessOutput
|
||
{
|
||
public readonly string Command;
|
||
public readonly int ExitCode;
|
||
public readonly string StdOut;
|
||
public readonly string StdErr;
|
||
public readonly string StdCombined;
|
||
|
||
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
|
||
{
|
||
Command = cmd;
|
||
ExitCode = ex;
|
||
StdOut = stdout;
|
||
StdErr = stderr;
|
||
StdCombined = stdcom;
|
||
}
|
||
|
||
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
|
||
}
|
||
|
||
public static class ProcessHelper
|
||
{
|
||
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
|
||
{
|
||
var process = new Process
|
||
{
|
||
StartInfo =
|
||
{
|
||
FileName = command,
|
||
Arguments = arguments,
|
||
WorkingDirectory = workingDirectory ?? string.Empty,
|
||
UseShellExecute = false,
|
||
RedirectStandardOutput = true,
|
||
RedirectStandardError = true,
|
||
CreateNoWindow = true,
|
||
ErrorDialog = false,
|
||
}
|
||
};
|
||
|
||
var builderOut = new StringBuilder();
|
||
var builderErr = new StringBuilder();
|
||
var builderBoth = new StringBuilder();
|
||
|
||
process.OutputDataReceived += (sender, args) =>
|
||
{
|
||
if (args.Data == null) return;
|
||
|
||
if (builderOut.Length == 0) builderOut.Append(args.Data);
|
||
else builderOut.Append("\n" + args.Data);
|
||
|
||
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
||
else builderBoth.Append("\n" + args.Data);
|
||
};
|
||
|
||
process.ErrorDataReceived += (sender, args) =>
|
||
{
|
||
if (args.Data == null) return;
|
||
|
||
if (builderErr.Length == 0) builderErr.Append(args.Data);
|
||
else builderErr.Append("\n" + args.Data);
|
||
|
||
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
||
else builderBoth.Append("\n" + args.Data);
|
||
};
|
||
|
||
process.Start();
|
||
|
||
process.BeginOutputReadLine();
|
||
process.BeginErrorReadLine();
|
||
|
||
process.WaitForExit();
|
||
|
||
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
|
||
}
|
||
}
|
||
public static class HTMLToText
|
||
{
|
||
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
|
||
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
|
||
|
||
private class PreceedingDomTextInfo
|
||
{
|
||
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
|
||
{
|
||
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
|
||
}
|
||
public bool WritePrecedingWhiteSpace { get; set; }
|
||
public bool LastCharWasSpace { get; set; }
|
||
public readonly BoolWrapper IsFirstTextOfDocWritten;
|
||
public int ListIndex { get; set; }
|
||
}
|
||
|
||
private class BoolWrapper
|
||
{
|
||
public BoolWrapper() { }
|
||
public bool Value { get; set; }
|
||
public static implicit operator bool(BoolWrapper boolWrapper)
|
||
{
|
||
return boolWrapper.Value;
|
||
}
|
||
public static implicit operator BoolWrapper(bool boolWrapper)
|
||
{
|
||
return new BoolWrapper { Value = boolWrapper };
|
||
}
|
||
}
|
||
|
||
public static string Convert(string path)
|
||
{
|
||
HtmlDocument doc = new HtmlDocument();
|
||
doc.Load(path);
|
||
return ConvertDoc(doc);
|
||
}
|
||
|
||
public static string ConvertHtml(string html)
|
||
{
|
||
HtmlDocument doc = new HtmlDocument();
|
||
html = REX_TAG1.Replace(html, " ");
|
||
html = REX_TAG2.Replace(html, " ");
|
||
doc.LoadHtml(html);
|
||
return ConvertDoc(doc);
|
||
}
|
||
|
||
public static string ConvertDoc(HtmlDocument doc)
|
||
{
|
||
using (StringWriter sw = new StringWriter())
|
||
{
|
||
ConvertTo(doc.DocumentNode, sw);
|
||
sw.Flush();
|
||
return sw.ToString();
|
||
}
|
||
}
|
||
|
||
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||
{
|
||
foreach (HtmlNode subnode in node.ChildNodes)
|
||
{
|
||
ConvertTo(subnode, outText, textInfo);
|
||
}
|
||
}
|
||
|
||
public static void ConvertTo(HtmlNode node, TextWriter outText)
|
||
{
|
||
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
|
||
}
|
||
|
||
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||
{
|
||
string html;
|
||
switch (node.NodeType)
|
||
{
|
||
case HtmlNodeType.Comment:
|
||
// don't output comments
|
||
break;
|
||
case HtmlNodeType.Document:
|
||
ConvertContentTo(node, outText, textInfo);
|
||
break;
|
||
case HtmlNodeType.Text:
|
||
// script and style must not be output
|
||
string parentName = node.ParentNode.Name;
|
||
if ((parentName == "script") || (parentName == "style"))
|
||
{
|
||
break;
|
||
}
|
||
// get text
|
||
html = ((HtmlTextNode)node).Text;
|
||
// is it in fact a special closing node output as text?
|
||
if (HtmlNode.IsOverlappedClosingElement(html)) break;
|
||
|
||
// check the text is meaningful and not a bunch of whitespaces
|
||
if (html.Length == 0) break;
|
||
|
||
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
|
||
|
||
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
|
||
{
|
||
html = html.TrimStart();
|
||
if (html.Length == 0) { break; }
|
||
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
|
||
}
|
||
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
|
||
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
|
||
{
|
||
outText.Write(' ');
|
||
}
|
||
break;
|
||
case HtmlNodeType.Element:
|
||
string endElementString = null;
|
||
bool isInline;
|
||
bool skip = false;
|
||
int listIndex = 0;
|
||
switch (node.Name)
|
||
{
|
||
case "nav":
|
||
skip = true;
|
||
isInline = false;
|
||
break;
|
||
case "body":
|
||
case "section":
|
||
case "article":
|
||
case "aside":
|
||
case "h1":
|
||
case "h2":
|
||
case "header":
|
||
case "footer":
|
||
case "address":
|
||
case "main":
|
||
case "div":
|
||
case "span":
|
||
case "p": // stylistic - adjust as you tend to use
|
||
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
|
||
endElementString = "\r\n";
|
||
isInline = false;
|
||
break;
|
||
case "br":
|
||
outText.Write("\r\n");
|
||
skip = true;
|
||
textInfo.WritePrecedingWhiteSpace = false;
|
||
isInline = true;
|
||
break;
|
||
case "a":
|
||
isInline = true;
|
||
break;
|
||
case "li":
|
||
isInline = false;
|
||
break;
|
||
case "ol":
|
||
listIndex = 1;
|
||
goto case "ul";
|
||
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
|
||
endElementString = "\r\n";
|
||
isInline = false;
|
||
break;
|
||
case "img": //inline-block in reality
|
||
isInline = true;
|
||
break;
|
||
default:
|
||
isInline = true;
|
||
break;
|
||
}
|
||
if (!skip && node.HasChildNodes)
|
||
{
|
||
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
|
||
}
|
||
if (endElementString != null)
|
||
{
|
||
outText.Write(endElementString);
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
} |