diff --git a/Config.cs b/Config.cs new file mode 100644 index 0000000..fa14ca6 --- /dev/null +++ b/Config.cs @@ -0,0 +1,83 @@ +using WordpressEboobScraper2.Scraper; + +namespace WordpressEboobScraper2; + +public class Config +{ + + public const string BASE_DIR_STASH = @"/home/mike/stash/eBook_scraper/"; + public const string BASE_DIR_OUT = @"/home/mike/Nextcloud/Dokumente/E-Books/Scraper/"; + public const string COMPARE_PROG = @"/usr/bin/bcompare"; + + //----------------------------------------------------------------------------------------------------// + + public static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/"); + public static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/"); + public static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/"); + public static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/"); + public static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/"); + + public static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/"); + public static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/"); + public static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/"); + public static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/"); + public static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/"); + public static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/"); + public static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/"); + + public static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/"); + public static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/"); + public static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/"); + public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); + + public static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/"); + + public static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/"); + + public static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/"); + public static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/"); + public static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/"); + public static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/"); + public static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/"); + public static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/"); + public static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/"); + public static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/"); + public static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/"); + public static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/"); + public static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/"); + public static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/"); + public static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/"); + public static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/"); + public static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/"); + + public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); + + public static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue"); + + public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); + + public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); + + public static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/"); + + public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); + + public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); + + public static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother"); + + public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); + + public static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave"); + + //----------------------------------------------------------------------------------------------------// + + public static readonly EpubParameter[] BOOKS = new[] { TPR }; + + public static readonly bool USE_WEBCACHE = true; + public static readonly bool DO_LIVE_RELOAD_OF_LAST = true; + public static readonly bool CONVERT_MOBI = true; + + public static readonly MainMode MODE = MainMode.Generate; + +} \ No newline at end of file diff --git a/Program.cs b/Program.cs deleted file mode 100644 index 597fb01..0000000 --- a/Program.cs +++ /dev/null @@ -1,1662 +0,0 @@ -/** *************************************************** **/ -/** **/ -/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/ -/** **/ -/** *************************************************** **/ - -const string BASE_DIR_STASH = @"F:\Stash\eBook_scraper\"; -const string BASE_DIR_OUT = @"F:\Home\Cloud\Dokumente\E-Books\Scraper\"; -const string COMPARE_PROG = @"C:\Program Files\Beyond Compare 4\BCompare.exe"; - -//----------------------------------------------------------------------------------------------------// - -static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/"); -static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/"); -static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/"); -static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/"); -static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/"); - -static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/"); -static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/"); -static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/"); -static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/"); -static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/"); -static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/"); -static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/"); - -static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/"); -static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/"); -static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/"); -static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); - -static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/"); - -static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/"); - -static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/"); -static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/"); -static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/"); -static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/"); -static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/"); -static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/"); -static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/"); -static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/"); -static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/"); -static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/"); -static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/"); -static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/"); -static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/"); -static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/"); -static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/"); - -static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); - -static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue"); - -static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); - -static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); - -static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/"); - -static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); - -static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); - -static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother"); - -static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); - -static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave"); - -//----------------------------------------------------------------------------------------------------// - -readonly EpubParameter[] BOOKS = new[] { TPR }; - -readonly bool USE_WEBCACHE = true; -readonly bool DO_LIVE_RELOAD_OF_LAST = true; -readonly bool CONVERT_MOBI = true; - -readonly MainMode MODE = MainMode.Generate; - -//----------------------------------------------------------------------------------------------------// - -static EpubParameter ACTIVE_BOOK = null; - -const int LIMIT = 1500; - -readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); - -Dictionary webCache = new Dictionary(); - -string STASH_FOLDER => BASE_DIR_STASH + ACTIVE_BOOK.Foldername + @"\"; - -string WCACHE_FILE => BASE_DIR_OUT + @"_cache\" + ACTIVE_BOOK.Foldername + @".xml"; -string HTML_FILE_OUT => BASE_DIR_OUT + @"html\" + ACTIVE_BOOK.Foldername + @".html"; -string EPUB_FILE_OUT => BASE_DIR_OUT + @"epub\" + ACTIVE_BOOK.Foldername + @".epub"; -string MOBI_FILE_OUT => BASE_DIR_OUT + @"mobi\" + ACTIVE_BOOK.Foldername + @".mobi"; - -string HTML_FILE_STASH => STASH_FOLDER + @"book.html"; -string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip"; -string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub"; -string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi"; - -string QUERY_FOLDER => STASH_FOLDER + @"query\"; // full query result -string HTML_FOLDER => STASH_FOLDER + @"html\"; // unprocessed chapter code -string EPUB_FOLDER => STASH_FOLDER + @"epub\"; // processed epub chapter code - -//----------------------------------------------------------------------------------------------------// - -public enum MainMode -{ - Generate, - Verify, -} - -public enum ProcessResult -{ - SuccessNormal, - ReachedEnd, - SkipChapter, -} - -public enum Site -{ - Wordpress, - WuxiaWorld, - Royalroad, - - WP = Wordpress, - WW = WuxiaWorld, - RR = Royalroad, -} - -public class Chapter -{ - public string url; - public string title; - public string next; - - public GZippedString queryResult; - public GZippedString sourcecode; - public GZippedString chapter; - - public bool isPrologue; - public bool isEpilogue; - public bool isBonus; - public bool isSpecial => isPrologue || isEpilogue || isBonus; -} - -public class SerializableCacheEntry -{ - public string URL; - public GZippedString Content; -} - -public class GZippedString : IXmlSerializable -{ - public string Value { get; set; } - - public System.Xml.Schema.XmlSchema GetSchema() { return null; } - - public void ReadXml(System.Xml.XmlReader reader) - { - Value = DecompressString(reader.ReadString()); - reader.ReadEndElement(); - } - - public void WriteXml(System.Xml.XmlWriter writer) - { - writer.WriteString(CompressString(Value)); - } - - private string CompressString(string text) - { - byte[] buffer = Encoding.UTF8.GetBytes(text); - var memoryStream = new MemoryStream(); - using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true)) - gZipStream.Write(buffer, 0, buffer.Length); - memoryStream.Position = 0; - var compressedData = new byte[memoryStream.Length]; - memoryStream.Read(compressedData, 0, compressedData.Length); - var gZipBuffer = new byte[compressedData.Length + 4]; - Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length); - Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4); - return Convert.ToBase64String(gZipBuffer); - } - - private string DecompressString(string compressedText) - { - byte[] gZipBuffer = Convert.FromBase64String(compressedText); - using (var memoryStream = new MemoryStream()) - { - int dataLength = BitConverter.ToInt32(gZipBuffer, 0); - memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4); - var buffer = new byte[dataLength]; - memoryStream.Position = 0; - using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress)) - gZipStream.Read(buffer, 0, buffer.Length); - return Encoding.UTF8.GetString(buffer); - } - } - - public static implicit operator GZippedString(string v) => new GZippedString{Value = v}; - public static implicit operator string (GZippedString v) => v.Value; - -} - -public class Utf8StringWriter : StringWriter -{ - public override Encoding Encoding { get { return Encoding.UTF8; } } -} - -public class EpubParameter -{ - public readonly string Series; - public readonly int SeriesIndex; - public readonly Guid ID_OPF; - public readonly Guid ID_CAL; - public readonly string Title; - public readonly string Author; - public readonly DateTime Release; - public readonly string Language; - public readonly string StartURL; - public readonly string Foldername; - public readonly Site SiteType; - - public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } } - - public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { } - - public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s) - { - SiteType = st; - Series = z; - SeriesIndex = i; - Title = t; - Author = a; - Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture); - Language = l; - StartURL = s; - if (z == null) - Foldername = Filenamify(t); - else - Foldername = string.Format("{0} {1} - {2}", Filenamify(z), i, Filenamify(t)); - - var u = new Random(Title.GetHashCode() ^ Author.GetHashCode()); - var g = new byte[16]; - u.NextBytes(g); - ID_OPF = new Guid(g); - u.NextBytes(g); - ID_CAL = new Guid(g); - } - - public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}"; -} - -//----------------------------------------------------------------------------------------------------// - -void Main() -{ - Util.AutoScrollResults = true; - - if (MODE == MainMode.Generate) Generate(); - if (MODE == MainMode.Verify) Verify(); -} - -void Generate() -{ - foreach (var bb in BOOKS) - { - ACTIVE_BOOK = bb; - - $"".Dump(); - $"".Dump(); - $"".Dump(); - new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); - $" [PROCESSING BOOK] {bb.DisplayStr} ".Dump(); - new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); - $"".Dump(); - $"".Dump(); - $"".Dump(); - - Init(); - - List chapters = FindChapters(); - - WriteBookHTML(chapters); - WriteEpub(chapters); - if (CONVERT_MOBI) GenerateMobi(); - } -} - -void Verify() -{ - foreach (var bb in BOOKS) - { - ACTIVE_BOOK = bb; - - $"".Dump(); - $"".Dump(); - $"".Dump(); - new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); - $" [VERIFYING BOOK] {bb.DisplayStr} ".Dump(); - new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); - $"".Dump(); - $"".Dump(); - $"".Dump(); - - LoadWebCache(); - - VerifyChapters(); - } -} - -void Init() -{ - if (Directory.Exists(STASH_FOLDER)) - { - Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete)); - if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH); - if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); - if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); - if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); - } - - Directory.CreateDirectory(STASH_FOLDER); - Directory.CreateDirectory(QUERY_FOLDER); - Directory.CreateDirectory(HTML_FOLDER); - Directory.CreateDirectory(EPUB_FOLDER); - - Directory.CreateDirectory(BASE_DIR_OUT + @"_cache\"); - Directory.CreateDirectory(BASE_DIR_OUT + @"html\"); - Directory.CreateDirectory(BASE_DIR_OUT + @"epub\"); - Directory.CreateDirectory(BASE_DIR_OUT + @"mobi\"); - - if (USE_WEBCACHE) LoadWebCache(); -} - -void WriteBookHTML(List chapters) -{ - StringBuilder b = new StringBuilder(); - - b.AppendLine(""); - b.AppendLine(""); - b.AppendLine(""); - - foreach (var currChapter in chapters) - { - b.AppendLine(); - b.AppendLine("

" + HtmlEntity.Entitize(currChapter.title) + "

"); - b.AppendLine(); - b.AppendLine(currChapter.chapter); - } - - b.AppendLine(""); - b.AppendLine(""); - - File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8); - File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true); -} - -void SaveCache() -{ - var xs = new XmlSerializer(typeof(List)); - using (var writer = new System.IO.StreamWriter(WCACHE_FILE)) - { - xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); - } -} - -void LoadWebCache() -{ - if (!File.Exists(WCACHE_FILE)) return; - - XmlSerializer deserializer = new XmlSerializer(typeof(List)); - using (TextReader reader = new StreamReader(WCACHE_FILE)) - { - var result = new List(); - - var l = (List)deserializer.Deserialize(reader); - - webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); - } -} - -List FindChapters() -{ - List result = new List(); - - using (WebClient client = new WebClient()) - { - client.Encoding = Encoding.UTF8; - Stack buffer = new Stack(); - buffer.Push(ACTIVE_BOOK.StartURL); - - while (buffer.Any() && result.Count < LIMIT) - { - var url = buffer.Pop(); - Chapter curr = new Chapter() { url = url }; - - var buffered = webCache.ContainsKey(url.ToLower()); - if (buffered) - { - curr.queryResult = webCache[url.ToLower()]; - "*(loaded from webcache)*".Dump(); - } - else - { - curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - webCache[url.ToLower()] = curr.queryResult; - SaveCache(); - } - - var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); - if (next_url != null) buffer.Push(next_url); - - if (buffered && buffer.Count == 0 && DO_LIVE_RELOAD_OF_LAST) - { - "".Dump(); - "//==> *(auto-reload from live)*".Dump(); - "".Dump(); - curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - webCache[url.ToLower()] = curr.queryResult; - SaveCache(); - - r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); - if (next_url_inner != null) buffer.Push(next_url_inner); - } - if (r == ProcessResult.SuccessNormal) - { - " ==> Chapter processed".Dump(); - result.Add(curr); - OutputChapter(curr, result.Count); - } - else if (r == ProcessResult.SkipChapter) - { - " ==> Skip this chapter".Dump(); - } - else if (r == ProcessResult.ReachedEnd) - { - " ==> End reached".Dump(); - } - - - "".Dump(); - } - } - - return result; -} - -void VerifyChapters() -{ - List result = new List(); - - using (WebClient client = new WebClient()) - { - client.Encoding = Encoding.UTF8; - Stack buffer = new Stack(); - buffer.Push(ACTIVE_BOOK.StartURL); - - while (buffer.Any() && result.Count < LIMIT) - { - var url = buffer.Pop(); - Chapter curr_buffer = new Chapter() { url = url }; - Chapter curr_live = new Chapter() { url = url }; - - var buffered = webCache.ContainsKey(url.ToLower()); - if (buffered) - { - try - { - curr_buffer.queryResult = webCache[url.ToLower()]; - curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - } - catch (Exception e) - { - $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); - continue; - } - } - else - { - continue; - } - - var is_diff = false; - - var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer); - var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live); - - if (next_buffer != null) buffer.Push(next_buffer); - - if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } - if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } - - if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } - if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } - - if (curr_buffer.chapter.Value != curr_live.chapter.Value) - { - var clean_buffer = GetChapterText(curr_buffer); - var clean_live = GetChapterText(curr_live); - - if (clean_buffer.Trim() != clean_live.Trim()) - { - $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); - new Hyperlinq(() => - { - - var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); - var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); - File.WriteAllText(fa, curr_buffer.chapter.Value); - File.WriteAllText(fb, curr_live.chapter.Value); - Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); - - }, "[Compare Raw]").Dump(); - new Hyperlinq(() => - { - - var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); - var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); - File.WriteAllText(fa, clean_buffer); - File.WriteAllText(fb, clean_live); - Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); - - }, "[Compare Text]").Dump(); - new Hyperlinq(() => - { - - webCache[url.ToLower()] = curr_live.queryResult; - SaveCache(); - - }, "[Save new version to webcache]").Dump(); - - is_diff = true; - } - } - - if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); - - if (is_diff) "".Dump(); - } - } -} - -bool Relaxedurleq(string a, string b) -{ - if (a == b) return true; - if (a.StartsWith("https://")) a = a.Substring("https://".Length); - if (a.StartsWith("http://")) a = a.Substring("http://".Length); - if (b.StartsWith("https://")) b = b.Substring("https://".Length); - if (b.StartsWith("http://")) b = b.Substring("http://".Length); - - return (a==b); -} - -string GetChapterText(Chapter c) -{ - if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty; - - var clean = HTMLToText.ConvertHtml(c.chapter.Value); - - clean = clean.Trim(); - - clean = new Regex(@"\s+").Replace(clean, " "); - - return clean; -} - -ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next) -{ - forwardQueue_next = null; - - HtmlDocument doc = new HtmlDocument(); - doc.LoadHtml(curr.queryResult); - - #region Base - - var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); - if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); - if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); - if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); - if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); - - var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); - if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]"); - if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]"); - if (nodeNav == null) nodeNav = nodeContent; - - var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); - if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); - - #endregion - - #region Title - - var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); - if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); - if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); - if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); - if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1"); - - curr.title = TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText)); - - var titles = new List(); - titles.Add(curr.title); - - if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*")) - { - var baseTitle = curr.title; - - var suffix = TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value); - - var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value; - var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value); - - titles.Add(prefix1); - titles.Add(prefix2); - - var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2); - var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2); - var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" title node removed"); - } - else if (altTitleNode4 != null) - { - var newtitle = TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length)); - titles.Add(newtitle); - curr.title = newtitle; - titles.Add(prefix1 + newtitle); - titles.Add(prefix2 + newtitle); - titles.Add(prefix1 + " - " + newtitle); - titles.Add(prefix2 + " - " + newtitle); - - altTitleNode4.Remove(); - prt(" > title node removed"); - } - else if (suffix.Length > 2) - { - curr.title = suffix; - titles.Add(suffix); - } - else - { - prt(" [!!] Warning cannot parse title"); - } - - if (suffix.Length > 2) - { - curr.title = baseTitle; - titles.Add(baseTitle); - } - } - - if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { - var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length); - while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1); - tit_alt = tit_alt.Trim(); - if (tit_alt.Length>2) curr.title = tit_alt; - } - - #endregion - - curr.sourcecode = "\r\n\r\n\r\n" + nodeContent.OuterHtml + "\r\n\r\n\r\n"; - - if (backBuffer.Any() && backBuffer.First().title == curr.title) - { - prt("[!] Book loop found - skipping entry"); - return ProcessResult.ReachedEnd; // prevent book II loop - } - - curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); - curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); - curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); - - if (ACTIVE_BOOK == APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II"); - - if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus) - { - prt("[!] Epilogue found - skipping entry"); - return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue - } - - prt(curr.title + " (" + curr.url + ")"); - - #region Next - - string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" }; - - if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && - backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && - REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && - REX_NUMSTART.Match(curr.title).Success && - REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value) - { - prt("[!] Book jump found - skipping entry"); - return ProcessResult.ReachedEnd; - } - - var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); - if (next == null) - next = nodeContent.Descendants() - .Where(p => p.Name.ToLower() == "a") - .Where(p => Striptease(p) == "next chapter" || Striptease(p) == "next") - .Where(p => p.Attributes.Contains("href")) - .FirstOrDefault(); - - var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a"); - - if (next == null) - next = nodeNav.Descendants() - .Where(p => p.Name.ToLower() == "a") - .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) - .FirstOrDefault(); - - if (next != null) - { - var next_url = next.Attributes["href"].Value.Trim(); - - if (next_url == "." || next_url == "/" || next_url == "./") - { - next=null; - } - else - { - if (next_url.StartsWith("//")) next_url = "http:" + next_url; - - if (next_url.StartsWith("/")) next_url = combineAuthority(curr.url, next_url); - - if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = CombineUri(curr.url, next_url); - - curr.next = next_url; - if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) - { - forwardQueue_next = next_url; - } - } - - } - - if (next == null) prt(" > (!) No next URL found"); - - #endregion - - #region Chapter marker - - var cpMarkerIdentities = new List - { - "previousnext", "previouschapternextchapter", - "firstnext", "firstchapternextchapter", - "firstchapter", "previouslast", - - "previouschapterlastchapter", - - "previouschapter", "nextchapter", "lastchapter", - - "first", "previous", "next", "last" - }; - - foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList()) - { - nodeChapter.RemoveChild(node); - prt(" > Chapter marker removed"); - } - - foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) - { - nodeChapter.RemoveChild(node); - prt(" > Chapter marker removed"); - } - - var alist = nodeChapter.SelectNodes("//a"); - if (alist != null) - { - foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) - { - node.Remove(); - prt(" > Chapter marker removed"); - } - } - - var plist = nodeChapter.SelectNodes("//p"); - if (plist != null) - { - foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) - { - node.Remove(); - prt(" > Chapter marker removed"); - } - } - - #endregion - - #region Share Div - - var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); - if (shareNodes != null) - { - foreach (var node in shareNodes) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > share div removed"); - } - else - { - prt(" > share div cannot be removed - skipping"); - } - } - } - - #endregion - - #region Meta Div - - var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]"); - if (metaNodes != null) - { - foreach (var node in metaNodes) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > meta div removed"); - } - else - { - prt(" > meta div cannot be removed - skipping"); - } - } - } - - #endregion - - #region Ad Blocking - - var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.."); - if (adNodes1 != null) - { - foreach (var node in adNodes1) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > ad div removed"); - } - else - { - prt(" > ad div cannot be removed - skipping"); - } - } - } - - var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.."); - if (adNodes2 != null) - { - foreach (var node in adNodes2) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > ad div removed"); - } - else - { - prt(" > ad div cannot be removed - skipping"); - } - } - } - - var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]"); - if (adNodes3 != null) - { - foreach (var node in adNodes3.Where(n => Striptease(n) == "advertisement")) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > ad div removed"); - } - else - { - prt(" > ad div cannot be removed - skipping"); - } - } - } - - #endregion - - #region Title Paragraphs - - var titleNodes1 = nodeChapter.SelectNodes(@"p"); - if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First())) - { - nodeChapter.RemoveChild(titleNodes1.First()); - prt(" > title node removed"); - } - - for (int hval = 1; hval <= 5; hval++) - { - var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval); - if (titleNodes2 != null) - { - foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == TitleFmt(node.InnerText).ToLower()))) - { - if (nodeChapter.ChildNodes.Contains(node)) - { - nodeChapter.RemoveChild(node); - prt(" > title node removed"); - } - } - } - } - - var titleNodes3 = nodeChapter.SelectNodes(@"//u"); - if (titleNodes3 != null && titleNodes3.Any()) - { - var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t))); - foreach (var t in xTitleNodes3) - { - t.Remove(); - prt(" > title node removed"); - } - } - - var titleNodes4 = nodeChapter.SelectNodes(@"//span"); - if (titleNodes4 != null && titleNodes4.Any()) - { - var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t))); - foreach (var t in xTitleNodes4) - { - t.Remove(); - prt(" > title node removed"); - } - } - - var titleNodes5 = nodeChapter.SelectNodes(@"//strong"); - if (titleNodes5 != null && titleNodes5.Any()) - { - var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t))); - foreach (var t in xTitleNodes5) - { - t.Remove(); - prt(" > title node removed"); - } - } - - #endregion - - #region Remove


's - - while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr") - { - nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First()); - prt(" > header hr removed"); - } - - while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr") - { - nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last()); - prt(" > footer hr removed"); - } - - #endregion - - #region Other (Author's Node) - - foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList()) - { - nodeChapter.RemoveChild(node); - prt(" > authors note removed"); - } - - #endregion - - var chap_html = nodeChapter.InnerHtml.Trim(); - - #region Fix raw
- // KOReader doesn't like
- - chap_html = chap_html.Replace("
", "
"); - - #endregion - - curr.chapter = chap_html; - - - if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter; - - return ProcessResult.SuccessNormal; -} - -string combineAuthority(string url, string suffix) -{ - var left = new Uri(url).GetLeftPart(UriPartial.Authority); - if (!left.EndsWith("/")) left = left + "/"; - if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/'); - return left + suffix; -} - -string CombineUri(string uri1, string uri2) -{ - if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/")); - uri1 = uri1.TrimEnd('/'); - uri2 = uri2.TrimStart('/'); - return string.Format("{0}/{1}", uri1, uri2); -} - -void OutputChapter(Chapter curr, int index) -{ - File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.queryResult); - - File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8); - - StringBuilder b = new StringBuilder(); - { - b.AppendLine(""); - b.AppendLine(""); - b.AppendLine(""); - b.AppendLine(); - b.AppendLine("

" + HtmlEntity.Entitize(curr.title) + "

"); - b.AppendLine(); - b.AppendLine(curr.chapter); - b.AppendLine(""); - b.AppendLine(""); - } - File.WriteAllText(Path.Combine(EPUB_FOLDER, Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8); -} - -static string Filenamify(string v, bool repl = false) -{ - var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p => - (p >= '0' && p <= '9') || - (p >= 'A' && p <= 'Z') || - (p >= 'a' && p <= 'z') || - p == ' ' || - p == '.' || - p == '-' || - p == '*' || - p == '_' || - p == '.' || - p == ',').ToArray()); - - if (repl) s = s.Replace(' ', '_'); - - return s; -} - -string TitleFmt(string raw) -{ - raw = HtmlEntity.DeEntitize(raw); - - raw = raw.Replace('–', '-'); - raw = raw.Replace((char)160, ' '); - - raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); - if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3); - - raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); - - if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1); - - return raw; -} - -string Striptease(HtmlNode raw) -{ - { - var rm = raw.SelectNodes(@"//script"); - if (rm != null && rm.Any()) - { - var copy = HtmlNode.CreateNode($"<{raw.Name}>"); - copy.CopyFrom(raw); - raw = copy; - - rm = raw.SelectNodes(@"//script"); - if (rm != null) foreach (var e in rm) e.Remove(); - } - } - - { - var rm = raw.SelectNodes(@"//meta"); - if (rm != null && rm.Any()) - { - var copy = HtmlNode.CreateNode($"<{raw.Name}>"); - copy.CopyFrom(raw); - raw = copy; - - rm = raw.SelectNodes(@"//meta"); - if (rm != null) foreach (var e in rm) e.Remove(); - } - } - - return Striptease(HtmlEntity.DeEntitize(raw.InnerText)); -} - -string Striptease(string raw) -{ - var r = string.Join(string.Empty, - raw - .ToCharArray() - .Select(c => char.IsWhiteSpace(c) ? ' ' : c) - .Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c)) - .Select(c => char.ToLower(c))).Trim(); - return r; -} - -string NakedIdentity(HtmlNode raw) -{ - return string.Join(string.Empty, - raw - .InnerText - .ToLower() - .Replace(">", "") - .Replace("<", "") - .Replace("&", "") - .Replace(""", "") - .Replace(" ", "") - .ToCharArray() - .Where(c => char.IsLetterOrDigit(c)) - .Select(c => char.ToLower(c))).Trim() - .ToLower(); -} - -bool CouldBeTitle(HtmlNode n, string title) -{ - var t0 = Striptease(n); - var t1 = Striptease(title); - - t0 = t0.ToLower(); - t1 = t1.ToLower(); - - t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); - t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); - - t0 = Regex.Replace(t0, @"\s\s+", ""); - t1 = Regex.Replace(t1, @"\s\s+", ""); - - return t0 == t1; -} - -void WriteEpub(List chapters) -{ - if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); - if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); - - Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); - - using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite)) - { - using (var zipbook = new ZipOutputStream(fs)) - { - WritePubString(zipbook, @"mimetype", GetEpubMimetype()); - WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML()); - WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters)); - WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters)); - - for (int i = 0; i < chapters.Count; i++) - { - WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i)); - } - } - } - - File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH); - - File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true); -} - -void GenerateMobi() -{ - if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); - - "Running ebook-convert for MOBI output".Dump(); - var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999"); - - $"ebook-convert returned: {pout.ExitCode}".Dump(); - if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined); - - File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true); -} - -void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null) -{ - e = e ?? Encoding.UTF8; - - var f = z.PutNextEntry(n); - f.CompressionLevel = Ionic.Zlib.CompressionLevel.None; - - byte[] buffer = e.GetBytes(c); - z.Write(buffer, 0, buffer.Length); -} - -string GetEpubMimetype() -{ - return "application/epub+zip"; -} - -string GetEpubContainerXML() -{ - var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null), - new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"), - new XAttribute("version", "1.0"), - new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"), - new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"), - new XAttribute("full-path", "OEBPS/content.opf"), - new XAttribute("media-type", "application/oebps-package+xml"))))); - - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - var r = writer.ToString(); - r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); - return r.Trim() + "\r\n"; - } -} - -string GetEpubContentOPF(List chapters) -{ - XNamespace dc = "http://purl.org/dc/elements/1.1/"; - XNamespace opf = "http://www.idpf.org/2007/opf"; - - var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null)); - - var package = new XElement(opf + "package", - new XAttribute("unique-identifier", "BookId"), - new XAttribute("version", "2.0")); - - doc.Add(package); - - var meta = new XElement(opf + "metadata", - new XAttribute(XNamespace.Xmlns + "dc", dc), - new XAttribute(XNamespace.Xmlns + "opf", opf), - new XElement(dc + "title", ACTIVE_BOOK.Title), - new XElement(dc + "creator", ACTIVE_BOOK.Author), - new XElement(dc + "identifier", - new XAttribute("id", "BookId"), - new XAttribute(opf + "scheme", "UUID"), - "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), - new XElement(dc + "date", - new XAttribute(opf + "event", "publication"), - ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")), - new XElement(dc + "date", - new XAttribute(opf + "event", "modification"), - DateTime.Now.ToString("yyyy'-'MM'-'dd")), - new XElement(dc + "date", - new XAttribute(opf + "event", "creation"), - DateTime.Now.ToString("yyyy'-'MM'-'dd")), - new XElement(dc + "language", ACTIVE_BOOK.Language), - new XElement(dc + "identifier", - new XAttribute(opf + "scheme", "UUID"), - ACTIVE_BOOK.ID_CAL.ToString("D")), - new XElement(opf + "meta", - new XAttribute("content", "1.0"), - new XAttribute("name", "Wordpress_eBook_scraper_version")), - new XElement(opf + "meta", - new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")), - new XAttribute("name", "Wordpress_eBook_scraper_creation_time"))); - - if (ACTIVE_BOOK.Series != null) - { - meta.Add(new XElement(opf + "meta", - new XAttribute("content", ACTIVE_BOOK.Series), - new XAttribute("name", "calibre:series"))); - meta.Add(new XElement(opf + "meta", - new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)), - new XAttribute("name", "calibre:series_index"))); - } - - package.Add(meta); - - var manifest = new XElement(opf + "manifest"); - for(int i = 0; i < chapters.Count; i++) - { - manifest.Add(new XElement(opf + "item", - new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Filenamify(chapters[i].title, true)))), - new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))), - new XAttribute("media-type", "application/xhtml+xml"))); - } - manifest.Add(new XElement(opf + "item", - new XAttribute("href", "toc.ncx"), - new XAttribute("id", "ncx"), - new XAttribute("media-type", "application/x-dtbncx+xml"))); - - package.Add(manifest); - - var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx")); - for (int i = 0; i < chapters.Count; i++) - { - spine.Add(new XElement(opf + "itemref", - new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))))); - } - - package.Add(spine); - - package.Add(new XElement(opf + "guide")); - - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - return writer.ToString(); - } -} - -string GetEpubTOC(List chapters) -{ - XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/"; - XNamespace ncx = "http://www.idpf.org/2007/opf"; - - var doc = new XDocument( - new XDeclaration("1.0", "UTF-8", null), - new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null)); - - var root = new XElement(ncx + "ncx", - new XAttribute("version", "2005-1"), - new XElement(ncx + "head", - new XElement(ncx + "meta", - new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), - new XAttribute("name", "dtb:uid")), - new XElement(ncx + "meta", - new XAttribute("content", 1), - new XAttribute("name", "dtb:depth")), - new XElement(ncx + "meta", - new XAttribute("content", 0), - new XAttribute("name", "dtb:totalPageCount")), - new XElement(ncx + "meta", - new XAttribute("content", 0), - new XAttribute("name", "dtb:maxPageNumber")))); - - doc.Add(root); - - root.Add(new XElement(ncx + "docTitle", - new XElement(ncx + "text", "Unknown"))); - - var nav = new XElement(ncx + "navMap"); - for (int i = 0; i < chapters.Count; i++) - { - nav.Add(new XElement(ncx + "navPoint", - new XAttribute("id", "navPoint-" + (i + 1)), - new XAttribute("playOrder", i + 1), - new XElement(ncx + "navLabel", - new XElement(ncx + "text", chapters[i].title)), - new XElement(ncx + "content", - new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true)))))); - } - - root.Add(nav); - - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - return writer.ToString(); - } -} - -string GetEpubChapterFile(Chapter chapter, int idx) -{ - StringBuilder xml = new StringBuilder(); - - xml.AppendLine(@""); - xml.AppendLine(@" "); - xml.AppendLine(@""); - xml.AppendLine(@""); - xml.AppendLine("" + HtmlEntity.Entitize(chapter.title) + ""); - xml.AppendLine(@""); - xml.AppendLine(@""); - xml.AppendLine("

" + HtmlEntity.Entitize(chapter.title) + "

"); - xml.AppendLine(chapter.chapter); - xml.AppendLine(@""); - xml.AppendLine(@""); - - return xml.ToString(); -} - -public struct ProcessOutput -{ - public readonly string Command; - public readonly int ExitCode; - public readonly string StdOut; - public readonly string StdErr; - public readonly string StdCombined; - - public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) - { - Command = cmd; - ExitCode = ex; - StdOut = stdout; - StdErr = stderr; - StdCombined = stdcom; - } - - public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; -} - -public static class ProcessHelper -{ - public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) - { - var process = new Process - { - StartInfo = - { - FileName = command, - Arguments = arguments, - WorkingDirectory = workingDirectory ?? string.Empty, - UseShellExecute = false, - RedirectStandardOutput = true, - RedirectStandardError = true, - CreateNoWindow = true, - ErrorDialog = false, - } - }; - - var builderOut = new StringBuilder(); - var builderErr = new StringBuilder(); - var builderBoth = new StringBuilder(); - - process.OutputDataReceived += (sender, args) => - { - if (args.Data == null) return; - - if (builderOut.Length == 0) builderOut.Append(args.Data); - else builderOut.Append("\n" + args.Data); - - if (builderBoth.Length == 0) builderBoth.Append(args.Data); - else builderBoth.Append("\n" + args.Data); - }; - - process.ErrorDataReceived += (sender, args) => - { - if (args.Data == null) return; - - if (builderErr.Length == 0) builderErr.Append(args.Data); - else builderErr.Append("\n" + args.Data); - - if (builderBoth.Length == 0) builderBoth.Append(args.Data); - else builderBoth.Append("\n" + args.Data); - }; - - process.Start(); - - process.BeginOutputReadLine(); - process.BeginErrorReadLine(); - - process.WaitForExit(); - - return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); - } -} -public static class HTMLToText -{ - private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); - private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); - - private class PreceedingDomTextInfo - { - public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) - { - IsFirstTextOfDocWritten = isFirstTextOfDocWritten; - } - public bool WritePrecedingWhiteSpace { get; set; } - public bool LastCharWasSpace { get; set; } - public readonly BoolWrapper IsFirstTextOfDocWritten; - public int ListIndex { get; set; } - } - - private class BoolWrapper - { - public BoolWrapper() { } - public bool Value { get; set; } - public static implicit operator bool(BoolWrapper boolWrapper) - { - return boolWrapper.Value; - } - public static implicit operator BoolWrapper(bool boolWrapper) - { - return new BoolWrapper { Value = boolWrapper }; - } - } - - public static string Convert(string path) - { - HtmlDocument doc = new HtmlDocument(); - doc.Load(path); - return ConvertDoc(doc); - } - - public static string ConvertHtml(string html) - { - HtmlDocument doc = new HtmlDocument(); - html = REX_TAG1.Replace(html, " "); - html = REX_TAG2.Replace(html, " "); - doc.LoadHtml(html); - return ConvertDoc(doc); - } - - public static string ConvertDoc(HtmlDocument doc) - { - using (StringWriter sw = new StringWriter()) - { - ConvertTo(doc.DocumentNode, sw); - sw.Flush(); - return sw.ToString(); - } - } - - private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) - { - foreach (HtmlNode subnode in node.ChildNodes) - { - ConvertTo(subnode, outText, textInfo); - } - } - - public static void ConvertTo(HtmlNode node, TextWriter outText) - { - ConvertTo(node, outText, new PreceedingDomTextInfo(false)); - } - - private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) - { - string html; - switch (node.NodeType) - { - case HtmlNodeType.Comment: - // don't output comments - break; - case HtmlNodeType.Document: - ConvertContentTo(node, outText, textInfo); - break; - case HtmlNodeType.Text: - // script and style must not be output - string parentName = node.ParentNode.Name; - if ((parentName == "script") || (parentName == "style")) - { - break; - } - // get text - html = ((HtmlTextNode)node).Text; - // is it in fact a special closing node output as text? - if (HtmlNode.IsOverlappedClosingElement(html)) break; - - // check the text is meaningful and not a bunch of whitespaces - if (html.Length == 0) break; - - if (html.Trim().ToLower().StartsWith("")) break; - - if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) - { - html = html.TrimStart(); - if (html.Length == 0) { break; } - textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; - } - outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); - if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) - { - outText.Write(' '); - } - break; - case HtmlNodeType.Element: - string endElementString = null; - bool isInline; - bool skip = false; - int listIndex = 0; - switch (node.Name) - { - case "nav": - skip = true; - isInline = false; - break; - case "body": - case "section": - case "article": - case "aside": - case "h1": - case "h2": - case "header": - case "footer": - case "address": - case "main": - case "div": - case "span": - case "p": // stylistic - adjust as you tend to use - if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); - endElementString = "\r\n"; - isInline = false; - break; - case "br": - outText.Write("\r\n"); - skip = true; - textInfo.WritePrecedingWhiteSpace = false; - isInline = true; - break; - case "a": - isInline = true; - break; - case "li": - isInline = false; - break; - case "ol": - listIndex = 1; - goto case "ul"; - case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems - endElementString = "\r\n"; - isInline = false; - break; - case "img": //inline-block in reality - isInline = true; - break; - default: - isInline = true; - break; - } - if (!skip && node.HasChildNodes) - { - ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); - } - if (endElementString != null) - { - outText.Write(endElementString); - } - break; - } - } -} \ No newline at end of file diff --git a/Scraper/Chapter.cs b/Scraper/Chapter.cs new file mode 100644 index 0000000..03af5fc --- /dev/null +++ b/Scraper/Chapter.cs @@ -0,0 +1,17 @@ +namespace WordpressEboobScraper2.Scraper; + +public class Chapter +{ + public string url; + public string title; + public string next; + + public GZippedString queryResult; + public GZippedString sourcecode; + public GZippedString chapter; + + public bool isPrologue; + public bool isEpilogue; + public bool isBonus; + public bool isSpecial => isPrologue || isEpilogue || isBonus; +} \ No newline at end of file diff --git a/Scraper/EpubParameter.cs b/Scraper/EpubParameter.cs new file mode 100644 index 0000000..ce776fc --- /dev/null +++ b/Scraper/EpubParameter.cs @@ -0,0 +1,47 @@ +using System.Globalization; + +namespace WordpressEboobScraper2.Scraper; + +public class EpubParameter + { + public readonly string Series; + public readonly int SeriesIndex; + public readonly Guid ID_OPF; + public readonly Guid ID_CAL; + public readonly string Title; + public readonly string Author; + public readonly DateTime Release; + public readonly string Language; + public readonly string StartURL; + public readonly string Foldername; + public readonly Site SiteType; + + public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } } + + public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { } + + public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s) + { + SiteType = st; + Series = z; + SeriesIndex = i; + Title = t; + Author = a; + Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture); + Language = l; + StartURL = s; + if (z == null) + Foldername = Helper.Filenamify(t); + else + Foldername = string.Format("{0} {1} - {2}", Helper.Filenamify(z), i, Helper.Filenamify(t)); + + var u = new Random(Title.GetHashCode() ^ Author.GetHashCode()); + var g = new byte[16]; + u.NextBytes(g); + ID_OPF = new Guid(g); + u.NextBytes(g); + ID_CAL = new Guid(g); + } + + public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}"; + } diff --git a/Scraper/Extensions.cs b/Scraper/Extensions.cs new file mode 100644 index 0000000..0ce2219 --- /dev/null +++ b/Scraper/Extensions.cs @@ -0,0 +1,9 @@ +namespace WordpressEboobScraper2.Scraper; + +public static class Extensions +{ + public static void Dump(this string str) + { + Console.Out.WriteLine(str); + } +} \ No newline at end of file diff --git a/Scraper/GZippedString.cs b/Scraper/GZippedString.cs new file mode 100644 index 0000000..e17c35d --- /dev/null +++ b/Scraper/GZippedString.cs @@ -0,0 +1,57 @@ +using System.IO.Compression; +using System.Text; +using System.Xml.Serialization; + +namespace WordpressEboobScraper2.Scraper; + +public class GZippedString : IXmlSerializable +{ + public string Value { get; set; } + + public System.Xml.Schema.XmlSchema GetSchema() { return null; } + + public void ReadXml(System.Xml.XmlReader reader) + { + Value = DecompressString(reader.ReadString()); + reader.ReadEndElement(); + } + + public void WriteXml(System.Xml.XmlWriter writer) + { + writer.WriteString(CompressString(Value)); + } + + private string CompressString(string text) + { + byte[] buffer = Encoding.UTF8.GetBytes(text); + var memoryStream = new MemoryStream(); + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true)) + gZipStream.Write(buffer, 0, buffer.Length); + memoryStream.Position = 0; + var compressedData = new byte[memoryStream.Length]; + memoryStream.Read(compressedData, 0, compressedData.Length); + var gZipBuffer = new byte[compressedData.Length + 4]; + Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length); + Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4); + return Convert.ToBase64String(gZipBuffer); + } + + private string DecompressString(string compressedText) + { + byte[] gZipBuffer = Convert.FromBase64String(compressedText); + using (var memoryStream = new MemoryStream()) + { + int dataLength = BitConverter.ToInt32(gZipBuffer, 0); + memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4); + var buffer = new byte[dataLength]; + memoryStream.Position = 0; + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress)) + gZipStream.Read(buffer, 0, buffer.Length); + return Encoding.UTF8.GetString(buffer); + } + } + + public static implicit operator GZippedString(string v) => new GZippedString{Value = v}; + public static implicit operator string (GZippedString v) => v.Value; + +} \ No newline at end of file diff --git a/Scraper/Helper.cs b/Scraper/Helper.cs new file mode 100644 index 0000000..ad46fa5 --- /dev/null +++ b/Scraper/Helper.cs @@ -0,0 +1,102 @@ +using HtmlAgilityPack; + +namespace WordpressEboobScraper2.Scraper; + +public class Helper +{ + + public static string Filenamify(string v, bool repl = false) + { + var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p => + (p >= '0' && p <= '9') || + (p >= 'A' && p <= 'Z') || + (p >= 'a' && p <= 'z') || + p == ' ' || + p == '.' || + p == '-' || + p == '*' || + p == '_' || + p == '.' || + p == ',').ToArray()); + + if (repl) s = s.Replace(' ', '_'); + + return s; + } + + public static string TitleFmt(string raw) + { + raw = HtmlEntity.DeEntitize(raw); + + raw = raw.Replace('–', '-'); + raw = raw.Replace((char)160, ' '); + + raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); + if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3); + + raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); + + if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1); + + return raw; + } + + public static string Striptease(HtmlNode raw) + { + { + var rm = raw.SelectNodes(@"//script"); + if (rm != null && rm.Any()) + { + var copy = HtmlNode.CreateNode($"<{raw.Name}>"); + copy.CopyFrom(raw); + raw = copy; + + rm = raw.SelectNodes(@"//script"); + if (rm != null) foreach (var e in rm) e.Remove(); + } + } + + { + var rm = raw.SelectNodes(@"//meta"); + if (rm != null && rm.Any()) + { + var copy = HtmlNode.CreateNode($"<{raw.Name}>"); + copy.CopyFrom(raw); + raw = copy; + + rm = raw.SelectNodes(@"//meta"); + if (rm != null) foreach (var e in rm) e.Remove(); + } + } + + return Striptease(HtmlEntity.DeEntitize(raw.InnerText)); + } + + public static string Striptease(string raw) + { + var r = string.Join(string.Empty, + raw + .ToCharArray() + .Select(c => char.IsWhiteSpace(c) ? ' ' : c) + .Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c)) + .Select(c => char.ToLower(c))).Trim(); + return r; + } + + public static string CombineAuthority(string url, string suffix) + { + var left = new Uri(url).GetLeftPart(UriPartial.Authority); + if (!left.EndsWith("/")) left = left + "/"; + if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/'); + return left + suffix; + } + + public static string CombineUri(string uri1, string uri2) + { + if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/")); + uri1 = uri1.TrimEnd('/'); + uri2 = uri2.TrimStart('/'); + return string.Format("{0}/{1}", uri1, uri2); + } + +} \ No newline at end of file diff --git a/Scraper/Hyperlinq.cs b/Scraper/Hyperlinq.cs new file mode 100644 index 0000000..00b6a68 --- /dev/null +++ b/Scraper/Hyperlinq.cs @@ -0,0 +1,18 @@ +namespace WordpressEboobScraper2.Scraper; + +public class Hyperlinq +{ + private readonly Action action; + private readonly string title; + + public Hyperlinq(Action action, string title) + { + this.action = action; + this.title = title; + } + + public void Dump() + { + Console.Out.WriteLine(this.title); + } +} \ No newline at end of file diff --git a/Scraper/MainMode.cs b/Scraper/MainMode.cs new file mode 100644 index 0000000..393282f --- /dev/null +++ b/Scraper/MainMode.cs @@ -0,0 +1,7 @@ +namespace WordpressEboobScraper2.Scraper; + +public enum MainMode +{ + Generate, + Verify, +} \ No newline at end of file diff --git a/Scraper/ProcessResult.cs b/Scraper/ProcessResult.cs new file mode 100644 index 0000000..8ae508c --- /dev/null +++ b/Scraper/ProcessResult.cs @@ -0,0 +1,8 @@ +namespace WordpressEboobScraper2.Scraper; + +public enum ProcessResult +{ + SuccessNormal, + ReachedEnd, + SkipChapter, +} \ No newline at end of file diff --git a/Scraper/Scraper.cs b/Scraper/Scraper.cs new file mode 100644 index 0000000..c8bac22 --- /dev/null +++ b/Scraper/Scraper.cs @@ -0,0 +1,1351 @@ +using System.Diagnostics; +using System.Net; +using System.Text; +using System.Text.RegularExpressions; +using System.Xml.Linq; +using System.Xml.Serialization; +using HtmlAgilityPack; +using Ionic.Zip; + +namespace WordpressEboobScraper2.Scraper; + +/** *************************************************** **/ +/** **/ +/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/ +/** **/ +/** *************************************************** **/ + +class Scraper +{ + + static EpubParameter ACTIVE_BOOK = null; + + const int LIMIT = 1500; + + readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); + + Dictionary webCache = new Dictionary(); + + string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar; + + string WCACHE_FILE => Path.Combine(Config.BASE_DIR_OUT, @"_cache" , ACTIVE_BOOK.Foldername + @".xml"); + string HTML_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"html" , ACTIVE_BOOK.Foldername + @".html"); + string EPUB_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"epub" , ACTIVE_BOOK.Foldername + @".epub"); + string MOBI_FILE_OUT => Path.Combine(Config.BASE_DIR_OUT, @"mobi" , ACTIVE_BOOK.Foldername + @".mobi"); + + string HTML_FILE_STASH => STASH_FOLDER + @"book.html"; + string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip"; + string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub"; + string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi"; + + string QUERY_FOLDER => STASH_FOLDER + @"query" + Path.DirectorySeparatorChar; // full query result + string HTML_FOLDER => STASH_FOLDER + @"html" + Path.DirectorySeparatorChar; // unprocessed chapter code + string EPUB_FOLDER => STASH_FOLDER + @"epub" + Path.DirectorySeparatorChar; // processed epub chapter code + + //----------------------------------------------------------------------------------------------------// + + //----------------------------------------------------------------------------------------------------// + + public void Generate() + { + foreach (var bb in Config.BOOKS) + { + ACTIVE_BOOK = bb; + + $"".Dump(); + $"".Dump(); + $"".Dump(); + new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); + $" [PROCESSING BOOK] {bb.DisplayStr} ".Dump(); + new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); + $"".Dump(); + $"".Dump(); + $"".Dump(); + + Init(); + + List chapters = FindChapters(); + + WriteBookHTML(chapters); + WriteEpub(chapters); + if (Config.CONVERT_MOBI) GenerateMobi(); + } + } + + public void Verify() + { + foreach (var bb in Config.BOOKS) + { + ACTIVE_BOOK = bb; + + $"".Dump(); + $"".Dump(); + $"".Dump(); + new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); + $" [VERIFYING BOOK] {bb.DisplayStr} ".Dump(); + new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); + $"".Dump(); + $"".Dump(); + $"".Dump(); + + LoadWebCache(); + + VerifyChapters(); + } + } + + void Init() + { + if (Directory.Exists(STASH_FOLDER)) + { + Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete)); + if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH); + if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); + if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); + if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); + } + + Directory.CreateDirectory(STASH_FOLDER); + Directory.CreateDirectory(QUERY_FOLDER); + Directory.CreateDirectory(HTML_FOLDER); + Directory.CreateDirectory(EPUB_FOLDER); + + Directory.CreateDirectory(Config.BASE_DIR_OUT + @"_cache" + Path.DirectorySeparatorChar); + Directory.CreateDirectory(Config.BASE_DIR_OUT + @"html" + Path.DirectorySeparatorChar); + Directory.CreateDirectory(Config.BASE_DIR_OUT + @"epub" + Path.DirectorySeparatorChar); + Directory.CreateDirectory(Config.BASE_DIR_OUT + @"mobi" + Path.DirectorySeparatorChar); + + if (Config.USE_WEBCACHE) LoadWebCache(); + } + + void WriteBookHTML(List chapters) + { + StringBuilder b = new StringBuilder(); + + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(""); + + foreach (var currChapter in chapters) + { + b.AppendLine(); + b.AppendLine("

" + HtmlEntity.Entitize(currChapter.title) + "

"); + b.AppendLine(); + b.AppendLine(currChapter.chapter); + } + + b.AppendLine(""); + b.AppendLine(""); + + File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8); + File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true); + } + + void SaveCache() + { + var xs = new XmlSerializer(typeof(List)); + using (var writer = new System.IO.StreamWriter(WCACHE_FILE)) + { + xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); + } + } + + void LoadWebCache() + { + if (!File.Exists(WCACHE_FILE)) return; + + XmlSerializer deserializer = new XmlSerializer(typeof(List)); + using (TextReader reader = new StreamReader(WCACHE_FILE)) + { + var result = new List(); + + var l = (List)deserializer.Deserialize(reader); + + webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); + } + } + + List FindChapters() + { + List result = new List(); + + using (WebClient client = new WebClient()) + { + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); + + while (buffer.Any() && result.Count < LIMIT) + { + var url = buffer.Pop(); + Chapter curr = new Chapter() { url = url }; + + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) + { + curr.queryResult = webCache[url.ToLower()]; + "*(loaded from webcache)*".Dump(); + } + else + { + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + } + + var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); + if (next_url != null) buffer.Push(next_url); + + if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST) + { + "".Dump(); + "//==> *(auto-reload from live)*".Dump(); + "".Dump(); + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + + r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); + if (next_url_inner != null) buffer.Push(next_url_inner); + } + if (r == ProcessResult.SuccessNormal) + { + " ==> Chapter processed".Dump(); + result.Add(curr); + OutputChapter(curr, result.Count); + } + else if (r == ProcessResult.SkipChapter) + { + " ==> Skip this chapter".Dump(); + } + else if (r == ProcessResult.ReachedEnd) + { + " ==> End reached".Dump(); + } + + + "".Dump(); + } + } + + return result; + } + + void VerifyChapters() + { + List result = new List(); + + using (WebClient client = new WebClient()) + { + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); + + while (buffer.Any() && result.Count < LIMIT) + { + var url = buffer.Pop(); + Chapter curr_buffer = new Chapter() { url = url }; + Chapter curr_live = new Chapter() { url = url }; + + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) + { + try + { + curr_buffer.queryResult = webCache[url.ToLower()]; + curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + } + catch (Exception e) + { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); + continue; + } + } + else + { + continue; + } + + var is_diff = false; + + var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer); + var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live); + + if (next_buffer != null) buffer.Push(next_buffer); + + if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } + if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } + + if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } + if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } + + if (curr_buffer.chapter.Value != curr_live.chapter.Value) + { + var clean_buffer = GetChapterText(curr_buffer); + var clean_live = GetChapterText(curr_live); + + if (clean_buffer.Trim() != clean_live.Trim()) + { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, curr_buffer.chapter.Value); + File.WriteAllText(fb, curr_live.chapter.Value); + Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Raw]").Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, clean_buffer); + File.WriteAllText(fb, clean_live); + Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Text]").Dump(); + new Hyperlinq(() => + { + + webCache[url.ToLower()] = curr_live.queryResult; + SaveCache(); + + }, "[Save new version to webcache]").Dump(); + + is_diff = true; + } + } + + if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); + + if (is_diff) "".Dump(); + } + } + } + + bool Relaxedurleq(string a, string b) + { + if (a == b) return true; + if (a.StartsWith("https://")) a = a.Substring("https://".Length); + if (a.StartsWith("http://")) a = a.Substring("http://".Length); + if (b.StartsWith("https://")) b = b.Substring("https://".Length); + if (b.StartsWith("http://")) b = b.Substring("http://".Length); + + return (a==b); + } + + string GetChapterText(Chapter c) + { + if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty; + + var clean = HTMLToText.ConvertHtml(c.chapter.Value); + + clean = clean.Trim(); + + clean = new Regex(@"\s+").Replace(clean, " "); + + return clean; + } + + ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next) + { + forwardQueue_next = null; + + HtmlDocument doc = new HtmlDocument(); + doc.LoadHtml(curr.queryResult); + + #region Base + + var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); + if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); + + var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); + if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]"); + if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]"); + if (nodeNav == null) nodeNav = nodeContent; + + var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); + if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); + + #endregion + + #region Title + + var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); + if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); + if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1"); + + curr.title = Helper.TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText)); + + var titles = new List(); + titles.Add(curr.title); + + if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*")) + { + var baseTitle = curr.title; + + var suffix = Helper.TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value); + + var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value; + var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value); + + titles.Add(prefix1); + titles.Add(prefix2); + + var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2); + var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2); + var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" title node removed"); + } + else if (altTitleNode4 != null) + { + var newtitle = Helper.TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length)); + titles.Add(newtitle); + curr.title = newtitle; + titles.Add(prefix1 + newtitle); + titles.Add(prefix2 + newtitle); + titles.Add(prefix1 + " - " + newtitle); + titles.Add(prefix2 + " - " + newtitle); + + altTitleNode4.Remove(); + prt(" > title node removed"); + } + else if (suffix.Length > 2) + { + curr.title = suffix; + titles.Add(suffix); + } + else + { + prt(" [!!] Warning cannot parse title"); + } + + if (suffix.Length > 2) + { + curr.title = baseTitle; + titles.Add(baseTitle); + } + } + + if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { + var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length); + while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1); + tit_alt = tit_alt.Trim(); + if (tit_alt.Length>2) curr.title = tit_alt; + } + + #endregion + + curr.sourcecode = "\r\n\r\n\r\n" + nodeContent.OuterHtml + "\r\n\r\n\r\n"; + + if (backBuffer.Any() && backBuffer.First().title == curr.title) + { + prt("[!] Book loop found - skipping entry"); + return ProcessResult.ReachedEnd; // prevent book II loop + } + + curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); + curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); + curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); + + if (ACTIVE_BOOK == Config.APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II"); + + if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus) + { + prt("[!] Epilogue found - skipping entry"); + return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue + } + + prt(curr.title + " (" + curr.url + ")"); + + #region Next + + string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" }; + + if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && + backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && + REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && + REX_NUMSTART.Match(curr.title).Success && + REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value) + { + prt("[!] Book jump found - skipping entry"); + return ProcessResult.ReachedEnd; + } + + var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); + if (next == null) + next = nodeContent.Descendants() + .Where(p => p.Name.ToLower() == "a") + .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") + .Where(p => p.Attributes.Contains("href")) + .FirstOrDefault(); + + var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a"); + + if (next == null) + next = nodeNav.Descendants() + .Where(p => p.Name.ToLower() == "a") + .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) + .FirstOrDefault(); + + if (next != null) + { + var next_url = next.Attributes["href"].Value.Trim(); + + if (next_url == "." || next_url == "/" || next_url == "./") + { + next=null; + } + else + { + if (next_url.StartsWith("//")) next_url = "http:" + next_url; + + if (next_url.StartsWith("/")) next_url = Helper.CombineAuthority(curr.url, next_url); + + if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = Helper.CombineUri(curr.url, next_url); + + curr.next = next_url; + if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) + { + forwardQueue_next = next_url; + } + } + + } + + if (next == null) prt(" > (!) No next URL found"); + + #endregion + + #region Chapter marker + + var cpMarkerIdentities = new List + { + "previousnext", "previouschapternextchapter", + "firstnext", "firstchapternextchapter", + "firstchapter", "previouslast", + + "previouschapterlastchapter", + + "previouschapter", "nextchapter", "lastchapter", + + "first", "previous", "next", "last" + }; + + foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > Chapter marker removed"); + } + + foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > Chapter marker removed"); + } + + var alist = nodeChapter.SelectNodes("//a"); + if (alist != null) + { + foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + node.Remove(); + prt(" > Chapter marker removed"); + } + } + + var plist = nodeChapter.SelectNodes("//p"); + if (plist != null) + { + foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + node.Remove(); + prt(" > Chapter marker removed"); + } + } + + #endregion + + #region Share Div + + var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); + if (shareNodes != null) + { + foreach (var node in shareNodes) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > share div removed"); + } + else + { + prt(" > share div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Meta Div + + var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]"); + if (metaNodes != null) + { + foreach (var node in metaNodes) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > meta div removed"); + } + else + { + prt(" > meta div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Ad Blocking + + var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.."); + if (adNodes1 != null) + { + foreach (var node in adNodes1) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.."); + if (adNodes2 != null) + { + foreach (var node in adNodes2) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]"); + if (adNodes3 != null) + { + foreach (var node in adNodes3.Where(n => Helper.Striptease(n) == "advertisement")) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Title Paragraphs + + var titleNodes1 = nodeChapter.SelectNodes(@"p"); + if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == Helper.TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First())) + { + nodeChapter.RemoveChild(titleNodes1.First()); + prt(" > title node removed"); + } + + for (int hval = 1; hval <= 5; hval++) + { + var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval); + if (titleNodes2 != null) + { + foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == Helper.TitleFmt(node.InnerText).ToLower()))) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > title node removed"); + } + } + } + } + + var titleNodes3 = nodeChapter.SelectNodes(@"//u"); + if (titleNodes3 != null && titleNodes3.Any()) + { + var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes3) + { + t.Remove(); + prt(" > title node removed"); + } + } + + var titleNodes4 = nodeChapter.SelectNodes(@"//span"); + if (titleNodes4 != null && titleNodes4.Any()) + { + var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes4) + { + t.Remove(); + prt(" > title node removed"); + } + } + + var titleNodes5 = nodeChapter.SelectNodes(@"//strong"); + if (titleNodes5 != null && titleNodes5.Any()) + { + var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes5) + { + t.Remove(); + prt(" > title node removed"); + } + } + + #endregion + + #region Remove


's + + while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr") + { + nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First()); + prt(" > header hr removed"); + } + + while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr") + { + nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last()); + prt(" > footer hr removed"); + } + + #endregion + + #region Other (Author's Node) + + foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > authors note removed"); + } + + #endregion + + var chap_html = nodeChapter.InnerHtml.Trim(); + + #region Fix raw
+ // KOReader doesn't like
+ + chap_html = chap_html.Replace("
", "
"); + + #endregion + + curr.chapter = chap_html; + + + if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter; + + return ProcessResult.SuccessNormal; + } + + void OutputChapter(Chapter curr, int index) + { + File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.queryResult); + + File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Helper.Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8); + + StringBuilder b = new StringBuilder(); + { + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(); + b.AppendLine("

" + HtmlEntity.Entitize(curr.title) + "

"); + b.AppendLine(); + b.AppendLine(curr.chapter); + b.AppendLine(""); + b.AppendLine(""); + } + File.WriteAllText(Path.Combine(EPUB_FOLDER, Helper.Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8); + } + + string NakedIdentity(HtmlNode raw) + { + return string.Join(string.Empty, + raw + .InnerText + .ToLower() + .Replace(">", "") + .Replace("<", "") + .Replace("&", "") + .Replace(""", "") + .Replace(" ", "") + .ToCharArray() + .Where(c => char.IsLetterOrDigit(c)) + .Select(c => char.ToLower(c))).Trim() + .ToLower(); + } + + bool CouldBeTitle(HtmlNode n, string title) + { + var t0 = Helper.Striptease(n); + var t1 = Helper.Striptease(title); + + t0 = t0.ToLower(); + t1 = t1.ToLower(); + + t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); + t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); + + t0 = Regex.Replace(t0, @"\s\s+", ""); + t1 = Regex.Replace(t1, @"\s\s+", ""); + + return t0 == t1; + } + + void WriteEpub(List chapters) + { + if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); + if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); + + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + + using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite)) + { + using (var zipbook = new ZipOutputStream(fs)) + { + WritePubString(zipbook, @"mimetype", GetEpubMimetype()); + WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML()); + WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters)); + WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters)); + + for (int i = 0; i < chapters.Count; i++) + { + WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Helper.Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i)); + } + } + } + + File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH); + + File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true); + } + + void GenerateMobi() + { + if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); + + "Running ebook-convert for MOBI output".Dump(); + var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999"); + + $"ebook-convert returned: {pout.ExitCode}".Dump(); + if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined); + + File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true); + } + + void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null) + { + e = e ?? Encoding.UTF8; + + var f = z.PutNextEntry(n); + f.CompressionLevel = Ionic.Zlib.CompressionLevel.None; + + byte[] buffer = e.GetBytes(c); + z.Write(buffer, 0, buffer.Length); + } + + string GetEpubMimetype() + { + return "application/epub+zip"; + } + + string GetEpubContainerXML() + { + var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null), + new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XAttribute("version", "1.0"), + new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XAttribute("full-path", "OEBPS/content.opf"), + new XAttribute("media-type", "application/oebps-package+xml"))))); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + var r = writer.ToString(); + r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); + return r.Trim() + "\r\n"; + } + } + + string GetEpubContentOPF(List chapters) + { + XNamespace dc = "http://purl.org/dc/elements/1.1/"; + XNamespace opf = "http://www.idpf.org/2007/opf"; + + var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null)); + + var package = new XElement(opf + "package", + new XAttribute("unique-identifier", "BookId"), + new XAttribute("version", "2.0")); + + doc.Add(package); + + var meta = new XElement(opf + "metadata", + new XAttribute(XNamespace.Xmlns + "dc", dc), + new XAttribute(XNamespace.Xmlns + "opf", opf), + new XElement(dc + "title", ACTIVE_BOOK.Title), + new XElement(dc + "creator", ACTIVE_BOOK.Author), + new XElement(dc + "identifier", + new XAttribute("id", "BookId"), + new XAttribute(opf + "scheme", "UUID"), + "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), + new XElement(dc + "date", + new XAttribute(opf + "event", "publication"), + ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "date", + new XAttribute(opf + "event", "modification"), + DateTime.Now.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "date", + new XAttribute(opf + "event", "creation"), + DateTime.Now.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "language", ACTIVE_BOOK.Language), + new XElement(dc + "identifier", + new XAttribute(opf + "scheme", "UUID"), + ACTIVE_BOOK.ID_CAL.ToString("D")), + new XElement(opf + "meta", + new XAttribute("content", "1.0"), + new XAttribute("name", "Wordpress_eBook_scraper_version")), + new XElement(opf + "meta", + new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")), + new XAttribute("name", "Wordpress_eBook_scraper_creation_time"))); + + if (ACTIVE_BOOK.Series != null) + { + meta.Add(new XElement(opf + "meta", + new XAttribute("content", ACTIVE_BOOK.Series), + new XAttribute("name", "calibre:series"))); + meta.Add(new XElement(opf + "meta", + new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)), + new XAttribute("name", "calibre:series_index"))); + } + + package.Add(meta); + + var manifest = new XElement(opf + "manifest"); + for(int i = 0; i < chapters.Count; i++) + { + manifest.Add(new XElement(opf + "item", + new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Helper.Filenamify(chapters[i].title, true)))), + new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))), + new XAttribute("media-type", "application/xhtml+xml"))); + } + manifest.Add(new XElement(opf + "item", + new XAttribute("href", "toc.ncx"), + new XAttribute("id", "ncx"), + new XAttribute("media-type", "application/x-dtbncx+xml"))); + + package.Add(manifest); + + var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx")); + for (int i = 0; i < chapters.Count; i++) + { + spine.Add(new XElement(opf + "itemref", + new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true))))); + } + + package.Add(spine); + + package.Add(new XElement(opf + "guide")); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + return writer.ToString(); + } + } + + string GetEpubTOC(List chapters) + { + XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/"; + XNamespace ncx = "http://www.idpf.org/2007/opf"; + + var doc = new XDocument( + new XDeclaration("1.0", "UTF-8", null), + new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null)); + + var root = new XElement(ncx + "ncx", + new XAttribute("version", "2005-1"), + new XElement(ncx + "head", + new XElement(ncx + "meta", + new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), + new XAttribute("name", "dtb:uid")), + new XElement(ncx + "meta", + new XAttribute("content", 1), + new XAttribute("name", "dtb:depth")), + new XElement(ncx + "meta", + new XAttribute("content", 0), + new XAttribute("name", "dtb:totalPageCount")), + new XElement(ncx + "meta", + new XAttribute("content", 0), + new XAttribute("name", "dtb:maxPageNumber")))); + + doc.Add(root); + + root.Add(new XElement(ncx + "docTitle", + new XElement(ncx + "text", "Unknown"))); + + var nav = new XElement(ncx + "navMap"); + for (int i = 0; i < chapters.Count; i++) + { + nav.Add(new XElement(ncx + "navPoint", + new XAttribute("id", "navPoint-" + (i + 1)), + new XAttribute("playOrder", i + 1), + new XElement(ncx + "navLabel", + new XElement(ncx + "text", chapters[i].title)), + new XElement(ncx + "content", + new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Helper.Filenamify(chapters[i].title, true)))))); + } + + root.Add(nav); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + return writer.ToString(); + } + } + + string GetEpubChapterFile(Chapter chapter, int idx) + { + StringBuilder xml = new StringBuilder(); + + xml.AppendLine(@""); + xml.AppendLine(@" "); + xml.AppendLine(@""); + xml.AppendLine(@""); + xml.AppendLine("" + HtmlEntity.Entitize(chapter.title) + ""); + xml.AppendLine(@""); + xml.AppendLine(@""); + xml.AppendLine("

" + HtmlEntity.Entitize(chapter.title) + "

"); + xml.AppendLine(chapter.chapter); + xml.AppendLine(@""); + xml.AppendLine(@""); + + return xml.ToString(); + } + + public struct ProcessOutput + { + public readonly string Command; + public readonly int ExitCode; + public readonly string StdOut; + public readonly string StdErr; + public readonly string StdCombined; + + public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) + { + Command = cmd; + ExitCode = ex; + StdOut = stdout; + StdErr = stderr; + StdCombined = stdcom; + } + + public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; + } + + public static class ProcessHelper + { + public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) + { + var process = new Process + { + StartInfo = + { + FileName = command, + Arguments = arguments, + WorkingDirectory = workingDirectory ?? string.Empty, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true, + ErrorDialog = false, + } + }; + + var builderOut = new StringBuilder(); + var builderErr = new StringBuilder(); + var builderBoth = new StringBuilder(); + + process.OutputDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderOut.Length == 0) builderOut.Append(args.Data); + else builderOut.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.ErrorDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderErr.Length == 0) builderErr.Append(args.Data); + else builderErr.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.Start(); + + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + process.WaitForExit(); + + return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); + } + } + public static class HTMLToText + { + private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); + private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); + + private class PreceedingDomTextInfo + { + public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) + { + IsFirstTextOfDocWritten = isFirstTextOfDocWritten; + } + public bool WritePrecedingWhiteSpace { get; set; } + public bool LastCharWasSpace { get; set; } + public readonly BoolWrapper IsFirstTextOfDocWritten; + public int ListIndex { get; set; } + } + + private class BoolWrapper + { + public BoolWrapper() { } + public bool Value { get; set; } + public static implicit operator bool(BoolWrapper boolWrapper) + { + return boolWrapper.Value; + } + public static implicit operator BoolWrapper(bool boolWrapper) + { + return new BoolWrapper { Value = boolWrapper }; + } + } + + public static string Convert(string path) + { + HtmlDocument doc = new HtmlDocument(); + doc.Load(path); + return ConvertDoc(doc); + } + + public static string ConvertHtml(string html) + { + HtmlDocument doc = new HtmlDocument(); + html = REX_TAG1.Replace(html, " "); + html = REX_TAG2.Replace(html, " "); + doc.LoadHtml(html); + return ConvertDoc(doc); + } + + public static string ConvertDoc(HtmlDocument doc) + { + using (StringWriter sw = new StringWriter()) + { + ConvertTo(doc.DocumentNode, sw); + sw.Flush(); + return sw.ToString(); + } + } + + private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + foreach (HtmlNode subnode in node.ChildNodes) + { + ConvertTo(subnode, outText, textInfo); + } + } + + public static void ConvertTo(HtmlNode node, TextWriter outText) + { + ConvertTo(node, outText, new PreceedingDomTextInfo(false)); + } + + private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + string html; + switch (node.NodeType) + { + case HtmlNodeType.Comment: + // don't output comments + break; + case HtmlNodeType.Document: + ConvertContentTo(node, outText, textInfo); + break; + case HtmlNodeType.Text: + // script and style must not be output + string parentName = node.ParentNode.Name; + if ((parentName == "script") || (parentName == "style")) + { + break; + } + // get text + html = ((HtmlTextNode)node).Text; + // is it in fact a special closing node output as text? + if (HtmlNode.IsOverlappedClosingElement(html)) break; + + // check the text is meaningful and not a bunch of whitespaces + if (html.Length == 0) break; + + if (html.Trim().ToLower().StartsWith("")) break; + + if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) + { + html = html.TrimStart(); + if (html.Length == 0) { break; } + textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; + } + outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); + if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) + { + outText.Write(' '); + } + break; + case HtmlNodeType.Element: + string endElementString = null; + bool isInline; + bool skip = false; + int listIndex = 0; + switch (node.Name) + { + case "nav": + skip = true; + isInline = false; + break; + case "body": + case "section": + case "article": + case "aside": + case "h1": + case "h2": + case "header": + case "footer": + case "address": + case "main": + case "div": + case "span": + case "p": // stylistic - adjust as you tend to use + if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); + endElementString = "\r\n"; + isInline = false; + break; + case "br": + outText.Write("\r\n"); + skip = true; + textInfo.WritePrecedingWhiteSpace = false; + isInline = true; + break; + case "a": + isInline = true; + break; + case "li": + isInline = false; + break; + case "ol": + listIndex = 1; + goto case "ul"; + case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems + endElementString = "\r\n"; + isInline = false; + break; + case "img": //inline-block in reality + isInline = true; + break; + default: + isInline = true; + break; + } + if (!skip && node.HasChildNodes) + { + ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); + } + if (endElementString != null) + { + outText.Write(endElementString); + } + break; + } + } + } +} diff --git a/Scraper/SerializableCacheEntry.cs b/Scraper/SerializableCacheEntry.cs new file mode 100644 index 0000000..f32409a --- /dev/null +++ b/Scraper/SerializableCacheEntry.cs @@ -0,0 +1,7 @@ +namespace WordpressEboobScraper2.Scraper; + +public class SerializableCacheEntry +{ + public string URL; + public GZippedString Content; +} \ No newline at end of file diff --git a/Scraper/Site.cs b/Scraper/Site.cs new file mode 100644 index 0000000..7aa91e5 --- /dev/null +++ b/Scraper/Site.cs @@ -0,0 +1,12 @@ +namespace WordpressEboobScraper2.Scraper; + +public enum Site +{ + Wordpress, + WuxiaWorld, + Royalroad, + + WP = Wordpress, + WW = WuxiaWorld, + RR = Royalroad, +} \ No newline at end of file diff --git a/Scraper/Utf8StringWriter.cs b/Scraper/Utf8StringWriter.cs new file mode 100644 index 0000000..6b493fa --- /dev/null +++ b/Scraper/Utf8StringWriter.cs @@ -0,0 +1,8 @@ +using System.Text; + +namespace WordpressEboobScraper2.Scraper; + +public class Utf8StringWriter : StringWriter +{ + public override Encoding Encoding { get { return Encoding.UTF8; } } +} \ No newline at end of file diff --git a/WPEbookScraper2.cs b/WPEbookScraper2.cs new file mode 100644 index 0000000..6d4e533 --- /dev/null +++ b/WPEbookScraper2.cs @@ -0,0 +1,15 @@ +using WordpressEboobScraper2.Scraper; + +namespace WordpressEboobScraper2; + +public class WPEbookScraper2 +{ + public static void Main() + { + var scraper = new Scraper.Scraper(); + + if (Config.MODE == MainMode.Generate) scraper.Generate(); + if (Config.MODE == MainMode.Verify) scraper.Verify(); + } + +} \ No newline at end of file diff --git a/WordpressEboobScraper2.csproj b/WordpressEboobScraper2.csproj index 2b14c81..a0ba966 100644 --- a/WordpressEboobScraper2.csproj +++ b/WordpressEboobScraper2.csproj @@ -7,4 +7,11 @@ enable + + + + + + +