From 4253113c39725bb40bbfe95656aa3b70579f3f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Schw=C3=B6rer?= Date: Sun, 12 Nov 2023 22:25:52 +0100 Subject: [PATCH] Quickpush-commit from 2023-11-12 22:25:52 --- Config.cs | 43 +++++++++++++++++++++++++++---------------- Scraper/Scraper.cs | 44 +++++++++++++++++++++++++++++++++----------- Scraper/Site.cs | 8 +++++--- 3 files changed, 65 insertions(+), 30 deletions(-) diff --git a/Config.cs b/Config.cs index e7ac703..da065c8 100644 --- a/Config.cs +++ b/Config.cs @@ -9,14 +9,14 @@ public class Config public const string BASE_DIR_OUT = @"/home/mike/Nextcloud/Dokumente/E-Books/Scraper/"; public const string COMPARE_PROG = @"/usr/bin/bcompare"; - //----------------------------------------------------------------------------------------------------// + //-------------------------------------------------- FINISHED --------------------------------------------------// public static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/"); public static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/"); public static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/"); public static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/"); public static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/"); - + public static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/"); public static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/"); public static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/"); @@ -28,12 +28,13 @@ public class Config public static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/"); public static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/"); public static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/"); - public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); public static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/"); public static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/"); + public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); + public static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/"); public static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/"); public static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/"); @@ -49,29 +50,39 @@ public class Config public static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/"); public static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/"); public static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/"); - - public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); + public static readonly EpubParameter TGAB5_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 16, "Bishop, Queen, and Pawn", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2019/11/04/16-1/"); public static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue"); - public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); - - public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); - public static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/"); - public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); - - public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); - public static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother"); - public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); - public static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave"); - public static readonly EpubParameter ZOS = new EpubParameter(Site.RR, "Zenith of Sorcery", "Domagoj Kurmaic", "2023-07-06", "en", @"https://www.royalroad.com/fiction/71045/zenith-of-sorcery/chapter/1269005/1-homecoming"); + public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); + public static readonly EpubParameter TGD = new EpubParameter(Site.AO3, "The Great Divide", "Alexander Wales", "2021-03-29", "en", @"https://archiveofourown.org/works/30350478"); + public static readonly EpubParameter ERINYA = new EpubParameter(Site.AO3, "Eager Readers in Your Area!", "Alexander Wales", "2022-08-18", "en", @"https://archiveofourown.org/works/41112099"); + + //-------------------------------------------------- ABORTED / BROKEN --------------------------------------------------// + + public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); + + public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); + + //-------------------------------------------------- WIP --------------------------------------------------// + + public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); + + public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); + + public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); + + public static readonly EpubParameter TGAB5_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 17, "A Great Doom", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2023/07/27/17-1/"); + + public static readonly EpubParameter ZOS = new EpubParameter(Site.RR, "Zenith of Sorcery", "Domagoj Kurmaic", "2023-07-06", "en", @"https://www.royalroad.com/fiction/71045/zenith-of-sorcery/chapter/1269005/1-homecoming"); + //----------------------------------------------------------------------------------------------------// public static readonly EpubParameter[] BOOKS = new[] { ZOS }; diff --git a/Scraper/Scraper.cs b/Scraper/Scraper.cs index 02f6955..86dbf3f 100644 --- a/Scraper/Scraper.cs +++ b/Scraper/Scraper.cs @@ -354,7 +354,9 @@ public class Scraper #region Base - var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); + HtmlNode nodeContent = null; + if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeContent = doc.DocumentNode.SelectSingleNode(@"//*[@id = 'workskin']"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]"); @@ -367,12 +369,16 @@ public class Scraper var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); + if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeChapter = nodeContent.SelectSingleNode(@"//*[@id = 'chapters']"); #endregion #region Title - var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); + HtmlNode titleNode = null; + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h3[contains(@class, 'title')]"); + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h2[contains(@class, 'title')]"); + if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); @@ -454,12 +460,6 @@ public class Scraper { prt(" [!!] Warning cannot parse title"); } - - if (suffix.Length > 2) - { - curr.title = baseTitle; - titles.Add(baseTitle); - } } if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { @@ -479,7 +479,7 @@ public class Scraper return ProcessResult.ReachedEnd; // prevent book II loop } - curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); + curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad) && (ACTIVE_BOOK!=Config.WI); curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); @@ -533,8 +533,9 @@ public class Scraper .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) .FirstOrDefault(); - if (next == null && ACTIVE_BOOK.Title == "Pale") + if (next == null && ACTIVE_BOOK == Config.PALE) { + // some chapters in Pale miss the anchor tags on the next-chapter elem -.- var nextLS = Helper.RecursiveDescendants(doc.DocumentNode) .Where(p => p.Name.ToLower() == "a") .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) @@ -543,7 +544,7 @@ public class Scraper if (nextLS.Count == 1) next = nextLS.Single().FirstOrDefault(); } - if (next != null && next.Attributes["href"].Value.Trim() == "(https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale + if (next != null && next.Attributes["href"].Value.Trim() == "https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale if (next != null) { @@ -623,6 +624,27 @@ public class Scraper #endregion + #region A03 Stuff + + var ao3workNodes = nodeChapter.SelectNodes(@"//*[@id = 'work']"); + if (ao3workNodes != null) + { + foreach (var node in ao3workNodes) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ao3 work-div removed"); + } + else + { + prt(" > ao3 work-div cannot be removed - skipping"); + } + } + } + + #endregion + #region Share Div var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); diff --git a/Scraper/Site.cs b/Scraper/Site.cs index 7aa91e5..403f083 100644 --- a/Scraper/Site.cs +++ b/Scraper/Site.cs @@ -5,8 +5,10 @@ public enum Site Wordpress, WuxiaWorld, Royalroad, + ArchiveOfOurOwn, - WP = Wordpress, - WW = WuxiaWorld, - RR = Royalroad, + WP = Wordpress, + WW = WuxiaWorld, + RR = Royalroad, + AO3 = ArchiveOfOurOwn, } \ No newline at end of file