Quickpush-commit from 2023-11-12 22:25:52
This commit is contained in:
parent
ff68d714ee
commit
4253113c39
37
Config.cs
37
Config.cs
@ -9,7 +9,7 @@ public class Config
|
||||
public const string BASE_DIR_OUT = @"/home/mike/Nextcloud/Dokumente/E-Books/Scraper/";
|
||||
public const string COMPARE_PROG = @"/usr/bin/bcompare";
|
||||
|
||||
//----------------------------------------------------------------------------------------------------//
|
||||
//-------------------------------------------------- FINISHED --------------------------------------------------//
|
||||
|
||||
public static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/");
|
||||
public static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/");
|
||||
@ -28,12 +28,13 @@ public class Config
|
||||
public static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/");
|
||||
public static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/");
|
||||
public static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/");
|
||||
public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
|
||||
|
||||
public static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/");
|
||||
|
||||
public static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/");
|
||||
|
||||
public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
|
||||
|
||||
public static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/");
|
||||
public static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/");
|
||||
public static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/");
|
||||
@ -49,26 +50,36 @@ public class Config
|
||||
public static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/");
|
||||
public static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/");
|
||||
public static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/");
|
||||
|
||||
public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
|
||||
public static readonly EpubParameter TGAB5_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 16, "Bishop, Queen, and Pawn", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2019/11/04/16-1/");
|
||||
|
||||
public static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue");
|
||||
|
||||
public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
|
||||
|
||||
public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
|
||||
|
||||
public static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/");
|
||||
|
||||
public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
|
||||
|
||||
public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
|
||||
|
||||
public static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother");
|
||||
|
||||
public static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave");
|
||||
|
||||
public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
|
||||
public static readonly EpubParameter TGD = new EpubParameter(Site.AO3, "The Great Divide", "Alexander Wales", "2021-03-29", "en", @"https://archiveofourown.org/works/30350478");
|
||||
public static readonly EpubParameter ERINYA = new EpubParameter(Site.AO3, "Eager Readers in Your Area!", "Alexander Wales", "2022-08-18", "en", @"https://archiveofourown.org/works/41112099");
|
||||
|
||||
|
||||
//-------------------------------------------------- ABORTED / BROKEN --------------------------------------------------//
|
||||
|
||||
public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
|
||||
|
||||
public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again");
|
||||
|
||||
public static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave");
|
||||
//-------------------------------------------------- WIP --------------------------------------------------//
|
||||
|
||||
public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
|
||||
|
||||
public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
|
||||
|
||||
public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
|
||||
|
||||
public static readonly EpubParameter TGAB5_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 17, "A Great Doom", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2023/07/27/17-1/");
|
||||
|
||||
public static readonly EpubParameter ZOS = new EpubParameter(Site.RR, "Zenith of Sorcery", "Domagoj Kurmaic", "2023-07-06", "en", @"https://www.royalroad.com/fiction/71045/zenith-of-sorcery/chapter/1269005/1-homecoming");
|
||||
|
||||
|
@ -354,7 +354,9 @@ public class Scraper
|
||||
|
||||
#region Base
|
||||
|
||||
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
|
||||
HtmlNode nodeContent = null;
|
||||
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeContent = doc.DocumentNode.SelectSingleNode(@"//*[@id = 'workskin']");
|
||||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
|
||||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
|
||||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
|
||||
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
|
||||
@ -367,12 +369,16 @@ public class Scraper
|
||||
|
||||
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
|
||||
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
|
||||
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeChapter = nodeContent.SelectSingleNode(@"//*[@id = 'chapters']");
|
||||
|
||||
#endregion
|
||||
|
||||
#region Title
|
||||
|
||||
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
|
||||
HtmlNode titleNode = null;
|
||||
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h3[contains(@class, 'title')]");
|
||||
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h2[contains(@class, 'title')]");
|
||||
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
|
||||
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
|
||||
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
|
||||
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
|
||||
@ -454,12 +460,6 @@ public class Scraper
|
||||
{
|
||||
prt(" [!!] Warning cannot parse title");
|
||||
}
|
||||
|
||||
if (suffix.Length > 2)
|
||||
{
|
||||
curr.title = baseTitle;
|
||||
titles.Add(baseTitle);
|
||||
}
|
||||
}
|
||||
|
||||
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
|
||||
@ -479,7 +479,7 @@ public class Scraper
|
||||
return ProcessResult.ReachedEnd; // prevent book II loop
|
||||
}
|
||||
|
||||
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
|
||||
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad) && (ACTIVE_BOOK!=Config.WI);
|
||||
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
|
||||
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
|
||||
|
||||
@ -533,8 +533,9 @@ public class Scraper
|
||||
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
|
||||
.FirstOrDefault();
|
||||
|
||||
if (next == null && ACTIVE_BOOK.Title == "Pale")
|
||||
if (next == null && ACTIVE_BOOK == Config.PALE)
|
||||
{
|
||||
// some chapters in Pale miss the anchor tags on the next-chapter elem -.-
|
||||
var nextLS = Helper.RecursiveDescendants(doc.DocumentNode)
|
||||
.Where(p => p.Name.ToLower() == "a")
|
||||
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
|
||||
@ -543,7 +544,7 @@ public class Scraper
|
||||
if (nextLS.Count == 1) next = nextLS.Single().FirstOrDefault();
|
||||
}
|
||||
|
||||
if (next != null && next.Attributes["href"].Value.Trim() == "(https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
|
||||
if (next != null && next.Attributes["href"].Value.Trim() == "https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
|
||||
|
||||
if (next != null)
|
||||
{
|
||||
@ -623,6 +624,27 @@ public class Scraper
|
||||
|
||||
#endregion
|
||||
|
||||
#region A03 Stuff
|
||||
|
||||
var ao3workNodes = nodeChapter.SelectNodes(@"//*[@id = 'work']");
|
||||
if (ao3workNodes != null)
|
||||
{
|
||||
foreach (var node in ao3workNodes)
|
||||
{
|
||||
if (nodeChapter.ChildNodes.Contains(node))
|
||||
{
|
||||
nodeChapter.RemoveChild(node);
|
||||
prt(" > ao3 work-div removed");
|
||||
}
|
||||
else
|
||||
{
|
||||
prt(" > ao3 work-div cannot be removed - skipping");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Share Div
|
||||
|
||||
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");
|
||||
|
@ -5,8 +5,10 @@ public enum Site
|
||||
Wordpress,
|
||||
WuxiaWorld,
|
||||
Royalroad,
|
||||
ArchiveOfOurOwn,
|
||||
|
||||
WP = Wordpress,
|
||||
WW = WuxiaWorld,
|
||||
RR = Royalroad,
|
||||
WP = Wordpress,
|
||||
WW = WuxiaWorld,
|
||||
RR = Royalroad,
|
||||
AO3 = ArchiveOfOurOwn,
|
||||
}
|
Loading…
Reference in New Issue
Block a user