1
0
Fork 0

Quickpush-commit from 2023-11-12 22:25:52

This commit is contained in:
Mike Schwörer 2023-11-12 22:25:52 +01:00
parent ff68d714ee
commit 4253113c39
Signed by: Mikescher
GPG Key ID: D3C7172E0A70F8CF
3 changed files with 65 additions and 30 deletions

View File

@ -9,14 +9,14 @@ public class Config
public const string BASE_DIR_OUT = @"/home/mike/Nextcloud/Dokumente/E-Books/Scraper/";
public const string COMPARE_PROG = @"/usr/bin/bcompare";
//----------------------------------------------------------------------------------------------------//
//-------------------------------------------------- FINISHED --------------------------------------------------//
public static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/");
public static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/");
public static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/");
public static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/");
public static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/");
public static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/");
public static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/");
public static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/");
@ -28,12 +28,13 @@ public class Config
public static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/");
public static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/");
public static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/");
public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
public static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/");
public static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/");
public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
public static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/");
public static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/");
public static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/");
@ -49,29 +50,39 @@ public class Config
public static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/");
public static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/");
public static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/");
public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
public static readonly EpubParameter TGAB5_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 16, "Bishop, Queen, and Pawn", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2019/11/04/16-1/");
public static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue");
public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
public static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/");
public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
public static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html");
public static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother");
public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again");
public static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave");
public static readonly EpubParameter ZOS = new EpubParameter(Site.RR, "Zenith of Sorcery", "Domagoj Kurmaic", "2023-07-06", "en", @"https://www.royalroad.com/fiction/71045/zenith-of-sorcery/chapter/1269005/1-homecoming");
public static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall");
public static readonly EpubParameter TGD = new EpubParameter(Site.AO3, "The Great Divide", "Alexander Wales", "2021-03-29", "en", @"https://archiveofourown.org/works/30350478");
public static readonly EpubParameter ERINYA = new EpubParameter(Site.AO3, "Eager Readers in Your Area!", "Alexander Wales", "2022-08-18", "en", @"https://archiveofourown.org/works/41112099");
//-------------------------------------------------- ABORTED / BROKEN --------------------------------------------------//
public static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/");
public static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again");
//-------------------------------------------------- WIP --------------------------------------------------//
public static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/");
public static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends");
public static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/");
public static readonly EpubParameter TGAB5_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 17, "A Great Doom", "D. D. Webb", "2019-11-04", "en", @"https://tiraas.net/2023/07/27/17-1/");
public static readonly EpubParameter ZOS = new EpubParameter(Site.RR, "Zenith of Sorcery", "Domagoj Kurmaic", "2023-07-06", "en", @"https://www.royalroad.com/fiction/71045/zenith-of-sorcery/chapter/1269005/1-homecoming");
//----------------------------------------------------------------------------------------------------//
public static readonly EpubParameter[] BOOKS = new[] { ZOS };

View File

@ -354,7 +354,9 @@ public class Scraper
#region Base
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
HtmlNode nodeContent = null;
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeContent = doc.DocumentNode.SelectSingleNode(@"//*[@id = 'workskin']");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
@ -367,12 +369,16 @@ public class Scraper
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeChapter = nodeContent.SelectSingleNode(@"//*[@id = 'chapters']");
#endregion
#region Title
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
HtmlNode titleNode = null;
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h3[contains(@class, 'title')]");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h2[contains(@class, 'title')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
@ -454,12 +460,6 @@ public class Scraper
{
prt(" [!!] Warning cannot parse title");
}
if (suffix.Length > 2)
{
curr.title = baseTitle;
titles.Add(baseTitle);
}
}
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
@ -479,7 +479,7 @@ public class Scraper
return ProcessResult.ReachedEnd; // prevent book II loop
}
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad) && (ACTIVE_BOOK!=Config.WI);
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
@ -533,8 +533,9 @@ public class Scraper
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next == null && ACTIVE_BOOK.Title == "Pale")
if (next == null && ACTIVE_BOOK == Config.PALE)
{
// some chapters in Pale miss the anchor tags on the next-chapter elem -.-
var nextLS = Helper.RecursiveDescendants(doc.DocumentNode)
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
@ -543,7 +544,7 @@ public class Scraper
if (nextLS.Count == 1) next = nextLS.Single().FirstOrDefault();
}
if (next != null && next.Attributes["href"].Value.Trim() == "(https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
if (next != null && next.Attributes["href"].Value.Trim() == "https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
if (next != null)
{
@ -623,6 +624,27 @@ public class Scraper
#endregion
#region A03 Stuff
var ao3workNodes = nodeChapter.SelectNodes(@"//*[@id = 'work']");
if (ao3workNodes != null)
{
foreach (var node in ao3workNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ao3 work-div removed");
}
else
{
prt(" > ao3 work-div cannot be removed - skipping");
}
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");

View File

@ -5,8 +5,10 @@ public enum Site
Wordpress,
WuxiaWorld,
Royalroad,
ArchiveOfOurOwn,
WP = Wordpress,
WW = WuxiaWorld,
RR = Royalroad,
WP = Wordpress,
WW = WuxiaWorld,
RR = Royalroad,
AO3 = ArchiveOfOurOwn,
}