From 4cc76a45ef4bc9be2f713eb49f9ca88cd59d5fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Schw=C3=B6rer?= Date: Sun, 20 Aug 2023 16:44:58 +0200 Subject: [PATCH] Fix bug when reading royalroad with extra content see e.g. https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/761960/127-theomachia --- Scraper/Helper.cs | 8 ++++++++ Scraper/Scraper.cs | 26 +++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Scraper/Helper.cs b/Scraper/Helper.cs index ad46fa5..8528275 100644 --- a/Scraper/Helper.cs +++ b/Scraper/Helper.cs @@ -99,4 +99,12 @@ public class Helper return string.Format("{0}/{1}", uri1, uri2); } + public static IEnumerable RecursiveDescendants(HtmlNode n) + { + foreach (var d1 in n.Descendants()) + { + yield return d1; + foreach (var d2 in RecursiveDescendants(d1)) yield return d2; + } + } } \ No newline at end of file diff --git a/Scraper/Scraper.cs b/Scraper/Scraper.cs index c8bac22..9ce0fda 100644 --- a/Scraper/Scraper.cs +++ b/Scraper/Scraper.cs @@ -362,7 +362,7 @@ class Scraper var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); - if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]"); if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); @@ -513,6 +513,7 @@ class Scraper } var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); + if (next == null) next = nodeContent.Descendants() .Where(p => p.Name.ToLower() == "a") @@ -520,14 +521,25 @@ class Scraper .Where(p => p.Attributes.Contains("href")) .FirstOrDefault(); - var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a"); - if (next == null) next = nodeNav.Descendants() - .Where(p => p.Name.ToLower() == "a") - .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) - .FirstOrDefault(); - + .Where(p => p.Name.ToLower() == "a") + .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) + .FirstOrDefault(); + + if (next == null) + next = Helper.RecursiveDescendants(nodeContent) + .Where(p => p.Name.ToLower() == "a") + .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") + .Where(p => p.Attributes.Contains("href")) + .FirstOrDefault(); + + if (next == null) + next = Helper.RecursiveDescendants(nodeContent) + .Where(p => p.Name.ToLower() == "a") + .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) + .FirstOrDefault(); + if (next != null) { var next_url = next.Attributes["href"].Value.Trim();