1
0

Fix bug when reading royalroad with extra content

see e.g. https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/761960/127-theomachia
This commit is contained in:
Mike Schwörer 2023-08-20 16:44:58 +02:00
parent 9692dc531f
commit 4cc76a45ef
Signed by: Mikescher
GPG Key ID: D3C7172E0A70F8CF
2 changed files with 27 additions and 7 deletions

View File

@ -99,4 +99,12 @@ public class Helper
return string.Format("{0}/{1}", uri1, uri2); return string.Format("{0}/{1}", uri1, uri2);
} }
public static IEnumerable<HtmlNode> RecursiveDescendants(HtmlNode n)
{
foreach (var d1 in n.Descendants())
{
yield return d1;
foreach (var d2 in RecursiveDescendants(d1)) yield return d2;
}
}
} }

View File

@ -362,7 +362,7 @@ class Scraper
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]");
var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']");
@ -513,6 +513,7 @@ class Scraper
} }
var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']");
if (next == null) if (next == null)
next = nodeContent.Descendants() next = nodeContent.Descendants()
.Where(p => p.Name.ToLower() == "a") .Where(p => p.Name.ToLower() == "a")
@ -520,14 +521,25 @@ class Scraper
.Where(p => p.Attributes.Contains("href")) .Where(p => p.Attributes.Contains("href"))
.FirstOrDefault(); .FirstOrDefault();
var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a");
if (next == null) if (next == null)
next = nodeNav.Descendants() next = nodeNav.Descendants()
.Where(p => p.Name.ToLower() == "a") .Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault(); .FirstOrDefault();
if (next == null)
next = Helper.RecursiveDescendants(nodeContent)
.Where(p => p.Name.ToLower() == "a")
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
if (next == null)
next = Helper.RecursiveDescendants(nodeContent)
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next != null) if (next != null)
{ {
var next_url = next.Attributes["href"].Value.Trim(); var next_url = next.Attributes["href"].Value.Trim();