diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml
index 288b36b..94a25f7 100644
--- a/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml
+++ b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml
@@ -1,7 +1,6 @@
-
\ No newline at end of file
diff --git a/Proc/HTMLToText.cs b/Proc/HTMLToText.cs
new file mode 100644
index 0000000..9d471d9
--- /dev/null
+++ b/Proc/HTMLToText.cs
@@ -0,0 +1,181 @@
+using System.Text.RegularExpressions;
+using HtmlAgilityPack;
+
+namespace WordpressEboobScraper2.Proc;
+
+public static class HTMLToText
+ {
+ private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
+ private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
+
+ private class PreceedingDomTextInfo
+ {
+ public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
+ {
+ IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
+ }
+ public bool WritePrecedingWhiteSpace { get; set; }
+ public bool LastCharWasSpace { get; set; }
+ public readonly BoolWrapper IsFirstTextOfDocWritten;
+ public int ListIndex { get; set; }
+ }
+
+ private class BoolWrapper
+ {
+ public BoolWrapper() { }
+ public bool Value { get; set; }
+ public static implicit operator bool(BoolWrapper boolWrapper)
+ {
+ return boolWrapper.Value;
+ }
+ public static implicit operator BoolWrapper(bool boolWrapper)
+ {
+ return new BoolWrapper { Value = boolWrapper };
+ }
+ }
+
+ public static string Convert(string path)
+ {
+ HtmlDocument doc = new HtmlDocument();
+ doc.Load(path);
+ return ConvertDoc(doc);
+ }
+
+ public static string ConvertHtml(string html)
+ {
+ HtmlDocument doc = new HtmlDocument();
+ html = REX_TAG1.Replace(html, " ");
+ html = REX_TAG2.Replace(html, " ");
+ doc.LoadHtml(html);
+ return ConvertDoc(doc);
+ }
+
+ public static string ConvertDoc(HtmlDocument doc)
+ {
+ using (StringWriter sw = new StringWriter())
+ {
+ ConvertTo(doc.DocumentNode, sw);
+ sw.Flush();
+ return sw.ToString();
+ }
+ }
+
+ private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
+ {
+ foreach (HtmlNode subnode in node.ChildNodes)
+ {
+ ConvertTo(subnode, outText, textInfo);
+ }
+ }
+
+ public static void ConvertTo(HtmlNode node, TextWriter outText)
+ {
+ ConvertTo(node, outText, new PreceedingDomTextInfo(false));
+ }
+
+ private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
+ {
+ string html;
+ switch (node.NodeType)
+ {
+ case HtmlNodeType.Comment:
+ // don't output comments
+ break;
+ case HtmlNodeType.Document:
+ ConvertContentTo(node, outText, textInfo);
+ break;
+ case HtmlNodeType.Text:
+ // script and style must not be output
+ string parentName = node.ParentNode.Name;
+ if ((parentName == "script") || (parentName == "style"))
+ {
+ break;
+ }
+ // get text
+ html = ((HtmlTextNode)node).Text;
+ // is it in fact a special closing node output as text?
+ if (HtmlNode.IsOverlappedClosingElement(html)) break;
+
+ // check the text is meaningful and not a bunch of whitespaces
+ if (html.Length == 0) break;
+
+ if (html.Trim().ToLower().StartsWith("")) break;
+
+ if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
+ {
+ html = html.TrimStart();
+ if (html.Length == 0) { break; }
+ textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
+ }
+ outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
+ if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
+ {
+ outText.Write(' ');
+ }
+ break;
+ case HtmlNodeType.Element:
+ string endElementString = null;
+ bool isInline;
+ bool skip = false;
+ int listIndex = 0;
+ switch (node.Name)
+ {
+ case "nav":
+ skip = true;
+ isInline = false;
+ break;
+ case "body":
+ case "section":
+ case "article":
+ case "aside":
+ case "h1":
+ case "h2":
+ case "header":
+ case "footer":
+ case "address":
+ case "main":
+ case "div":
+ case "span":
+ case "p": // stylistic - adjust as you tend to use
+ if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
+ endElementString = "\r\n";
+ isInline = false;
+ break;
+ case "br":
+ outText.Write("\r\n");
+ skip = true;
+ textInfo.WritePrecedingWhiteSpace = false;
+ isInline = true;
+ break;
+ case "a":
+ isInline = true;
+ break;
+ case "li":
+ isInline = false;
+ break;
+ case "ol":
+ listIndex = 1;
+ goto case "ul";
+ case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
+ endElementString = "\r\n";
+ isInline = false;
+ break;
+ case "img": //inline-block in reality
+ isInline = true;
+ break;
+ default:
+ isInline = true;
+ break;
+ }
+ if (!skip && node.HasChildNodes)
+ {
+ ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
+ }
+ if (endElementString != null)
+ {
+ outText.Write(endElementString);
+ }
+ break;
+ }
+ }
+ }
\ No newline at end of file
diff --git a/Proc/ProcessHelper.cs b/Proc/ProcessHelper.cs
new file mode 100644
index 0000000..08730d6
--- /dev/null
+++ b/Proc/ProcessHelper.cs
@@ -0,0 +1,59 @@
+using System.Text;
+
+namespace WordpressEboobScraper2.Proc;
+
+public static class ProcessHelper
+{
+ public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
+ {
+ var process = new System.Diagnostics.Process
+ {
+ StartInfo =
+ {
+ FileName = command,
+ Arguments = arguments,
+ WorkingDirectory = workingDirectory ?? string.Empty,
+ UseShellExecute = false,
+ RedirectStandardOutput = true,
+ RedirectStandardError = true,
+ CreateNoWindow = true,
+ ErrorDialog = false,
+ }
+ };
+
+ var builderOut = new StringBuilder();
+ var builderErr = new StringBuilder();
+ var builderBoth = new StringBuilder();
+
+ process.OutputDataReceived += (sender, args) =>
+ {
+ if (args.Data == null) return;
+
+ if (builderOut.Length == 0) builderOut.Append(args.Data);
+ else builderOut.Append("\n" + args.Data);
+
+ if (builderBoth.Length == 0) builderBoth.Append(args.Data);
+ else builderBoth.Append("\n" + args.Data);
+ };
+
+ process.ErrorDataReceived += (sender, args) =>
+ {
+ if (args.Data == null) return;
+
+ if (builderErr.Length == 0) builderErr.Append(args.Data);
+ else builderErr.Append("\n" + args.Data);
+
+ if (builderBoth.Length == 0) builderBoth.Append(args.Data);
+ else builderBoth.Append("\n" + args.Data);
+ };
+
+ process.Start();
+
+ process.BeginOutputReadLine();
+ process.BeginErrorReadLine();
+
+ process.WaitForExit();
+
+ return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
+ }
+}
\ No newline at end of file
diff --git a/Proc/ProcessOutput.cs b/Proc/ProcessOutput.cs
new file mode 100644
index 0000000..7813e58
--- /dev/null
+++ b/Proc/ProcessOutput.cs
@@ -0,0 +1,21 @@
+namespace WordpressEboobScraper2.Proc;
+
+public struct ProcessOutput
+{
+ public readonly string Command;
+ public readonly int ExitCode;
+ public readonly string StdOut;
+ public readonly string StdErr;
+ public readonly string StdCombined;
+
+ public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
+ {
+ Command = cmd;
+ ExitCode = ex;
+ StdOut = stdout;
+ StdErr = stderr;
+ StdCombined = stdcom;
+ }
+
+ public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
+}
\ No newline at end of file
diff --git a/Scraper/Helper.cs b/Scraper/Helper.cs
index 8528275..7c80bc9 100644
--- a/Scraper/Helper.cs
+++ b/Scraper/Helper.cs
@@ -2,9 +2,8 @@ using HtmlAgilityPack;
namespace WordpressEboobScraper2.Scraper;
-public class Helper
+public static class Helper
{
-
public static string Filenamify(string v, bool repl = false)
{
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>
diff --git a/Scraper/Scraper.cs b/Scraper/Scraper.cs
index 9ce0fda..4da91d5 100644
--- a/Scraper/Scraper.cs
+++ b/Scraper/Scraper.cs
@@ -6,6 +6,7 @@ using System.Xml.Linq;
using System.Xml.Serialization;
using HtmlAgilityPack;
using Ionic.Zip;
+using WordpressEboobScraper2.Proc;
namespace WordpressEboobScraper2.Scraper;
@@ -15,16 +16,16 @@ namespace WordpressEboobScraper2.Scraper;
/** **/
/** *************************************************** **/
-class Scraper
+public class Scraper
{
static EpubParameter ACTIVE_BOOK = null;
const int LIMIT = 1500;
- readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled);
+ readonly Regex REX_NUMSTART = new(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled);
- Dictionary webCache = new Dictionary();
+ Dictionary webCache = new();
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
@@ -144,7 +145,7 @@ class Scraper
void SaveCache()
{
var xs = new XmlSerializer(typeof(List));
- using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
+ using (var writer = new StreamWriter(WCACHE_FILE))
{
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
}
@@ -155,77 +156,74 @@ class Scraper
if (!File.Exists(WCACHE_FILE)) return;
XmlSerializer deserializer = new XmlSerializer(typeof(List));
- using (TextReader reader = new StreamReader(WCACHE_FILE))
- {
- var result = new List();
+
+ using TextReader reader = new StreamReader(WCACHE_FILE);
+
+ var l = (List)deserializer.Deserialize(reader);
- var l = (List)deserializer.Deserialize(reader);
-
- webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
- }
+ webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
}
List FindChapters()
{
List result = new List();
- using (WebClient client = new WebClient())
+ using WebClient client = new WebClient();
+
+ client.Encoding = Encoding.UTF8;
+ Stack buffer = new Stack();
+ buffer.Push(ACTIVE_BOOK.StartURL);
+
+ while (buffer.Any() && result.Count < LIMIT)
{
- client.Encoding = Encoding.UTF8;
- Stack buffer = new Stack();
- buffer.Push(ACTIVE_BOOK.StartURL);
+ var url = buffer.Pop();
+ Chapter curr = new Chapter() { url = url };
- while (buffer.Any() && result.Count < LIMIT)
+ var buffered = webCache.ContainsKey(url.ToLower());
+ if (buffered)
{
- var url = buffer.Pop();
- Chapter curr = new Chapter() { url = url };
-
- var buffered = webCache.ContainsKey(url.ToLower());
- if (buffered)
- {
- curr.queryResult = webCache[url.ToLower()];
- "*(loaded from webcache)*".Dump();
- }
- else
- {
- curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
- webCache[url.ToLower()] = curr.queryResult;
- SaveCache();
- }
-
- var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
- if (next_url != null) buffer.Push(next_url);
-
- if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
- {
- "".Dump();
- "//==> *(auto-reload from live)*".Dump();
- "".Dump();
- curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
- webCache[url.ToLower()] = curr.queryResult;
- SaveCache();
-
- r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
- if (next_url_inner != null) buffer.Push(next_url_inner);
- }
- if (r == ProcessResult.SuccessNormal)
- {
- " ==> Chapter processed".Dump();
- result.Add(curr);
- OutputChapter(curr, result.Count);
- }
- else if (r == ProcessResult.SkipChapter)
- {
- " ==> Skip this chapter".Dump();
- }
- else if (r == ProcessResult.ReachedEnd)
- {
- " ==> End reached".Dump();
- }
-
-
- "".Dump();
+ curr.queryResult = webCache[url.ToLower()];
+ "*(loaded from webcache)*".Dump();
}
+ else
+ {
+ curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
+ webCache[url.ToLower()] = curr.queryResult;
+ SaveCache();
+ }
+
+ var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
+ if (next_url != null) buffer.Push(next_url);
+
+ if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
+ {
+ "".Dump();
+ "//==> *(auto-reload from live)*".Dump();
+ "".Dump();
+ curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
+ webCache[url.ToLower()] = curr.queryResult;
+ SaveCache();
+
+ r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
+ if (next_url_inner != null) buffer.Push(next_url_inner);
+ }
+ if (r == ProcessResult.SuccessNormal)
+ {
+ " ==> Chapter processed".Dump();
+ result.Add(curr);
+ OutputChapter(curr, result.Count);
+ }
+ else if (r == ProcessResult.SkipChapter)
+ {
+ " ==> Skip this chapter".Dump();
+ }
+ else if (r == ProcessResult.ReachedEnd)
+ {
+ " ==> End reached".Dump();
+ }
+
+
+ "".Dump();
}
return result;
@@ -233,96 +231,93 @@ class Scraper
void VerifyChapters()
{
- List result = new List();
+ using WebClient client = new WebClient();
+
+ client.Encoding = Encoding.UTF8;
+ Stack buffer = new Stack();
+ buffer.Push(ACTIVE_BOOK.StartURL);
- using (WebClient client = new WebClient())
+ while (buffer.Any())
{
- client.Encoding = Encoding.UTF8;
- Stack buffer = new Stack();
- buffer.Push(ACTIVE_BOOK.StartURL);
+ var url = buffer.Pop();
+ Chapter curr_buffer = new Chapter() { url = url };
+ Chapter curr_live = new Chapter() { url = url };
- while (buffer.Any() && result.Count < LIMIT)
+ var buffered = webCache.ContainsKey(url.ToLower());
+ if (buffered)
{
- var url = buffer.Pop();
- Chapter curr_buffer = new Chapter() { url = url };
- Chapter curr_live = new Chapter() { url = url };
-
- var buffered = webCache.ContainsKey(url.ToLower());
- if (buffered)
+ try
{
- try
- {
- curr_buffer.queryResult = webCache[url.ToLower()];
- curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
- }
- catch (Exception e)
- {
- $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
- continue;
- }
+ curr_buffer.queryResult = webCache[url.ToLower()];
+ curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
}
- else
+ catch (Exception e)
{
+ $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
continue;
}
-
- var is_diff = false;
-
- var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
- var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);
-
- if (next_buffer != null) buffer.Push(next_buffer);
-
- if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
- if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
-
- if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
- if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
-
- if (curr_buffer.chapter.Value != curr_live.chapter.Value)
- {
- var clean_buffer = GetChapterText(curr_buffer);
- var clean_live = GetChapterText(curr_live);
-
- if (clean_buffer.Trim() != clean_live.Trim())
- {
- $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
- new Hyperlinq(() =>
- {
-
- var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
- var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
- File.WriteAllText(fa, curr_buffer.chapter.Value);
- File.WriteAllText(fb, curr_live.chapter.Value);
- Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
-
- }, "[Compare Raw]").Dump();
- new Hyperlinq(() =>
- {
-
- var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
- var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
- File.WriteAllText(fa, clean_buffer);
- File.WriteAllText(fb, clean_live);
- Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
-
- }, "[Compare Text]").Dump();
- new Hyperlinq(() =>
- {
-
- webCache[url.ToLower()] = curr_live.queryResult;
- SaveCache();
-
- }, "[Save new version to webcache]").Dump();
-
- is_diff = true;
- }
- }
-
- if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
-
- if (is_diff) "".Dump();
}
+ else
+ {
+ continue;
+ }
+
+ var is_diff = false;
+
+ var r_buffer = ProcessChapter(curr_buffer, new List(), _ => {}, out var next_buffer);
+ var r_live = ProcessChapter(curr_live, new List(), _ => {}, out var next_live);
+
+ if (next_buffer != null) buffer.Push(next_buffer);
+
+ if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
+ if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
+
+ if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
+ if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
+
+ if (curr_buffer.chapter.Value != curr_live.chapter.Value)
+ {
+ var clean_buffer = GetChapterText(curr_buffer);
+ var clean_live = GetChapterText(curr_live);
+
+ if (clean_buffer.Trim() != clean_live.Trim())
+ {
+ $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
+ new Hyperlinq(() =>
+ {
+
+ var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
+ var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
+ File.WriteAllText(fa, curr_buffer.chapter.Value);
+ File.WriteAllText(fb, curr_live.chapter.Value);
+ Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
+
+ }, "[Compare Raw]").Dump();
+ new Hyperlinq(() =>
+ {
+
+ var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
+ var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
+ File.WriteAllText(fa, clean_buffer);
+ File.WriteAllText(fb, clean_live);
+ Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
+
+ }, "[Compare Text]").Dump();
+ new Hyperlinq(() =>
+ {
+
+ webCache[url.ToLower()] = curr_live.queryResult;
+ SaveCache();
+
+ }, "[Save new version to webcache]").Dump();
+
+ is_diff = true;
+ }
+ }
+
+ if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
+
+ if (is_diff) "".Dump();
}
}
@@ -350,9 +345,9 @@ class Scraper
return clean;
}
- ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next)
+ ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueueNext)
{
- forwardQueue_next = null;
+ forwardQueueNext = null;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(curr.queryResult);
@@ -500,8 +495,6 @@ class Scraper
#region Next
- string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
-
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
@@ -517,8 +510,8 @@ class Scraper
if (next == null)
next = nodeContent.Descendants()
.Where(p => p.Name.ToLower() == "a")
- .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
- .Where(p => p.Attributes.Contains("href"))
+ .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
+ .Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
if (next == null)
@@ -559,7 +552,7 @@ class Scraper
curr.next = next_url;
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
{
- forwardQueue_next = next_url;
+ forwardQueueNext = next_url;
}
}
@@ -940,14 +933,12 @@ class Scraper
new XAttribute("full-path", "OEBPS/content.opf"),
new XAttribute("media-type", "application/oebps-package+xml")))));
- StringBuilder builder = new StringBuilder();
- using (Utf8StringWriter writer = new Utf8StringWriter())
- {
- doc.Save(writer);
- var r = writer.ToString();
- r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
- return r.Trim() + "\r\n";
- }
+ using Utf8StringWriter writer = new Utf8StringWriter();
+
+ doc.Save(writer);
+ var r = writer.ToString();
+ r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
+ return r.Trim() + "\r\n";
}
string GetEpubContentOPF(List chapters)
@@ -1030,17 +1021,14 @@ class Scraper
package.Add(new XElement(opf + "guide"));
- StringBuilder builder = new StringBuilder();
- using (Utf8StringWriter writer = new Utf8StringWriter())
- {
- doc.Save(writer);
- return writer.ToString();
- }
+ using Utf8StringWriter writer = new Utf8StringWriter();
+
+ doc.Save(writer);
+ return writer.ToString();
}
string GetEpubTOC(List chapters)
{
- XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
XNamespace ncx = "http://www.idpf.org/2007/opf";
var doc = new XDocument(
@@ -1082,12 +1070,10 @@ class Scraper
root.Add(nav);
- StringBuilder builder = new StringBuilder();
- using (Utf8StringWriter writer = new Utf8StringWriter())
- {
- doc.Save(writer);
- return writer.ToString();
- }
+ using Utf8StringWriter writer = new Utf8StringWriter();
+
+ doc.Save(writer);
+ return writer.ToString();
}
string GetEpubChapterFile(Chapter chapter, int idx)
@@ -1108,256 +1094,4 @@ class Scraper
return xml.ToString();
}
-
- public struct ProcessOutput
- {
- public readonly string Command;
- public readonly int ExitCode;
- public readonly string StdOut;
- public readonly string StdErr;
- public readonly string StdCombined;
-
- public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
- {
- Command = cmd;
- ExitCode = ex;
- StdOut = stdout;
- StdErr = stderr;
- StdCombined = stdcom;
- }
-
- public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
- }
-
- public static class ProcessHelper
- {
- public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
- {
- var process = new Process
- {
- StartInfo =
- {
- FileName = command,
- Arguments = arguments,
- WorkingDirectory = workingDirectory ?? string.Empty,
- UseShellExecute = false,
- RedirectStandardOutput = true,
- RedirectStandardError = true,
- CreateNoWindow = true,
- ErrorDialog = false,
- }
- };
-
- var builderOut = new StringBuilder();
- var builderErr = new StringBuilder();
- var builderBoth = new StringBuilder();
-
- process.OutputDataReceived += (sender, args) =>
- {
- if (args.Data == null) return;
-
- if (builderOut.Length == 0) builderOut.Append(args.Data);
- else builderOut.Append("\n" + args.Data);
-
- if (builderBoth.Length == 0) builderBoth.Append(args.Data);
- else builderBoth.Append("\n" + args.Data);
- };
-
- process.ErrorDataReceived += (sender, args) =>
- {
- if (args.Data == null) return;
-
- if (builderErr.Length == 0) builderErr.Append(args.Data);
- else builderErr.Append("\n" + args.Data);
-
- if (builderBoth.Length == 0) builderBoth.Append(args.Data);
- else builderBoth.Append("\n" + args.Data);
- };
-
- process.Start();
-
- process.BeginOutputReadLine();
- process.BeginErrorReadLine();
-
- process.WaitForExit();
-
- return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
- }
- }
- public static class HTMLToText
- {
- private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
- private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
-
- private class PreceedingDomTextInfo
- {
- public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
- {
- IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
- }
- public bool WritePrecedingWhiteSpace { get; set; }
- public bool LastCharWasSpace { get; set; }
- public readonly BoolWrapper IsFirstTextOfDocWritten;
- public int ListIndex { get; set; }
- }
-
- private class BoolWrapper
- {
- public BoolWrapper() { }
- public bool Value { get; set; }
- public static implicit operator bool(BoolWrapper boolWrapper)
- {
- return boolWrapper.Value;
- }
- public static implicit operator BoolWrapper(bool boolWrapper)
- {
- return new BoolWrapper { Value = boolWrapper };
- }
- }
-
- public static string Convert(string path)
- {
- HtmlDocument doc = new HtmlDocument();
- doc.Load(path);
- return ConvertDoc(doc);
- }
-
- public static string ConvertHtml(string html)
- {
- HtmlDocument doc = new HtmlDocument();
- html = REX_TAG1.Replace(html, " ");
- html = REX_TAG2.Replace(html, " ");
- doc.LoadHtml(html);
- return ConvertDoc(doc);
- }
-
- public static string ConvertDoc(HtmlDocument doc)
- {
- using (StringWriter sw = new StringWriter())
- {
- ConvertTo(doc.DocumentNode, sw);
- sw.Flush();
- return sw.ToString();
- }
- }
-
- private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
- {
- foreach (HtmlNode subnode in node.ChildNodes)
- {
- ConvertTo(subnode, outText, textInfo);
- }
- }
-
- public static void ConvertTo(HtmlNode node, TextWriter outText)
- {
- ConvertTo(node, outText, new PreceedingDomTextInfo(false));
- }
-
- private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
- {
- string html;
- switch (node.NodeType)
- {
- case HtmlNodeType.Comment:
- // don't output comments
- break;
- case HtmlNodeType.Document:
- ConvertContentTo(node, outText, textInfo);
- break;
- case HtmlNodeType.Text:
- // script and style must not be output
- string parentName = node.ParentNode.Name;
- if ((parentName == "script") || (parentName == "style"))
- {
- break;
- }
- // get text
- html = ((HtmlTextNode)node).Text;
- // is it in fact a special closing node output as text?
- if (HtmlNode.IsOverlappedClosingElement(html)) break;
-
- // check the text is meaningful and not a bunch of whitespaces
- if (html.Length == 0) break;
-
- if (html.Trim().ToLower().StartsWith("")) break;
-
- if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
- {
- html = html.TrimStart();
- if (html.Length == 0) { break; }
- textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
- }
- outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
- if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
- {
- outText.Write(' ');
- }
- break;
- case HtmlNodeType.Element:
- string endElementString = null;
- bool isInline;
- bool skip = false;
- int listIndex = 0;
- switch (node.Name)
- {
- case "nav":
- skip = true;
- isInline = false;
- break;
- case "body":
- case "section":
- case "article":
- case "aside":
- case "h1":
- case "h2":
- case "header":
- case "footer":
- case "address":
- case "main":
- case "div":
- case "span":
- case "p": // stylistic - adjust as you tend to use
- if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
- endElementString = "\r\n";
- isInline = false;
- break;
- case "br":
- outText.Write("\r\n");
- skip = true;
- textInfo.WritePrecedingWhiteSpace = false;
- isInline = true;
- break;
- case "a":
- isInline = true;
- break;
- case "li":
- isInline = false;
- break;
- case "ol":
- listIndex = 1;
- goto case "ul";
- case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
- endElementString = "\r\n";
- isInline = false;
- break;
- case "img": //inline-block in reality
- isInline = true;
- break;
- default:
- isInline = true;
- break;
- }
- if (!skip && node.HasChildNodes)
- {
- ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
- }
- if (endElementString != null)
- {
- outText.Write(endElementString);
- }
- break;
- }
- }
- }
}
diff --git a/Scraper/Utf8StringWriter.cs b/Scraper/Utf8StringWriter.cs
index 6b493fa..05a1d57 100644
--- a/Scraper/Utf8StringWriter.cs
+++ b/Scraper/Utf8StringWriter.cs
@@ -4,5 +4,5 @@ namespace WordpressEboobScraper2.Scraper;
public class Utf8StringWriter : StringWriter
{
- public override Encoding Encoding { get { return Encoding.UTF8; } }
+ public override Encoding Encoding => Encoding.UTF8;
}
\ No newline at end of file
diff --git a/WordpressEboobScraper2.csproj b/WordpressEboobScraper2.csproj
index a0ba966..7da616f 100644
--- a/WordpressEboobScraper2.csproj
+++ b/WordpressEboobScraper2.csproj
@@ -4,7 +4,7 @@
Exe
net7.0
enable
- enable
+ disable
@@ -14,4 +14,6 @@
+
+