From d981d092e42c8af1b1857e8212d36382339ae8db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Schw=C3=B6rer?= Date: Tue, 3 Oct 2023 16:13:37 +0200 Subject: [PATCH] fix a few compiler warnings --- .../.idea/vcs.xml | 1 - Proc/HTMLToText.cs | 181 ++++++ Proc/ProcessHelper.cs | 59 ++ Proc/ProcessOutput.cs | 21 + Scraper/Helper.cs | 3 +- Scraper/Scraper.cs | 582 +++++------------- Scraper/Utf8StringWriter.cs | 2 +- WordpressEboobScraper2.csproj | 4 +- 8 files changed, 424 insertions(+), 429 deletions(-) create mode 100644 Proc/HTMLToText.cs create mode 100644 Proc/ProcessHelper.cs create mode 100644 Proc/ProcessOutput.cs diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml index 288b36b..94a25f7 100644 --- a/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml +++ b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml @@ -1,7 +1,6 @@ - \ No newline at end of file diff --git a/Proc/HTMLToText.cs b/Proc/HTMLToText.cs new file mode 100644 index 0000000..9d471d9 --- /dev/null +++ b/Proc/HTMLToText.cs @@ -0,0 +1,181 @@ +using System.Text.RegularExpressions; +using HtmlAgilityPack; + +namespace WordpressEboobScraper2.Proc; + +public static class HTMLToText + { + private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); + private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); + + private class PreceedingDomTextInfo + { + public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) + { + IsFirstTextOfDocWritten = isFirstTextOfDocWritten; + } + public bool WritePrecedingWhiteSpace { get; set; } + public bool LastCharWasSpace { get; set; } + public readonly BoolWrapper IsFirstTextOfDocWritten; + public int ListIndex { get; set; } + } + + private class BoolWrapper + { + public BoolWrapper() { } + public bool Value { get; set; } + public static implicit operator bool(BoolWrapper boolWrapper) + { + return boolWrapper.Value; + } + public static implicit operator BoolWrapper(bool boolWrapper) + { + return new BoolWrapper { Value = boolWrapper }; + } + } + + public static string Convert(string path) + { + HtmlDocument doc = new HtmlDocument(); + doc.Load(path); + return ConvertDoc(doc); + } + + public static string ConvertHtml(string html) + { + HtmlDocument doc = new HtmlDocument(); + html = REX_TAG1.Replace(html, " "); + html = REX_TAG2.Replace(html, " "); + doc.LoadHtml(html); + return ConvertDoc(doc); + } + + public static string ConvertDoc(HtmlDocument doc) + { + using (StringWriter sw = new StringWriter()) + { + ConvertTo(doc.DocumentNode, sw); + sw.Flush(); + return sw.ToString(); + } + } + + private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + foreach (HtmlNode subnode in node.ChildNodes) + { + ConvertTo(subnode, outText, textInfo); + } + } + + public static void ConvertTo(HtmlNode node, TextWriter outText) + { + ConvertTo(node, outText, new PreceedingDomTextInfo(false)); + } + + private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + string html; + switch (node.NodeType) + { + case HtmlNodeType.Comment: + // don't output comments + break; + case HtmlNodeType.Document: + ConvertContentTo(node, outText, textInfo); + break; + case HtmlNodeType.Text: + // script and style must not be output + string parentName = node.ParentNode.Name; + if ((parentName == "script") || (parentName == "style")) + { + break; + } + // get text + html = ((HtmlTextNode)node).Text; + // is it in fact a special closing node output as text? + if (HtmlNode.IsOverlappedClosingElement(html)) break; + + // check the text is meaningful and not a bunch of whitespaces + if (html.Length == 0) break; + + if (html.Trim().ToLower().StartsWith("")) break; + + if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) + { + html = html.TrimStart(); + if (html.Length == 0) { break; } + textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; + } + outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); + if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) + { + outText.Write(' '); + } + break; + case HtmlNodeType.Element: + string endElementString = null; + bool isInline; + bool skip = false; + int listIndex = 0; + switch (node.Name) + { + case "nav": + skip = true; + isInline = false; + break; + case "body": + case "section": + case "article": + case "aside": + case "h1": + case "h2": + case "header": + case "footer": + case "address": + case "main": + case "div": + case "span": + case "p": // stylistic - adjust as you tend to use + if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); + endElementString = "\r\n"; + isInline = false; + break; + case "br": + outText.Write("\r\n"); + skip = true; + textInfo.WritePrecedingWhiteSpace = false; + isInline = true; + break; + case "a": + isInline = true; + break; + case "li": + isInline = false; + break; + case "ol": + listIndex = 1; + goto case "ul"; + case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems + endElementString = "\r\n"; + isInline = false; + break; + case "img": //inline-block in reality + isInline = true; + break; + default: + isInline = true; + break; + } + if (!skip && node.HasChildNodes) + { + ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); + } + if (endElementString != null) + { + outText.Write(endElementString); + } + break; + } + } + } \ No newline at end of file diff --git a/Proc/ProcessHelper.cs b/Proc/ProcessHelper.cs new file mode 100644 index 0000000..08730d6 --- /dev/null +++ b/Proc/ProcessHelper.cs @@ -0,0 +1,59 @@ +using System.Text; + +namespace WordpressEboobScraper2.Proc; + +public static class ProcessHelper +{ + public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) + { + var process = new System.Diagnostics.Process + { + StartInfo = + { + FileName = command, + Arguments = arguments, + WorkingDirectory = workingDirectory ?? string.Empty, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true, + ErrorDialog = false, + } + }; + + var builderOut = new StringBuilder(); + var builderErr = new StringBuilder(); + var builderBoth = new StringBuilder(); + + process.OutputDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderOut.Length == 0) builderOut.Append(args.Data); + else builderOut.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.ErrorDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderErr.Length == 0) builderErr.Append(args.Data); + else builderErr.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.Start(); + + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + process.WaitForExit(); + + return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); + } +} \ No newline at end of file diff --git a/Proc/ProcessOutput.cs b/Proc/ProcessOutput.cs new file mode 100644 index 0000000..7813e58 --- /dev/null +++ b/Proc/ProcessOutput.cs @@ -0,0 +1,21 @@ +namespace WordpressEboobScraper2.Proc; + +public struct ProcessOutput +{ + public readonly string Command; + public readonly int ExitCode; + public readonly string StdOut; + public readonly string StdErr; + public readonly string StdCombined; + + public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) + { + Command = cmd; + ExitCode = ex; + StdOut = stdout; + StdErr = stderr; + StdCombined = stdcom; + } + + public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; +} \ No newline at end of file diff --git a/Scraper/Helper.cs b/Scraper/Helper.cs index 8528275..7c80bc9 100644 --- a/Scraper/Helper.cs +++ b/Scraper/Helper.cs @@ -2,9 +2,8 @@ using HtmlAgilityPack; namespace WordpressEboobScraper2.Scraper; -public class Helper +public static class Helper { - public static string Filenamify(string v, bool repl = false) { var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p => diff --git a/Scraper/Scraper.cs b/Scraper/Scraper.cs index 9ce0fda..4da91d5 100644 --- a/Scraper/Scraper.cs +++ b/Scraper/Scraper.cs @@ -6,6 +6,7 @@ using System.Xml.Linq; using System.Xml.Serialization; using HtmlAgilityPack; using Ionic.Zip; +using WordpressEboobScraper2.Proc; namespace WordpressEboobScraper2.Scraper; @@ -15,16 +16,16 @@ namespace WordpressEboobScraper2.Scraper; /** **/ /** *************************************************** **/ -class Scraper +public class Scraper { static EpubParameter ACTIVE_BOOK = null; const int LIMIT = 1500; - readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); + readonly Regex REX_NUMSTART = new(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); - Dictionary webCache = new Dictionary(); + Dictionary webCache = new(); string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar; @@ -144,7 +145,7 @@ class Scraper void SaveCache() { var xs = new XmlSerializer(typeof(List)); - using (var writer = new System.IO.StreamWriter(WCACHE_FILE)) + using (var writer = new StreamWriter(WCACHE_FILE)) { xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); } @@ -155,77 +156,74 @@ class Scraper if (!File.Exists(WCACHE_FILE)) return; XmlSerializer deserializer = new XmlSerializer(typeof(List)); - using (TextReader reader = new StreamReader(WCACHE_FILE)) - { - var result = new List(); + + using TextReader reader = new StreamReader(WCACHE_FILE); + + var l = (List)deserializer.Deserialize(reader); - var l = (List)deserializer.Deserialize(reader); - - webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); - } + webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); } List FindChapters() { List result = new List(); - using (WebClient client = new WebClient()) + using WebClient client = new WebClient(); + + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); + + while (buffer.Any() && result.Count < LIMIT) { - client.Encoding = Encoding.UTF8; - Stack buffer = new Stack(); - buffer.Push(ACTIVE_BOOK.StartURL); + var url = buffer.Pop(); + Chapter curr = new Chapter() { url = url }; - while (buffer.Any() && result.Count < LIMIT) + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) { - var url = buffer.Pop(); - Chapter curr = new Chapter() { url = url }; - - var buffered = webCache.ContainsKey(url.ToLower()); - if (buffered) - { - curr.queryResult = webCache[url.ToLower()]; - "*(loaded from webcache)*".Dump(); - } - else - { - curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - webCache[url.ToLower()] = curr.queryResult; - SaveCache(); - } - - var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); - if (next_url != null) buffer.Push(next_url); - - if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST) - { - "".Dump(); - "//==> *(auto-reload from live)*".Dump(); - "".Dump(); - curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - webCache[url.ToLower()] = curr.queryResult; - SaveCache(); - - r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); - if (next_url_inner != null) buffer.Push(next_url_inner); - } - if (r == ProcessResult.SuccessNormal) - { - " ==> Chapter processed".Dump(); - result.Add(curr); - OutputChapter(curr, result.Count); - } - else if (r == ProcessResult.SkipChapter) - { - " ==> Skip this chapter".Dump(); - } - else if (r == ProcessResult.ReachedEnd) - { - " ==> End reached".Dump(); - } - - - "".Dump(); + curr.queryResult = webCache[url.ToLower()]; + "*(loaded from webcache)*".Dump(); } + else + { + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + } + + var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); + if (next_url != null) buffer.Push(next_url); + + if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST) + { + "".Dump(); + "//==> *(auto-reload from live)*".Dump(); + "".Dump(); + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + + r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); + if (next_url_inner != null) buffer.Push(next_url_inner); + } + if (r == ProcessResult.SuccessNormal) + { + " ==> Chapter processed".Dump(); + result.Add(curr); + OutputChapter(curr, result.Count); + } + else if (r == ProcessResult.SkipChapter) + { + " ==> Skip this chapter".Dump(); + } + else if (r == ProcessResult.ReachedEnd) + { + " ==> End reached".Dump(); + } + + + "".Dump(); } return result; @@ -233,96 +231,93 @@ class Scraper void VerifyChapters() { - List result = new List(); + using WebClient client = new WebClient(); + + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); - using (WebClient client = new WebClient()) + while (buffer.Any()) { - client.Encoding = Encoding.UTF8; - Stack buffer = new Stack(); - buffer.Push(ACTIVE_BOOK.StartURL); + var url = buffer.Pop(); + Chapter curr_buffer = new Chapter() { url = url }; + Chapter curr_live = new Chapter() { url = url }; - while (buffer.Any() && result.Count < LIMIT) + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) { - var url = buffer.Pop(); - Chapter curr_buffer = new Chapter() { url = url }; - Chapter curr_live = new Chapter() { url = url }; - - var buffered = webCache.ContainsKey(url.ToLower()); - if (buffered) + try { - try - { - curr_buffer.queryResult = webCache[url.ToLower()]; - curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); - } - catch (Exception e) - { - $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); - continue; - } + curr_buffer.queryResult = webCache[url.ToLower()]; + curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); } - else + catch (Exception e) { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); continue; } - - var is_diff = false; - - var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer); - var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live); - - if (next_buffer != null) buffer.Push(next_buffer); - - if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } - if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } - - if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } - if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } - - if (curr_buffer.chapter.Value != curr_live.chapter.Value) - { - var clean_buffer = GetChapterText(curr_buffer); - var clean_live = GetChapterText(curr_live); - - if (clean_buffer.Trim() != clean_live.Trim()) - { - $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); - new Hyperlinq(() => - { - - var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); - var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); - File.WriteAllText(fa, curr_buffer.chapter.Value); - File.WriteAllText(fb, curr_live.chapter.Value); - Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); - - }, "[Compare Raw]").Dump(); - new Hyperlinq(() => - { - - var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); - var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); - File.WriteAllText(fa, clean_buffer); - File.WriteAllText(fb, clean_live); - Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); - - }, "[Compare Text]").Dump(); - new Hyperlinq(() => - { - - webCache[url.ToLower()] = curr_live.queryResult; - SaveCache(); - - }, "[Save new version to webcache]").Dump(); - - is_diff = true; - } - } - - if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); - - if (is_diff) "".Dump(); } + else + { + continue; + } + + var is_diff = false; + + var r_buffer = ProcessChapter(curr_buffer, new List(), _ => {}, out var next_buffer); + var r_live = ProcessChapter(curr_live, new List(), _ => {}, out var next_live); + + if (next_buffer != null) buffer.Push(next_buffer); + + if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } + if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } + + if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } + if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } + + if (curr_buffer.chapter.Value != curr_live.chapter.Value) + { + var clean_buffer = GetChapterText(curr_buffer); + var clean_live = GetChapterText(curr_live); + + if (clean_buffer.Trim() != clean_live.Trim()) + { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, curr_buffer.chapter.Value); + File.WriteAllText(fb, curr_live.chapter.Value); + Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Raw]").Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, clean_buffer); + File.WriteAllText(fb, clean_live); + Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Text]").Dump(); + new Hyperlinq(() => + { + + webCache[url.ToLower()] = curr_live.queryResult; + SaveCache(); + + }, "[Save new version to webcache]").Dump(); + + is_diff = true; + } + } + + if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); + + if (is_diff) "".Dump(); } } @@ -350,9 +345,9 @@ class Scraper return clean; } - ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next) + ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueueNext) { - forwardQueue_next = null; + forwardQueueNext = null; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(curr.queryResult); @@ -500,8 +495,6 @@ class Scraper #region Next - string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" }; - if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && @@ -517,8 +510,8 @@ class Scraper if (next == null) next = nodeContent.Descendants() .Where(p => p.Name.ToLower() == "a") - .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") - .Where(p => p.Attributes.Contains("href")) + .Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next") + .Where(p => p.Attributes.Contains("href")) .FirstOrDefault(); if (next == null) @@ -559,7 +552,7 @@ class Scraper curr.next = next_url; if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) { - forwardQueue_next = next_url; + forwardQueueNext = next_url; } } @@ -940,14 +933,12 @@ class Scraper new XAttribute("full-path", "OEBPS/content.opf"), new XAttribute("media-type", "application/oebps-package+xml"))))); - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - var r = writer.ToString(); - r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); - return r.Trim() + "\r\n"; - } + using Utf8StringWriter writer = new Utf8StringWriter(); + + doc.Save(writer); + var r = writer.ToString(); + r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); + return r.Trim() + "\r\n"; } string GetEpubContentOPF(List chapters) @@ -1030,17 +1021,14 @@ class Scraper package.Add(new XElement(opf + "guide")); - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - return writer.ToString(); - } + using Utf8StringWriter writer = new Utf8StringWriter(); + + doc.Save(writer); + return writer.ToString(); } string GetEpubTOC(List chapters) { - XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/"; XNamespace ncx = "http://www.idpf.org/2007/opf"; var doc = new XDocument( @@ -1082,12 +1070,10 @@ class Scraper root.Add(nav); - StringBuilder builder = new StringBuilder(); - using (Utf8StringWriter writer = new Utf8StringWriter()) - { - doc.Save(writer); - return writer.ToString(); - } + using Utf8StringWriter writer = new Utf8StringWriter(); + + doc.Save(writer); + return writer.ToString(); } string GetEpubChapterFile(Chapter chapter, int idx) @@ -1108,256 +1094,4 @@ class Scraper return xml.ToString(); } - - public struct ProcessOutput - { - public readonly string Command; - public readonly int ExitCode; - public readonly string StdOut; - public readonly string StdErr; - public readonly string StdCombined; - - public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) - { - Command = cmd; - ExitCode = ex; - StdOut = stdout; - StdErr = stderr; - StdCombined = stdcom; - } - - public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; - } - - public static class ProcessHelper - { - public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) - { - var process = new Process - { - StartInfo = - { - FileName = command, - Arguments = arguments, - WorkingDirectory = workingDirectory ?? string.Empty, - UseShellExecute = false, - RedirectStandardOutput = true, - RedirectStandardError = true, - CreateNoWindow = true, - ErrorDialog = false, - } - }; - - var builderOut = new StringBuilder(); - var builderErr = new StringBuilder(); - var builderBoth = new StringBuilder(); - - process.OutputDataReceived += (sender, args) => - { - if (args.Data == null) return; - - if (builderOut.Length == 0) builderOut.Append(args.Data); - else builderOut.Append("\n" + args.Data); - - if (builderBoth.Length == 0) builderBoth.Append(args.Data); - else builderBoth.Append("\n" + args.Data); - }; - - process.ErrorDataReceived += (sender, args) => - { - if (args.Data == null) return; - - if (builderErr.Length == 0) builderErr.Append(args.Data); - else builderErr.Append("\n" + args.Data); - - if (builderBoth.Length == 0) builderBoth.Append(args.Data); - else builderBoth.Append("\n" + args.Data); - }; - - process.Start(); - - process.BeginOutputReadLine(); - process.BeginErrorReadLine(); - - process.WaitForExit(); - - return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); - } - } - public static class HTMLToText - { - private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); - private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); - - private class PreceedingDomTextInfo - { - public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) - { - IsFirstTextOfDocWritten = isFirstTextOfDocWritten; - } - public bool WritePrecedingWhiteSpace { get; set; } - public bool LastCharWasSpace { get; set; } - public readonly BoolWrapper IsFirstTextOfDocWritten; - public int ListIndex { get; set; } - } - - private class BoolWrapper - { - public BoolWrapper() { } - public bool Value { get; set; } - public static implicit operator bool(BoolWrapper boolWrapper) - { - return boolWrapper.Value; - } - public static implicit operator BoolWrapper(bool boolWrapper) - { - return new BoolWrapper { Value = boolWrapper }; - } - } - - public static string Convert(string path) - { - HtmlDocument doc = new HtmlDocument(); - doc.Load(path); - return ConvertDoc(doc); - } - - public static string ConvertHtml(string html) - { - HtmlDocument doc = new HtmlDocument(); - html = REX_TAG1.Replace(html, " "); - html = REX_TAG2.Replace(html, " "); - doc.LoadHtml(html); - return ConvertDoc(doc); - } - - public static string ConvertDoc(HtmlDocument doc) - { - using (StringWriter sw = new StringWriter()) - { - ConvertTo(doc.DocumentNode, sw); - sw.Flush(); - return sw.ToString(); - } - } - - private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) - { - foreach (HtmlNode subnode in node.ChildNodes) - { - ConvertTo(subnode, outText, textInfo); - } - } - - public static void ConvertTo(HtmlNode node, TextWriter outText) - { - ConvertTo(node, outText, new PreceedingDomTextInfo(false)); - } - - private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) - { - string html; - switch (node.NodeType) - { - case HtmlNodeType.Comment: - // don't output comments - break; - case HtmlNodeType.Document: - ConvertContentTo(node, outText, textInfo); - break; - case HtmlNodeType.Text: - // script and style must not be output - string parentName = node.ParentNode.Name; - if ((parentName == "script") || (parentName == "style")) - { - break; - } - // get text - html = ((HtmlTextNode)node).Text; - // is it in fact a special closing node output as text? - if (HtmlNode.IsOverlappedClosingElement(html)) break; - - // check the text is meaningful and not a bunch of whitespaces - if (html.Length == 0) break; - - if (html.Trim().ToLower().StartsWith("")) break; - - if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) - { - html = html.TrimStart(); - if (html.Length == 0) { break; } - textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; - } - outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); - if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) - { - outText.Write(' '); - } - break; - case HtmlNodeType.Element: - string endElementString = null; - bool isInline; - bool skip = false; - int listIndex = 0; - switch (node.Name) - { - case "nav": - skip = true; - isInline = false; - break; - case "body": - case "section": - case "article": - case "aside": - case "h1": - case "h2": - case "header": - case "footer": - case "address": - case "main": - case "div": - case "span": - case "p": // stylistic - adjust as you tend to use - if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); - endElementString = "\r\n"; - isInline = false; - break; - case "br": - outText.Write("\r\n"); - skip = true; - textInfo.WritePrecedingWhiteSpace = false; - isInline = true; - break; - case "a": - isInline = true; - break; - case "li": - isInline = false; - break; - case "ol": - listIndex = 1; - goto case "ul"; - case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems - endElementString = "\r\n"; - isInline = false; - break; - case "img": //inline-block in reality - isInline = true; - break; - default: - isInline = true; - break; - } - if (!skip && node.HasChildNodes) - { - ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); - } - if (endElementString != null) - { - outText.Write(endElementString); - } - break; - } - } - } } diff --git a/Scraper/Utf8StringWriter.cs b/Scraper/Utf8StringWriter.cs index 6b493fa..05a1d57 100644 --- a/Scraper/Utf8StringWriter.cs +++ b/Scraper/Utf8StringWriter.cs @@ -4,5 +4,5 @@ namespace WordpressEboobScraper2.Scraper; public class Utf8StringWriter : StringWriter { - public override Encoding Encoding { get { return Encoding.UTF8; } } + public override Encoding Encoding => Encoding.UTF8; } \ No newline at end of file diff --git a/WordpressEboobScraper2.csproj b/WordpressEboobScraper2.csproj index a0ba966..7da616f 100644 --- a/WordpressEboobScraper2.csproj +++ b/WordpressEboobScraper2.csproj @@ -4,7 +4,7 @@ Exe net7.0 enable - enable + disable @@ -14,4 +14,6 @@ + +