1
0
Fork 0

fix a few compiler warnings

This commit is contained in:
Mike Schwörer 2023-10-03 16:13:37 +02:00
parent 4cc76a45ef
commit d981d092e4
Signed by: Mikescher
GPG Key ID: D3C7172E0A70F8CF
8 changed files with 424 additions and 429 deletions

View File

@ -1,7 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

181
Proc/HTMLToText.cs Normal file
View File

@ -0,0 +1,181 @@
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace WordpressEboobScraper2.Proc;
public static class HTMLToText
{
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
private class PreceedingDomTextInfo
{
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
}
public bool WritePrecedingWhiteSpace { get; set; }
public bool LastCharWasSpace { get; set; }
public readonly BoolWrapper IsFirstTextOfDocWritten;
public int ListIndex { get; set; }
}
private class BoolWrapper
{
public BoolWrapper() { }
public bool Value { get; set; }
public static implicit operator bool(BoolWrapper boolWrapper)
{
return boolWrapper.Value;
}
public static implicit operator BoolWrapper(bool boolWrapper)
{
return new BoolWrapper { Value = boolWrapper };
}
}
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
return ConvertDoc(doc);
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
html = REX_TAG1.Replace(html, " ");
html = REX_TAG2.Replace(html, " ");
doc.LoadHtml(html);
return ConvertDoc(doc);
}
public static string ConvertDoc(HtmlDocument doc)
{
using (StringWriter sw = new StringWriter())
{
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
}
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText, textInfo);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
}
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText, textInfo);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
{
break;
}
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html)) break;
// check the text is meaningful and not a bunch of whitespaces
if (html.Length == 0) break;
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
{
html = html.TrimStart();
if (html.Length == 0) { break; }
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
}
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
{
outText.Write(' ');
}
break;
case HtmlNodeType.Element:
string endElementString = null;
bool isInline;
bool skip = false;
int listIndex = 0;
switch (node.Name)
{
case "nav":
skip = true;
isInline = false;
break;
case "body":
case "section":
case "article":
case "aside":
case "h1":
case "h2":
case "header":
case "footer":
case "address":
case "main":
case "div":
case "span":
case "p": // stylistic - adjust as you tend to use
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
endElementString = "\r\n";
isInline = false;
break;
case "br":
outText.Write("\r\n");
skip = true;
textInfo.WritePrecedingWhiteSpace = false;
isInline = true;
break;
case "a":
isInline = true;
break;
case "li":
isInline = false;
break;
case "ol":
listIndex = 1;
goto case "ul";
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n";
isInline = false;
break;
case "img": //inline-block in reality
isInline = true;
break;
default:
isInline = true;
break;
}
if (!skip && node.HasChildNodes)
{
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
}
if (endElementString != null)
{
outText.Write(endElementString);
}
break;
}
}
}

59
Proc/ProcessHelper.cs Normal file
View File

@ -0,0 +1,59 @@
using System.Text;
namespace WordpressEboobScraper2.Proc;
public static class ProcessHelper
{
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
{
var process = new System.Diagnostics.Process
{
StartInfo =
{
FileName = command,
Arguments = arguments,
WorkingDirectory = workingDirectory ?? string.Empty,
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true,
ErrorDialog = false,
}
};
var builderOut = new StringBuilder();
var builderErr = new StringBuilder();
var builderBoth = new StringBuilder();
process.OutputDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderOut.Length == 0) builderOut.Append(args.Data);
else builderOut.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.ErrorDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderErr.Length == 0) builderErr.Append(args.Data);
else builderErr.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
process.WaitForExit();
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
}
}

21
Proc/ProcessOutput.cs Normal file
View File

@ -0,0 +1,21 @@
namespace WordpressEboobScraper2.Proc;
public struct ProcessOutput
{
public readonly string Command;
public readonly int ExitCode;
public readonly string StdOut;
public readonly string StdErr;
public readonly string StdCombined;
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
{
Command = cmd;
ExitCode = ex;
StdOut = stdout;
StdErr = stderr;
StdCombined = stdcom;
}
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
}

View File

@ -2,9 +2,8 @@ using HtmlAgilityPack;
namespace WordpressEboobScraper2.Scraper;
public class Helper
public static class Helper
{
public static string Filenamify(string v, bool repl = false)
{
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>

View File

@ -6,6 +6,7 @@ using System.Xml.Linq;
using System.Xml.Serialization;
using HtmlAgilityPack;
using Ionic.Zip;
using WordpressEboobScraper2.Proc;
namespace WordpressEboobScraper2.Scraper;
@ -15,16 +16,16 @@ namespace WordpressEboobScraper2.Scraper;
/** **/
/** *************************************************** **/
class Scraper
public class Scraper
{
static EpubParameter ACTIVE_BOOK = null;
const int LIMIT = 1500;
readonly Regex REX_NUMSTART = new Regex(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
readonly Regex REX_NUMSTART = new(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
Dictionary<string, string> webCache = new Dictionary<string, string>();
Dictionary<string, string> webCache = new();
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
@ -144,7 +145,7 @@ class Scraper
void SaveCache()
{
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
using (var writer = new StreamWriter(WCACHE_FILE))
{
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
}
@ -155,77 +156,74 @@ class Scraper
if (!File.Exists(WCACHE_FILE)) return;
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
using (TextReader reader = new StreamReader(WCACHE_FILE))
{
var result = new List<SerializableCacheEntry>();
using TextReader reader = new StreamReader(WCACHE_FILE);
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
}
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
}
List<Chapter> FindChapters()
{
List<Chapter> result = new List<Chapter>();
using (WebClient client = new WebClient())
using WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
while (buffer.Any() && result.Count < LIMIT)
{
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
var url = buffer.Pop();
Chapter curr = new Chapter() { url = url };
while (buffer.Any() && result.Count < LIMIT)
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
var url = buffer.Pop();
Chapter curr = new Chapter() { url = url };
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
curr.queryResult = webCache[url.ToLower()];
"*(loaded from webcache)*".Dump();
}
else
{
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
}
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
if (next_url != null) buffer.Push(next_url);
if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
{
"".Dump();
"//==> *(auto-reload from live)*".Dump();
"".Dump();
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
if (next_url_inner != null) buffer.Push(next_url_inner);
}
if (r == ProcessResult.SuccessNormal)
{
" ==> Chapter processed".Dump();
result.Add(curr);
OutputChapter(curr, result.Count);
}
else if (r == ProcessResult.SkipChapter)
{
" ==> Skip this chapter".Dump();
}
else if (r == ProcessResult.ReachedEnd)
{
" ==> End reached".Dump();
}
"".Dump();
curr.queryResult = webCache[url.ToLower()];
"*(loaded from webcache)*".Dump();
}
else
{
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
}
var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url);
if (next_url != null) buffer.Push(next_url);
if (buffered && buffer.Count == 0 && Config.DO_LIVE_RELOAD_OF_LAST)
{
"".Dump();
"//==> *(auto-reload from live)*".Dump();
"".Dump();
curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
webCache[url.ToLower()] = curr.queryResult;
SaveCache();
r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner);
if (next_url_inner != null) buffer.Push(next_url_inner);
}
if (r == ProcessResult.SuccessNormal)
{
" ==> Chapter processed".Dump();
result.Add(curr);
OutputChapter(curr, result.Count);
}
else if (r == ProcessResult.SkipChapter)
{
" ==> Skip this chapter".Dump();
}
else if (r == ProcessResult.ReachedEnd)
{
" ==> End reached".Dump();
}
"".Dump();
}
return result;
@ -233,96 +231,93 @@ class Scraper
void VerifyChapters()
{
List<Chapter> result = new List<Chapter>();
using WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
using (WebClient client = new WebClient())
while (buffer.Any())
{
client.Encoding = Encoding.UTF8;
Stack<string> buffer = new Stack<string>();
buffer.Push(ACTIVE_BOOK.StartURL);
var url = buffer.Pop();
Chapter curr_buffer = new Chapter() { url = url };
Chapter curr_live = new Chapter() { url = url };
while (buffer.Any() && result.Count < LIMIT)
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
{
var url = buffer.Pop();
Chapter curr_buffer = new Chapter() { url = url };
Chapter curr_live = new Chapter() { url = url };
var buffered = webCache.ContainsKey(url.ToLower());
if (buffered)
try
{
try
{
curr_buffer.queryResult = webCache[url.ToLower()];
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
}
catch (Exception e)
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
continue;
}
curr_buffer.queryResult = webCache[url.ToLower()];
curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url));
}
else
catch (Exception e)
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump();
continue;
}
var is_diff = false;
var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);
if (next_buffer != null) buffer.Push(next_buffer);
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
{
var clean_buffer = GetChapterText(curr_buffer);
var clean_live = GetChapterText(curr_live);
if (clean_buffer.Trim() != clean_live.Trim())
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, curr_buffer.chapter.Value);
File.WriteAllText(fb, curr_live.chapter.Value);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Raw]").Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, clean_buffer);
File.WriteAllText(fb, clean_live);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Text]").Dump();
new Hyperlinq(() =>
{
webCache[url.ToLower()] = curr_live.queryResult;
SaveCache();
}, "[Save new version to webcache]").Dump();
is_diff = true;
}
}
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
if (is_diff) "".Dump();
}
else
{
continue;
}
var is_diff = false;
var r_buffer = ProcessChapter(curr_buffer, new List<Chapter>(), _ => {}, out var next_buffer);
var r_live = ProcessChapter(curr_live, new List<Chapter>(), _ => {}, out var next_live);
if (next_buffer != null) buffer.Push(next_buffer);
if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; }
if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; }
if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; }
if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; }
if (curr_buffer.chapter.Value != curr_live.chapter.Value)
{
var clean_buffer = GetChapterText(curr_buffer);
var clean_live = GetChapterText(curr_live);
if (clean_buffer.Trim() != clean_live.Trim())
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, curr_buffer.chapter.Value);
File.WriteAllText(fb, curr_live.chapter.Value);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Raw]").Dump();
new Hyperlinq(() =>
{
var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt");
var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt");
File.WriteAllText(fa, clean_buffer);
File.WriteAllText(fb, clean_live);
Process.Start(Config.COMPARE_PROG, $"\"{fa}\" \"{fb}\"");
}, "[Compare Text]").Dump();
new Hyperlinq(() =>
{
webCache[url.ToLower()] = curr_live.queryResult;
SaveCache();
}, "[Save new version to webcache]").Dump();
is_diff = true;
}
}
if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump();
if (is_diff) "".Dump();
}
}
@ -350,9 +345,9 @@ class Scraper
return clean;
}
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueue_next)
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueueNext)
{
forwardQueue_next = null;
forwardQueueNext = null;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(curr.queryResult);
@ -500,8 +495,6 @@ class Scraper
#region Next
string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
@ -517,8 +510,8 @@ class Scraper
if (next == null)
next = nodeContent.Descendants()
.Where(p => p.Name.ToLower() == "a")
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.Where(p => Helper.Striptease(p) == "next chapter" || Helper.Striptease(p) == "next")
.Where(p => p.Attributes.Contains("href"))
.FirstOrDefault();
if (next == null)
@ -559,7 +552,7 @@ class Scraper
curr.next = next_url;
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
{
forwardQueue_next = next_url;
forwardQueueNext = next_url;
}
}
@ -940,14 +933,12 @@ class Scraper
new XAttribute("full-path", "OEBPS/content.opf"),
new XAttribute("media-type", "application/oebps-package+xml")))));
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
var r = writer.ToString();
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
return r.Trim() + "\r\n";
}
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
var r = writer.ToString();
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
return r.Trim() + "\r\n";
}
string GetEpubContentOPF(List<Chapter> chapters)
@ -1030,17 +1021,14 @@ class Scraper
package.Add(new XElement(opf + "guide"));
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
return writer.ToString();
}
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
return writer.ToString();
}
string GetEpubTOC(List<Chapter> chapters)
{
XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
XNamespace ncx = "http://www.idpf.org/2007/opf";
var doc = new XDocument(
@ -1082,12 +1070,10 @@ class Scraper
root.Add(nav);
StringBuilder builder = new StringBuilder();
using (Utf8StringWriter writer = new Utf8StringWriter())
{
doc.Save(writer);
return writer.ToString();
}
using Utf8StringWriter writer = new Utf8StringWriter();
doc.Save(writer);
return writer.ToString();
}
string GetEpubChapterFile(Chapter chapter, int idx)
@ -1108,256 +1094,4 @@ class Scraper
return xml.ToString();
}
public struct ProcessOutput
{
public readonly string Command;
public readonly int ExitCode;
public readonly string StdOut;
public readonly string StdErr;
public readonly string StdCombined;
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
{
Command = cmd;
ExitCode = ex;
StdOut = stdout;
StdErr = stderr;
StdCombined = stdcom;
}
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
}
public static class ProcessHelper
{
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
{
var process = new Process
{
StartInfo =
{
FileName = command,
Arguments = arguments,
WorkingDirectory = workingDirectory ?? string.Empty,
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true,
ErrorDialog = false,
}
};
var builderOut = new StringBuilder();
var builderErr = new StringBuilder();
var builderBoth = new StringBuilder();
process.OutputDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderOut.Length == 0) builderOut.Append(args.Data);
else builderOut.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.ErrorDataReceived += (sender, args) =>
{
if (args.Data == null) return;
if (builderErr.Length == 0) builderErr.Append(args.Data);
else builderErr.Append("\n" + args.Data);
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
else builderBoth.Append("\n" + args.Data);
};
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
process.WaitForExit();
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
}
}
public static class HTMLToText
{
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
private class PreceedingDomTextInfo
{
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
}
public bool WritePrecedingWhiteSpace { get; set; }
public bool LastCharWasSpace { get; set; }
public readonly BoolWrapper IsFirstTextOfDocWritten;
public int ListIndex { get; set; }
}
private class BoolWrapper
{
public BoolWrapper() { }
public bool Value { get; set; }
public static implicit operator bool(BoolWrapper boolWrapper)
{
return boolWrapper.Value;
}
public static implicit operator BoolWrapper(bool boolWrapper)
{
return new BoolWrapper { Value = boolWrapper };
}
}
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
return ConvertDoc(doc);
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
html = REX_TAG1.Replace(html, " ");
html = REX_TAG2.Replace(html, " ");
doc.LoadHtml(html);
return ConvertDoc(doc);
}
public static string ConvertDoc(HtmlDocument doc)
{
using (StringWriter sw = new StringWriter())
{
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
}
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText, textInfo);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
}
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText, textInfo);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
{
break;
}
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html)) break;
// check the text is meaningful and not a bunch of whitespaces
if (html.Length == 0) break;
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
{
html = html.TrimStart();
if (html.Length == 0) { break; }
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
}
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
{
outText.Write(' ');
}
break;
case HtmlNodeType.Element:
string endElementString = null;
bool isInline;
bool skip = false;
int listIndex = 0;
switch (node.Name)
{
case "nav":
skip = true;
isInline = false;
break;
case "body":
case "section":
case "article":
case "aside":
case "h1":
case "h2":
case "header":
case "footer":
case "address":
case "main":
case "div":
case "span":
case "p": // stylistic - adjust as you tend to use
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
endElementString = "\r\n";
isInline = false;
break;
case "br":
outText.Write("\r\n");
skip = true;
textInfo.WritePrecedingWhiteSpace = false;
isInline = true;
break;
case "a":
isInline = true;
break;
case "li":
isInline = false;
break;
case "ol":
listIndex = 1;
goto case "ul";
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n";
isInline = false;
break;
case "img": //inline-block in reality
isInline = true;
break;
default:
isInline = true;
break;
}
if (!skip && node.HasChildNodes)
{
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
}
if (endElementString != null)
{
outText.Write(endElementString);
}
break;
}
}
}
}

View File

@ -4,5 +4,5 @@ namespace WordpressEboobScraper2.Scraper;
public class Utf8StringWriter : StringWriter
{
public override Encoding Encoding { get { return Encoding.UTF8; } }
public override Encoding Encoding => Encoding.UTF8;
}

View File

@ -4,7 +4,7 @@
<OutputType>Exe</OutputType>
<TargetFramework>net7.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Nullable>disable</Nullable>
</PropertyGroup>
@ -14,4 +14,6 @@
<PackageReference Include="System.Text.Encoding.CodePages" Version="7.0.0" />
</ItemGroup>
</Project>