fix a few compiler warnings
This commit is contained in:
parent
4cc76a45ef
commit
d981d092e4
1
.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml
generated
1
.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml
generated
@ -1,7 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="VcsDirectoryMappings">
|
<component name="VcsDirectoryMappings">
|
||||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
|
||||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
181
Proc/HTMLToText.cs
Normal file
181
Proc/HTMLToText.cs
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using HtmlAgilityPack;
|
||||||
|
|
||||||
|
namespace WordpressEboobScraper2.Proc;
|
||||||
|
|
||||||
|
public static class HTMLToText
|
||||||
|
{
|
||||||
|
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
|
||||||
|
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
|
||||||
|
|
||||||
|
private class PreceedingDomTextInfo
|
||||||
|
{
|
||||||
|
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
|
||||||
|
{
|
||||||
|
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
|
||||||
|
}
|
||||||
|
public bool WritePrecedingWhiteSpace { get; set; }
|
||||||
|
public bool LastCharWasSpace { get; set; }
|
||||||
|
public readonly BoolWrapper IsFirstTextOfDocWritten;
|
||||||
|
public int ListIndex { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
private class BoolWrapper
|
||||||
|
{
|
||||||
|
public BoolWrapper() { }
|
||||||
|
public bool Value { get; set; }
|
||||||
|
public static implicit operator bool(BoolWrapper boolWrapper)
|
||||||
|
{
|
||||||
|
return boolWrapper.Value;
|
||||||
|
}
|
||||||
|
public static implicit operator BoolWrapper(bool boolWrapper)
|
||||||
|
{
|
||||||
|
return new BoolWrapper { Value = boolWrapper };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string Convert(string path)
|
||||||
|
{
|
||||||
|
HtmlDocument doc = new HtmlDocument();
|
||||||
|
doc.Load(path);
|
||||||
|
return ConvertDoc(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string ConvertHtml(string html)
|
||||||
|
{
|
||||||
|
HtmlDocument doc = new HtmlDocument();
|
||||||
|
html = REX_TAG1.Replace(html, " ");
|
||||||
|
html = REX_TAG2.Replace(html, " ");
|
||||||
|
doc.LoadHtml(html);
|
||||||
|
return ConvertDoc(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string ConvertDoc(HtmlDocument doc)
|
||||||
|
{
|
||||||
|
using (StringWriter sw = new StringWriter())
|
||||||
|
{
|
||||||
|
ConvertTo(doc.DocumentNode, sw);
|
||||||
|
sw.Flush();
|
||||||
|
return sw.ToString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||||||
|
{
|
||||||
|
foreach (HtmlNode subnode in node.ChildNodes)
|
||||||
|
{
|
||||||
|
ConvertTo(subnode, outText, textInfo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void ConvertTo(HtmlNode node, TextWriter outText)
|
||||||
|
{
|
||||||
|
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||||||
|
{
|
||||||
|
string html;
|
||||||
|
switch (node.NodeType)
|
||||||
|
{
|
||||||
|
case HtmlNodeType.Comment:
|
||||||
|
// don't output comments
|
||||||
|
break;
|
||||||
|
case HtmlNodeType.Document:
|
||||||
|
ConvertContentTo(node, outText, textInfo);
|
||||||
|
break;
|
||||||
|
case HtmlNodeType.Text:
|
||||||
|
// script and style must not be output
|
||||||
|
string parentName = node.ParentNode.Name;
|
||||||
|
if ((parentName == "script") || (parentName == "style"))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// get text
|
||||||
|
html = ((HtmlTextNode)node).Text;
|
||||||
|
// is it in fact a special closing node output as text?
|
||||||
|
if (HtmlNode.IsOverlappedClosingElement(html)) break;
|
||||||
|
|
||||||
|
// check the text is meaningful and not a bunch of whitespaces
|
||||||
|
if (html.Length == 0) break;
|
||||||
|
|
||||||
|
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
|
||||||
|
|
||||||
|
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
|
||||||
|
{
|
||||||
|
html = html.TrimStart();
|
||||||
|
if (html.Length == 0) { break; }
|
||||||
|
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
|
||||||
|
}
|
||||||
|
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
|
||||||
|
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
|
||||||
|
{
|
||||||
|
outText.Write(' ');
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case HtmlNodeType.Element:
|
||||||
|
string endElementString = null;
|
||||||
|
bool isInline;
|
||||||
|
bool skip = false;
|
||||||
|
int listIndex = 0;
|
||||||
|
switch (node.Name)
|
||||||
|
{
|
||||||
|
case "nav":
|
||||||
|
skip = true;
|
||||||
|
isInline = false;
|
||||||
|
break;
|
||||||
|
case "body":
|
||||||
|
case "section":
|
||||||
|
case "article":
|
||||||
|
case "aside":
|
||||||
|
case "h1":
|
||||||
|
case "h2":
|
||||||
|
case "header":
|
||||||
|
case "footer":
|
||||||
|
case "address":
|
||||||
|
case "main":
|
||||||
|
case "div":
|
||||||
|
case "span":
|
||||||
|
case "p": // stylistic - adjust as you tend to use
|
||||||
|
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
|
||||||
|
endElementString = "\r\n";
|
||||||
|
isInline = false;
|
||||||
|
break;
|
||||||
|
case "br":
|
||||||
|
outText.Write("\r\n");
|
||||||
|
skip = true;
|
||||||
|
textInfo.WritePrecedingWhiteSpace = false;
|
||||||
|
isInline = true;
|
||||||
|
break;
|
||||||
|
case "a":
|
||||||
|
isInline = true;
|
||||||
|
break;
|
||||||
|
case "li":
|
||||||
|
isInline = false;
|
||||||
|
break;
|
||||||
|
case "ol":
|
||||||
|
listIndex = 1;
|
||||||
|
goto case "ul";
|
||||||
|
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
|
||||||
|
endElementString = "\r\n";
|
||||||
|
isInline = false;
|
||||||
|
break;
|
||||||
|
case "img": //inline-block in reality
|
||||||
|
isInline = true;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
isInline = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!skip && node.HasChildNodes)
|
||||||
|
{
|
||||||
|
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
|
||||||
|
}
|
||||||
|
if (endElementString != null)
|
||||||
|
{
|
||||||
|
outText.Write(endElementString);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
59
Proc/ProcessHelper.cs
Normal file
59
Proc/ProcessHelper.cs
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace WordpressEboobScraper2.Proc;
|
||||||
|
|
||||||
|
public static class ProcessHelper
|
||||||
|
{
|
||||||
|
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
|
||||||
|
{
|
||||||
|
var process = new System.Diagnostics.Process
|
||||||
|
{
|
||||||
|
StartInfo =
|
||||||
|
{
|
||||||
|
FileName = command,
|
||||||
|
Arguments = arguments,
|
||||||
|
WorkingDirectory = workingDirectory ?? string.Empty,
|
||||||
|
UseShellExecute = false,
|
||||||
|
RedirectStandardOutput = true,
|
||||||
|
RedirectStandardError = true,
|
||||||
|
CreateNoWindow = true,
|
||||||
|
ErrorDialog = false,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
var builderOut = new StringBuilder();
|
||||||
|
var builderErr = new StringBuilder();
|
||||||
|
var builderBoth = new StringBuilder();
|
||||||
|
|
||||||
|
process.OutputDataReceived += (sender, args) =>
|
||||||
|
{
|
||||||
|
if (args.Data == null) return;
|
||||||
|
|
||||||
|
if (builderOut.Length == 0) builderOut.Append(args.Data);
|
||||||
|
else builderOut.Append("\n" + args.Data);
|
||||||
|
|
||||||
|
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
||||||
|
else builderBoth.Append("\n" + args.Data);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.ErrorDataReceived += (sender, args) =>
|
||||||
|
{
|
||||||
|
if (args.Data == null) return;
|
||||||
|
|
||||||
|
if (builderErr.Length == 0) builderErr.Append(args.Data);
|
||||||
|
else builderErr.Append("\n" + args.Data);
|
||||||
|
|
||||||
|
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
||||||
|
else builderBoth.Append("\n" + args.Data);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.Start();
|
||||||
|
|
||||||
|
process.BeginOutputReadLine();
|
||||||
|
process.BeginErrorReadLine();
|
||||||
|
|
||||||
|
process.WaitForExit();
|
||||||
|
|
||||||
|
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
|
||||||
|
}
|
||||||
|
}
|
21
Proc/ProcessOutput.cs
Normal file
21
Proc/ProcessOutput.cs
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
namespace WordpressEboobScraper2.Proc;
|
||||||
|
|
||||||
|
public struct ProcessOutput
|
||||||
|
{
|
||||||
|
public readonly string Command;
|
||||||
|
public readonly int ExitCode;
|
||||||
|
public readonly string StdOut;
|
||||||
|
public readonly string StdErr;
|
||||||
|
public readonly string StdCombined;
|
||||||
|
|
||||||
|
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
|
||||||
|
{
|
||||||
|
Command = cmd;
|
||||||
|
ExitCode = ex;
|
||||||
|
StdOut = stdout;
|
||||||
|
StdErr = stderr;
|
||||||
|
StdCombined = stdcom;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
|
||||||
|
}
|
@ -2,9 +2,8 @@ using HtmlAgilityPack;
|
|||||||
|
|
||||||
namespace WordpressEboobScraper2.Scraper;
|
namespace WordpressEboobScraper2.Scraper;
|
||||||
|
|
||||||
public class Helper
|
public static class Helper
|
||||||
{
|
{
|
||||||
|
|
||||||
public static string Filenamify(string v, bool repl = false)
|
public static string Filenamify(string v, bool repl = false)
|
||||||
{
|
{
|
||||||
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>
|
var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p =>
|
||||||
|
@ -6,6 +6,7 @@ using System.Xml.Linq;
|
|||||||
using System.Xml.Serialization;
|
using System.Xml.Serialization;
|
||||||
using HtmlAgilityPack;
|
using HtmlAgilityPack;
|
||||||
using Ionic.Zip;
|
using Ionic.Zip;
|
||||||
|
using WordpressEboobScraper2.Proc;
|
||||||
|
|
||||||
namespace WordpressEboobScraper2.Scraper;
|
namespace WordpressEboobScraper2.Scraper;
|
||||||
|
|
||||||
@ -15,16 +16,16 @@ namespace WordpressEboobScraper2.Scraper;
|
|||||||
/** **/
|
/** **/
|
||||||
/** *************************************************** **/
|
/** *************************************************** **/
|
||||||
|
|
||||||
class Scraper
|
public class Scraper
|
||||||
{
|
{
|
||||||
|
|
||||||
static EpubParameter ACTIVE_BOOK = null;
|
static EpubParameter ACTIVE_BOOK = null;
|
||||||
|
|
||||||
const int LIMIT = 1500;
|
const int LIMIT = 1500;
|
||||||
|
|
||||||
readonly Regex REX_NUMSTART = new Regex(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
|
readonly Regex REX_NUMSTART = new(@"^\s*(?<n>[0-9]+)\s*\-.*$", RegexOptions.Compiled);
|
||||||
|
|
||||||
Dictionary<string, string> webCache = new Dictionary<string, string>();
|
Dictionary<string, string> webCache = new();
|
||||||
|
|
||||||
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
|
string STASH_FOLDER => Config.BASE_DIR_STASH + ACTIVE_BOOK.Foldername + Path.DirectorySeparatorChar;
|
||||||
|
|
||||||
@ -144,7 +145,7 @@ class Scraper
|
|||||||
void SaveCache()
|
void SaveCache()
|
||||||
{
|
{
|
||||||
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
var xs = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
||||||
using (var writer = new System.IO.StreamWriter(WCACHE_FILE))
|
using (var writer = new StreamWriter(WCACHE_FILE))
|
||||||
{
|
{
|
||||||
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
|
xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList());
|
||||||
}
|
}
|
||||||
@ -155,22 +156,20 @@ class Scraper
|
|||||||
if (!File.Exists(WCACHE_FILE)) return;
|
if (!File.Exists(WCACHE_FILE)) return;
|
||||||
|
|
||||||
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
XmlSerializer deserializer = new XmlSerializer(typeof(List<SerializableCacheEntry>));
|
||||||
using (TextReader reader = new StreamReader(WCACHE_FILE))
|
|
||||||
{
|
using TextReader reader = new StreamReader(WCACHE_FILE);
|
||||||
var result = new List<SerializableCacheEntry>();
|
|
||||||
|
|
||||||
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
|
var l = (List<SerializableCacheEntry>)deserializer.Deserialize(reader);
|
||||||
|
|
||||||
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
|
webCache = l.ToDictionary(p => p.URL, p => p.Content.Value);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
List<Chapter> FindChapters()
|
List<Chapter> FindChapters()
|
||||||
{
|
{
|
||||||
List<Chapter> result = new List<Chapter>();
|
List<Chapter> result = new List<Chapter>();
|
||||||
|
|
||||||
using (WebClient client = new WebClient())
|
using WebClient client = new WebClient();
|
||||||
{
|
|
||||||
client.Encoding = Encoding.UTF8;
|
client.Encoding = Encoding.UTF8;
|
||||||
Stack<string> buffer = new Stack<string>();
|
Stack<string> buffer = new Stack<string>();
|
||||||
buffer.Push(ACTIVE_BOOK.StartURL);
|
buffer.Push(ACTIVE_BOOK.StartURL);
|
||||||
@ -226,22 +225,19 @@ class Scraper
|
|||||||
|
|
||||||
"".Dump();
|
"".Dump();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void VerifyChapters()
|
void VerifyChapters()
|
||||||
{
|
{
|
||||||
List<Chapter> result = new List<Chapter>();
|
using WebClient client = new WebClient();
|
||||||
|
|
||||||
using (WebClient client = new WebClient())
|
|
||||||
{
|
|
||||||
client.Encoding = Encoding.UTF8;
|
client.Encoding = Encoding.UTF8;
|
||||||
Stack<string> buffer = new Stack<string>();
|
Stack<string> buffer = new Stack<string>();
|
||||||
buffer.Push(ACTIVE_BOOK.StartURL);
|
buffer.Push(ACTIVE_BOOK.StartURL);
|
||||||
|
|
||||||
while (buffer.Any() && result.Count < LIMIT)
|
while (buffer.Any())
|
||||||
{
|
{
|
||||||
var url = buffer.Pop();
|
var url = buffer.Pop();
|
||||||
Chapter curr_buffer = new Chapter() { url = url };
|
Chapter curr_buffer = new Chapter() { url = url };
|
||||||
@ -268,8 +264,8 @@ class Scraper
|
|||||||
|
|
||||||
var is_diff = false;
|
var is_diff = false;
|
||||||
|
|
||||||
var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer);
|
var r_buffer = ProcessChapter(curr_buffer, new List<Chapter>(), _ => {}, out var next_buffer);
|
||||||
var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live);
|
var r_live = ProcessChapter(curr_live, new List<Chapter>(), _ => {}, out var next_live);
|
||||||
|
|
||||||
if (next_buffer != null) buffer.Push(next_buffer);
|
if (next_buffer != null) buffer.Push(next_buffer);
|
||||||
|
|
||||||
@ -324,7 +320,6 @@ class Scraper
|
|||||||
if (is_diff) "".Dump();
|
if (is_diff) "".Dump();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
bool Relaxedurleq(string a, string b)
|
bool Relaxedurleq(string a, string b)
|
||||||
{
|
{
|
||||||
@ -350,9 +345,9 @@ class Scraper
|
|||||||
return clean;
|
return clean;
|
||||||
}
|
}
|
||||||
|
|
||||||
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueue_next)
|
ProcessResult ProcessChapter(Chapter curr, IReadOnlyList<Chapter> backBuffer, Action<String> prt, out string forwardQueueNext)
|
||||||
{
|
{
|
||||||
forwardQueue_next = null;
|
forwardQueueNext = null;
|
||||||
|
|
||||||
HtmlDocument doc = new HtmlDocument();
|
HtmlDocument doc = new HtmlDocument();
|
||||||
doc.LoadHtml(curr.queryResult);
|
doc.LoadHtml(curr.queryResult);
|
||||||
@ -500,8 +495,6 @@ class Scraper
|
|||||||
|
|
||||||
#region Next
|
#region Next
|
||||||
|
|
||||||
string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" };
|
|
||||||
|
|
||||||
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
|
if (backBuffer.Where(b => !b.isSpecial).Count() > 4 &&
|
||||||
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
|
backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 &&
|
||||||
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
|
REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success &&
|
||||||
@ -559,7 +552,7 @@ class Scraper
|
|||||||
curr.next = next_url;
|
curr.next = next_url;
|
||||||
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
|
if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower()))
|
||||||
{
|
{
|
||||||
forwardQueue_next = next_url;
|
forwardQueueNext = next_url;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -940,15 +933,13 @@ class Scraper
|
|||||||
new XAttribute("full-path", "OEBPS/content.opf"),
|
new XAttribute("full-path", "OEBPS/content.opf"),
|
||||||
new XAttribute("media-type", "application/oebps-package+xml")))));
|
new XAttribute("media-type", "application/oebps-package+xml")))));
|
||||||
|
|
||||||
StringBuilder builder = new StringBuilder();
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
||||||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
|
||||||
{
|
|
||||||
doc.Save(writer);
|
doc.Save(writer);
|
||||||
var r = writer.ToString();
|
var r = writer.ToString();
|
||||||
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
|
r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\"");
|
||||||
return r.Trim() + "\r\n";
|
return r.Trim() + "\r\n";
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
string GetEpubContentOPF(List<Chapter> chapters)
|
string GetEpubContentOPF(List<Chapter> chapters)
|
||||||
{
|
{
|
||||||
@ -1030,17 +1021,14 @@ class Scraper
|
|||||||
|
|
||||||
package.Add(new XElement(opf + "guide"));
|
package.Add(new XElement(opf + "guide"));
|
||||||
|
|
||||||
StringBuilder builder = new StringBuilder();
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
||||||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
|
||||||
{
|
|
||||||
doc.Save(writer);
|
doc.Save(writer);
|
||||||
return writer.ToString();
|
return writer.ToString();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
string GetEpubTOC(List<Chapter> chapters)
|
string GetEpubTOC(List<Chapter> chapters)
|
||||||
{
|
{
|
||||||
XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/";
|
|
||||||
XNamespace ncx = "http://www.idpf.org/2007/opf";
|
XNamespace ncx = "http://www.idpf.org/2007/opf";
|
||||||
|
|
||||||
var doc = new XDocument(
|
var doc = new XDocument(
|
||||||
@ -1082,13 +1070,11 @@ class Scraper
|
|||||||
|
|
||||||
root.Add(nav);
|
root.Add(nav);
|
||||||
|
|
||||||
StringBuilder builder = new StringBuilder();
|
using Utf8StringWriter writer = new Utf8StringWriter();
|
||||||
using (Utf8StringWriter writer = new Utf8StringWriter())
|
|
||||||
{
|
|
||||||
doc.Save(writer);
|
doc.Save(writer);
|
||||||
return writer.ToString();
|
return writer.ToString();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
string GetEpubChapterFile(Chapter chapter, int idx)
|
string GetEpubChapterFile(Chapter chapter, int idx)
|
||||||
{
|
{
|
||||||
@ -1108,256 +1094,4 @@ class Scraper
|
|||||||
|
|
||||||
return xml.ToString();
|
return xml.ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public struct ProcessOutput
|
|
||||||
{
|
|
||||||
public readonly string Command;
|
|
||||||
public readonly int ExitCode;
|
|
||||||
public readonly string StdOut;
|
|
||||||
public readonly string StdErr;
|
|
||||||
public readonly string StdCombined;
|
|
||||||
|
|
||||||
public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom)
|
|
||||||
{
|
|
||||||
Command = cmd;
|
|
||||||
ExitCode = ex;
|
|
||||||
StdOut = stdout;
|
|
||||||
StdErr = stderr;
|
|
||||||
StdCombined = stdcom;
|
|
||||||
}
|
|
||||||
|
|
||||||
public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}";
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class ProcessHelper
|
|
||||||
{
|
|
||||||
public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null)
|
|
||||||
{
|
|
||||||
var process = new Process
|
|
||||||
{
|
|
||||||
StartInfo =
|
|
||||||
{
|
|
||||||
FileName = command,
|
|
||||||
Arguments = arguments,
|
|
||||||
WorkingDirectory = workingDirectory ?? string.Empty,
|
|
||||||
UseShellExecute = false,
|
|
||||||
RedirectStandardOutput = true,
|
|
||||||
RedirectStandardError = true,
|
|
||||||
CreateNoWindow = true,
|
|
||||||
ErrorDialog = false,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
var builderOut = new StringBuilder();
|
|
||||||
var builderErr = new StringBuilder();
|
|
||||||
var builderBoth = new StringBuilder();
|
|
||||||
|
|
||||||
process.OutputDataReceived += (sender, args) =>
|
|
||||||
{
|
|
||||||
if (args.Data == null) return;
|
|
||||||
|
|
||||||
if (builderOut.Length == 0) builderOut.Append(args.Data);
|
|
||||||
else builderOut.Append("\n" + args.Data);
|
|
||||||
|
|
||||||
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
|
||||||
else builderBoth.Append("\n" + args.Data);
|
|
||||||
};
|
|
||||||
|
|
||||||
process.ErrorDataReceived += (sender, args) =>
|
|
||||||
{
|
|
||||||
if (args.Data == null) return;
|
|
||||||
|
|
||||||
if (builderErr.Length == 0) builderErr.Append(args.Data);
|
|
||||||
else builderErr.Append("\n" + args.Data);
|
|
||||||
|
|
||||||
if (builderBoth.Length == 0) builderBoth.Append(args.Data);
|
|
||||||
else builderBoth.Append("\n" + args.Data);
|
|
||||||
};
|
|
||||||
|
|
||||||
process.Start();
|
|
||||||
|
|
||||||
process.BeginOutputReadLine();
|
|
||||||
process.BeginErrorReadLine();
|
|
||||||
|
|
||||||
process.WaitForExit();
|
|
||||||
|
|
||||||
return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public static class HTMLToText
|
|
||||||
{
|
|
||||||
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
|
|
||||||
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
|
|
||||||
|
|
||||||
private class PreceedingDomTextInfo
|
|
||||||
{
|
|
||||||
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
|
|
||||||
{
|
|
||||||
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
|
|
||||||
}
|
|
||||||
public bool WritePrecedingWhiteSpace { get; set; }
|
|
||||||
public bool LastCharWasSpace { get; set; }
|
|
||||||
public readonly BoolWrapper IsFirstTextOfDocWritten;
|
|
||||||
public int ListIndex { get; set; }
|
|
||||||
}
|
|
||||||
|
|
||||||
private class BoolWrapper
|
|
||||||
{
|
|
||||||
public BoolWrapper() { }
|
|
||||||
public bool Value { get; set; }
|
|
||||||
public static implicit operator bool(BoolWrapper boolWrapper)
|
|
||||||
{
|
|
||||||
return boolWrapper.Value;
|
|
||||||
}
|
|
||||||
public static implicit operator BoolWrapper(bool boolWrapper)
|
|
||||||
{
|
|
||||||
return new BoolWrapper { Value = boolWrapper };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static string Convert(string path)
|
|
||||||
{
|
|
||||||
HtmlDocument doc = new HtmlDocument();
|
|
||||||
doc.Load(path);
|
|
||||||
return ConvertDoc(doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static string ConvertHtml(string html)
|
|
||||||
{
|
|
||||||
HtmlDocument doc = new HtmlDocument();
|
|
||||||
html = REX_TAG1.Replace(html, " ");
|
|
||||||
html = REX_TAG2.Replace(html, " ");
|
|
||||||
doc.LoadHtml(html);
|
|
||||||
return ConvertDoc(doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static string ConvertDoc(HtmlDocument doc)
|
|
||||||
{
|
|
||||||
using (StringWriter sw = new StringWriter())
|
|
||||||
{
|
|
||||||
ConvertTo(doc.DocumentNode, sw);
|
|
||||||
sw.Flush();
|
|
||||||
return sw.ToString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
|
||||||
{
|
|
||||||
foreach (HtmlNode subnode in node.ChildNodes)
|
|
||||||
{
|
|
||||||
ConvertTo(subnode, outText, textInfo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void ConvertTo(HtmlNode node, TextWriter outText)
|
|
||||||
{
|
|
||||||
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
|
||||||
{
|
|
||||||
string html;
|
|
||||||
switch (node.NodeType)
|
|
||||||
{
|
|
||||||
case HtmlNodeType.Comment:
|
|
||||||
// don't output comments
|
|
||||||
break;
|
|
||||||
case HtmlNodeType.Document:
|
|
||||||
ConvertContentTo(node, outText, textInfo);
|
|
||||||
break;
|
|
||||||
case HtmlNodeType.Text:
|
|
||||||
// script and style must not be output
|
|
||||||
string parentName = node.ParentNode.Name;
|
|
||||||
if ((parentName == "script") || (parentName == "style"))
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// get text
|
|
||||||
html = ((HtmlTextNode)node).Text;
|
|
||||||
// is it in fact a special closing node output as text?
|
|
||||||
if (HtmlNode.IsOverlappedClosingElement(html)) break;
|
|
||||||
|
|
||||||
// check the text is meaningful and not a bunch of whitespaces
|
|
||||||
if (html.Length == 0) break;
|
|
||||||
|
|
||||||
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
|
|
||||||
|
|
||||||
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
|
|
||||||
{
|
|
||||||
html = html.TrimStart();
|
|
||||||
if (html.Length == 0) { break; }
|
|
||||||
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
|
|
||||||
}
|
|
||||||
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
|
|
||||||
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
|
|
||||||
{
|
|
||||||
outText.Write(' ');
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case HtmlNodeType.Element:
|
|
||||||
string endElementString = null;
|
|
||||||
bool isInline;
|
|
||||||
bool skip = false;
|
|
||||||
int listIndex = 0;
|
|
||||||
switch (node.Name)
|
|
||||||
{
|
|
||||||
case "nav":
|
|
||||||
skip = true;
|
|
||||||
isInline = false;
|
|
||||||
break;
|
|
||||||
case "body":
|
|
||||||
case "section":
|
|
||||||
case "article":
|
|
||||||
case "aside":
|
|
||||||
case "h1":
|
|
||||||
case "h2":
|
|
||||||
case "header":
|
|
||||||
case "footer":
|
|
||||||
case "address":
|
|
||||||
case "main":
|
|
||||||
case "div":
|
|
||||||
case "span":
|
|
||||||
case "p": // stylistic - adjust as you tend to use
|
|
||||||
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
|
|
||||||
endElementString = "\r\n";
|
|
||||||
isInline = false;
|
|
||||||
break;
|
|
||||||
case "br":
|
|
||||||
outText.Write("\r\n");
|
|
||||||
skip = true;
|
|
||||||
textInfo.WritePrecedingWhiteSpace = false;
|
|
||||||
isInline = true;
|
|
||||||
break;
|
|
||||||
case "a":
|
|
||||||
isInline = true;
|
|
||||||
break;
|
|
||||||
case "li":
|
|
||||||
isInline = false;
|
|
||||||
break;
|
|
||||||
case "ol":
|
|
||||||
listIndex = 1;
|
|
||||||
goto case "ul";
|
|
||||||
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
|
|
||||||
endElementString = "\r\n";
|
|
||||||
isInline = false;
|
|
||||||
break;
|
|
||||||
case "img": //inline-block in reality
|
|
||||||
isInline = true;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
isInline = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (!skip && node.HasChildNodes)
|
|
||||||
{
|
|
||||||
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
|
|
||||||
}
|
|
||||||
if (endElementString != null)
|
|
||||||
{
|
|
||||||
outText.Write(endElementString);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -4,5 +4,5 @@ namespace WordpressEboobScraper2.Scraper;
|
|||||||
|
|
||||||
public class Utf8StringWriter : StringWriter
|
public class Utf8StringWriter : StringWriter
|
||||||
{
|
{
|
||||||
public override Encoding Encoding { get { return Encoding.UTF8; } }
|
public override Encoding Encoding => Encoding.UTF8;
|
||||||
}
|
}
|
@ -4,7 +4,7 @@
|
|||||||
<OutputType>Exe</OutputType>
|
<OutputType>Exe</OutputType>
|
||||||
<TargetFramework>net7.0</TargetFramework>
|
<TargetFramework>net7.0</TargetFramework>
|
||||||
<ImplicitUsings>enable</ImplicitUsings>
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
<Nullable>enable</Nullable>
|
<Nullable>disable</Nullable>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
|
|
||||||
@ -14,4 +14,6 @@
|
|||||||
<PackageReference Include="System.Text.Encoding.CodePages" Version="7.0.0" />
|
<PackageReference Include="System.Text.Encoding.CodePages" Version="7.0.0" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
Loading…
Reference in New Issue
Block a user