181 lines
4.9 KiB
C#
181 lines
4.9 KiB
C#
|
using System.Text.RegularExpressions;
|
||
|
using HtmlAgilityPack;
|
||
|
|
||
|
namespace WordpressEboobScraper2.Proc;
|
||
|
|
||
|
public static class HTMLToText
|
||
|
{
|
||
|
private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled);
|
||
|
private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled);
|
||
|
|
||
|
private class PreceedingDomTextInfo
|
||
|
{
|
||
|
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
|
||
|
{
|
||
|
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
|
||
|
}
|
||
|
public bool WritePrecedingWhiteSpace { get; set; }
|
||
|
public bool LastCharWasSpace { get; set; }
|
||
|
public readonly BoolWrapper IsFirstTextOfDocWritten;
|
||
|
public int ListIndex { get; set; }
|
||
|
}
|
||
|
|
||
|
private class BoolWrapper
|
||
|
{
|
||
|
public BoolWrapper() { }
|
||
|
public bool Value { get; set; }
|
||
|
public static implicit operator bool(BoolWrapper boolWrapper)
|
||
|
{
|
||
|
return boolWrapper.Value;
|
||
|
}
|
||
|
public static implicit operator BoolWrapper(bool boolWrapper)
|
||
|
{
|
||
|
return new BoolWrapper { Value = boolWrapper };
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static string Convert(string path)
|
||
|
{
|
||
|
HtmlDocument doc = new HtmlDocument();
|
||
|
doc.Load(path);
|
||
|
return ConvertDoc(doc);
|
||
|
}
|
||
|
|
||
|
public static string ConvertHtml(string html)
|
||
|
{
|
||
|
HtmlDocument doc = new HtmlDocument();
|
||
|
html = REX_TAG1.Replace(html, " ");
|
||
|
html = REX_TAG2.Replace(html, " ");
|
||
|
doc.LoadHtml(html);
|
||
|
return ConvertDoc(doc);
|
||
|
}
|
||
|
|
||
|
public static string ConvertDoc(HtmlDocument doc)
|
||
|
{
|
||
|
using (StringWriter sw = new StringWriter())
|
||
|
{
|
||
|
ConvertTo(doc.DocumentNode, sw);
|
||
|
sw.Flush();
|
||
|
return sw.ToString();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||
|
{
|
||
|
foreach (HtmlNode subnode in node.ChildNodes)
|
||
|
{
|
||
|
ConvertTo(subnode, outText, textInfo);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static void ConvertTo(HtmlNode node, TextWriter outText)
|
||
|
{
|
||
|
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
|
||
|
}
|
||
|
|
||
|
private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
|
||
|
{
|
||
|
string html;
|
||
|
switch (node.NodeType)
|
||
|
{
|
||
|
case HtmlNodeType.Comment:
|
||
|
// don't output comments
|
||
|
break;
|
||
|
case HtmlNodeType.Document:
|
||
|
ConvertContentTo(node, outText, textInfo);
|
||
|
break;
|
||
|
case HtmlNodeType.Text:
|
||
|
// script and style must not be output
|
||
|
string parentName = node.ParentNode.Name;
|
||
|
if ((parentName == "script") || (parentName == "style"))
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
// get text
|
||
|
html = ((HtmlTextNode)node).Text;
|
||
|
// is it in fact a special closing node output as text?
|
||
|
if (HtmlNode.IsOverlappedClosingElement(html)) break;
|
||
|
|
||
|
// check the text is meaningful and not a bunch of whitespaces
|
||
|
if (html.Length == 0) break;
|
||
|
|
||
|
if (html.Trim().ToLower().StartsWith("<?xml") && html.Trim().ToLower().EndsWith("?>")) break;
|
||
|
|
||
|
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
|
||
|
{
|
||
|
html = html.TrimStart();
|
||
|
if (html.Length == 0) { break; }
|
||
|
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
|
||
|
}
|
||
|
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
|
||
|
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
|
||
|
{
|
||
|
outText.Write(' ');
|
||
|
}
|
||
|
break;
|
||
|
case HtmlNodeType.Element:
|
||
|
string endElementString = null;
|
||
|
bool isInline;
|
||
|
bool skip = false;
|
||
|
int listIndex = 0;
|
||
|
switch (node.Name)
|
||
|
{
|
||
|
case "nav":
|
||
|
skip = true;
|
||
|
isInline = false;
|
||
|
break;
|
||
|
case "body":
|
||
|
case "section":
|
||
|
case "article":
|
||
|
case "aside":
|
||
|
case "h1":
|
||
|
case "h2":
|
||
|
case "header":
|
||
|
case "footer":
|
||
|
case "address":
|
||
|
case "main":
|
||
|
case "div":
|
||
|
case "span":
|
||
|
case "p": // stylistic - adjust as you tend to use
|
||
|
if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n");
|
||
|
endElementString = "\r\n";
|
||
|
isInline = false;
|
||
|
break;
|
||
|
case "br":
|
||
|
outText.Write("\r\n");
|
||
|
skip = true;
|
||
|
textInfo.WritePrecedingWhiteSpace = false;
|
||
|
isInline = true;
|
||
|
break;
|
||
|
case "a":
|
||
|
isInline = true;
|
||
|
break;
|
||
|
case "li":
|
||
|
isInline = false;
|
||
|
break;
|
||
|
case "ol":
|
||
|
listIndex = 1;
|
||
|
goto case "ul";
|
||
|
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
|
||
|
endElementString = "\r\n";
|
||
|
isInline = false;
|
||
|
break;
|
||
|
case "img": //inline-block in reality
|
||
|
isInline = true;
|
||
|
break;
|
||
|
default:
|
||
|
isInline = true;
|
||
|
break;
|
||
|
}
|
||
|
if (!skip && node.HasChildNodes)
|
||
|
{
|
||
|
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex });
|
||
|
}
|
||
|
if (endElementString != null)
|
||
|
{
|
||
|
outText.Write(endElementString);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|