2023-08-20 16:10:39 +02:00
using System.Diagnostics ;
using System.Net ;
using System.Text ;
using System.Text.RegularExpressions ;
using System.Xml.Linq ;
using System.Xml.Serialization ;
using HtmlAgilityPack ;
using Ionic.Zip ;
namespace WordpressEboobScraper2.Scraper ;
/** *************************************************** **/
/** **/
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
/** **/
/** *************************************************** **/
class Scraper
{
static EpubParameter ACTIVE_BOOK = null ;
const int LIMIT = 1500 ;
readonly Regex REX_NUMSTART = new Regex ( @"^\s*(?<n>[0-9]+)\s*\-.*$" , RegexOptions . Compiled ) ;
Dictionary < string , string > webCache = new Dictionary < string , string > ( ) ;
string STASH_FOLDER = > Config . BASE_DIR_STASH + ACTIVE_BOOK . Foldername + Path . DirectorySeparatorChar ;
string WCACHE_FILE = > Path . Combine ( Config . BASE_DIR_OUT , @"_cache" , ACTIVE_BOOK . Foldername + @".xml" ) ;
string HTML_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"html" , ACTIVE_BOOK . Foldername + @".html" ) ;
string EPUB_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"epub" , ACTIVE_BOOK . Foldername + @".epub" ) ;
string MOBI_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"mobi" , ACTIVE_BOOK . Foldername + @".mobi" ) ;
string HTML_FILE_STASH = > STASH_FOLDER + @"book.html" ;
string ZIP_FILE_STASH = > STASH_FOLDER + @"book.zip" ;
string EPUB_FILE_STASH = > STASH_FOLDER + @"book.epub" ;
string MOBI_FILE_STASH = > STASH_FOLDER + @"book.mobi" ;
string QUERY_FOLDER = > STASH_FOLDER + @"query" + Path . DirectorySeparatorChar ; // full query result
string HTML_FOLDER = > STASH_FOLDER + @"html" + Path . DirectorySeparatorChar ; // unprocessed chapter code
string EPUB_FOLDER = > STASH_FOLDER + @"epub" + Path . DirectorySeparatorChar ; // processed epub chapter code
//----------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------------------------------------------//
public void Generate ( )
{
foreach ( var bb in Config . BOOKS )
{
ACTIVE_BOOK = bb ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
new string ( '=' , $" [PROCESSING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$" [PROCESSING BOOK] {bb.DisplayStr} " . Dump ( ) ;
new string ( '=' , $" [PROCESSING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
Init ( ) ;
List < Chapter > chapters = FindChapters ( ) ;
WriteBookHTML ( chapters ) ;
WriteEpub ( chapters ) ;
if ( Config . CONVERT_MOBI ) GenerateMobi ( ) ;
}
}
public void Verify ( )
{
foreach ( var bb in Config . BOOKS )
{
ACTIVE_BOOK = bb ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
new string ( '=' , $" [VERIFYING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$" [VERIFYING BOOK] {bb.DisplayStr} " . Dump ( ) ;
new string ( '=' , $" [VERIFYING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
LoadWebCache ( ) ;
VerifyChapters ( ) ;
}
}
void Init ( )
{
if ( Directory . Exists ( STASH_FOLDER ) )
{
Directory . EnumerateDirectories ( STASH_FOLDER ) . ToList ( ) . ForEach ( d = > Directory . EnumerateFiles ( d ) . ToList ( ) . ForEach ( File . Delete ) ) ;
if ( File . Exists ( HTML_FILE_STASH ) ) File . Delete ( HTML_FILE_STASH ) ;
if ( File . Exists ( ZIP_FILE_STASH ) ) File . Delete ( ZIP_FILE_STASH ) ;
if ( File . Exists ( EPUB_FILE_STASH ) ) File . Delete ( EPUB_FILE_STASH ) ;
if ( File . Exists ( MOBI_FILE_STASH ) ) File . Delete ( MOBI_FILE_STASH ) ;
}
Directory . CreateDirectory ( STASH_FOLDER ) ;
Directory . CreateDirectory ( QUERY_FOLDER ) ;
Directory . CreateDirectory ( HTML_FOLDER ) ;
Directory . CreateDirectory ( EPUB_FOLDER ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"_cache" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"html" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"epub" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"mobi" + Path . DirectorySeparatorChar ) ;
if ( Config . USE_WEBCACHE ) LoadWebCache ( ) ;
}
void WriteBookHTML ( List < Chapter > chapters )
{
StringBuilder b = new StringBuilder ( ) ;
b . AppendLine ( "<!DOCTYPE html>" ) ;
b . AppendLine ( "<html>" ) ;
b . AppendLine ( "<body>" ) ;
foreach ( var currChapter in chapters )
{
b . AppendLine ( ) ;
b . AppendLine ( "<h1>" + HtmlEntity . Entitize ( currChapter . title ) + "</h1>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( currChapter . chapter ) ;
}
b . AppendLine ( "</html>" ) ;
b . AppendLine ( "</body>" ) ;
File . WriteAllText ( HTML_FILE_STASH , b . ToString ( ) , Encoding . UTF8 ) ;
File . Copy ( HTML_FILE_STASH , HTML_FILE_OUT , true ) ;
}
void SaveCache ( )
{
var xs = new XmlSerializer ( typeof ( List < SerializableCacheEntry > ) ) ;
using ( var writer = new System . IO . StreamWriter ( WCACHE_FILE ) )
{
xs . Serialize ( writer , webCache . Select ( p = > new SerializableCacheEntry { URL = p . Key , Content = new GZippedString { Value = p . Value } } ) . ToList ( ) ) ;
}
}
void LoadWebCache ( )
{
if ( ! File . Exists ( WCACHE_FILE ) ) return ;
XmlSerializer deserializer = new XmlSerializer ( typeof ( List < SerializableCacheEntry > ) ) ;
using ( TextReader reader = new StreamReader ( WCACHE_FILE ) )
{
var result = new List < SerializableCacheEntry > ( ) ;
var l = ( List < SerializableCacheEntry > ) deserializer . Deserialize ( reader ) ;
webCache = l . ToDictionary ( p = > p . URL , p = > p . Content . Value ) ;
}
}
List < Chapter > FindChapters ( )
{
List < Chapter > result = new List < Chapter > ( ) ;
using ( WebClient client = new WebClient ( ) )
{
client . Encoding = Encoding . UTF8 ;
Stack < string > buffer = new Stack < string > ( ) ;
buffer . Push ( ACTIVE_BOOK . StartURL ) ;
while ( buffer . Any ( ) & & result . Count < LIMIT )
{
var url = buffer . Pop ( ) ;
Chapter curr = new Chapter ( ) { url = url } ;
var buffered = webCache . ContainsKey ( url . ToLower ( ) ) ;
if ( buffered )
{
curr . queryResult = webCache [ url . ToLower ( ) ] ;
"*(loaded from webcache)*" . Dump ( ) ;
}
else
{
curr . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
webCache [ url . ToLower ( ) ] = curr . queryResult ;
SaveCache ( ) ;
}
var r = ProcessChapter ( curr , result , s = > s . Dump ( ) , out var next_url ) ;
if ( next_url ! = null ) buffer . Push ( next_url ) ;
if ( buffered & & buffer . Count = = 0 & & Config . DO_LIVE_RELOAD_OF_LAST )
{
"" . Dump ( ) ;
"//==> *(auto-reload from live)*" . Dump ( ) ;
"" . Dump ( ) ;
curr . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
webCache [ url . ToLower ( ) ] = curr . queryResult ;
SaveCache ( ) ;
r = ProcessChapter ( curr , result , s = > s . Dump ( ) , out var next_url_inner ) ;
if ( next_url_inner ! = null ) buffer . Push ( next_url_inner ) ;
}
if ( r = = ProcessResult . SuccessNormal )
{
" ==> Chapter processed" . Dump ( ) ;
result . Add ( curr ) ;
OutputChapter ( curr , result . Count ) ;
}
else if ( r = = ProcessResult . SkipChapter )
{
" ==> Skip this chapter" . Dump ( ) ;
}
else if ( r = = ProcessResult . ReachedEnd )
{
" ==> End reached" . Dump ( ) ;
}
"" . Dump ( ) ;
}
}
return result ;
}
void VerifyChapters ( )
{
List < Chapter > result = new List < Chapter > ( ) ;
using ( WebClient client = new WebClient ( ) )
{
client . Encoding = Encoding . UTF8 ;
Stack < string > buffer = new Stack < string > ( ) ;
buffer . Push ( ACTIVE_BOOK . StartURL ) ;
while ( buffer . Any ( ) & & result . Count < LIMIT )
{
var url = buffer . Pop ( ) ;
Chapter curr_buffer = new Chapter ( ) { url = url } ;
Chapter curr_live = new Chapter ( ) { url = url } ;
var buffered = webCache . ContainsKey ( url . ToLower ( ) ) ;
if ( buffered )
{
try
{
curr_buffer . queryResult = webCache [ url . ToLower ( ) ] ;
curr_live . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
}
catch ( Exception e )
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}" . Dump ( ) ;
continue ;
}
}
else
{
continue ;
}
var is_diff = false ;
var r_buffer = ProcessChapter ( curr_buffer , result , _ = > { } , out var next_buffer ) ;
var r_live = ProcessChapter ( curr_live , result , _ = > { } , out var next_live ) ;
if ( next_buffer ! = null ) buffer . Push ( next_buffer ) ;
if ( r_buffer ! = r_live ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}" . Dump ( ) ; is_diff = true ; }
if ( r_buffer ! = r_live ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}" . Dump ( ) ; is_diff = true ; }
if ( ! Relaxedurleq ( curr_buffer . next , curr_live . next ) ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}" . Dump ( ) ; is_diff = true ; }
if ( curr_buffer . title ! = curr_live . title ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}" . Dump ( ) ; is_diff = true ; }
if ( curr_buffer . chapter . Value ! = curr_live . chapter . Value )
{
var clean_buffer = GetChapterText ( curr_buffer ) ;
var clean_live = GetChapterText ( curr_live ) ;
if ( clean_buffer . Trim ( ) ! = clean_live . Trim ( ) )
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: " . Dump ( ) ;
new Hyperlinq ( ( ) = >
{
var fa = Path . Combine ( Path . GetTempPath ( ) , "buffer_" + Guid . NewGuid ( ) + ".txt" ) ;
var fb = Path . Combine ( Path . GetTempPath ( ) , "live___" + Guid . NewGuid ( ) + ".txt" ) ;
File . WriteAllText ( fa , curr_buffer . chapter . Value ) ;
File . WriteAllText ( fb , curr_live . chapter . Value ) ;
Process . Start ( Config . COMPARE_PROG , $"\" { fa } \ " \"{fb}\"" ) ;
} , "[Compare Raw]" ) . Dump ( ) ;
new Hyperlinq ( ( ) = >
{
var fa = Path . Combine ( Path . GetTempPath ( ) , "buffer_" + Guid . NewGuid ( ) + ".txt" ) ;
var fb = Path . Combine ( Path . GetTempPath ( ) , "live___" + Guid . NewGuid ( ) + ".txt" ) ;
File . WriteAllText ( fa , clean_buffer ) ;
File . WriteAllText ( fb , clean_live ) ;
Process . Start ( Config . COMPARE_PROG , $"\" { fa } \ " \"{fb}\"" ) ;
} , "[Compare Text]" ) . Dump ( ) ;
new Hyperlinq ( ( ) = >
{
webCache [ url . ToLower ( ) ] = curr_live . queryResult ;
SaveCache ( ) ;
} , "[Save new version to webcache]" ) . Dump ( ) ;
is_diff = true ;
}
}
if ( ! is_diff ) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences" . Dump ( ) ;
if ( is_diff ) "" . Dump ( ) ;
}
}
}
bool Relaxedurleq ( string a , string b )
{
if ( a = = b ) return true ;
if ( a . StartsWith ( "https://" ) ) a = a . Substring ( "https://" . Length ) ;
if ( a . StartsWith ( "http://" ) ) a = a . Substring ( "http://" . Length ) ;
if ( b . StartsWith ( "https://" ) ) b = b . Substring ( "https://" . Length ) ;
if ( b . StartsWith ( "http://" ) ) b = b . Substring ( "http://" . Length ) ;
return ( a = = b ) ;
}
string GetChapterText ( Chapter c )
{
if ( string . IsNullOrWhiteSpace ( c . chapter . Value ) ) return string . Empty ;
var clean = HTMLToText . ConvertHtml ( c . chapter . Value ) ;
clean = clean . Trim ( ) ;
clean = new Regex ( @"\s+" ) . Replace ( clean , " " ) ;
return clean ;
}
ProcessResult ProcessChapter ( Chapter curr , IReadOnlyList < Chapter > backBuffer , Action < String > prt , out string forwardQueue_next )
{
forwardQueue_next = null ;
HtmlDocument doc = new HtmlDocument ( ) ;
doc . LoadHtml ( curr . queryResult ) ;
#region Base
var nodeContent = doc . DocumentNode . SelectSingleNode ( @"//article[contains(@class,'post') and contains(@class ,'type-post')]" ) ;
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//article[contains(@id,'post') and contains(@class ,'post')]" ) ;
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@id,'post') and contains(@class ,'post')]" ) ;
2023-08-20 16:44:58 +02:00
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]" ) ;
2023-08-20 16:10:39 +02:00
if ( nodeContent = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class,'box_con')]" ) ;
var nodeNav = doc . DocumentNode . SelectSingleNode ( @"//nav[contains(@class,'post-navigation') and @role='navigation']" ) ;
if ( nodeNav = = null ) nodeNav = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class,'pjgm-navigation')]" ) ;
if ( nodeNav = = null ) nodeNav = nodeContent . SelectSingleNode ( @"//div[contains(@class,'nav-buttons')]" ) ;
if ( nodeNav = = null ) nodeNav = nodeContent ;
var nodeChapter = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]" ) ;
if ( nodeChapter = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) nodeChapter = nodeContent . SelectSingleNode ( @"//div[contains(@id, 'content')]" ) ;
#endregion
#region Title
var titleNode = nodeContent . SelectSingleNode ( @"//header[@class='entry-header']//h1[@class='entry-title']" ) ;
if ( titleNode = = null ) titleNode = nodeContent . SelectSingleNode ( @"//h1[contains(@class, 'posttitle')]" ) ;
if ( titleNode = = null ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'fic-header')]//h1" ) ;
if ( titleNode = = null & & ACTIVE_BOOK . SiteType = = Site . WP ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'entry-content')]//strong" ) ;
if ( titleNode = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'bookname')]/h1" ) ;
curr . title = Helper . TitleFmt ( HtmlEntity . DeEntitize ( titleNode . InnerText ) ) ;
var titles = new List < string > ( ) ;
titles . Add ( curr . title ) ;
if ( string . IsNullOrWhiteSpace ( curr . title ) | | Regex . IsMatch ( curr . title . ToLower ( ) , @"^chapter [0-9]+.*" ) )
{
var baseTitle = curr . title ;
var suffix = Helper . TitleFmt ( Regex . Match ( curr . title . ToLower ( ) , @"^chapter [0-9]+(.*)$" ) . Groups [ 1 ] . Value ) ;
var prefix1 = Regex . Match ( curr . title . ToLower ( ) , @"^(chapter) ([0-9]+)" ) . Groups [ 0 ] . Value ;
var prefix2 = "chapter " + int . Parse ( Regex . Match ( curr . title . ToLower ( ) , @"^(chapter) ([0-9]+)" ) . Groups [ 2 ] . Value ) ;
titles . Add ( prefix1 ) ;
titles . Add ( prefix2 ) ;
var altTitleNode1 = nodeChapter . Descendants ( ) . LastOrDefault ( p = > ! p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix1 ) & & p . InnerText . Trim ( ) . Length - prefix1 . Length > 2 ) ;
var altTitleNode2 = nodeChapter . Descendants ( ) . LastOrDefault ( p = > ! p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix2 ) & & p . InnerText . Trim ( ) . Length - prefix2 . Length > 2 ) ;
var altTitleNode3 = nodeChapter . Descendants ( ) . FirstOrDefault ( p = > p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix1 ) & & p . InnerText . Trim ( ) . Length - prefix1 . Length > 2 & & ! ( p . InnerHtml . Contains ( "<p>" ) | | p . InnerHtml . Contains ( "<br" ) ) ) ;
var altTitleNode4 = nodeChapter . Descendants ( ) . FirstOrDefault ( p = > p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix2 ) & & p . InnerText . Trim ( ) . Length - prefix2 . Length > 2 & & ! ( p . InnerHtml . Contains ( "<p>" ) | | p . InnerHtml . Contains ( "<br" ) ) ) ;
if ( altTitleNode1 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode1 . InnerText . Trim ( ) . Substring ( prefix1 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
}
else if ( altTitleNode2 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode2 . InnerText . Trim ( ) . Substring ( prefix2 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
}
else if ( altTitleNode3 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode3 . InnerText . Trim ( ) . Substring ( prefix1 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
altTitleNode3 . Remove ( ) ;
prt ( " > title node removed" ) ;
}
else if ( altTitleNode4 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode4 . InnerText . Trim ( ) . Substring ( prefix2 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
altTitleNode4 . Remove ( ) ;
prt ( " > title node removed" ) ;
}
else if ( suffix . Length > 2 )
{
curr . title = suffix ;
titles . Add ( suffix ) ;
}
else
{
prt ( " [!!] Warning cannot parse title" ) ;
}
if ( suffix . Length > 2 )
{
curr . title = baseTitle ;
titles . Add ( baseTitle ) ;
}
}
if ( curr . title . ToLower ( ) . StartsWith ( ACTIVE_BOOK . Foldername . ToLower ( ) ) ) {
var tit_alt = curr . title . Substring ( ACTIVE_BOOK . Foldername . Length ) ;
while ( tit_alt . Length > 0 & & new [ ] { ' ' , '\t' , '-' , ',' , ':' , '.' , '_' , ';' } . Contains ( tit_alt [ 0 ] ) ) tit_alt = tit_alt . Substring ( 1 ) ;
tit_alt = tit_alt . Trim ( ) ;
if ( tit_alt . Length > 2 ) curr . title = tit_alt ;
}
#endregion
curr . sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent . OuterHtml + "\r\n</body>\r\n</html>\r\n" ;
if ( backBuffer . Any ( ) & & backBuffer . First ( ) . title = = curr . title )
{
prt ( "[!] Book loop found - skipping entry" ) ;
return ProcessResult . ReachedEnd ; // prevent book II loop
}
curr . isEpilogue = ( titles . Any ( t = > t . ToLower ( ) . Contains ( "epilogue" ) | | t . ToLower ( ) . Contains ( "epilog" ) ) ) & & ( ACTIVE_BOOK . SiteType ! = Site . Royalroad ) ;
curr . isPrologue = ( titles . Any ( t = > t . ToLower ( ) . Contains ( "prologue" ) | | t . ToLower ( ) . Contains ( "prolog" ) ) ) ;
curr . isBonus = ( titles . Any ( t = > t . ToLower ( ) . Trim ( ) . StartsWith ( "bonus" ) ) ) ;
if ( ACTIVE_BOOK = = Config . APGTE7 ) curr . isEpilogue = titles . Any ( t = > t . ToLower ( ) = = "epilogue II" ) ;
if ( backBuffer . Skip ( 1 ) . Any ( bb = > bb . isEpilogue ) & & ! curr . isBonus )
{
prt ( "[!] Epilogue found - skipping entry" ) ;
return ProcessResult . ReachedEnd ; // Book finished - it was the Epilogue
}
prt ( curr . title + " (" + curr . url + ")" ) ;
#region Next
string [ ] title_spec_words = new string [ ] { "prologue" , "epilogue" , "bonus" } ;
if ( backBuffer . Where ( b = > ! b . isSpecial ) . Count ( ) > 4 & &
backBuffer . Where ( b = > ! b . isSpecial ) . Select ( bb = > { var r = REX_NUMSTART . Match ( bb . title ) ; return r . Success ? r . Groups [ "n" ] . Value : null ; } ) . Distinct ( ) . Count ( ) = = 1 & &
REX_NUMSTART . Match ( backBuffer . Where ( b = > ! b . isSpecial ) . First ( ) . title ) . Success & &
REX_NUMSTART . Match ( curr . title ) . Success & &
REX_NUMSTART . Match ( backBuffer . Where ( b = > ! b . isSpecial ) . First ( ) . title ) . Groups [ "n" ] . Value ! = REX_NUMSTART . Match ( curr . title ) . Groups [ "n" ] . Value )
{
prt ( "[!] Book jump found - skipping entry" ) ;
return ProcessResult . ReachedEnd ;
}
var next = nodeContent . SelectSingleNode ( @"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']" ) ;
2023-08-20 16:44:58 +02:00
2023-08-20 16:10:39 +02:00
if ( next = = null )
next = nodeContent . Descendants ( )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > Helper . Striptease ( p ) = = "next chapter" | | Helper . Striptease ( p ) = = "next" )
. Where ( p = > p . Attributes . Contains ( "href" ) )
. FirstOrDefault ( ) ;
if ( next = = null )
next = nodeNav . Descendants ( )
2023-08-20 16:44:58 +02:00
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > p . Attributes . Any ( q = > q . Name = = "rel" & & q . Value = = "next" ) )
. FirstOrDefault ( ) ;
if ( next = = null )
next = Helper . RecursiveDescendants ( nodeContent )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > Helper . Striptease ( p ) = = "next chapter" | | Helper . Striptease ( p ) = = "next" )
. Where ( p = > p . Attributes . Contains ( "href" ) )
. FirstOrDefault ( ) ;
if ( next = = null )
next = Helper . RecursiveDescendants ( nodeContent )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > p . Attributes . Any ( q = > q . Name = = "rel" & & q . Value = = "next" ) )
. FirstOrDefault ( ) ;
2023-08-20 16:10:39 +02:00
if ( next ! = null )
{
var next_url = next . Attributes [ "href" ] . Value . Trim ( ) ;
if ( next_url = = "." | | next_url = = "/" | | next_url = = "./" )
{
next = null ;
}
else
{
if ( next_url . StartsWith ( "//" ) ) next_url = "http:" + next_url ;
if ( next_url . StartsWith ( "/" ) ) next_url = Helper . CombineAuthority ( curr . url , next_url ) ;
if ( ! next_url . Contains ( "://" ) & & ACTIVE_BOOK . SiteType = = Site . WW ) next_url = Helper . CombineUri ( curr . url , next_url ) ;
curr . next = next_url ;
if ( ! backBuffer . Any ( p = > p . url . ToLower ( ) = = next_url . ToLower ( ) ) )
{
forwardQueue_next = next_url ;
}
}
}
if ( next = = null ) prt ( " > (!) No next URL found" ) ;
#endregion
#region Chapter marker
var cpMarkerIdentities = new List < string >
{
"previousnext" , "previouschapternextchapter" ,
"firstnext" , "firstchapternextchapter" ,
"firstchapter" , "previouslast" ,
"previouschapterlastchapter" ,
"previouschapter" , "nextchapter" , "lastchapter" ,
"first" , "previous" , "next" , "last"
} ;
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > p . InnerText . Trim ( ) . Length < 24 & & ( p . InnerText . ToLower ( ) . Contains ( "previous chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "next chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "last chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "first chapter" ) ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > Chapter marker removed" ) ;
}
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > Chapter marker removed" ) ;
}
var alist = nodeChapter . SelectNodes ( "//a" ) ;
if ( alist ! = null )
{
foreach ( var node in alist . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
node . Remove ( ) ;
prt ( " > Chapter marker removed" ) ;
}
}
var plist = nodeChapter . SelectNodes ( "//p" ) ;
if ( plist ! = null )
{
foreach ( var node in plist . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
node . Remove ( ) ;
prt ( " > Chapter marker removed" ) ;
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter . SelectNodes ( @"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]" ) ;
if ( shareNodes ! = null )
{
foreach ( var node in shareNodes )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > share div removed" ) ;
}
else
{
prt ( " > share div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Meta Div
var metaNodes = nodeChapter . SelectNodes ( @"div[contains(@class, 'entry-meta')]" ) ;
if ( metaNodes ! = null )
{
foreach ( var node in metaNodes )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > meta div removed" ) ;
}
else
{
prt ( " > meta div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Ad Blocking
var adNodes1 = nodeChapter . SelectNodes ( @"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.." ) ;
if ( adNodes1 ! = null )
{
foreach ( var node in adNodes1 )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
var adNodes2 = nodeChapter . SelectNodes ( @"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.." ) ;
if ( adNodes2 ! = null )
{
foreach ( var node in adNodes2 )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
var adNodes3 = nodeChapter . SelectNodes ( @"div[contains(@class,'code-block')]" ) ;
if ( adNodes3 ! = null )
{
foreach ( var node in adNodes3 . Where ( n = > Helper . Striptease ( n ) = = "advertisement" ) )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Title Paragraphs
var titleNodes1 = nodeChapter . SelectNodes ( @"p" ) ;
if ( titleNodes1 ! = null & & titleNodes1 . Any ( ) & & titles . Any ( t = > t . ToLower ( ) = = Helper . TitleFmt ( titleNodes1 . First ( ) . InnerText ) . ToLower ( ) ) & & nodeChapter . ChildNodes . Contains ( titleNodes1 . First ( ) ) )
{
nodeChapter . RemoveChild ( titleNodes1 . First ( ) ) ;
prt ( " > title node removed" ) ;
}
for ( int hval = 1 ; hval < = 5 ; hval + + )
{
var titleNodes2 = nodeChapter . SelectNodes ( @"h" + hval ) ;
if ( titleNodes2 ! = null )
{
foreach ( var node in titleNodes2 . Where ( node = > titles . Any ( t = > t . ToLower ( ) = = Helper . TitleFmt ( node . InnerText ) . ToLower ( ) ) ) )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > title node removed" ) ;
}
}
}
}
var titleNodes3 = nodeChapter . SelectNodes ( @"//u" ) ;
if ( titleNodes3 ! = null & & titleNodes3 . Any ( ) )
{
var xTitleNodes3 = titleNodes3 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes3 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
var titleNodes4 = nodeChapter . SelectNodes ( @"//span" ) ;
if ( titleNodes4 ! = null & & titleNodes4 . Any ( ) )
{
var xTitleNodes4 = titleNodes4 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes4 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
var titleNodes5 = nodeChapter . SelectNodes ( @"//strong" ) ;
if ( titleNodes5 ! = null & & titleNodes5 . Any ( ) )
{
var xTitleNodes5 = titleNodes5 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes5 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
#endregion
#region Remove < hr > ' s
while ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . First ( ) . Name . ToLower ( ) = = "hr" )
{
nodeChapter . RemoveChild ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . First ( ) ) ;
prt ( " > header hr removed" ) ;
}
while ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . Last ( ) . Name . ToLower ( ) = = "hr" )
{
nodeChapter . RemoveChild ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . Last ( ) ) ;
prt ( " > footer hr removed" ) ;
}
#endregion
#region Other ( Author ' s Node )
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > p . InnerText . ToLower ( ) . Contains ( "note from the author" ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > authors note removed" ) ;
}
#endregion
var chap_html = nodeChapter . InnerHtml . Trim ( ) ;
#region Fix raw < hr >
// KOReader doesn't like <hr>
chap_html = chap_html . Replace ( "<hr>" , "<hr/>" ) ;
#endregion
curr . chapter = chap_html ;
if ( curr . title . ToLower ( ) . StartsWith ( "not a chapter - " ) ) return ProcessResult . SkipChapter ;
return ProcessResult . SuccessNormal ;
}
void OutputChapter ( Chapter curr , int index )
{
File . WriteAllText ( QUERY_FOLDER + string . Format ( "{0:000}" , index ) + "_" + Helper . Filenamify ( curr . title ) + ".html" , curr . queryResult ) ;
File . WriteAllText ( HTML_FOLDER + string . Format ( "{0:000}" , index ) + "_" + Helper . Filenamify ( curr . title ) + ".html" , curr . sourcecode , Encoding . UTF8 ) ;
StringBuilder b = new StringBuilder ( ) ;
{
b . AppendLine ( "<!DOCTYPE html>" ) ;
b . AppendLine ( "<html>" ) ;
b . AppendLine ( "<body>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( "<h1>" + HtmlEntity . Entitize ( curr . title ) + "</h1>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( curr . chapter ) ;
b . AppendLine ( "</body>" ) ;
b . AppendLine ( "</html>" ) ;
}
File . WriteAllText ( Path . Combine ( EPUB_FOLDER , Helper . Filenamify ( string . Format ( "{0:000}_{1}.html" , index , curr . title ) ) ) , b . ToString ( ) , Encoding . UTF8 ) ;
}
string NakedIdentity ( HtmlNode raw )
{
return string . Join ( string . Empty ,
raw
. InnerText
. ToLower ( )
. Replace ( ">" , "" )
. Replace ( "<" , "" )
. Replace ( "&" , "" )
. Replace ( """ , "" )
. Replace ( " " , "" )
. ToCharArray ( )
. Where ( c = > char . IsLetterOrDigit ( c ) )
. Select ( c = > char . ToLower ( c ) ) ) . Trim ( )
. ToLower ( ) ;
}
bool CouldBeTitle ( HtmlNode n , string title )
{
var t0 = Helper . Striptease ( n ) ;
var t1 = Helper . Striptease ( title ) ;
t0 = t0 . ToLower ( ) ;
t1 = t1 . ToLower ( ) ;
t0 = t0 . Replace ( ":" , "" ) . Replace ( "-" , "" ) . Replace ( "(" , "" ) . Replace ( ")" , "" ) ;
t1 = t1 . Replace ( ":" , "" ) . Replace ( "-" , "" ) . Replace ( "(" , "" ) . Replace ( ")" , "" ) ;
t0 = Regex . Replace ( t0 , @"\s\s+" , "" ) ;
t1 = Regex . Replace ( t1 , @"\s\s+" , "" ) ;
return t0 = = t1 ;
}
void WriteEpub ( List < Chapter > chapters )
{
if ( File . Exists ( EPUB_FILE_STASH ) ) File . Delete ( EPUB_FILE_STASH ) ;
if ( File . Exists ( ZIP_FILE_STASH ) ) File . Delete ( ZIP_FILE_STASH ) ;
Encoding . RegisterProvider ( CodePagesEncodingProvider . Instance ) ;
using ( FileStream fs = File . Open ( ZIP_FILE_STASH , FileMode . Create , FileAccess . ReadWrite ) )
{
using ( var zipbook = new ZipOutputStream ( fs ) )
{
WritePubString ( zipbook , @"mimetype" , GetEpubMimetype ( ) ) ;
WritePubString ( zipbook , @"META-INF\container.xml" , GetEpubContainerXML ( ) ) ;
WritePubString ( zipbook , @"OEBPS\content.opf" , GetEpubContentOPF ( chapters ) ) ;
WritePubString ( zipbook , @"OEBPS\toc.ncx" , GetEpubTOC ( chapters ) ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
WritePubString ( zipbook , string . Format ( @"OEBPS\Text\{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) , GetEpubChapterFile ( chapters [ i ] , i ) ) ;
}
}
}
File . Copy ( ZIP_FILE_STASH , EPUB_FILE_STASH ) ;
File . Copy ( EPUB_FILE_STASH , EPUB_FILE_OUT , true ) ;
}
void GenerateMobi ( )
{
if ( File . Exists ( MOBI_FILE_STASH ) ) File . Delete ( MOBI_FILE_STASH ) ;
"Running ebook-convert for MOBI output" . Dump ( ) ;
var pout = ProcessHelper . ProcExecute ( "ebook-convert" , $"\" { EPUB_FILE_STASH } \ " \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999" ) ;
$"ebook-convert returned: {pout.ExitCode}" . Dump ( ) ;
if ( pout . ExitCode ! = 0 ) throw new Exception ( pout . ExitCode + "\n\n\n\n" + pout . StdCombined ) ;
File . Copy ( MOBI_FILE_STASH , MOBI_FILE_OUT , true ) ;
}
void WritePubString ( ZipOutputStream z , string n , string c , Encoding e = null )
{
e = e ? ? Encoding . UTF8 ;
var f = z . PutNextEntry ( n ) ;
f . CompressionLevel = Ionic . Zlib . CompressionLevel . None ;
byte [ ] buffer = e . GetBytes ( c ) ;
z . Write ( buffer , 0 , buffer . Length ) ;
}
string GetEpubMimetype ( )
{
return "application/epub+zip" ;
}
string GetEpubContainerXML ( )
{
var doc = new XDocument ( new XDeclaration ( "1.0" , "UTF-8" , null ) ,
new XElement ( XName . Get ( "container" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XAttribute ( "version" , "1.0" ) ,
new XElement ( XName . Get ( "rootfiles" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XElement ( XName . Get ( "rootfile" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XAttribute ( "full-path" , "OEBPS/content.opf" ) ,
new XAttribute ( "media-type" , "application/oebps-package+xml" ) ) ) ) ) ;
StringBuilder builder = new StringBuilder ( ) ;
using ( Utf8StringWriter writer = new Utf8StringWriter ( ) )
{
doc . Save ( writer ) ;
var r = writer . ToString ( ) ;
r = r . Replace ( "encoding=\"utf-8\"" , "encoding=\"UTF-8\"" ) ;
return r . Trim ( ) + "\r\n" ;
}
}
string GetEpubContentOPF ( List < Chapter > chapters )
{
XNamespace dc = "http://purl.org/dc/elements/1.1/" ;
XNamespace opf = "http://www.idpf.org/2007/opf" ;
var doc = new XDocument ( new XDeclaration ( "1.0" , "UTF-8" , null ) ) ;
var package = new XElement ( opf + "package" ,
new XAttribute ( "unique-identifier" , "BookId" ) ,
new XAttribute ( "version" , "2.0" ) ) ;
doc . Add ( package ) ;
var meta = new XElement ( opf + "metadata" ,
new XAttribute ( XNamespace . Xmlns + "dc" , dc ) ,
new XAttribute ( XNamespace . Xmlns + "opf" , opf ) ,
new XElement ( dc + "title" , ACTIVE_BOOK . Title ) ,
new XElement ( dc + "creator" , ACTIVE_BOOK . Author ) ,
new XElement ( dc + "identifier" ,
new XAttribute ( "id" , "BookId" ) ,
new XAttribute ( opf + "scheme" , "UUID" ) ,
"urn:uuid:" + ACTIVE_BOOK . ID_OPF . ToString ( "D" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "publication" ) ,
ACTIVE_BOOK . Release . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "modification" ) ,
DateTime . Now . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "creation" ) ,
DateTime . Now . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "language" , ACTIVE_BOOK . Language ) ,
new XElement ( dc + "identifier" ,
new XAttribute ( opf + "scheme" , "UUID" ) ,
ACTIVE_BOOK . ID_CAL . ToString ( "D" ) ) ,
new XElement ( opf + "meta" ,
new XAttribute ( "content" , "1.0" ) ,
new XAttribute ( "name" , "Wordpress_eBook_scraper_version" ) ) ,
new XElement ( opf + "meta" ,
new XAttribute ( "content" , DateTime . Now . ToString ( "yyyy-MM-dd" ) ) ,
new XAttribute ( "name" , "Wordpress_eBook_scraper_creation_time" ) ) ) ;
if ( ACTIVE_BOOK . Series ! = null )
{
meta . Add ( new XElement ( opf + "meta" ,
new XAttribute ( "content" , ACTIVE_BOOK . Series ) ,
new XAttribute ( "name" , "calibre:series" ) ) ) ;
meta . Add ( new XElement ( opf + "meta" ,
new XAttribute ( "content" , string . Format ( "{0}.0" , ACTIVE_BOOK . SeriesIndex ) ) ,
new XAttribute ( "name" , "calibre:series_index" ) ) ) ;
}
package . Add ( meta ) ;
var manifest = new XElement ( opf + "manifest" ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
manifest . Add ( new XElement ( opf + "item" ,
new XAttribute ( "href" , string . Format ( "Text/{0:000}_{1}.html" , i + 1 , Uri . EscapeUriString ( Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ,
new XAttribute ( "id" , string . Format ( "x{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ,
new XAttribute ( "media-type" , "application/xhtml+xml" ) ) ) ;
}
manifest . Add ( new XElement ( opf + "item" ,
new XAttribute ( "href" , "toc.ncx" ) ,
new XAttribute ( "id" , "ncx" ) ,
new XAttribute ( "media-type" , "application/x-dtbncx+xml" ) ) ) ;
package . Add ( manifest ) ;
var spine = new XElement ( opf + "spine" , new XAttribute ( "toc" , "ncx" ) ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
spine . Add ( new XElement ( opf + "itemref" ,
new XAttribute ( "idref" , string . Format ( "x{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ) ;
}
package . Add ( spine ) ;
package . Add ( new XElement ( opf + "guide" ) ) ;
StringBuilder builder = new StringBuilder ( ) ;
using ( Utf8StringWriter writer = new Utf8StringWriter ( ) )
{
doc . Save ( writer ) ;
return writer . ToString ( ) ;
}
}
string GetEpubTOC ( List < Chapter > chapters )
{
XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/" ;
XNamespace ncx = "http://www.idpf.org/2007/opf" ;
var doc = new XDocument (
new XDeclaration ( "1.0" , "UTF-8" , null ) ,
new XDocumentType ( "ncx" , "-//NISO//DTD ncx 2005-1//EN" , "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd" , null ) ) ;
var root = new XElement ( ncx + "ncx" ,
new XAttribute ( "version" , "2005-1" ) ,
new XElement ( ncx + "head" ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , "urn:uuid:" + ACTIVE_BOOK . ID_OPF . ToString ( "D" ) ) ,
new XAttribute ( "name" , "dtb:uid" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 1 ) ,
new XAttribute ( "name" , "dtb:depth" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 0 ) ,
new XAttribute ( "name" , "dtb:totalPageCount" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 0 ) ,
new XAttribute ( "name" , "dtb:maxPageNumber" ) ) ) ) ;
doc . Add ( root ) ;
root . Add ( new XElement ( ncx + "docTitle" ,
new XElement ( ncx + "text" , "Unknown" ) ) ) ;
var nav = new XElement ( ncx + "navMap" ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
nav . Add ( new XElement ( ncx + "navPoint" ,
new XAttribute ( "id" , "navPoint-" + ( i + 1 ) ) ,
new XAttribute ( "playOrder" , i + 1 ) ,
new XElement ( ncx + "navLabel" ,
new XElement ( ncx + "text" , chapters [ i ] . title ) ) ,
new XElement ( ncx + "content" ,
new XAttribute ( "src" , string . Format ( "Text/{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ) ) ;
}
root . Add ( nav ) ;
StringBuilder builder = new StringBuilder ( ) ;
using ( Utf8StringWriter writer = new Utf8StringWriter ( ) )
{
doc . Save ( writer ) ;
return writer . ToString ( ) ;
}
}
string GetEpubChapterFile ( Chapter chapter , int idx )
{
StringBuilder xml = new StringBuilder ( ) ;
xml . AppendLine ( @"<?xml version=""1.0"" encoding=""utf-8""?>" ) ;
xml . AppendLine ( @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > " ) ;
xml . AppendLine ( @"<html xmlns=""http://www.w3.org/1999/xhtml"">" ) ;
xml . AppendLine ( @"<head>" ) ;
xml . AppendLine ( "<title>" + HtmlEntity . Entitize ( chapter . title ) + "</title>" ) ;
xml . AppendLine ( @"</head>" ) ;
xml . AppendLine ( @"<body>" ) ;
xml . AppendLine ( "<h1>" + HtmlEntity . Entitize ( chapter . title ) + "</h1>" ) ;
xml . AppendLine ( chapter . chapter ) ;
xml . AppendLine ( @"</body>" ) ;
xml . AppendLine ( @"</html>" ) ;
return xml . ToString ( ) ;
}
public struct ProcessOutput
{
public readonly string Command ;
public readonly int ExitCode ;
public readonly string StdOut ;
public readonly string StdErr ;
public readonly string StdCombined ;
public ProcessOutput ( string cmd , int ex , string stdout , string stderr , string stdcom )
{
Command = cmd ;
ExitCode = ex ;
StdOut = stdout ;
StdErr = stderr ;
StdCombined = stdcom ;
}
public override string ToString ( ) = > $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}" ;
}
public static class ProcessHelper
{
public static ProcessOutput ProcExecute ( string command , string arguments , string workingDirectory = null )
{
var process = new Process
{
StartInfo =
{
FileName = command ,
Arguments = arguments ,
WorkingDirectory = workingDirectory ? ? string . Empty ,
UseShellExecute = false ,
RedirectStandardOutput = true ,
RedirectStandardError = true ,
CreateNoWindow = true ,
ErrorDialog = false ,
}
} ;
var builderOut = new StringBuilder ( ) ;
var builderErr = new StringBuilder ( ) ;
var builderBoth = new StringBuilder ( ) ;
process . OutputDataReceived + = ( sender , args ) = >
{
if ( args . Data = = null ) return ;
if ( builderOut . Length = = 0 ) builderOut . Append ( args . Data ) ;
else builderOut . Append ( "\n" + args . Data ) ;
if ( builderBoth . Length = = 0 ) builderBoth . Append ( args . Data ) ;
else builderBoth . Append ( "\n" + args . Data ) ;
} ;
process . ErrorDataReceived + = ( sender , args ) = >
{
if ( args . Data = = null ) return ;
if ( builderErr . Length = = 0 ) builderErr . Append ( args . Data ) ;
else builderErr . Append ( "\n" + args . Data ) ;
if ( builderBoth . Length = = 0 ) builderBoth . Append ( args . Data ) ;
else builderBoth . Append ( "\n" + args . Data ) ;
} ;
process . Start ( ) ;
process . BeginOutputReadLine ( ) ;
process . BeginErrorReadLine ( ) ;
process . WaitForExit ( ) ;
return new ProcessOutput ( $"{command} {arguments.Replace(" \ r ", " \ \ r ").Replace(" \ n ", " \ \ n ")}" , process . ExitCode , builderOut . ToString ( ) , builderErr . ToString ( ) , builderBoth . ToString ( ) ) ;
}
}
public static class HTMLToText
{
private static Regex REX_TAG1 = new Regex ( "<\\s*(link|style|script)[^>]*?/>" , RegexOptions . Compiled ) ;
private static Regex REX_TAG2 = new Regex ( "<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>" , RegexOptions . Compiled ) ;
private class PreceedingDomTextInfo
{
public PreceedingDomTextInfo ( BoolWrapper isFirstTextOfDocWritten )
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten ;
}
public bool WritePrecedingWhiteSpace { get ; set ; }
public bool LastCharWasSpace { get ; set ; }
public readonly BoolWrapper IsFirstTextOfDocWritten ;
public int ListIndex { get ; set ; }
}
private class BoolWrapper
{
public BoolWrapper ( ) { }
public bool Value { get ; set ; }
public static implicit operator bool ( BoolWrapper boolWrapper )
{
return boolWrapper . Value ;
}
public static implicit operator BoolWrapper ( bool boolWrapper )
{
return new BoolWrapper { Value = boolWrapper } ;
}
}
public static string Convert ( string path )
{
HtmlDocument doc = new HtmlDocument ( ) ;
doc . Load ( path ) ;
return ConvertDoc ( doc ) ;
}
public static string ConvertHtml ( string html )
{
HtmlDocument doc = new HtmlDocument ( ) ;
html = REX_TAG1 . Replace ( html , " " ) ;
html = REX_TAG2 . Replace ( html , " " ) ;
doc . LoadHtml ( html ) ;
return ConvertDoc ( doc ) ;
}
public static string ConvertDoc ( HtmlDocument doc )
{
using ( StringWriter sw = new StringWriter ( ) )
{
ConvertTo ( doc . DocumentNode , sw ) ;
sw . Flush ( ) ;
return sw . ToString ( ) ;
}
}
private static void ConvertContentTo ( HtmlNode node , TextWriter outText , PreceedingDomTextInfo textInfo )
{
foreach ( HtmlNode subnode in node . ChildNodes )
{
ConvertTo ( subnode , outText , textInfo ) ;
}
}
public static void ConvertTo ( HtmlNode node , TextWriter outText )
{
ConvertTo ( node , outText , new PreceedingDomTextInfo ( false ) ) ;
}
private static void ConvertTo ( HtmlNode node , TextWriter outText , PreceedingDomTextInfo textInfo )
{
string html ;
switch ( node . NodeType )
{
case HtmlNodeType . Comment :
// don't output comments
break ;
case HtmlNodeType . Document :
ConvertContentTo ( node , outText , textInfo ) ;
break ;
case HtmlNodeType . Text :
// script and style must not be output
string parentName = node . ParentNode . Name ;
if ( ( parentName = = "script" ) | | ( parentName = = "style" ) )
{
break ;
}
// get text
html = ( ( HtmlTextNode ) node ) . Text ;
// is it in fact a special closing node output as text?
if ( HtmlNode . IsOverlappedClosingElement ( html ) ) break ;
// check the text is meaningful and not a bunch of whitespaces
if ( html . Length = = 0 ) break ;
if ( html . Trim ( ) . ToLower ( ) . StartsWith ( "<?xml" ) & & html . Trim ( ) . ToLower ( ) . EndsWith ( "?>" ) ) break ;
if ( ! textInfo . WritePrecedingWhiteSpace | | textInfo . LastCharWasSpace )
{
html = html . TrimStart ( ) ;
if ( html . Length = = 0 ) { break ; }
textInfo . IsFirstTextOfDocWritten . Value = textInfo . WritePrecedingWhiteSpace = true ;
}
outText . Write ( HtmlEntity . DeEntitize ( Regex . Replace ( html . TrimEnd ( ) , @"\s{2,}" , " " ) ) ) ;
if ( textInfo . LastCharWasSpace = char . IsWhiteSpace ( html [ html . Length - 1 ] ) )
{
outText . Write ( ' ' ) ;
}
break ;
case HtmlNodeType . Element :
string endElementString = null ;
bool isInline ;
bool skip = false ;
int listIndex = 0 ;
switch ( node . Name )
{
case "nav" :
skip = true ;
isInline = false ;
break ;
case "body" :
case "section" :
case "article" :
case "aside" :
case "h1" :
case "h2" :
case "header" :
case "footer" :
case "address" :
case "main" :
case "div" :
case "span" :
case "p" : // stylistic - adjust as you tend to use
if ( textInfo . IsFirstTextOfDocWritten ) outText . Write ( "\r\n" ) ;
endElementString = "\r\n" ;
isInline = false ;
break ;
case "br" :
outText . Write ( "\r\n" ) ;
skip = true ;
textInfo . WritePrecedingWhiteSpace = false ;
isInline = true ;
break ;
case "a" :
isInline = true ;
break ;
case "li" :
isInline = false ;
break ;
case "ol" :
listIndex = 1 ;
goto case "ul" ;
case "ul" : //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n" ;
isInline = false ;
break ;
case "img" : //inline-block in reality
isInline = true ;
break ;
default :
isInline = true ;
break ;
}
if ( ! skip & & node . HasChildNodes )
{
ConvertContentTo ( node , outText , isInline ? textInfo : new PreceedingDomTextInfo ( textInfo . IsFirstTextOfDocWritten ) { ListIndex = listIndex } ) ;
}
if ( endElementString ! = null )
{
outText . Write ( endElementString ) ;
}
break ;
}
}
}
}