2023-08-20 16:10:39 +02:00
using System.Diagnostics ;
using System.Net ;
using System.Text ;
using System.Text.RegularExpressions ;
using System.Xml.Linq ;
using System.Xml.Serialization ;
using HtmlAgilityPack ;
using Ionic.Zip ;
2023-10-03 16:13:37 +02:00
using WordpressEboobScraper2.Proc ;
2023-08-20 16:10:39 +02:00
namespace WordpressEboobScraper2.Scraper ;
/** *************************************************** **/
/** **/
/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/
/** **/
/** *************************************************** **/
2023-10-03 16:13:37 +02:00
public class Scraper
2023-08-20 16:10:39 +02:00
{
static EpubParameter ACTIVE_BOOK = null ;
const int LIMIT = 1500 ;
2023-10-03 16:13:37 +02:00
readonly Regex REX_NUMSTART = new ( @"^\s*(?<n>[0-9]+)\s*\-.*$" , RegexOptions . Compiled ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
Dictionary < string , string > webCache = new ( ) ;
2023-08-20 16:10:39 +02:00
string STASH_FOLDER = > Config . BASE_DIR_STASH + ACTIVE_BOOK . Foldername + Path . DirectorySeparatorChar ;
string WCACHE_FILE = > Path . Combine ( Config . BASE_DIR_OUT , @"_cache" , ACTIVE_BOOK . Foldername + @".xml" ) ;
string HTML_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"html" , ACTIVE_BOOK . Foldername + @".html" ) ;
string EPUB_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"epub" , ACTIVE_BOOK . Foldername + @".epub" ) ;
string MOBI_FILE_OUT = > Path . Combine ( Config . BASE_DIR_OUT , @"mobi" , ACTIVE_BOOK . Foldername + @".mobi" ) ;
string HTML_FILE_STASH = > STASH_FOLDER + @"book.html" ;
string ZIP_FILE_STASH = > STASH_FOLDER + @"book.zip" ;
string EPUB_FILE_STASH = > STASH_FOLDER + @"book.epub" ;
string MOBI_FILE_STASH = > STASH_FOLDER + @"book.mobi" ;
string QUERY_FOLDER = > STASH_FOLDER + @"query" + Path . DirectorySeparatorChar ; // full query result
string HTML_FOLDER = > STASH_FOLDER + @"html" + Path . DirectorySeparatorChar ; // unprocessed chapter code
string EPUB_FOLDER = > STASH_FOLDER + @"epub" + Path . DirectorySeparatorChar ; // processed epub chapter code
//----------------------------------------------------------------------------------------------------//
//----------------------------------------------------------------------------------------------------//
public void Generate ( )
{
foreach ( var bb in Config . BOOKS )
{
ACTIVE_BOOK = bb ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
new string ( '=' , $" [PROCESSING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$" [PROCESSING BOOK] {bb.DisplayStr} " . Dump ( ) ;
new string ( '=' , $" [PROCESSING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
Init ( ) ;
List < Chapter > chapters = FindChapters ( ) ;
WriteBookHTML ( chapters ) ;
WriteEpub ( chapters ) ;
if ( Config . CONVERT_MOBI ) GenerateMobi ( ) ;
}
}
public void Verify ( )
{
foreach ( var bb in Config . BOOKS )
{
ACTIVE_BOOK = bb ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
new string ( '=' , $" [VERIFYING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$" [VERIFYING BOOK] {bb.DisplayStr} " . Dump ( ) ;
new string ( '=' , $" [VERIFYING BOOK] {bb.DisplayStr} " . Length ) . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
$"" . Dump ( ) ;
LoadWebCache ( ) ;
VerifyChapters ( ) ;
}
}
void Init ( )
{
if ( Directory . Exists ( STASH_FOLDER ) )
{
Directory . EnumerateDirectories ( STASH_FOLDER ) . ToList ( ) . ForEach ( d = > Directory . EnumerateFiles ( d ) . ToList ( ) . ForEach ( File . Delete ) ) ;
if ( File . Exists ( HTML_FILE_STASH ) ) File . Delete ( HTML_FILE_STASH ) ;
if ( File . Exists ( ZIP_FILE_STASH ) ) File . Delete ( ZIP_FILE_STASH ) ;
if ( File . Exists ( EPUB_FILE_STASH ) ) File . Delete ( EPUB_FILE_STASH ) ;
if ( File . Exists ( MOBI_FILE_STASH ) ) File . Delete ( MOBI_FILE_STASH ) ;
}
Directory . CreateDirectory ( STASH_FOLDER ) ;
Directory . CreateDirectory ( QUERY_FOLDER ) ;
Directory . CreateDirectory ( HTML_FOLDER ) ;
Directory . CreateDirectory ( EPUB_FOLDER ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"_cache" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"html" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"epub" + Path . DirectorySeparatorChar ) ;
Directory . CreateDirectory ( Config . BASE_DIR_OUT + @"mobi" + Path . DirectorySeparatorChar ) ;
if ( Config . USE_WEBCACHE ) LoadWebCache ( ) ;
}
void WriteBookHTML ( List < Chapter > chapters )
{
StringBuilder b = new StringBuilder ( ) ;
b . AppendLine ( "<!DOCTYPE html>" ) ;
b . AppendLine ( "<html>" ) ;
b . AppendLine ( "<body>" ) ;
foreach ( var currChapter in chapters )
{
b . AppendLine ( ) ;
b . AppendLine ( "<h1>" + HtmlEntity . Entitize ( currChapter . title ) + "</h1>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( currChapter . chapter ) ;
}
b . AppendLine ( "</html>" ) ;
b . AppendLine ( "</body>" ) ;
File . WriteAllText ( HTML_FILE_STASH , b . ToString ( ) , Encoding . UTF8 ) ;
File . Copy ( HTML_FILE_STASH , HTML_FILE_OUT , true ) ;
}
void SaveCache ( )
{
var xs = new XmlSerializer ( typeof ( List < SerializableCacheEntry > ) ) ;
2023-10-03 16:13:37 +02:00
using ( var writer = new StreamWriter ( WCACHE_FILE ) )
2023-08-20 16:10:39 +02:00
{
xs . Serialize ( writer , webCache . Select ( p = > new SerializableCacheEntry { URL = p . Key , Content = new GZippedString { Value = p . Value } } ) . ToList ( ) ) ;
}
}
void LoadWebCache ( )
{
if ( ! File . Exists ( WCACHE_FILE ) ) return ;
XmlSerializer deserializer = new XmlSerializer ( typeof ( List < SerializableCacheEntry > ) ) ;
2023-10-03 16:13:37 +02:00
using TextReader reader = new StreamReader ( WCACHE_FILE ) ;
var l = ( List < SerializableCacheEntry > ) deserializer . Deserialize ( reader ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
webCache = l . ToDictionary ( p = > p . URL , p = > p . Content . Value ) ;
2023-08-20 16:10:39 +02:00
}
List < Chapter > FindChapters ( )
{
List < Chapter > result = new List < Chapter > ( ) ;
2023-10-03 16:13:37 +02:00
using WebClient client = new WebClient ( ) ;
client . Encoding = Encoding . UTF8 ;
Stack < string > buffer = new Stack < string > ( ) ;
buffer . Push ( ACTIVE_BOOK . StartURL ) ;
while ( buffer . Any ( ) & & result . Count < LIMIT )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
var url = buffer . Pop ( ) ;
Chapter curr = new Chapter ( ) { url = url } ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var buffered = webCache . ContainsKey ( url . ToLower ( ) ) ;
if ( buffered )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
curr . queryResult = webCache [ url . ToLower ( ) ] ;
"*(loaded from webcache)*" . Dump ( ) ;
}
else
{
curr . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
webCache [ url . ToLower ( ) ] = curr . queryResult ;
SaveCache ( ) ;
}
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var r = ProcessChapter ( curr , result , s = > s . Dump ( ) , out var next_url ) ;
if ( next_url ! = null ) buffer . Push ( next_url ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( buffered & & buffer . Count = = 0 & & Config . DO_LIVE_RELOAD_OF_LAST )
{
"" . Dump ( ) ;
"//==> *(auto-reload from live)*" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
"" . Dump ( ) ;
2023-10-03 16:13:37 +02:00
curr . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
webCache [ url . ToLower ( ) ] = curr . queryResult ;
SaveCache ( ) ;
r = ProcessChapter ( curr , result , s = > s . Dump ( ) , out var next_url_inner ) ;
if ( next_url_inner ! = null ) buffer . Push ( next_url_inner ) ;
}
if ( r = = ProcessResult . SuccessNormal )
{
" ==> Chapter processed" . Dump ( ) ;
result . Add ( curr ) ;
OutputChapter ( curr , result . Count ) ;
}
else if ( r = = ProcessResult . SkipChapter )
{
" ==> Skip this chapter" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
}
2023-10-03 16:13:37 +02:00
else if ( r = = ProcessResult . ReachedEnd )
{
" ==> End reached" . Dump ( ) ;
}
"" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
}
return result ;
}
void VerifyChapters ( )
{
2023-10-03 16:13:37 +02:00
using WebClient client = new WebClient ( ) ;
client . Encoding = Encoding . UTF8 ;
Stack < string > buffer = new Stack < string > ( ) ;
buffer . Push ( ACTIVE_BOOK . StartURL ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
while ( buffer . Any ( ) )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
var url = buffer . Pop ( ) ;
Chapter curr_buffer = new Chapter ( ) { url = url } ;
Chapter curr_live = new Chapter ( ) { url = url } ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var buffered = webCache . ContainsKey ( url . ToLower ( ) ) ;
if ( buffered )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
try
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
curr_buffer . queryResult = webCache [ url . ToLower ( ) ] ;
curr_live . queryResult = client . DownloadString ( Uri . UnescapeDataString ( url ) ) ;
2023-08-20 16:10:39 +02:00
}
2023-10-03 16:13:37 +02:00
catch ( Exception e )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
continue ;
}
2023-10-03 16:13:37 +02:00
}
else
{
continue ;
}
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var is_diff = false ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var r_buffer = ProcessChapter ( curr_buffer , new List < Chapter > ( ) , _ = > { } , out var next_buffer ) ;
var r_live = ProcessChapter ( curr_live , new List < Chapter > ( ) , _ = > { } , out var next_live ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( next_buffer ! = null ) buffer . Push ( next_buffer ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( r_buffer ! = r_live ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}" . Dump ( ) ; is_diff = true ; }
if ( r_buffer ! = r_live ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}" . Dump ( ) ; is_diff = true ; }
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( ! Relaxedurleq ( curr_buffer . next , curr_live . next ) ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}" . Dump ( ) ; is_diff = true ; }
if ( curr_buffer . title ! = curr_live . title ) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}" . Dump ( ) ; is_diff = true ; }
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( curr_buffer . chapter . Value ! = curr_live . chapter . Value )
{
var clean_buffer = GetChapterText ( curr_buffer ) ;
var clean_live = GetChapterText ( curr_live ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( clean_buffer . Trim ( ) ! = clean_live . Trim ( ) )
{
$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: " . Dump ( ) ;
new Hyperlinq ( ( ) = >
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
var fa = Path . Combine ( Path . GetTempPath ( ) , "buffer_" + Guid . NewGuid ( ) + ".txt" ) ;
var fb = Path . Combine ( Path . GetTempPath ( ) , "live___" + Guid . NewGuid ( ) + ".txt" ) ;
File . WriteAllText ( fa , curr_buffer . chapter . Value ) ;
File . WriteAllText ( fb , curr_live . chapter . Value ) ;
Process . Start ( Config . COMPARE_PROG , $"\" { fa } \ " \"{fb}\"" ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
} , "[Compare Raw]" ) . Dump ( ) ;
new Hyperlinq ( ( ) = >
{
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
var fa = Path . Combine ( Path . GetTempPath ( ) , "buffer_" + Guid . NewGuid ( ) + ".txt" ) ;
var fb = Path . Combine ( Path . GetTempPath ( ) , "live___" + Guid . NewGuid ( ) + ".txt" ) ;
File . WriteAllText ( fa , clean_buffer ) ;
File . WriteAllText ( fb , clean_live ) ;
Process . Start ( Config . COMPARE_PROG , $"\" { fa } \ " \"{fb}\"" ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
} , "[Compare Text]" ) . Dump ( ) ;
new Hyperlinq ( ( ) = >
{
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
webCache [ url . ToLower ( ) ] = curr_live . queryResult ;
SaveCache ( ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
} , "[Save new version to webcache]" ) . Dump ( ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
is_diff = true ;
2023-08-20 16:10:39 +02:00
}
2023-10-03 16:13:37 +02:00
}
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( ! is_diff ) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
2023-10-03 16:13:37 +02:00
if ( is_diff ) "" . Dump ( ) ;
2023-08-20 16:10:39 +02:00
}
}
bool Relaxedurleq ( string a , string b )
{
if ( a = = b ) return true ;
if ( a . StartsWith ( "https://" ) ) a = a . Substring ( "https://" . Length ) ;
if ( a . StartsWith ( "http://" ) ) a = a . Substring ( "http://" . Length ) ;
if ( b . StartsWith ( "https://" ) ) b = b . Substring ( "https://" . Length ) ;
if ( b . StartsWith ( "http://" ) ) b = b . Substring ( "http://" . Length ) ;
return ( a = = b ) ;
}
string GetChapterText ( Chapter c )
{
if ( string . IsNullOrWhiteSpace ( c . chapter . Value ) ) return string . Empty ;
var clean = HTMLToText . ConvertHtml ( c . chapter . Value ) ;
clean = clean . Trim ( ) ;
clean = new Regex ( @"\s+" ) . Replace ( clean , " " ) ;
return clean ;
}
2023-10-03 16:13:37 +02:00
ProcessResult ProcessChapter ( Chapter curr , IReadOnlyList < Chapter > backBuffer , Action < String > prt , out string forwardQueueNext )
2023-08-20 16:10:39 +02:00
{
2023-10-03 16:13:37 +02:00
forwardQueueNext = null ;
2023-08-20 16:10:39 +02:00
HtmlDocument doc = new HtmlDocument ( ) ;
doc . LoadHtml ( curr . queryResult ) ;
#region Base
var nodeContent = doc . DocumentNode . SelectSingleNode ( @"//article[contains(@class,'post') and contains(@class ,'type-post')]" ) ;
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//article[contains(@id,'post') and contains(@class ,'post')]" ) ;
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@id,'post') and contains(@class ,'post')]" ) ;
2023-08-20 16:44:58 +02:00
if ( nodeContent = = null ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]" ) ;
2023-08-20 16:10:39 +02:00
if ( nodeContent = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) nodeContent = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class,'box_con')]" ) ;
var nodeNav = doc . DocumentNode . SelectSingleNode ( @"//nav[contains(@class,'post-navigation') and @role='navigation']" ) ;
if ( nodeNav = = null ) nodeNav = doc . DocumentNode . SelectSingleNode ( @"//div[contains(@class,'pjgm-navigation')]" ) ;
if ( nodeNav = = null ) nodeNav = nodeContent . SelectSingleNode ( @"//div[contains(@class,'nav-buttons')]" ) ;
if ( nodeNav = = null ) nodeNav = nodeContent ;
var nodeChapter = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]" ) ;
if ( nodeChapter = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) nodeChapter = nodeContent . SelectSingleNode ( @"//div[contains(@id, 'content')]" ) ;
#endregion
#region Title
var titleNode = nodeContent . SelectSingleNode ( @"//header[@class='entry-header']//h1[@class='entry-title']" ) ;
if ( titleNode = = null ) titleNode = nodeContent . SelectSingleNode ( @"//h1[contains(@class, 'posttitle')]" ) ;
if ( titleNode = = null ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'fic-header')]//h1" ) ;
if ( titleNode = = null & & ACTIVE_BOOK . SiteType = = Site . WP ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'entry-content')]//strong" ) ;
if ( titleNode = = null & & ACTIVE_BOOK . SiteType = = Site . WW ) titleNode = nodeContent . SelectSingleNode ( @"//div[contains(@class, 'bookname')]/h1" ) ;
curr . title = Helper . TitleFmt ( HtmlEntity . DeEntitize ( titleNode . InnerText ) ) ;
var titles = new List < string > ( ) ;
titles . Add ( curr . title ) ;
if ( string . IsNullOrWhiteSpace ( curr . title ) | | Regex . IsMatch ( curr . title . ToLower ( ) , @"^chapter [0-9]+.*" ) )
{
var baseTitle = curr . title ;
var suffix = Helper . TitleFmt ( Regex . Match ( curr . title . ToLower ( ) , @"^chapter [0-9]+(.*)$" ) . Groups [ 1 ] . Value ) ;
var prefix1 = Regex . Match ( curr . title . ToLower ( ) , @"^(chapter) ([0-9]+)" ) . Groups [ 0 ] . Value ;
var prefix2 = "chapter " + int . Parse ( Regex . Match ( curr . title . ToLower ( ) , @"^(chapter) ([0-9]+)" ) . Groups [ 2 ] . Value ) ;
titles . Add ( prefix1 ) ;
titles . Add ( prefix2 ) ;
var altTitleNode1 = nodeChapter . Descendants ( ) . LastOrDefault ( p = > ! p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix1 ) & & p . InnerText . Trim ( ) . Length - prefix1 . Length > 2 ) ;
var altTitleNode2 = nodeChapter . Descendants ( ) . LastOrDefault ( p = > ! p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix2 ) & & p . InnerText . Trim ( ) . Length - prefix2 . Length > 2 ) ;
var altTitleNode3 = nodeChapter . Descendants ( ) . FirstOrDefault ( p = > p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix1 ) & & p . InnerText . Trim ( ) . Length - prefix1 . Length > 2 & & ! ( p . InnerHtml . Contains ( "<p>" ) | | p . InnerHtml . Contains ( "<br" ) ) ) ;
var altTitleNode4 = nodeChapter . Descendants ( ) . FirstOrDefault ( p = > p . HasChildNodes & & p . InnerText . Trim ( ) . ToLower ( ) . StartsWith ( prefix2 ) & & p . InnerText . Trim ( ) . Length - prefix2 . Length > 2 & & ! ( p . InnerHtml . Contains ( "<p>" ) | | p . InnerHtml . Contains ( "<br" ) ) ) ;
if ( altTitleNode1 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode1 . InnerText . Trim ( ) . Substring ( prefix1 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
}
else if ( altTitleNode2 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode2 . InnerText . Trim ( ) . Substring ( prefix2 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
}
else if ( altTitleNode3 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode3 . InnerText . Trim ( ) . Substring ( prefix1 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
altTitleNode3 . Remove ( ) ;
prt ( " > title node removed" ) ;
}
else if ( altTitleNode4 ! = null )
{
var newtitle = Helper . TitleFmt ( altTitleNode4 . InnerText . Trim ( ) . Substring ( prefix2 . Length ) ) ;
titles . Add ( newtitle ) ;
curr . title = newtitle ;
titles . Add ( prefix1 + newtitle ) ;
titles . Add ( prefix2 + newtitle ) ;
titles . Add ( prefix1 + " - " + newtitle ) ;
titles . Add ( prefix2 + " - " + newtitle ) ;
altTitleNode4 . Remove ( ) ;
prt ( " > title node removed" ) ;
}
else if ( suffix . Length > 2 )
{
curr . title = suffix ;
titles . Add ( suffix ) ;
}
else
{
prt ( " [!!] Warning cannot parse title" ) ;
}
if ( suffix . Length > 2 )
{
curr . title = baseTitle ;
titles . Add ( baseTitle ) ;
}
}
if ( curr . title . ToLower ( ) . StartsWith ( ACTIVE_BOOK . Foldername . ToLower ( ) ) ) {
var tit_alt = curr . title . Substring ( ACTIVE_BOOK . Foldername . Length ) ;
while ( tit_alt . Length > 0 & & new [ ] { ' ' , '\t' , '-' , ',' , ':' , '.' , '_' , ';' } . Contains ( tit_alt [ 0 ] ) ) tit_alt = tit_alt . Substring ( 1 ) ;
tit_alt = tit_alt . Trim ( ) ;
if ( tit_alt . Length > 2 ) curr . title = tit_alt ;
}
#endregion
curr . sourcecode = "<!DOCTYPE html>\r\n<html>\r\n<body>\r\n" + nodeContent . OuterHtml + "\r\n</body>\r\n</html>\r\n" ;
if ( backBuffer . Any ( ) & & backBuffer . First ( ) . title = = curr . title )
{
prt ( "[!] Book loop found - skipping entry" ) ;
return ProcessResult . ReachedEnd ; // prevent book II loop
}
curr . isEpilogue = ( titles . Any ( t = > t . ToLower ( ) . Contains ( "epilogue" ) | | t . ToLower ( ) . Contains ( "epilog" ) ) ) & & ( ACTIVE_BOOK . SiteType ! = Site . Royalroad ) ;
curr . isPrologue = ( titles . Any ( t = > t . ToLower ( ) . Contains ( "prologue" ) | | t . ToLower ( ) . Contains ( "prolog" ) ) ) ;
curr . isBonus = ( titles . Any ( t = > t . ToLower ( ) . Trim ( ) . StartsWith ( "bonus" ) ) ) ;
if ( ACTIVE_BOOK = = Config . APGTE7 ) curr . isEpilogue = titles . Any ( t = > t . ToLower ( ) = = "epilogue II" ) ;
if ( backBuffer . Skip ( 1 ) . Any ( bb = > bb . isEpilogue ) & & ! curr . isBonus )
{
prt ( "[!] Epilogue found - skipping entry" ) ;
return ProcessResult . ReachedEnd ; // Book finished - it was the Epilogue
}
prt ( curr . title + " (" + curr . url + ")" ) ;
#region Next
if ( backBuffer . Where ( b = > ! b . isSpecial ) . Count ( ) > 4 & &
backBuffer . Where ( b = > ! b . isSpecial ) . Select ( bb = > { var r = REX_NUMSTART . Match ( bb . title ) ; return r . Success ? r . Groups [ "n" ] . Value : null ; } ) . Distinct ( ) . Count ( ) = = 1 & &
REX_NUMSTART . Match ( backBuffer . Where ( b = > ! b . isSpecial ) . First ( ) . title ) . Success & &
REX_NUMSTART . Match ( curr . title ) . Success & &
REX_NUMSTART . Match ( backBuffer . Where ( b = > ! b . isSpecial ) . First ( ) . title ) . Groups [ "n" ] . Value ! = REX_NUMSTART . Match ( curr . title ) . Groups [ "n" ] . Value )
{
prt ( "[!] Book jump found - skipping entry" ) ;
return ProcessResult . ReachedEnd ;
}
var next = nodeContent . SelectSingleNode ( @"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']" ) ;
2023-08-20 16:44:58 +02:00
2023-08-20 16:10:39 +02:00
if ( next = = null )
next = nodeContent . Descendants ( )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
2023-10-03 16:13:37 +02:00
. Where ( p = > Helper . Striptease ( p ) = = "next chapter" | | Helper . Striptease ( p ) = = "next" )
. Where ( p = > p . Attributes . Contains ( "href" ) )
2023-08-20 16:10:39 +02:00
. FirstOrDefault ( ) ;
if ( next = = null )
next = nodeNav . Descendants ( )
2023-08-20 16:44:58 +02:00
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > p . Attributes . Any ( q = > q . Name = = "rel" & & q . Value = = "next" ) )
. FirstOrDefault ( ) ;
if ( next = = null )
next = Helper . RecursiveDescendants ( nodeContent )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > Helper . Striptease ( p ) = = "next chapter" | | Helper . Striptease ( p ) = = "next" )
. Where ( p = > p . Attributes . Contains ( "href" ) )
. FirstOrDefault ( ) ;
if ( next = = null )
next = Helper . RecursiveDescendants ( nodeContent )
. Where ( p = > p . Name . ToLower ( ) = = "a" )
. Where ( p = > p . Attributes . Any ( q = > q . Name = = "rel" & & q . Value = = "next" ) )
. FirstOrDefault ( ) ;
2023-08-20 16:10:39 +02:00
if ( next ! = null )
{
var next_url = next . Attributes [ "href" ] . Value . Trim ( ) ;
if ( next_url = = "." | | next_url = = "/" | | next_url = = "./" )
{
next = null ;
}
else
{
if ( next_url . StartsWith ( "//" ) ) next_url = "http:" + next_url ;
if ( next_url . StartsWith ( "/" ) ) next_url = Helper . CombineAuthority ( curr . url , next_url ) ;
if ( ! next_url . Contains ( "://" ) & & ACTIVE_BOOK . SiteType = = Site . WW ) next_url = Helper . CombineUri ( curr . url , next_url ) ;
curr . next = next_url ;
if ( ! backBuffer . Any ( p = > p . url . ToLower ( ) = = next_url . ToLower ( ) ) )
{
2023-10-03 16:13:37 +02:00
forwardQueueNext = next_url ;
2023-08-20 16:10:39 +02:00
}
}
}
if ( next = = null ) prt ( " > (!) No next URL found" ) ;
#endregion
#region Chapter marker
var cpMarkerIdentities = new List < string >
{
"previousnext" , "previouschapternextchapter" ,
"firstnext" , "firstchapternextchapter" ,
"firstchapter" , "previouslast" ,
"previouschapterlastchapter" ,
"previouschapter" , "nextchapter" , "lastchapter" ,
"first" , "previous" , "next" , "last"
} ;
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > p . InnerText . Trim ( ) . Length < 24 & & ( p . InnerText . ToLower ( ) . Contains ( "previous chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "next chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "last chapter" ) | | p . InnerText . ToLower ( ) . Contains ( "first chapter" ) ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > Chapter marker removed" ) ;
}
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > Chapter marker removed" ) ;
}
var alist = nodeChapter . SelectNodes ( "//a" ) ;
if ( alist ! = null )
{
foreach ( var node in alist . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
node . Remove ( ) ;
prt ( " > Chapter marker removed" ) ;
}
}
var plist = nodeChapter . SelectNodes ( "//p" ) ;
if ( plist ! = null )
{
foreach ( var node in plist . Where ( p = > cpMarkerIdentities . Any ( m = > NakedIdentity ( p ) = = m ) ) . ToList ( ) )
{
node . Remove ( ) ;
prt ( " > Chapter marker removed" ) ;
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter . SelectNodes ( @"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]" ) ;
if ( shareNodes ! = null )
{
foreach ( var node in shareNodes )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > share div removed" ) ;
}
else
{
prt ( " > share div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Meta Div
var metaNodes = nodeChapter . SelectNodes ( @"div[contains(@class, 'entry-meta')]" ) ;
if ( metaNodes ! = null )
{
foreach ( var node in metaNodes )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > meta div removed" ) ;
}
else
{
prt ( " > meta div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Ad Blocking
var adNodes1 = nodeChapter . SelectNodes ( @"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.." ) ;
if ( adNodes1 ! = null )
{
foreach ( var node in adNodes1 )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
var adNodes2 = nodeChapter . SelectNodes ( @"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.." ) ;
if ( adNodes2 ! = null )
{
foreach ( var node in adNodes2 )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
var adNodes3 = nodeChapter . SelectNodes ( @"div[contains(@class,'code-block')]" ) ;
if ( adNodes3 ! = null )
{
foreach ( var node in adNodes3 . Where ( n = > Helper . Striptease ( n ) = = "advertisement" ) )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > ad div removed" ) ;
}
else
{
prt ( " > ad div cannot be removed - skipping" ) ;
}
}
}
#endregion
#region Title Paragraphs
var titleNodes1 = nodeChapter . SelectNodes ( @"p" ) ;
if ( titleNodes1 ! = null & & titleNodes1 . Any ( ) & & titles . Any ( t = > t . ToLower ( ) = = Helper . TitleFmt ( titleNodes1 . First ( ) . InnerText ) . ToLower ( ) ) & & nodeChapter . ChildNodes . Contains ( titleNodes1 . First ( ) ) )
{
nodeChapter . RemoveChild ( titleNodes1 . First ( ) ) ;
prt ( " > title node removed" ) ;
}
for ( int hval = 1 ; hval < = 5 ; hval + + )
{
var titleNodes2 = nodeChapter . SelectNodes ( @"h" + hval ) ;
if ( titleNodes2 ! = null )
{
foreach ( var node in titleNodes2 . Where ( node = > titles . Any ( t = > t . ToLower ( ) = = Helper . TitleFmt ( node . InnerText ) . ToLower ( ) ) ) )
{
if ( nodeChapter . ChildNodes . Contains ( node ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > title node removed" ) ;
}
}
}
}
var titleNodes3 = nodeChapter . SelectNodes ( @"//u" ) ;
if ( titleNodes3 ! = null & & titleNodes3 . Any ( ) )
{
var xTitleNodes3 = titleNodes3 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes3 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
var titleNodes4 = nodeChapter . SelectNodes ( @"//span" ) ;
if ( titleNodes4 ! = null & & titleNodes4 . Any ( ) )
{
var xTitleNodes4 = titleNodes4 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes4 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
var titleNodes5 = nodeChapter . SelectNodes ( @"//strong" ) ;
if ( titleNodes5 ! = null & & titleNodes5 . Any ( ) )
{
var xTitleNodes5 = titleNodes5 . Where ( n = > titles . Any ( t = > CouldBeTitle ( n , t ) ) ) ;
foreach ( var t in xTitleNodes5 )
{
t . Remove ( ) ;
prt ( " > title node removed" ) ;
}
}
#endregion
#region Remove < hr > ' s
while ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . First ( ) . Name . ToLower ( ) = = "hr" )
{
nodeChapter . RemoveChild ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . First ( ) ) ;
prt ( " > header hr removed" ) ;
}
while ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . Last ( ) . Name . ToLower ( ) = = "hr" )
{
nodeChapter . RemoveChild ( nodeChapter . ChildNodes . Where ( p = > p . NodeType = = HtmlNodeType . Element ) . Last ( ) ) ;
prt ( " > footer hr removed" ) ;
}
#endregion
#region Other ( Author ' s Node )
foreach ( var node in nodeChapter . ChildNodes . Where ( p = > p . InnerText . ToLower ( ) . Contains ( "note from the author" ) ) . ToList ( ) )
{
nodeChapter . RemoveChild ( node ) ;
prt ( " > authors note removed" ) ;
}
#endregion
var chap_html = nodeChapter . InnerHtml . Trim ( ) ;
#region Fix raw < hr >
// KOReader doesn't like <hr>
chap_html = chap_html . Replace ( "<hr>" , "<hr/>" ) ;
#endregion
curr . chapter = chap_html ;
if ( curr . title . ToLower ( ) . StartsWith ( "not a chapter - " ) ) return ProcessResult . SkipChapter ;
return ProcessResult . SuccessNormal ;
}
void OutputChapter ( Chapter curr , int index )
{
File . WriteAllText ( QUERY_FOLDER + string . Format ( "{0:000}" , index ) + "_" + Helper . Filenamify ( curr . title ) + ".html" , curr . queryResult ) ;
File . WriteAllText ( HTML_FOLDER + string . Format ( "{0:000}" , index ) + "_" + Helper . Filenamify ( curr . title ) + ".html" , curr . sourcecode , Encoding . UTF8 ) ;
StringBuilder b = new StringBuilder ( ) ;
{
b . AppendLine ( "<!DOCTYPE html>" ) ;
b . AppendLine ( "<html>" ) ;
b . AppendLine ( "<body>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( "<h1>" + HtmlEntity . Entitize ( curr . title ) + "</h1>" ) ;
b . AppendLine ( ) ;
b . AppendLine ( curr . chapter ) ;
b . AppendLine ( "</body>" ) ;
b . AppendLine ( "</html>" ) ;
}
File . WriteAllText ( Path . Combine ( EPUB_FOLDER , Helper . Filenamify ( string . Format ( "{0:000}_{1}.html" , index , curr . title ) ) ) , b . ToString ( ) , Encoding . UTF8 ) ;
}
string NakedIdentity ( HtmlNode raw )
{
return string . Join ( string . Empty ,
raw
. InnerText
. ToLower ( )
. Replace ( ">" , "" )
. Replace ( "<" , "" )
. Replace ( "&" , "" )
. Replace ( """ , "" )
. Replace ( " " , "" )
. ToCharArray ( )
. Where ( c = > char . IsLetterOrDigit ( c ) )
. Select ( c = > char . ToLower ( c ) ) ) . Trim ( )
. ToLower ( ) ;
}
bool CouldBeTitle ( HtmlNode n , string title )
{
var t0 = Helper . Striptease ( n ) ;
var t1 = Helper . Striptease ( title ) ;
t0 = t0 . ToLower ( ) ;
t1 = t1 . ToLower ( ) ;
t0 = t0 . Replace ( ":" , "" ) . Replace ( "-" , "" ) . Replace ( "(" , "" ) . Replace ( ")" , "" ) ;
t1 = t1 . Replace ( ":" , "" ) . Replace ( "-" , "" ) . Replace ( "(" , "" ) . Replace ( ")" , "" ) ;
t0 = Regex . Replace ( t0 , @"\s\s+" , "" ) ;
t1 = Regex . Replace ( t1 , @"\s\s+" , "" ) ;
return t0 = = t1 ;
}
void WriteEpub ( List < Chapter > chapters )
{
if ( File . Exists ( EPUB_FILE_STASH ) ) File . Delete ( EPUB_FILE_STASH ) ;
if ( File . Exists ( ZIP_FILE_STASH ) ) File . Delete ( ZIP_FILE_STASH ) ;
Encoding . RegisterProvider ( CodePagesEncodingProvider . Instance ) ;
using ( FileStream fs = File . Open ( ZIP_FILE_STASH , FileMode . Create , FileAccess . ReadWrite ) )
{
using ( var zipbook = new ZipOutputStream ( fs ) )
{
WritePubString ( zipbook , @"mimetype" , GetEpubMimetype ( ) ) ;
WritePubString ( zipbook , @"META-INF\container.xml" , GetEpubContainerXML ( ) ) ;
WritePubString ( zipbook , @"OEBPS\content.opf" , GetEpubContentOPF ( chapters ) ) ;
WritePubString ( zipbook , @"OEBPS\toc.ncx" , GetEpubTOC ( chapters ) ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
WritePubString ( zipbook , string . Format ( @"OEBPS\Text\{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) , GetEpubChapterFile ( chapters [ i ] , i ) ) ;
}
}
}
File . Copy ( ZIP_FILE_STASH , EPUB_FILE_STASH ) ;
File . Copy ( EPUB_FILE_STASH , EPUB_FILE_OUT , true ) ;
}
void GenerateMobi ( )
{
if ( File . Exists ( MOBI_FILE_STASH ) ) File . Delete ( MOBI_FILE_STASH ) ;
"Running ebook-convert for MOBI output" . Dump ( ) ;
var pout = ProcessHelper . ProcExecute ( "ebook-convert" , $"\" { EPUB_FILE_STASH } \ " \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999" ) ;
$"ebook-convert returned: {pout.ExitCode}" . Dump ( ) ;
if ( pout . ExitCode ! = 0 ) throw new Exception ( pout . ExitCode + "\n\n\n\n" + pout . StdCombined ) ;
File . Copy ( MOBI_FILE_STASH , MOBI_FILE_OUT , true ) ;
}
void WritePubString ( ZipOutputStream z , string n , string c , Encoding e = null )
{
e = e ? ? Encoding . UTF8 ;
var f = z . PutNextEntry ( n ) ;
f . CompressionLevel = Ionic . Zlib . CompressionLevel . None ;
byte [ ] buffer = e . GetBytes ( c ) ;
z . Write ( buffer , 0 , buffer . Length ) ;
}
string GetEpubMimetype ( )
{
return "application/epub+zip" ;
}
string GetEpubContainerXML ( )
{
var doc = new XDocument ( new XDeclaration ( "1.0" , "UTF-8" , null ) ,
new XElement ( XName . Get ( "container" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XAttribute ( "version" , "1.0" ) ,
new XElement ( XName . Get ( "rootfiles" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XElement ( XName . Get ( "rootfile" , "urn:oasis:names:tc:opendocument:xmlns:container" ) ,
new XAttribute ( "full-path" , "OEBPS/content.opf" ) ,
new XAttribute ( "media-type" , "application/oebps-package+xml" ) ) ) ) ) ;
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter ( ) ;
doc . Save ( writer ) ;
var r = writer . ToString ( ) ;
r = r . Replace ( "encoding=\"utf-8\"" , "encoding=\"UTF-8\"" ) ;
return r . Trim ( ) + "\r\n" ;
2023-08-20 16:10:39 +02:00
}
string GetEpubContentOPF ( List < Chapter > chapters )
{
XNamespace dc = "http://purl.org/dc/elements/1.1/" ;
XNamespace opf = "http://www.idpf.org/2007/opf" ;
var doc = new XDocument ( new XDeclaration ( "1.0" , "UTF-8" , null ) ) ;
var package = new XElement ( opf + "package" ,
new XAttribute ( "unique-identifier" , "BookId" ) ,
new XAttribute ( "version" , "2.0" ) ) ;
doc . Add ( package ) ;
var meta = new XElement ( opf + "metadata" ,
new XAttribute ( XNamespace . Xmlns + "dc" , dc ) ,
new XAttribute ( XNamespace . Xmlns + "opf" , opf ) ,
new XElement ( dc + "title" , ACTIVE_BOOK . Title ) ,
new XElement ( dc + "creator" , ACTIVE_BOOK . Author ) ,
new XElement ( dc + "identifier" ,
new XAttribute ( "id" , "BookId" ) ,
new XAttribute ( opf + "scheme" , "UUID" ) ,
"urn:uuid:" + ACTIVE_BOOK . ID_OPF . ToString ( "D" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "publication" ) ,
ACTIVE_BOOK . Release . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "modification" ) ,
DateTime . Now . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "date" ,
new XAttribute ( opf + "event" , "creation" ) ,
DateTime . Now . ToString ( "yyyy'-'MM'-'dd" ) ) ,
new XElement ( dc + "language" , ACTIVE_BOOK . Language ) ,
new XElement ( dc + "identifier" ,
new XAttribute ( opf + "scheme" , "UUID" ) ,
ACTIVE_BOOK . ID_CAL . ToString ( "D" ) ) ,
new XElement ( opf + "meta" ,
new XAttribute ( "content" , "1.0" ) ,
new XAttribute ( "name" , "Wordpress_eBook_scraper_version" ) ) ,
new XElement ( opf + "meta" ,
new XAttribute ( "content" , DateTime . Now . ToString ( "yyyy-MM-dd" ) ) ,
new XAttribute ( "name" , "Wordpress_eBook_scraper_creation_time" ) ) ) ;
if ( ACTIVE_BOOK . Series ! = null )
{
meta . Add ( new XElement ( opf + "meta" ,
new XAttribute ( "content" , ACTIVE_BOOK . Series ) ,
new XAttribute ( "name" , "calibre:series" ) ) ) ;
meta . Add ( new XElement ( opf + "meta" ,
new XAttribute ( "content" , string . Format ( "{0}.0" , ACTIVE_BOOK . SeriesIndex ) ) ,
new XAttribute ( "name" , "calibre:series_index" ) ) ) ;
}
package . Add ( meta ) ;
var manifest = new XElement ( opf + "manifest" ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
manifest . Add ( new XElement ( opf + "item" ,
new XAttribute ( "href" , string . Format ( "Text/{0:000}_{1}.html" , i + 1 , Uri . EscapeUriString ( Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ,
new XAttribute ( "id" , string . Format ( "x{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ,
new XAttribute ( "media-type" , "application/xhtml+xml" ) ) ) ;
}
manifest . Add ( new XElement ( opf + "item" ,
new XAttribute ( "href" , "toc.ncx" ) ,
new XAttribute ( "id" , "ncx" ) ,
new XAttribute ( "media-type" , "application/x-dtbncx+xml" ) ) ) ;
package . Add ( manifest ) ;
var spine = new XElement ( opf + "spine" , new XAttribute ( "toc" , "ncx" ) ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
spine . Add ( new XElement ( opf + "itemref" ,
new XAttribute ( "idref" , string . Format ( "x{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ) ;
}
package . Add ( spine ) ;
package . Add ( new XElement ( opf + "guide" ) ) ;
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter ( ) ;
doc . Save ( writer ) ;
return writer . ToString ( ) ;
2023-08-20 16:10:39 +02:00
}
string GetEpubTOC ( List < Chapter > chapters )
{
XNamespace ncx = "http://www.idpf.org/2007/opf" ;
var doc = new XDocument (
new XDeclaration ( "1.0" , "UTF-8" , null ) ,
new XDocumentType ( "ncx" , "-//NISO//DTD ncx 2005-1//EN" , "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd" , null ) ) ;
var root = new XElement ( ncx + "ncx" ,
new XAttribute ( "version" , "2005-1" ) ,
new XElement ( ncx + "head" ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , "urn:uuid:" + ACTIVE_BOOK . ID_OPF . ToString ( "D" ) ) ,
new XAttribute ( "name" , "dtb:uid" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 1 ) ,
new XAttribute ( "name" , "dtb:depth" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 0 ) ,
new XAttribute ( "name" , "dtb:totalPageCount" ) ) ,
new XElement ( ncx + "meta" ,
new XAttribute ( "content" , 0 ) ,
new XAttribute ( "name" , "dtb:maxPageNumber" ) ) ) ) ;
doc . Add ( root ) ;
root . Add ( new XElement ( ncx + "docTitle" ,
new XElement ( ncx + "text" , "Unknown" ) ) ) ;
var nav = new XElement ( ncx + "navMap" ) ;
for ( int i = 0 ; i < chapters . Count ; i + + )
{
nav . Add ( new XElement ( ncx + "navPoint" ,
new XAttribute ( "id" , "navPoint-" + ( i + 1 ) ) ,
new XAttribute ( "playOrder" , i + 1 ) ,
new XElement ( ncx + "navLabel" ,
new XElement ( ncx + "text" , chapters [ i ] . title ) ) ,
new XElement ( ncx + "content" ,
new XAttribute ( "src" , string . Format ( "Text/{0:000}_{1}.html" , i + 1 , Helper . Filenamify ( chapters [ i ] . title , true ) ) ) ) ) ) ;
}
root . Add ( nav ) ;
2023-10-03 16:13:37 +02:00
using Utf8StringWriter writer = new Utf8StringWriter ( ) ;
doc . Save ( writer ) ;
return writer . ToString ( ) ;
2023-08-20 16:10:39 +02:00
}
string GetEpubChapterFile ( Chapter chapter , int idx )
{
StringBuilder xml = new StringBuilder ( ) ;
xml . AppendLine ( @"<?xml version=""1.0"" encoding=""utf-8""?>" ) ;
xml . AppendLine ( @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//EN"" ""http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"" > " ) ;
xml . AppendLine ( @"<html xmlns=""http://www.w3.org/1999/xhtml"">" ) ;
xml . AppendLine ( @"<head>" ) ;
xml . AppendLine ( "<title>" + HtmlEntity . Entitize ( chapter . title ) + "</title>" ) ;
xml . AppendLine ( @"</head>" ) ;
xml . AppendLine ( @"<body>" ) ;
xml . AppendLine ( "<h1>" + HtmlEntity . Entitize ( chapter . title ) + "</h1>" ) ;
xml . AppendLine ( chapter . chapter ) ;
xml . AppendLine ( @"</body>" ) ;
xml . AppendLine ( @"</html>" ) ;
return xml . ToString ( ) ;
}
}