commit ccfeedc067669290d7c0ead118e40daab9d58a3f Author: Mike Schwörer Date: Sun Aug 20 15:10:10 2023 +0200 Copied project from LinqPad diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5e037d --- /dev/null +++ b/.gitignore @@ -0,0 +1,757 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/csharp,rider,aspnetcore,visualstudio,linux,windows +# Edit at https://www.toptal.com/developers/gitignore?templates=csharp,rider,aspnetcore,visualstudio,linux,windows + +### ASPNETCore ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# DNX +project.lock.json +project.fragment.lock.json +artifacts/ + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/packages/* +# except build/, which is used as an MSBuild target. +!**/packages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config +# NuGet v3's project.json files produces more ignoreable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +node_modules/ +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/ + +### Csharp ### +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser + +# User-specific files (MonoDevelop/Xamarin Studio) + +# Mono auto generated files +mono_crash.* + +# Build results +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +# Uncomment if you have tasks that create the project's static files in wwwroot + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results + +# NUnit +nunit-*.xml + +# Build Results of an ATL Project + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_h.h +*.iobj +*.ipdb +*_wpftmp.csproj + +# Chutzpah Test files + +# Visual C++ cache files + +# Visual Studio profiler + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace + +# Guidance Automation Toolkit + +# ReSharper is a .NET coding add-in + +# TeamCity is a build add-in + +# DotCover is a Code Coverage Tool + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results + +# NCrunch + +# MightyMoose + +# Web workbench (sass) + +# Installshield output folder + +# DocProject is a documentation generator add-in + +# Click-Once directory + +# Publish Web Output +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted + +# NuGet Packages +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files + +# Microsoft Azure Build Output + +# Microsoft Azure Emulator + +# Windows Store app package directories and files +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) + +# RIA/Silverlight projects + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.ndf + +# Business Intelligence projects +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes + +# GhostDoc plugin setting file + +# Node.js Tools for Visual Studio + +# Visual Studio 6 build log + +# Visual Studio 6 workspace options file + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output + +# Paket dependency manager + +# FAKE - F# Make + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +### Linux ### + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Rider ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudio ### + +# User-specific files + +# User-specific files (MonoDevelop/Xamarin Studio) + +# Mono auto generated files + +# Build results + +# Visual Studio 2015/2017 cache/options directory +# Uncomment if you have tasks that create the project's static files in wwwroot + +# Visual Studio 2017 auto generated files + +# MSTest test Results + +# NUnit + +# Build Results of an ATL Project + +# Benchmark Results + +# .NET Core + +# StyleCop + +# Files built by Visual Studio + +# Chutzpah Test files + +# Visual C++ cache files + +# Visual Studio profiler + +# Visual Studio Trace Files + +# TFS 2012 Local Workspace + +# Guidance Automation Toolkit + +# ReSharper is a .NET coding add-in + +# TeamCity is a build add-in + +# DotCover is a Code Coverage Tool + +# AxoCover is a Code Coverage Tool + +# Coverlet is a free, cross platform Code Coverage Tool + +# Visual Studio code coverage results + +# NCrunch + +# MightyMoose + +# Web workbench (sass) + +# Installshield output folder + +# DocProject is a documentation generator add-in + +# Click-Once directory + +# Publish Web Output +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted + +# NuGet Packages +# NuGet Symbol Packages +# The packages folder can be ignored because of Package Restore +# except build/, which is used as an MSBuild target. +# Uncomment if necessary however generally it will be regenerated when needed +# NuGet v3's project.json files produces more ignorable files + +# Microsoft Azure Build Output + +# Microsoft Azure Emulator + +# Windows Store app package directories and files + +# Visual Studio cache files +# files ending in .cache can be ignored +# but keep track of directories ending in .cache + +# Others + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) + +# RIA/Silverlight projects + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) + +# SQL Server files + +# Business Intelligence projects + +# Microsoft Fakes + +# GhostDoc plugin setting file + +# Node.js Tools for Visual Studio + +# Visual Studio 6 build log + +# Visual Studio 6 workspace options file + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) + +# Visual Studio LightSwitch build output + +# Paket dependency manager + +# FAKE - F# Make + +# CodeRush personal settings + +# Python Tools for Visual Studio (PTVS) + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio + +# Telerik's JustMock configuration file + +# BizTalk build output + +# OpenCover UI analysis results + +# Azure Stream Analytics local run output + +# MSBuild Binary and Structured Log + +# NVidia Nsight GPU debugger configuration file + +# MFractors (Xamarin productivity tool) working folder + +# Local History for Visual Studio + +# BeatPulse healthcheck temp database + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 + +# Ionide (cross platform F# VS Code tools) working folder + +# End of https://www.toptal.com/developers/gitignore/api/csharp,rider,aspnetcore,visualstudio,linux,windows + diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/.gitignore b/.idea/.idea.WordpressEboobScraper2/.idea/.gitignore new file mode 100644 index 0000000..e1cfbf4 --- /dev/null +++ b/.idea/.idea.WordpressEboobScraper2/.idea/.gitignore @@ -0,0 +1,13 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Rider ignored files +/projectSettingsUpdater.xml +/contentModel.xml +/modules.xml +/.idea.WordpressEboobScraper2.iml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/encodings.xml b/.idea/.idea.WordpressEboobScraper2/.idea/encodings.xml new file mode 100644 index 0000000..df87cf9 --- /dev/null +++ b/.idea/.idea.WordpressEboobScraper2/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/indexLayout.xml b/.idea/.idea.WordpressEboobScraper2/.idea/indexLayout.xml new file mode 100644 index 0000000..7b08163 --- /dev/null +++ b/.idea/.idea.WordpressEboobScraper2/.idea/indexLayout.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml new file mode 100644 index 0000000..288b36b --- /dev/null +++ b/.idea/.idea.WordpressEboobScraper2/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/Program.cs b/Program.cs new file mode 100644 index 0000000..597fb01 --- /dev/null +++ b/Program.cs @@ -0,0 +1,1662 @@ +/** *************************************************** **/ +/** **/ +/** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/ +/** **/ +/** *************************************************** **/ + +const string BASE_DIR_STASH = @"F:\Stash\eBook_scraper\"; +const string BASE_DIR_OUT = @"F:\Home\Cloud\Dokumente\E-Books\Scraper\"; +const string COMPARE_PROG = @"C:\Program Files\Beyond Compare 4\BCompare.exe"; + +//----------------------------------------------------------------------------------------------------// + +static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/"); +static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/"); +static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/"); +static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/"); +static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/"); + +static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/"); +static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/"); +static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/"); +static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/"); +static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/"); +static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/"); +static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/"); + +static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/"); +static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/"); +static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/"); +static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); + +static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/"); + +static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/"); + +static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/"); +static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/"); +static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/"); +static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/"); +static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/"); +static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/"); +static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/"); +static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/"); +static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/"); +static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/"); +static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/"); +static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/"); +static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/"); +static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/"); +static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/"); + +static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); + +static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue"); + +static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); + +static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); + +static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/"); + +static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); + +static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); + +static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother"); + +static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); + +static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave"); + +//----------------------------------------------------------------------------------------------------// + +readonly EpubParameter[] BOOKS = new[] { TPR }; + +readonly bool USE_WEBCACHE = true; +readonly bool DO_LIVE_RELOAD_OF_LAST = true; +readonly bool CONVERT_MOBI = true; + +readonly MainMode MODE = MainMode.Generate; + +//----------------------------------------------------------------------------------------------------// + +static EpubParameter ACTIVE_BOOK = null; + +const int LIMIT = 1500; + +readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); + +Dictionary webCache = new Dictionary(); + +string STASH_FOLDER => BASE_DIR_STASH + ACTIVE_BOOK.Foldername + @"\"; + +string WCACHE_FILE => BASE_DIR_OUT + @"_cache\" + ACTIVE_BOOK.Foldername + @".xml"; +string HTML_FILE_OUT => BASE_DIR_OUT + @"html\" + ACTIVE_BOOK.Foldername + @".html"; +string EPUB_FILE_OUT => BASE_DIR_OUT + @"epub\" + ACTIVE_BOOK.Foldername + @".epub"; +string MOBI_FILE_OUT => BASE_DIR_OUT + @"mobi\" + ACTIVE_BOOK.Foldername + @".mobi"; + +string HTML_FILE_STASH => STASH_FOLDER + @"book.html"; +string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip"; +string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub"; +string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi"; + +string QUERY_FOLDER => STASH_FOLDER + @"query\"; // full query result +string HTML_FOLDER => STASH_FOLDER + @"html\"; // unprocessed chapter code +string EPUB_FOLDER => STASH_FOLDER + @"epub\"; // processed epub chapter code + +//----------------------------------------------------------------------------------------------------// + +public enum MainMode +{ + Generate, + Verify, +} + +public enum ProcessResult +{ + SuccessNormal, + ReachedEnd, + SkipChapter, +} + +public enum Site +{ + Wordpress, + WuxiaWorld, + Royalroad, + + WP = Wordpress, + WW = WuxiaWorld, + RR = Royalroad, +} + +public class Chapter +{ + public string url; + public string title; + public string next; + + public GZippedString queryResult; + public GZippedString sourcecode; + public GZippedString chapter; + + public bool isPrologue; + public bool isEpilogue; + public bool isBonus; + public bool isSpecial => isPrologue || isEpilogue || isBonus; +} + +public class SerializableCacheEntry +{ + public string URL; + public GZippedString Content; +} + +public class GZippedString : IXmlSerializable +{ + public string Value { get; set; } + + public System.Xml.Schema.XmlSchema GetSchema() { return null; } + + public void ReadXml(System.Xml.XmlReader reader) + { + Value = DecompressString(reader.ReadString()); + reader.ReadEndElement(); + } + + public void WriteXml(System.Xml.XmlWriter writer) + { + writer.WriteString(CompressString(Value)); + } + + private string CompressString(string text) + { + byte[] buffer = Encoding.UTF8.GetBytes(text); + var memoryStream = new MemoryStream(); + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true)) + gZipStream.Write(buffer, 0, buffer.Length); + memoryStream.Position = 0; + var compressedData = new byte[memoryStream.Length]; + memoryStream.Read(compressedData, 0, compressedData.Length); + var gZipBuffer = new byte[compressedData.Length + 4]; + Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length); + Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4); + return Convert.ToBase64String(gZipBuffer); + } + + private string DecompressString(string compressedText) + { + byte[] gZipBuffer = Convert.FromBase64String(compressedText); + using (var memoryStream = new MemoryStream()) + { + int dataLength = BitConverter.ToInt32(gZipBuffer, 0); + memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4); + var buffer = new byte[dataLength]; + memoryStream.Position = 0; + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress)) + gZipStream.Read(buffer, 0, buffer.Length); + return Encoding.UTF8.GetString(buffer); + } + } + + public static implicit operator GZippedString(string v) => new GZippedString{Value = v}; + public static implicit operator string (GZippedString v) => v.Value; + +} + +public class Utf8StringWriter : StringWriter +{ + public override Encoding Encoding { get { return Encoding.UTF8; } } +} + +public class EpubParameter +{ + public readonly string Series; + public readonly int SeriesIndex; + public readonly Guid ID_OPF; + public readonly Guid ID_CAL; + public readonly string Title; + public readonly string Author; + public readonly DateTime Release; + public readonly string Language; + public readonly string StartURL; + public readonly string Foldername; + public readonly Site SiteType; + + public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } } + + public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { } + + public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s) + { + SiteType = st; + Series = z; + SeriesIndex = i; + Title = t; + Author = a; + Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture); + Language = l; + StartURL = s; + if (z == null) + Foldername = Filenamify(t); + else + Foldername = string.Format("{0} {1} - {2}", Filenamify(z), i, Filenamify(t)); + + var u = new Random(Title.GetHashCode() ^ Author.GetHashCode()); + var g = new byte[16]; + u.NextBytes(g); + ID_OPF = new Guid(g); + u.NextBytes(g); + ID_CAL = new Guid(g); + } + + public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}"; +} + +//----------------------------------------------------------------------------------------------------// + +void Main() +{ + Util.AutoScrollResults = true; + + if (MODE == MainMode.Generate) Generate(); + if (MODE == MainMode.Verify) Verify(); +} + +void Generate() +{ + foreach (var bb in BOOKS) + { + ACTIVE_BOOK = bb; + + $"".Dump(); + $"".Dump(); + $"".Dump(); + new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); + $" [PROCESSING BOOK] {bb.DisplayStr} ".Dump(); + new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); + $"".Dump(); + $"".Dump(); + $"".Dump(); + + Init(); + + List chapters = FindChapters(); + + WriteBookHTML(chapters); + WriteEpub(chapters); + if (CONVERT_MOBI) GenerateMobi(); + } +} + +void Verify() +{ + foreach (var bb in BOOKS) + { + ACTIVE_BOOK = bb; + + $"".Dump(); + $"".Dump(); + $"".Dump(); + new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); + $" [VERIFYING BOOK] {bb.DisplayStr} ".Dump(); + new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); + $"".Dump(); + $"".Dump(); + $"".Dump(); + + LoadWebCache(); + + VerifyChapters(); + } +} + +void Init() +{ + if (Directory.Exists(STASH_FOLDER)) + { + Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete)); + if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH); + if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); + if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); + if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); + } + + Directory.CreateDirectory(STASH_FOLDER); + Directory.CreateDirectory(QUERY_FOLDER); + Directory.CreateDirectory(HTML_FOLDER); + Directory.CreateDirectory(EPUB_FOLDER); + + Directory.CreateDirectory(BASE_DIR_OUT + @"_cache\"); + Directory.CreateDirectory(BASE_DIR_OUT + @"html\"); + Directory.CreateDirectory(BASE_DIR_OUT + @"epub\"); + Directory.CreateDirectory(BASE_DIR_OUT + @"mobi\"); + + if (USE_WEBCACHE) LoadWebCache(); +} + +void WriteBookHTML(List chapters) +{ + StringBuilder b = new StringBuilder(); + + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(""); + + foreach (var currChapter in chapters) + { + b.AppendLine(); + b.AppendLine("

" + HtmlEntity.Entitize(currChapter.title) + "

"); + b.AppendLine(); + b.AppendLine(currChapter.chapter); + } + + b.AppendLine(""); + b.AppendLine(""); + + File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8); + File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true); +} + +void SaveCache() +{ + var xs = new XmlSerializer(typeof(List)); + using (var writer = new System.IO.StreamWriter(WCACHE_FILE)) + { + xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); + } +} + +void LoadWebCache() +{ + if (!File.Exists(WCACHE_FILE)) return; + + XmlSerializer deserializer = new XmlSerializer(typeof(List)); + using (TextReader reader = new StreamReader(WCACHE_FILE)) + { + var result = new List(); + + var l = (List)deserializer.Deserialize(reader); + + webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); + } +} + +List FindChapters() +{ + List result = new List(); + + using (WebClient client = new WebClient()) + { + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); + + while (buffer.Any() && result.Count < LIMIT) + { + var url = buffer.Pop(); + Chapter curr = new Chapter() { url = url }; + + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) + { + curr.queryResult = webCache[url.ToLower()]; + "*(loaded from webcache)*".Dump(); + } + else + { + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + } + + var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); + if (next_url != null) buffer.Push(next_url); + + if (buffered && buffer.Count == 0 && DO_LIVE_RELOAD_OF_LAST) + { + "".Dump(); + "//==> *(auto-reload from live)*".Dump(); + "".Dump(); + curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + webCache[url.ToLower()] = curr.queryResult; + SaveCache(); + + r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); + if (next_url_inner != null) buffer.Push(next_url_inner); + } + if (r == ProcessResult.SuccessNormal) + { + " ==> Chapter processed".Dump(); + result.Add(curr); + OutputChapter(curr, result.Count); + } + else if (r == ProcessResult.SkipChapter) + { + " ==> Skip this chapter".Dump(); + } + else if (r == ProcessResult.ReachedEnd) + { + " ==> End reached".Dump(); + } + + + "".Dump(); + } + } + + return result; +} + +void VerifyChapters() +{ + List result = new List(); + + using (WebClient client = new WebClient()) + { + client.Encoding = Encoding.UTF8; + Stack buffer = new Stack(); + buffer.Push(ACTIVE_BOOK.StartURL); + + while (buffer.Any() && result.Count < LIMIT) + { + var url = buffer.Pop(); + Chapter curr_buffer = new Chapter() { url = url }; + Chapter curr_live = new Chapter() { url = url }; + + var buffered = webCache.ContainsKey(url.ToLower()); + if (buffered) + { + try + { + curr_buffer.queryResult = webCache[url.ToLower()]; + curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); + } + catch (Exception e) + { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); + continue; + } + } + else + { + continue; + } + + var is_diff = false; + + var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer); + var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live); + + if (next_buffer != null) buffer.Push(next_buffer); + + if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } + if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } + + if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } + if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } + + if (curr_buffer.chapter.Value != curr_live.chapter.Value) + { + var clean_buffer = GetChapterText(curr_buffer); + var clean_live = GetChapterText(curr_live); + + if (clean_buffer.Trim() != clean_live.Trim()) + { + $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, curr_buffer.chapter.Value); + File.WriteAllText(fb, curr_live.chapter.Value); + Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Raw]").Dump(); + new Hyperlinq(() => + { + + var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); + var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); + File.WriteAllText(fa, clean_buffer); + File.WriteAllText(fb, clean_live); + Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); + + }, "[Compare Text]").Dump(); + new Hyperlinq(() => + { + + webCache[url.ToLower()] = curr_live.queryResult; + SaveCache(); + + }, "[Save new version to webcache]").Dump(); + + is_diff = true; + } + } + + if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); + + if (is_diff) "".Dump(); + } + } +} + +bool Relaxedurleq(string a, string b) +{ + if (a == b) return true; + if (a.StartsWith("https://")) a = a.Substring("https://".Length); + if (a.StartsWith("http://")) a = a.Substring("http://".Length); + if (b.StartsWith("https://")) b = b.Substring("https://".Length); + if (b.StartsWith("http://")) b = b.Substring("http://".Length); + + return (a==b); +} + +string GetChapterText(Chapter c) +{ + if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty; + + var clean = HTMLToText.ConvertHtml(c.chapter.Value); + + clean = clean.Trim(); + + clean = new Regex(@"\s+").Replace(clean, " "); + + return clean; +} + +ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next) +{ + forwardQueue_next = null; + + HtmlDocument doc = new HtmlDocument(); + doc.LoadHtml(curr.queryResult); + + #region Base + + var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); + if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); + if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); + + var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); + if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]"); + if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]"); + if (nodeNav == null) nodeNav = nodeContent; + + var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); + if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); + + #endregion + + #region Title + + var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); + if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); + if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); + if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1"); + + curr.title = TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText)); + + var titles = new List(); + titles.Add(curr.title); + + if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*")) + { + var baseTitle = curr.title; + + var suffix = TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value); + + var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value; + var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value); + + titles.Add(prefix1); + titles.Add(prefix2); + + var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2); + var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2); + var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" title node removed"); + } + else if (altTitleNode4 != null) + { + var newtitle = TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length)); + titles.Add(newtitle); + curr.title = newtitle; + titles.Add(prefix1 + newtitle); + titles.Add(prefix2 + newtitle); + titles.Add(prefix1 + " - " + newtitle); + titles.Add(prefix2 + " - " + newtitle); + + altTitleNode4.Remove(); + prt(" > title node removed"); + } + else if (suffix.Length > 2) + { + curr.title = suffix; + titles.Add(suffix); + } + else + { + prt(" [!!] Warning cannot parse title"); + } + + if (suffix.Length > 2) + { + curr.title = baseTitle; + titles.Add(baseTitle); + } + } + + if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { + var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length); + while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1); + tit_alt = tit_alt.Trim(); + if (tit_alt.Length>2) curr.title = tit_alt; + } + + #endregion + + curr.sourcecode = "\r\n\r\n\r\n" + nodeContent.OuterHtml + "\r\n\r\n\r\n"; + + if (backBuffer.Any() && backBuffer.First().title == curr.title) + { + prt("[!] Book loop found - skipping entry"); + return ProcessResult.ReachedEnd; // prevent book II loop + } + + curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); + curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); + curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); + + if (ACTIVE_BOOK == APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II"); + + if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus) + { + prt("[!] Epilogue found - skipping entry"); + return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue + } + + prt(curr.title + " (" + curr.url + ")"); + + #region Next + + string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" }; + + if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && + backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && + REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && + REX_NUMSTART.Match(curr.title).Success && + REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value) + { + prt("[!] Book jump found - skipping entry"); + return ProcessResult.ReachedEnd; + } + + var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); + if (next == null) + next = nodeContent.Descendants() + .Where(p => p.Name.ToLower() == "a") + .Where(p => Striptease(p) == "next chapter" || Striptease(p) == "next") + .Where(p => p.Attributes.Contains("href")) + .FirstOrDefault(); + + var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a"); + + if (next == null) + next = nodeNav.Descendants() + .Where(p => p.Name.ToLower() == "a") + .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) + .FirstOrDefault(); + + if (next != null) + { + var next_url = next.Attributes["href"].Value.Trim(); + + if (next_url == "." || next_url == "/" || next_url == "./") + { + next=null; + } + else + { + if (next_url.StartsWith("//")) next_url = "http:" + next_url; + + if (next_url.StartsWith("/")) next_url = combineAuthority(curr.url, next_url); + + if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = CombineUri(curr.url, next_url); + + curr.next = next_url; + if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) + { + forwardQueue_next = next_url; + } + } + + } + + if (next == null) prt(" > (!) No next URL found"); + + #endregion + + #region Chapter marker + + var cpMarkerIdentities = new List + { + "previousnext", "previouschapternextchapter", + "firstnext", "firstchapternextchapter", + "firstchapter", "previouslast", + + "previouschapterlastchapter", + + "previouschapter", "nextchapter", "lastchapter", + + "first", "previous", "next", "last" + }; + + foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > Chapter marker removed"); + } + + foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > Chapter marker removed"); + } + + var alist = nodeChapter.SelectNodes("//a"); + if (alist != null) + { + foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + node.Remove(); + prt(" > Chapter marker removed"); + } + } + + var plist = nodeChapter.SelectNodes("//p"); + if (plist != null) + { + foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) + { + node.Remove(); + prt(" > Chapter marker removed"); + } + } + + #endregion + + #region Share Div + + var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); + if (shareNodes != null) + { + foreach (var node in shareNodes) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > share div removed"); + } + else + { + prt(" > share div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Meta Div + + var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]"); + if (metaNodes != null) + { + foreach (var node in metaNodes) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > meta div removed"); + } + else + { + prt(" > meta div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Ad Blocking + + var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.."); + if (adNodes1 != null) + { + foreach (var node in adNodes1) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.."); + if (adNodes2 != null) + { + foreach (var node in adNodes2) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]"); + if (adNodes3 != null) + { + foreach (var node in adNodes3.Where(n => Striptease(n) == "advertisement")) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > ad div removed"); + } + else + { + prt(" > ad div cannot be removed - skipping"); + } + } + } + + #endregion + + #region Title Paragraphs + + var titleNodes1 = nodeChapter.SelectNodes(@"p"); + if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First())) + { + nodeChapter.RemoveChild(titleNodes1.First()); + prt(" > title node removed"); + } + + for (int hval = 1; hval <= 5; hval++) + { + var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval); + if (titleNodes2 != null) + { + foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == TitleFmt(node.InnerText).ToLower()))) + { + if (nodeChapter.ChildNodes.Contains(node)) + { + nodeChapter.RemoveChild(node); + prt(" > title node removed"); + } + } + } + } + + var titleNodes3 = nodeChapter.SelectNodes(@"//u"); + if (titleNodes3 != null && titleNodes3.Any()) + { + var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes3) + { + t.Remove(); + prt(" > title node removed"); + } + } + + var titleNodes4 = nodeChapter.SelectNodes(@"//span"); + if (titleNodes4 != null && titleNodes4.Any()) + { + var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes4) + { + t.Remove(); + prt(" > title node removed"); + } + } + + var titleNodes5 = nodeChapter.SelectNodes(@"//strong"); + if (titleNodes5 != null && titleNodes5.Any()) + { + var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t))); + foreach (var t in xTitleNodes5) + { + t.Remove(); + prt(" > title node removed"); + } + } + + #endregion + + #region Remove


's + + while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr") + { + nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First()); + prt(" > header hr removed"); + } + + while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr") + { + nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last()); + prt(" > footer hr removed"); + } + + #endregion + + #region Other (Author's Node) + + foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList()) + { + nodeChapter.RemoveChild(node); + prt(" > authors note removed"); + } + + #endregion + + var chap_html = nodeChapter.InnerHtml.Trim(); + + #region Fix raw
+ // KOReader doesn't like
+ + chap_html = chap_html.Replace("
", "
"); + + #endregion + + curr.chapter = chap_html; + + + if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter; + + return ProcessResult.SuccessNormal; +} + +string combineAuthority(string url, string suffix) +{ + var left = new Uri(url).GetLeftPart(UriPartial.Authority); + if (!left.EndsWith("/")) left = left + "/"; + if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/'); + return left + suffix; +} + +string CombineUri(string uri1, string uri2) +{ + if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/")); + uri1 = uri1.TrimEnd('/'); + uri2 = uri2.TrimStart('/'); + return string.Format("{0}/{1}", uri1, uri2); +} + +void OutputChapter(Chapter curr, int index) +{ + File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.queryResult); + + File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8); + + StringBuilder b = new StringBuilder(); + { + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(""); + b.AppendLine(); + b.AppendLine("

" + HtmlEntity.Entitize(curr.title) + "

"); + b.AppendLine(); + b.AppendLine(curr.chapter); + b.AppendLine(""); + b.AppendLine(""); + } + File.WriteAllText(Path.Combine(EPUB_FOLDER, Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8); +} + +static string Filenamify(string v, bool repl = false) +{ + var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p => + (p >= '0' && p <= '9') || + (p >= 'A' && p <= 'Z') || + (p >= 'a' && p <= 'z') || + p == ' ' || + p == '.' || + p == '-' || + p == '*' || + p == '_' || + p == '.' || + p == ',').ToArray()); + + if (repl) s = s.Replace(' ', '_'); + + return s; +} + +string TitleFmt(string raw) +{ + raw = HtmlEntity.DeEntitize(raw); + + raw = raw.Replace('–', '-'); + raw = raw.Replace((char)160, ' '); + + raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); + if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3); + + raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); + + if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1); + + return raw; +} + +string Striptease(HtmlNode raw) +{ + { + var rm = raw.SelectNodes(@"//script"); + if (rm != null && rm.Any()) + { + var copy = HtmlNode.CreateNode($"<{raw.Name}>"); + copy.CopyFrom(raw); + raw = copy; + + rm = raw.SelectNodes(@"//script"); + if (rm != null) foreach (var e in rm) e.Remove(); + } + } + + { + var rm = raw.SelectNodes(@"//meta"); + if (rm != null && rm.Any()) + { + var copy = HtmlNode.CreateNode($"<{raw.Name}>"); + copy.CopyFrom(raw); + raw = copy; + + rm = raw.SelectNodes(@"//meta"); + if (rm != null) foreach (var e in rm) e.Remove(); + } + } + + return Striptease(HtmlEntity.DeEntitize(raw.InnerText)); +} + +string Striptease(string raw) +{ + var r = string.Join(string.Empty, + raw + .ToCharArray() + .Select(c => char.IsWhiteSpace(c) ? ' ' : c) + .Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c)) + .Select(c => char.ToLower(c))).Trim(); + return r; +} + +string NakedIdentity(HtmlNode raw) +{ + return string.Join(string.Empty, + raw + .InnerText + .ToLower() + .Replace(">", "") + .Replace("<", "") + .Replace("&", "") + .Replace(""", "") + .Replace(" ", "") + .ToCharArray() + .Where(c => char.IsLetterOrDigit(c)) + .Select(c => char.ToLower(c))).Trim() + .ToLower(); +} + +bool CouldBeTitle(HtmlNode n, string title) +{ + var t0 = Striptease(n); + var t1 = Striptease(title); + + t0 = t0.ToLower(); + t1 = t1.ToLower(); + + t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); + t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); + + t0 = Regex.Replace(t0, @"\s\s+", ""); + t1 = Regex.Replace(t1, @"\s\s+", ""); + + return t0 == t1; +} + +void WriteEpub(List chapters) +{ + if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); + if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); + + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + + using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite)) + { + using (var zipbook = new ZipOutputStream(fs)) + { + WritePubString(zipbook, @"mimetype", GetEpubMimetype()); + WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML()); + WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters)); + WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters)); + + for (int i = 0; i < chapters.Count; i++) + { + WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i)); + } + } + } + + File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH); + + File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true); +} + +void GenerateMobi() +{ + if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); + + "Running ebook-convert for MOBI output".Dump(); + var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999"); + + $"ebook-convert returned: {pout.ExitCode}".Dump(); + if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined); + + File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true); +} + +void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null) +{ + e = e ?? Encoding.UTF8; + + var f = z.PutNextEntry(n); + f.CompressionLevel = Ionic.Zlib.CompressionLevel.None; + + byte[] buffer = e.GetBytes(c); + z.Write(buffer, 0, buffer.Length); +} + +string GetEpubMimetype() +{ + return "application/epub+zip"; +} + +string GetEpubContainerXML() +{ + var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null), + new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XAttribute("version", "1.0"), + new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"), + new XAttribute("full-path", "OEBPS/content.opf"), + new XAttribute("media-type", "application/oebps-package+xml"))))); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + var r = writer.ToString(); + r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); + return r.Trim() + "\r\n"; + } +} + +string GetEpubContentOPF(List chapters) +{ + XNamespace dc = "http://purl.org/dc/elements/1.1/"; + XNamespace opf = "http://www.idpf.org/2007/opf"; + + var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null)); + + var package = new XElement(opf + "package", + new XAttribute("unique-identifier", "BookId"), + new XAttribute("version", "2.0")); + + doc.Add(package); + + var meta = new XElement(opf + "metadata", + new XAttribute(XNamespace.Xmlns + "dc", dc), + new XAttribute(XNamespace.Xmlns + "opf", opf), + new XElement(dc + "title", ACTIVE_BOOK.Title), + new XElement(dc + "creator", ACTIVE_BOOK.Author), + new XElement(dc + "identifier", + new XAttribute("id", "BookId"), + new XAttribute(opf + "scheme", "UUID"), + "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), + new XElement(dc + "date", + new XAttribute(opf + "event", "publication"), + ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "date", + new XAttribute(opf + "event", "modification"), + DateTime.Now.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "date", + new XAttribute(opf + "event", "creation"), + DateTime.Now.ToString("yyyy'-'MM'-'dd")), + new XElement(dc + "language", ACTIVE_BOOK.Language), + new XElement(dc + "identifier", + new XAttribute(opf + "scheme", "UUID"), + ACTIVE_BOOK.ID_CAL.ToString("D")), + new XElement(opf + "meta", + new XAttribute("content", "1.0"), + new XAttribute("name", "Wordpress_eBook_scraper_version")), + new XElement(opf + "meta", + new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")), + new XAttribute("name", "Wordpress_eBook_scraper_creation_time"))); + + if (ACTIVE_BOOK.Series != null) + { + meta.Add(new XElement(opf + "meta", + new XAttribute("content", ACTIVE_BOOK.Series), + new XAttribute("name", "calibre:series"))); + meta.Add(new XElement(opf + "meta", + new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)), + new XAttribute("name", "calibre:series_index"))); + } + + package.Add(meta); + + var manifest = new XElement(opf + "manifest"); + for(int i = 0; i < chapters.Count; i++) + { + manifest.Add(new XElement(opf + "item", + new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Filenamify(chapters[i].title, true)))), + new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))), + new XAttribute("media-type", "application/xhtml+xml"))); + } + manifest.Add(new XElement(opf + "item", + new XAttribute("href", "toc.ncx"), + new XAttribute("id", "ncx"), + new XAttribute("media-type", "application/x-dtbncx+xml"))); + + package.Add(manifest); + + var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx")); + for (int i = 0; i < chapters.Count; i++) + { + spine.Add(new XElement(opf + "itemref", + new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))))); + } + + package.Add(spine); + + package.Add(new XElement(opf + "guide")); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + return writer.ToString(); + } +} + +string GetEpubTOC(List chapters) +{ + XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/"; + XNamespace ncx = "http://www.idpf.org/2007/opf"; + + var doc = new XDocument( + new XDeclaration("1.0", "UTF-8", null), + new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null)); + + var root = new XElement(ncx + "ncx", + new XAttribute("version", "2005-1"), + new XElement(ncx + "head", + new XElement(ncx + "meta", + new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), + new XAttribute("name", "dtb:uid")), + new XElement(ncx + "meta", + new XAttribute("content", 1), + new XAttribute("name", "dtb:depth")), + new XElement(ncx + "meta", + new XAttribute("content", 0), + new XAttribute("name", "dtb:totalPageCount")), + new XElement(ncx + "meta", + new XAttribute("content", 0), + new XAttribute("name", "dtb:maxPageNumber")))); + + doc.Add(root); + + root.Add(new XElement(ncx + "docTitle", + new XElement(ncx + "text", "Unknown"))); + + var nav = new XElement(ncx + "navMap"); + for (int i = 0; i < chapters.Count; i++) + { + nav.Add(new XElement(ncx + "navPoint", + new XAttribute("id", "navPoint-" + (i + 1)), + new XAttribute("playOrder", i + 1), + new XElement(ncx + "navLabel", + new XElement(ncx + "text", chapters[i].title)), + new XElement(ncx + "content", + new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true)))))); + } + + root.Add(nav); + + StringBuilder builder = new StringBuilder(); + using (Utf8StringWriter writer = new Utf8StringWriter()) + { + doc.Save(writer); + return writer.ToString(); + } +} + +string GetEpubChapterFile(Chapter chapter, int idx) +{ + StringBuilder xml = new StringBuilder(); + + xml.AppendLine(@""); + xml.AppendLine(@" "); + xml.AppendLine(@""); + xml.AppendLine(@""); + xml.AppendLine("" + HtmlEntity.Entitize(chapter.title) + ""); + xml.AppendLine(@""); + xml.AppendLine(@""); + xml.AppendLine("

" + HtmlEntity.Entitize(chapter.title) + "

"); + xml.AppendLine(chapter.chapter); + xml.AppendLine(@""); + xml.AppendLine(@""); + + return xml.ToString(); +} + +public struct ProcessOutput +{ + public readonly string Command; + public readonly int ExitCode; + public readonly string StdOut; + public readonly string StdErr; + public readonly string StdCombined; + + public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) + { + Command = cmd; + ExitCode = ex; + StdOut = stdout; + StdErr = stderr; + StdCombined = stdcom; + } + + public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; +} + +public static class ProcessHelper +{ + public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) + { + var process = new Process + { + StartInfo = + { + FileName = command, + Arguments = arguments, + WorkingDirectory = workingDirectory ?? string.Empty, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true, + ErrorDialog = false, + } + }; + + var builderOut = new StringBuilder(); + var builderErr = new StringBuilder(); + var builderBoth = new StringBuilder(); + + process.OutputDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderOut.Length == 0) builderOut.Append(args.Data); + else builderOut.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.ErrorDataReceived += (sender, args) => + { + if (args.Data == null) return; + + if (builderErr.Length == 0) builderErr.Append(args.Data); + else builderErr.Append("\n" + args.Data); + + if (builderBoth.Length == 0) builderBoth.Append(args.Data); + else builderBoth.Append("\n" + args.Data); + }; + + process.Start(); + + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + process.WaitForExit(); + + return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); + } +} +public static class HTMLToText +{ + private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); + private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); + + private class PreceedingDomTextInfo + { + public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) + { + IsFirstTextOfDocWritten = isFirstTextOfDocWritten; + } + public bool WritePrecedingWhiteSpace { get; set; } + public bool LastCharWasSpace { get; set; } + public readonly BoolWrapper IsFirstTextOfDocWritten; + public int ListIndex { get; set; } + } + + private class BoolWrapper + { + public BoolWrapper() { } + public bool Value { get; set; } + public static implicit operator bool(BoolWrapper boolWrapper) + { + return boolWrapper.Value; + } + public static implicit operator BoolWrapper(bool boolWrapper) + { + return new BoolWrapper { Value = boolWrapper }; + } + } + + public static string Convert(string path) + { + HtmlDocument doc = new HtmlDocument(); + doc.Load(path); + return ConvertDoc(doc); + } + + public static string ConvertHtml(string html) + { + HtmlDocument doc = new HtmlDocument(); + html = REX_TAG1.Replace(html, " "); + html = REX_TAG2.Replace(html, " "); + doc.LoadHtml(html); + return ConvertDoc(doc); + } + + public static string ConvertDoc(HtmlDocument doc) + { + using (StringWriter sw = new StringWriter()) + { + ConvertTo(doc.DocumentNode, sw); + sw.Flush(); + return sw.ToString(); + } + } + + private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + foreach (HtmlNode subnode in node.ChildNodes) + { + ConvertTo(subnode, outText, textInfo); + } + } + + public static void ConvertTo(HtmlNode node, TextWriter outText) + { + ConvertTo(node, outText, new PreceedingDomTextInfo(false)); + } + + private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) + { + string html; + switch (node.NodeType) + { + case HtmlNodeType.Comment: + // don't output comments + break; + case HtmlNodeType.Document: + ConvertContentTo(node, outText, textInfo); + break; + case HtmlNodeType.Text: + // script and style must not be output + string parentName = node.ParentNode.Name; + if ((parentName == "script") || (parentName == "style")) + { + break; + } + // get text + html = ((HtmlTextNode)node).Text; + // is it in fact a special closing node output as text? + if (HtmlNode.IsOverlappedClosingElement(html)) break; + + // check the text is meaningful and not a bunch of whitespaces + if (html.Length == 0) break; + + if (html.Trim().ToLower().StartsWith("")) break; + + if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) + { + html = html.TrimStart(); + if (html.Length == 0) { break; } + textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; + } + outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); + if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) + { + outText.Write(' '); + } + break; + case HtmlNodeType.Element: + string endElementString = null; + bool isInline; + bool skip = false; + int listIndex = 0; + switch (node.Name) + { + case "nav": + skip = true; + isInline = false; + break; + case "body": + case "section": + case "article": + case "aside": + case "h1": + case "h2": + case "header": + case "footer": + case "address": + case "main": + case "div": + case "span": + case "p": // stylistic - adjust as you tend to use + if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); + endElementString = "\r\n"; + isInline = false; + break; + case "br": + outText.Write("\r\n"); + skip = true; + textInfo.WritePrecedingWhiteSpace = false; + isInline = true; + break; + case "a": + isInline = true; + break; + case "li": + isInline = false; + break; + case "ol": + listIndex = 1; + goto case "ul"; + case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems + endElementString = "\r\n"; + isInline = false; + break; + case "img": //inline-block in reality + isInline = true; + break; + default: + isInline = true; + break; + } + if (!skip && node.HasChildNodes) + { + ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); + } + if (endElementString != null) + { + outText.Write(endElementString); + } + break; + } + } +} \ No newline at end of file diff --git a/WordpressEboobScraper2.csproj b/WordpressEboobScraper2.csproj new file mode 100644 index 0000000..2b14c81 --- /dev/null +++ b/WordpressEboobScraper2.csproj @@ -0,0 +1,10 @@ + + + + Exe + net7.0 + enable + enable + + + diff --git a/WordpressEboobScraper2.sln b/WordpressEboobScraper2.sln new file mode 100644 index 0000000..2ac5da8 --- /dev/null +++ b/WordpressEboobScraper2.sln @@ -0,0 +1,16 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WordpressEboobScraper2", "WordpressEboobScraper2.csproj", "{1E7E37A3-5C85-41F3-904F-5AF49C493045}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {1E7E37A3-5C85-41F3-904F-5AF49C493045}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1E7E37A3-5C85-41F3-904F-5AF49C493045}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1E7E37A3-5C85-41F3-904F-5AF49C493045}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1E7E37A3-5C85-41F3-904F-5AF49C493045}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal