From d990fa84fc31f012d2423be15cd5bfd3e90fd5a5 Mon Sep 17 00:00:00 2001 From: Aivean Date: Sun, 14 Nov 2021 17:34:58 -0800 Subject: [PATCH] v2.5.0 rewrite chapter downloading without using scala parallel collections (use explicit concurrency with blocking queue and futures) reduce memory usage by streaming add MIT license file --- build.sbt | 2 +- readme.md | 5 +- .../scala/com/aivean/royalroad/Main.scala | 86 ++++++++++++------- 3 files changed, 60 insertions(+), 33 deletions(-) diff --git a/build.sbt b/build.sbt index ea85ace..3cd7da7 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "royalroad-downloader" -version := "2.4.0" +version := "2.5.0" scalaVersion := "2.11.11" diff --git a/readme.md b/readme.md index 6381c18..f1e43e9 100644 --- a/readme.md +++ b/readme.md @@ -1,9 +1,10 @@ [Royal Road](http://royalroad.com/) book downloader --- -Nothing fancy, just a simple CLI that imports fiction from [Royal Road](http://royalroad.com/) +A simple command-line tool that downloads fiction from [Royal Road](http://royalroad.com/) as html. Use something like [Calibre](http://calibre-ebook.com/) to convert html to any desired format. +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) How to use --- @@ -43,6 +44,8 @@ Building from sources * run `sbt/sbt assembly` in project directory +Note: build tested to work with Java 8. + Running from sources -------------------- diff --git a/src/main/scala/com/aivean/royalroad/Main.scala b/src/main/scala/com/aivean/royalroad/Main.scala index e64a8ae..486fc79 100644 --- a/src/main/scala/com/aivean/royalroad/Main.scala +++ b/src/main/scala/com/aivean/royalroad/Main.scala @@ -2,12 +2,14 @@ package com.aivean.royalroad import net.ruippeixotog.scalascraper.browser.JsoupBrowser import net.ruippeixotog.scalascraper.dsl.DSL +import net.ruippeixotog.scalascraper.model.Document import org.jsoup.Connection import java.io.PrintWriter import java.net.URLDecoder +import java.util.concurrent.ArrayBlockingQueue import java.util.concurrent.atomic.AtomicLong -import scala.collection.parallel.ForkJoinTaskSupport +import scala.concurrent.{Await, Future, duration} import scala.util.{Failure, Success, Try} object Main extends App { @@ -47,14 +49,9 @@ object Main extends App { """.stripMargin) } - val chapUrls = { - val urls = threads.collect { - case x if x.startsWith("/") => "https://www.royalroad.com" + x - case x => x - }.par - - urls.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(4)) - urls + val chapUrls = threads.collect { + case x if x.startsWith("/") => "https://www.royalroad.com" + x + case x => x } // delay requests until that time @@ -85,32 +82,59 @@ object Main extends App { else throw e } - val chaps = chapUrls.drop(cliArgs.fromChapter() - 1).map { u => - val uDecoded = URLDecoder.decode(u, "utf-8") - println(s"downloading: $uDecoded") - uDecoded -> retry(backpressure(browser.get(uDecoded))) - }.map { case (u, doc) => - println("parsing: " + u) - -

- {(doc >?> text(cliArgs.titleQuery()).map(_.trim.stripSuffix(" - " + title))) - .getOrElse(parsingError("chapter title", cliArgs.titleQuery(), u))} -

.toString() + - (doc >?> element(cliArgs.bodyQuery())) - .getOrElse(parsingError("chapter text", cliArgs.bodyQuery(), u)).outerHtml - }.seq + import scala.concurrent.ExecutionContext.Implicits.global + + // chapter producer, with parallelism limited by the capacity of queue (currently capacity = 4) + val chapQ = new ArrayBlockingQueue[Option[Future[(String, Document)]]](4, true) + Future { + chapUrls.drop(cliArgs.fromChapter() - 1).foreach { u => + val uDecoded = URLDecoder.decode(u, "utf-8") + println(s"downloading: $uDecoded") + chapQ.put(Some(Future(uDecoded -> retry(backpressure(browser.get(uDecoded)))))) + } + chapQ.put(None) + } val filename = title.replaceAll("[^\\w\\d]+", "_") + ".html" println("Saving as: " + filename) - new PrintWriter(filename, "UTF-8") { - write( - s"""$title - |${chaps.mkString("\n")} - | - | - """.stripMargin) - close() + val printWriter = new PrintWriter(filename, "UTF-8") + try { + printWriter.write(s"""$title""") + + def rec(chap: Option[Future[(String, Document)]]): Unit = chap match { + case None => + case Some(f) => + Await.result(f, duration.Duration.Inf) match { + case (url, doc) => + println("parsing: " + url) + + // write chapter title to file + printWriter.write( +

+ {(doc >?> text(cliArgs.titleQuery()).map(_.trim.stripSuffix(" - " + title))) + .getOrElse(parsingError("chapter title", cliArgs.titleQuery(), url))} +

.toString() + ) + + // write chapter content to file + printWriter.write( + (doc >?> element(cliArgs.bodyQuery())) + .getOrElse(parsingError("chapter text", cliArgs.bodyQuery(), url)).outerHtml + ) + + printWriter.write("\n") + + // parse next chapter + rec(chapQ.take()) + } + } + + rec(chapQ.take()) + + } finally { + printWriter.write("") + printWriter.close() } println("done")