Skip to content

Commit

Permalink
v2.5.0
Browse files Browse the repository at this point in the history
rewrite chapter downloading without using scala parallel collections (use explicit concurrency with blocking queue and futures)
reduce memory usage by streaming
add MIT license file
  • Loading branch information
Aivean committed Nov 15, 2021
1 parent 6dea69d commit d990fa8
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 33 deletions.
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name := "royalroad-downloader"

version := "2.4.0"
version := "2.5.0"

scalaVersion := "2.11.11"

Expand Down
5 changes: 4 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
[Royal Road](http://royalroad.com/) book downloader
---

Nothing fancy, just a simple CLI that imports fiction from [Royal Road](http://royalroad.com/)
A simple command-line tool that downloads fiction from [Royal Road](http://royalroad.com/)
as html. Use something like [Calibre](http://calibre-ebook.com/) to convert html to any desired format.

[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

How to use
---
Expand Down Expand Up @@ -43,6 +44,8 @@ Building from sources

* run `sbt/sbt assembly` in project directory

Note: build tested to work with Java 8.

Running from sources
--------------------

Expand Down
86 changes: 55 additions & 31 deletions src/main/scala/com/aivean/royalroad/Main.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ package com.aivean.royalroad

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL
import net.ruippeixotog.scalascraper.model.Document
import org.jsoup.Connection

import java.io.PrintWriter
import java.net.URLDecoder
import java.util.concurrent.ArrayBlockingQueue
import java.util.concurrent.atomic.AtomicLong
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.{Await, Future, duration}
import scala.util.{Failure, Success, Try}

object Main extends App {
Expand Down Expand Up @@ -47,14 +49,9 @@ object Main extends App {
""".stripMargin)
}

val chapUrls = {
val urls = threads.collect {
case x if x.startsWith("/") => "https://www.royalroad.com" + x
case x => x
}.par

urls.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(4))
urls
val chapUrls = threads.collect {
case x if x.startsWith("/") => "https://www.royalroad.com" + x
case x => x
}

// delay requests until that time
Expand Down Expand Up @@ -85,32 +82,59 @@ object Main extends App {
else throw e
}

val chaps = chapUrls.drop(cliArgs.fromChapter() - 1).map { u =>
val uDecoded = URLDecoder.decode(u, "utf-8")
println(s"downloading: $uDecoded")
uDecoded -> retry(backpressure(browser.get(uDecoded)))
}.map { case (u, doc) =>
println("parsing: " + u)

<h1 class="chapter">
{(doc >?> text(cliArgs.titleQuery()).map(_.trim.stripSuffix(" - " + title)))
.getOrElse(parsingError("chapter title", cliArgs.titleQuery(), u))}
</h1>.toString() +
(doc >?> element(cliArgs.bodyQuery()))
.getOrElse(parsingError("chapter text", cliArgs.bodyQuery(), u)).outerHtml
}.seq
import scala.concurrent.ExecutionContext.Implicits.global

// chapter producer, with parallelism limited by the capacity of queue (currently capacity = 4)
val chapQ = new ArrayBlockingQueue[Option[Future[(String, Document)]]](4, true)
Future {
chapUrls.drop(cliArgs.fromChapter() - 1).foreach { u =>
val uDecoded = URLDecoder.decode(u, "utf-8")
println(s"downloading: $uDecoded")
chapQ.put(Some(Future(uDecoded -> retry(backpressure(browser.get(uDecoded))))))
}
chapQ.put(None)
}

val filename = title.replaceAll("[^\\w\\d]+", "_") + ".html"
println("Saving as: " + filename)

new PrintWriter(filename, "UTF-8") {
write(
s"""<html><head><meta charset="UTF-8"><title>$title</title></head><body>
|${chaps.mkString("\n")}
|</body>
|</html>
""".stripMargin)
close()
val printWriter = new PrintWriter(filename, "UTF-8")
try {
printWriter.write(s"""<html><head><meta charset="UTF-8"><title>$title</title></head><body>""")

def rec(chap: Option[Future[(String, Document)]]): Unit = chap match {
case None =>
case Some(f) =>
Await.result(f, duration.Duration.Inf) match {
case (url, doc) =>
println("parsing: " + url)

// write chapter title to file
printWriter.write(
<h1 class="chapter">
{(doc >?> text(cliArgs.titleQuery()).map(_.trim.stripSuffix(" - " + title)))
.getOrElse(parsingError("chapter title", cliArgs.titleQuery(), url))}
</h1>.toString()
)

// write chapter content to file
printWriter.write(
(doc >?> element(cliArgs.bodyQuery()))
.getOrElse(parsingError("chapter text", cliArgs.bodyQuery(), url)).outerHtml
)

printWriter.write("\n")

// parse next chapter
rec(chapQ.take())
}
}

rec(chapQ.take())

} finally {
printWriter.write("</body></html>")
printWriter.close()
}

println("done")
Expand Down

0 comments on commit d990fa8

Please sign in to comment.