Skip to content

Commit

Permalink
Rework completely how Spidey works
Browse files Browse the repository at this point in the history
Basically the concurrency crawling was being done wrong:
1. I was spinning way too many tasks (as many as URLs in the queue)
2. poolboy was only being used for throttling network requests.
3. too much going on when I actually only needed a simple mapping over
   urls concurrently.

TLDR: swapped all the broken pooling implementation for
`Task.Supervisor.async_stream/3`. The crawling was blocking anyways, so
there was no benefit really in all the supervisors/workers being spinned
up.
  • Loading branch information
manzanit0 committed May 2, 2021
1 parent 332e9c6 commit 5d05a77
Show file tree
Hide file tree
Showing 12 changed files with 131 additions and 177 deletions.
3 changes: 2 additions & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
elixir 1.11.2-otp-22
elixir 1.11.4-otp-23
erlang 23.3.1
56 changes: 56 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "Test Function at Cursor",
"command": "mix test ${relativeFile}:${lineNumber}",
"group": "test",
"type": "shell",
"problemMatcher": [
"$mixCompileError",
"$mixCompileWarning",
"$mixTestFailure"
],
"presentation": {
"echo": true,
"reveal": "always",
"focus": false,
"panel": "shared"
}
},
{
"label": "Test Current File",
"command": "mix test ${relativeFile}",
"group": "test",
"type": "shell",
"problemMatcher": [
"$mixCompileError",
"$mixCompileWarning",
"$mixTestFailure"
],
"presentation": {
"echo": true,
"reveal": "always",
"focus": false,
"panel": "shared"
}
},
{
"label": "Run All Tests",
"command": "mix test",
"type": "shell",
"group": "test",
"problemMatcher": [
"$mixCompileError",
"$mixCompileWarning",
"$mixTestFailure"
],
"presentation": {
"echo": true,
"reveal": "always",
"focus": false,
"panel": "shared"
}
}
]
}
2 changes: 1 addition & 1 deletion lib/spidey.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ defmodule Spidey do
@doc """
Crawls a website for all the same-domain urls, returning a list with them.
The defauilt `pool_name` is `:default`, but a custom one can be provided.
The default `pool_name` is `:default`, but a custom one can be provided.
The default filter rejects assets, Wordpress links, and others. To provide
custom filtering make sure to implement the `Spidey.Filter` behaviour and
Expand Down
5 changes: 1 addition & 4 deletions lib/spidey/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@ defmodule Spidey.Application do
use Application

def start(_type, _args) do
children = [
{Spidey.Crawler.PoolManager, []},
{Registry, keys: :unique, name: Spidey.Registry}
]
children = []

opts = [strategy: :one_for_one, name: Spidey.Supervisor]
Supervisor.start_link(children, opts)
Expand Down
64 changes: 42 additions & 22 deletions lib/spidey/crawler.ex
Original file line number Diff line number Diff line change
@@ -1,46 +1,66 @@
defmodule Spidey.Crawler do
alias Spidey.Logger
alias Spidey.Crawler.{PoolManager, UrlStore, Queue, Worker}

@worker_timeout 60_000
alias Spidey.Filter
alias Spidey.Crawler.{UrlStore, Queue, Content}

def crawl(seed, pool_name, opts) do
filter = Keyword.get(opts, :filter, Spidey.Filter.DefaultFilter)

Logger.log("starting crawler supervision tree #{pool_name}")
{:ok, pid} = Spidey.Crawler.Supervisor.start_link(pool_name, [])

try do
Logger.log("starting pool and ETS table #{pool_name}")
PoolManager.start_child(pool_name, opts)
Logger.log("starting ETS table #{pool_name}")
UrlStore.init!(seed, pool_name)

Queue.push(seed, pool_name)
crawl_queue(pool_name, seed)
crawl_queue(pool_name, seed, filter)
after
Logger.log("terminating pool and ETS table #{pool_name}")
PoolManager.terminate_child(pool_name)
Logger.log("terminating crawler supervision tree #{pool_name}")
Process.exit(pid, :normal)

Logger.log("terminating ETS table #{pool_name}")
UrlStore.teardown(pool_name)
end
end

defp crawl_queue(pool_name, seed) do
defp crawl_queue(pool_name, seed, filter) do
queue_length = Queue.length(pool_name)

if queue_length == 0 do
Logger.log("no urls remaining in queue. Returning all urls")
UrlStore.retrieve_all(pool_name)
else
queue_length
|> Queue.take(pool_name)
|> Enum.map(&run_in_pool(&1, pool_name, seed))
|> Task.await_many(@worker_timeout)
max_concurrency = System.schedulers_online()

Logger.log(
"attempting to crawl #{queue_length} urls at a concurrent rate of #{max_concurrency}"
)

urls = Queue.take(queue_length, pool_name)

crawl_queue(pool_name, seed)
Task.Supervisor.async_stream(
:"#{pool_name}TaskSupervisor",
urls,
fn url ->
url
|> Content.scan()
|> Filter.filter_urls(filter, seed: seed)
|> Stream.reject(&UrlStore.exists?(&1, pool_name))
|> Stream.each(&push_to_stores(&1, pool_name))
|> Stream.run()
end,
timeout: 10_000,
on_timeout: :kill_task
)
|> Stream.run()

crawl_queue(pool_name, seed, filter)
end
end

defp run_in_pool(url, pool_name, seed) do
Task.async(fn ->
:poolboy.transaction(
pool_name,
fn pid -> Worker.crawl(pid, url, pool_name, seed, timeout: @worker_timeout - 5000) end,
@worker_timeout
)
end)
defp push_to_stores(url, pool_name) do
Queue.push(url, pool_name)
UrlStore.add(url, pool_name)
end
end
48 changes: 0 additions & 48 deletions lib/spidey/crawler/pool_manager.ex

This file was deleted.

42 changes: 0 additions & 42 deletions lib/spidey/crawler/pool_supervisor.ex

This file was deleted.

4 changes: 1 addition & 3 deletions lib/spidey/crawler/queue.ex
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
defmodule Spidey.Crawler.Queue do
use Agent

alias __MODULE__

def start_link(urls, pool_name) do
queue = :queue.from_list(urls)
Agent.start_link(fn -> queue end, name: queue_name(pool_name))
Expand All @@ -11,7 +9,7 @@ defmodule Spidey.Crawler.Queue do
def child_spec(pool_name, urls \\ []) do
%{
id: queue_name(pool_name),
start: {Queue, :start_link, [urls, pool_name]}
start: {__MODULE__, :start_link, [urls, pool_name]}
}
end

Expand Down
19 changes: 19 additions & 0 deletions lib/spidey/crawler/supervisor.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
defmodule Spidey.Crawler.Supervisor do
use Supervisor

alias Spidey.Crawler.Queue

def start_link(pool_name, _opts) do
Supervisor.start_link(__MODULE__, %{pool_name: pool_name}, name: :"#{pool_name}Supervisor")
end

@impl true
def init(%{pool_name: pool_name}) do
children = [
{Task.Supervisor, name: :"#{pool_name}TaskSupervisor"},
Queue.child_spec(pool_name)
]

Supervisor.init(children, strategy: :one_for_all)
end
end
46 changes: 0 additions & 46 deletions lib/spidey/crawler/worker.ex

This file was deleted.

7 changes: 3 additions & 4 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ defmodule Spidey.MixProject do
package: package(),
docs: docs(),
version: @version,
elixir: "~> 1.11",
elixir: "~> 1.6",
start_permanent: Mix.env() == :prod,
deps: deps(),
elixirc_paths: elixirc_paths(Mix.env()),
Expand All @@ -33,12 +33,11 @@ defmodule Spidey.MixProject do

defp deps do
[
{:httpoison, "~> 1.7.0"},
{:httpoison, "~> 1.8.0"},
{:floki, "~> 0.27.0"},
{:excoveralls, "~> 0.10", only: :test},
{:mox, "~> 0.5", only: :test},
{:ex_doc, ">= 0.0.0", only: :dev, runtime: false},
{:poolboy, "~> 1.5"}
{:ex_doc, ">= 0.0.0", only: :dev, runtime: false}
]
end

Expand Down
Loading

0 comments on commit 5d05a77

Please sign in to comment.