diff --git a/.tool-versions b/.tool-versions index f3d5fde..515cf0e 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1 +1,2 @@ -elixir 1.11.2-otp-22 +elixir 1.11.4-otp-23 +erlang 23.3.1 diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..20aa2e4 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,56 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Test Function at Cursor", + "command": "mix test ${relativeFile}:${lineNumber}", + "group": "test", + "type": "shell", + "problemMatcher": [ + "$mixCompileError", + "$mixCompileWarning", + "$mixTestFailure" + ], + "presentation": { + "echo": true, + "reveal": "always", + "focus": false, + "panel": "shared" + } + }, + { + "label": "Test Current File", + "command": "mix test ${relativeFile}", + "group": "test", + "type": "shell", + "problemMatcher": [ + "$mixCompileError", + "$mixCompileWarning", + "$mixTestFailure" + ], + "presentation": { + "echo": true, + "reveal": "always", + "focus": false, + "panel": "shared" + } + }, + { + "label": "Run All Tests", + "command": "mix test", + "type": "shell", + "group": "test", + "problemMatcher": [ + "$mixCompileError", + "$mixCompileWarning", + "$mixTestFailure" + ], + "presentation": { + "echo": true, + "reveal": "always", + "focus": false, + "panel": "shared" + } + } + ] +} diff --git a/lib/spidey.ex b/lib/spidey.ex index ed15e71..d81e7cf 100644 --- a/lib/spidey.ex +++ b/lib/spidey.ex @@ -11,7 +11,7 @@ defmodule Spidey do @doc """ Crawls a website for all the same-domain urls, returning a list with them. - The defauilt `pool_name` is `:default`, but a custom one can be provided. + The default `pool_name` is `:default`, but a custom one can be provided. The default filter rejects assets, Wordpress links, and others. To provide custom filtering make sure to implement the `Spidey.Filter` behaviour and diff --git a/lib/spidey/application.ex b/lib/spidey/application.ex index 3ff79fa..9ea7a1a 100644 --- a/lib/spidey/application.ex +++ b/lib/spidey/application.ex @@ -2,10 +2,7 @@ defmodule Spidey.Application do use Application def start(_type, _args) do - children = [ - {Spidey.Crawler.PoolManager, []}, - {Registry, keys: :unique, name: Spidey.Registry} - ] + children = [] opts = [strategy: :one_for_one, name: Spidey.Supervisor] Supervisor.start_link(children, opts) diff --git a/lib/spidey/crawler.ex b/lib/spidey/crawler.ex index 215d671..03d1938 100644 --- a/lib/spidey/crawler.ex +++ b/lib/spidey/crawler.ex @@ -1,46 +1,66 @@ defmodule Spidey.Crawler do alias Spidey.Logger - alias Spidey.Crawler.{PoolManager, UrlStore, Queue, Worker} - - @worker_timeout 60_000 + alias Spidey.Filter + alias Spidey.Crawler.{UrlStore, Queue, Content} def crawl(seed, pool_name, opts) do + filter = Keyword.get(opts, :filter, Spidey.Filter.DefaultFilter) + + Logger.log("starting crawler supervision tree #{pool_name}") + {:ok, pid} = Spidey.Crawler.Supervisor.start_link(pool_name, []) + try do - Logger.log("starting pool and ETS table #{pool_name}") - PoolManager.start_child(pool_name, opts) + Logger.log("starting ETS table #{pool_name}") UrlStore.init!(seed, pool_name) Queue.push(seed, pool_name) - crawl_queue(pool_name, seed) + crawl_queue(pool_name, seed, filter) after - Logger.log("terminating pool and ETS table #{pool_name}") - PoolManager.terminate_child(pool_name) + Logger.log("terminating crawler supervision tree #{pool_name}") + Process.exit(pid, :normal) + + Logger.log("terminating ETS table #{pool_name}") UrlStore.teardown(pool_name) end end - defp crawl_queue(pool_name, seed) do + defp crawl_queue(pool_name, seed, filter) do queue_length = Queue.length(pool_name) if queue_length == 0 do + Logger.log("no urls remaining in queue. Returning all urls") UrlStore.retrieve_all(pool_name) else - queue_length - |> Queue.take(pool_name) - |> Enum.map(&run_in_pool(&1, pool_name, seed)) - |> Task.await_many(@worker_timeout) + max_concurrency = System.schedulers_online() + + Logger.log( + "attempting to crawl #{queue_length} urls at a concurrent rate of #{max_concurrency}" + ) + + urls = Queue.take(queue_length, pool_name) - crawl_queue(pool_name, seed) + Task.Supervisor.async_stream( + :"#{pool_name}TaskSupervisor", + urls, + fn url -> + url + |> Content.scan() + |> Filter.filter_urls(filter, seed: seed) + |> Stream.reject(&UrlStore.exists?(&1, pool_name)) + |> Stream.each(&push_to_stores(&1, pool_name)) + |> Stream.run() + end, + timeout: 10_000, + on_timeout: :kill_task + ) + |> Stream.run() + + crawl_queue(pool_name, seed, filter) end end - defp run_in_pool(url, pool_name, seed) do - Task.async(fn -> - :poolboy.transaction( - pool_name, - fn pid -> Worker.crawl(pid, url, pool_name, seed, timeout: @worker_timeout - 5000) end, - @worker_timeout - ) - end) + defp push_to_stores(url, pool_name) do + Queue.push(url, pool_name) + UrlStore.add(url, pool_name) end end diff --git a/lib/spidey/crawler/pool_manager.ex b/lib/spidey/crawler/pool_manager.ex deleted file mode 100644 index 866afaa..0000000 --- a/lib/spidey/crawler/pool_manager.ex +++ /dev/null @@ -1,48 +0,0 @@ -defmodule Spidey.Crawler.PoolManager do - @moduledoc """ - PoolManager is in charge of spinning up the pools and queues to crawl new - websites. Every time a new website is to be crawled, a pool is created, - along with a queue. When the job has been finished, they are terminated. - """ - - use DynamicSupervisor - - alias Spidey.Crawler.PoolSupervisor - - @doc "Starts the supervisor." - def start_link(_) do - DynamicSupervisor.start_link(__MODULE__, [], name: __MODULE__) - end - - @doc "Starts a pool of crawlers and a queue under the supervisor." - def start_child(pool_name, opts \\ []) do - case Registry.lookup(Spidey.Registry, pool_name) do - [_] -> - {:error, :already_exists} - - [] -> - child_spec = PoolSupervisor.child_spec(pool_name, opts) - DynamicSupervisor.start_child(__MODULE__, child_spec) - end - end - - @doc "Terminates both the pool of crawlers and the queue they use." - def terminate_child(pool_name) do - case Registry.lookup(Spidey.Registry, pool_name) do - [{_, %{pid: pid}}] -> - DynamicSupervisor.terminate_child(__MODULE__, pid) - - [] -> - {:error, :not_found} - end - end - - @impl true - def init(_args) do - DynamicSupervisor.init( - strategy: :one_for_one, - max_restarts: 5, - max_seconds: 5 - ) - end -end diff --git a/lib/spidey/crawler/pool_supervisor.ex b/lib/spidey/crawler/pool_supervisor.ex deleted file mode 100644 index 8e32068..0000000 --- a/lib/spidey/crawler/pool_supervisor.ex +++ /dev/null @@ -1,42 +0,0 @@ -defmodule Spidey.Crawler.PoolSupervisor do - use Supervisor - - alias Spidey.Crawler.{PoolSupervisor, Worker, Queue} - - def start_link(pool_name, opts) do - Supervisor.start_link(__MODULE__, %{pool_name: pool_name, opts: opts}, - name: :"#{pool_name}Supervisor" - ) - end - - def child_spec(pool_name, opts) do - %{ - id: :"#{pool_name}Supervisor", - start: {PoolSupervisor, :start_link, [pool_name, opts]} - } - end - - @impl true - def init(%{pool_name: pool_name, opts: opts}) do - pool_size = Keyword.get(opts, :pool_size, 20) - max_overflow = Keyword.get(opts, :max_overflow, 5) - - worker_opts = Keyword.take(opts, [:filter]) - - config = [ - name: {:local, pool_name}, - worker_module: Worker, - size: pool_size, - max_overflow: max_overflow - ] - - children = [ - :poolboy.child_spec(pool_name, config, worker_opts), - Queue.child_spec(pool_name) - ] - - Registry.register(Spidey.Registry, pool_name, %{pid: self()}) - - Supervisor.init(children, strategy: :one_for_all) - end -end diff --git a/lib/spidey/crawler/queue.ex b/lib/spidey/crawler/queue.ex index 54f9af7..0d49380 100644 --- a/lib/spidey/crawler/queue.ex +++ b/lib/spidey/crawler/queue.ex @@ -1,8 +1,6 @@ defmodule Spidey.Crawler.Queue do use Agent - alias __MODULE__ - def start_link(urls, pool_name) do queue = :queue.from_list(urls) Agent.start_link(fn -> queue end, name: queue_name(pool_name)) @@ -11,7 +9,7 @@ defmodule Spidey.Crawler.Queue do def child_spec(pool_name, urls \\ []) do %{ id: queue_name(pool_name), - start: {Queue, :start_link, [urls, pool_name]} + start: {__MODULE__, :start_link, [urls, pool_name]} } end diff --git a/lib/spidey/crawler/supervisor.ex b/lib/spidey/crawler/supervisor.ex new file mode 100644 index 0000000..bef287f --- /dev/null +++ b/lib/spidey/crawler/supervisor.ex @@ -0,0 +1,19 @@ +defmodule Spidey.Crawler.Supervisor do + use Supervisor + + alias Spidey.Crawler.Queue + + def start_link(pool_name, _opts) do + Supervisor.start_link(__MODULE__, %{pool_name: pool_name}, name: :"#{pool_name}Supervisor") + end + + @impl true + def init(%{pool_name: pool_name}) do + children = [ + {Task.Supervisor, name: :"#{pool_name}TaskSupervisor"}, + Queue.child_spec(pool_name) + ] + + Supervisor.init(children, strategy: :one_for_all) + end +end diff --git a/lib/spidey/crawler/worker.ex b/lib/spidey/crawler/worker.ex deleted file mode 100644 index e32e1f3..0000000 --- a/lib/spidey/crawler/worker.ex +++ /dev/null @@ -1,46 +0,0 @@ -defmodule Spidey.Crawler.Worker do - use GenServer, restart: :transient - - alias Spidey.Filter - alias Spidey.Crawler.{Content, Queue, UrlStore} - alias Spidey.Logger - - def start_link(opts \\ []) do - Logger.log("starting worker process") - - filter = Keyword.get(opts, :filter, Filter.DefaultFilter) - GenServer.start_link(__MODULE__, %{filter: filter}) - end - - def crawl(pid, url, pool_name, seed, opts \\ []) when is_binary(url) do - Logger.log("pool #{pool_name} handling url: #{url}") - timeout = Keyword.get(opts, :timeout, 60_000) - GenServer.call(pid, {:work, url, pool_name, seed}, timeout) - end - - @impl true - def init(state) do - {:ok, state} - end - - @impl true - def handle_call({:work, url, pool_name, seed}, _from, %{filter: filter} = state) do - url - |> Content.scan() - |> Filter.filter_urls(filter, seed: seed) - |> Stream.reject(&UrlStore.exists?(&1, pool_name)) - |> Enum.each(&push_to_stores(&1, pool_name)) - - {:reply, :ok, state} - end - - @impl true - def terminate(reason, _state) do - Logger.log("worker terminated due to #{inspect(reason)}") - end - - defp push_to_stores(url, pool_name) do - Queue.push(url, pool_name) - UrlStore.add(url, pool_name) - end -end diff --git a/mix.exs b/mix.exs index 0d90bf3..86ea434 100644 --- a/mix.exs +++ b/mix.exs @@ -15,7 +15,7 @@ defmodule Spidey.MixProject do package: package(), docs: docs(), version: @version, - elixir: "~> 1.11", + elixir: "~> 1.6", start_permanent: Mix.env() == :prod, deps: deps(), elixirc_paths: elixirc_paths(Mix.env()), @@ -33,12 +33,11 @@ defmodule Spidey.MixProject do defp deps do [ - {:httpoison, "~> 1.7.0"}, + {:httpoison, "~> 1.8.0"}, {:floki, "~> 0.27.0"}, {:excoveralls, "~> 0.10", only: :test}, {:mox, "~> 0.5", only: :test}, - {:ex_doc, ">= 0.0.0", only: :dev, runtime: false}, - {:poolboy, "~> 1.5"} + {:ex_doc, ">= 0.0.0", only: :dev, runtime: false} ] end diff --git a/mix.lock b/mix.lock index f28471e..69d0f83 100644 --- a/mix.lock +++ b/mix.lock @@ -1,14 +1,14 @@ %{ - "certifi": {:hex, :certifi, "2.5.2", "b7cfeae9d2ed395695dd8201c57a2d019c0c43ecaf8b8bcb9320b40d6662f340", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "3b3b5f36493004ac3455966991eaf6e768ce9884693d9968055aeeeb1e575040"}, + "certifi": {:hex, :certifi, "2.6.1", "dbab8e5e155a0763eea978c913ca280a6b544bfa115633fa20249c3d396d9493", [:rebar3], [], "hexpm", "524c97b4991b3849dd5c17a631223896272c6b0af446778ba4675a1dff53bb7e"}, "coverex": {:hex, :coverex, "1.4.15", "60fadf825a6c0439b79d1f98cdb54b6733cdd5cb1b35d15d56026c44ed15a5a8", [:mix], [{:hackney, "~> 1.5", [hex: :hackney, repo: "hexpm", optional: false]}, {:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"}, "earmark_parser": {:hex, :earmark_parser, "1.4.10", "6603d7a603b9c18d3d20db69921527f82ef09990885ed7525003c7fe7dc86c56", [:mix], [], "hexpm", "8e2d5370b732385db2c9b22215c3f59c84ac7dda7ed7e544d7c459496ae519c0"}, "ex_doc": {:hex, :ex_doc, "0.22.2", "03a2a58bdd2ba0d83d004507c4ee113b9c521956938298eba16e55cc4aba4a6c", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "cf60e1b3e2efe317095b6bb79651f83a2c1b3edcb4d319c421d7fcda8b3aff26"}, "excoveralls": {:hex, :excoveralls, "0.11.2", "0c6f2c8db7683b0caa9d490fb8125709c54580b4255ffa7ad35f3264b075a643", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "e11a4490976aabeed3eb9dc70ec94a4f2d11fed5c9d4b5dc5d89bfa0a215abb5"}, "floki": {:hex, :floki, "0.27.0", "6b29a14283f1e2e8fad824bc930eaa9477c462022075df6bea8f0ad811c13599", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "583b8c13697c37179f1f82443bcc7ad2f76fbc0bf4c186606eebd658f7f2631b"}, - "hackney": {:hex, :hackney, "1.16.0", "5096ac8e823e3a441477b2d187e30dd3fff1a82991a806b2003845ce72ce2d84", [:rebar3], [{:certifi, "2.5.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.0", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.6", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "3bf0bebbd5d3092a3543b783bf065165fa5d3ad4b899b836810e513064134e18"}, + "hackney": {:hex, :hackney, "1.17.4", "99da4674592504d3fb0cfef0db84c3ba02b4508bae2dff8c0108baa0d6e0977c", [:rebar3], [{:certifi, "~>2.6.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "de16ff4996556c8548d512f4dbe22dd58a587bf3332e7fd362430a7ef3986b16"}, "html_entities": {:hex, :html_entities, "0.5.1", "1c9715058b42c35a2ab65edc5b36d0ea66dd083767bef6e3edb57870ef556549", [:mix], [], "hexpm", "30efab070904eb897ff05cd52fa61c1025d7f8ef3a9ca250bc4e6513d16c32de"}, - "httpoison": {:hex, :httpoison, "1.7.0", "abba7d086233c2d8574726227b6c2c4f6e53c4deae7fe5f6de531162ce9929a0", [:mix], [{:hackney, "~> 1.16", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "975cc87c845a103d3d1ea1ccfd68a2700c211a434d8428b10c323dc95dc5b980"}, - "idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"}, + "httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"}, + "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, "jason": {:hex, :jason, "1.1.2", "b03dedea67a99223a2eaf9f1264ce37154564de899fd3d8b9a21b1a6fd64afe7", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fdf843bca858203ae1de16da2ee206f53416bbda5dc8c9e78f43243de4bc3afe"}, "makeup": {:hex, :makeup, "1.0.3", "e339e2f766d12e7260e6672dd4047405963c5ec99661abdc432e6ec67d29ef95", [:mix], [{:nimble_parsec, "~> 0.5", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "2e9b4996d11832947731f7608fed7ad2f9443011b3b479ae288011265cdd3dad"}, "makeup_elixir": {:hex, :makeup_elixir, "0.14.1", "4f0e96847c63c17841d42c08107405a005a2680eb9c7ccadfd757bd31dabccfb", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f2438b1a80eaec9ede832b5c41cd4f373b38fd7aa33e3b22d9db79e640cbde11"}, @@ -17,9 +17,9 @@ "mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm", "b93e2b1e564bdbadfecc297277f9e6d0902da645b417d6c9210f6038ac63489a"}, "mox": {:hex, :mox, "0.5.1", "f86bb36026aac1e6f924a4b6d024b05e9adbed5c63e8daa069bd66fb3292165b", [:mix], [], "hexpm", "052346cf322311c49a0f22789f3698eea030eec09b8c47367f0686ef2634ae14"}, "nimble_parsec": {:hex, :nimble_parsec, "0.6.0", "32111b3bf39137144abd7ba1cce0914533b2d16ef35e8abc5ec8be6122944263", [:mix], [], "hexpm", "27eac315a94909d4dc68bc07a4a83e06c8379237c5ea528a9acff4ca1c873c52"}, - "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm", "17ef63abde837ad30680ea7f857dd9e7ced9476cdd7b0394432af4bfc241b960"}, + "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"}, "poolboy": {:hex, :poolboy, "1.5.2", "392b007a1693a64540cead79830443abf5762f5d30cf50bc95cb2c1aaafa006b", [:rebar3], [], "hexpm", "dad79704ce5440f3d5a3681c8590b9dc25d1a561e8f5a9c995281012860901e3"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"}, - "unicode_util_compat": {:hex, :unicode_util_compat, "0.5.0", "8516502659002cec19e244ebd90d312183064be95025a319a6c7e89f4bccd65b", [:rebar3], [], "hexpm", "d48d002e15f5cc105a696cf2f1bbb3fc72b4b770a184d8420c8db20da2674b38"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, }