From 3773e520da45e327a89b56b5db9e264a5c96496b Mon Sep 17 00:00:00 2001 From: Dereck Smith Date: Fri, 29 Mar 2024 16:15:44 +0530 Subject: [PATCH] Commit commited --- .gitignore | 5 ++ .projectile | 0 LICENSE.md | 7 ++ README.md | 64 ++++++++++++++++++ build.clj | 30 +++++++++ deps.edn | 9 +++ src/mail_harvester/core.clj | 120 +++++++++++++++++++++++++++++++++ src/mail_harvester/scraper.clj | 53 +++++++++++++++ 8 files changed, 288 insertions(+) create mode 100644 .gitignore create mode 100644 .projectile create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 build.clj create mode 100644 deps.edn create mode 100644 src/mail_harvester/core.clj create mode 100644 src/mail_harvester/scraper.clj diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..34b911a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +target/ +.lsp/ +.cpcache/ +.clj-kondo/ +drivers/ \ No newline at end of file diff --git a/.projectile b/.projectile new file mode 100644 index 0000000..e69de29 diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..4fce019 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,7 @@ +Copyright 2024 Dereck Smith + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..2a74e9d --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# Mail Harvester + +A Clojure app that scrapes emails and links from any website. + +Browsers supported: +* Chrome +* Firefox +* Safari + +## How to use? + +### Note + +If you are using Chrome or Firefox, skip this section. + +If you want to use Safari, you should enable a feature to use this application (Remote Automation) + +For Sonoma: +* Open Safari > Preferences from the menu bar (or use the shortcut Command + ,) +* Go to the Advanced section +* Check the "Show features for web developers" checkbox +* Go to the Developer section +* Check the "Allow Remote Automation" checkbox + +For Ventura and below: +* Open Safari > Preferences from the menu bar (or use the shortcut Command + ,) +* Go to the Advanced Section +* Check the "Show Develop menu in menu bar" +* Click Safari > Develop > Allow Remote Automation from the menu bar + +### Setup + +Before using this app, you need to install Java from [here](https://adoptium.net). + +After you have installed Java, download the latest release from the Releases section in the right side of this page + +After you have download the archive, unzip it and double click the .jar to use! + +# For Developers + +## Prerequisites + +While running locally, this project expects the drivers to be in a folder named `drivers`. For users convenience, these drivers are packaged in the Releases, but not in the repository. + +If you run it with the `clj` tool, they should be in the root of the repository. If you run it after compiling it in a jar, it needs to be in the `target` directory (or wherever the JAR is) + +The browsers this application supports are: +* Chrome +* Firefox +* Safari + +You can download the drivers for them at: +* [Chrome](https://chromedriver.chromium.org/downloads) +* [Firefox](https://github.com/mozilla/geckodriver/releases) +* Safari doesn't need a driver. Check the [note](#note) above + +## How to run + +To run from the `clj` tool, use `clj -M -m mail-harvester.core` +To compile a JAR, use `clj -T:build uber` + +# License + +This project is licensed under the MIT License. diff --git a/build.clj b/build.clj new file mode 100644 index 0000000..2ab2114 --- /dev/null +++ b/build.clj @@ -0,0 +1,30 @@ +(ns build + (:require [clojure.tools.build.api :as b])) + +(def build-directory "target") +(def jar-content (str build-directory "/resources")) + +(def basis (b/create-basis {:project "deps.edn"})) +(def version "1.0.0") +(def app-name "mail-harvester") +(def uber-file-name (format "%s/%s-%s-standalone.jar" build-directory app-name version)) + +(defn clean [_] + (b/delete {:path build-directory}) + (println (format "Build directory \"%s\" removed" build-directory))) + +(defn uber [_] + (clean nil) + (b/copy-file {:src "README.md" + :target-dir build-directory}) + + (b/compile-clj {:basis basis + :src-dirs ["src"] + :class-dir jar-content}) + + (b/uber {:class-dir jar-content + :uber-file uber-file-name + :basis basis + :main 'mail-harvester.core}) + + (println (format "Uber file created: \"%s\"" uber-file-name))) diff --git a/deps.edn b/deps.edn new file mode 100644 index 0000000..0f871c9 --- /dev/null +++ b/deps.edn @@ -0,0 +1,9 @@ +{:paths ["src" "classes"] + :deps {org.clojure/clojure {:mvn/version "1.11.2"} + cljfx/cljfx {:mvn/version "1.8.0"} + etaoin/etaoin {:mvn/version "1.0.40"} + org.clojure/core.async {:mvn/version "1.6.681"} + org.clojure/data.csv {:mvn/version "1.1.0"}} + :aliases { + :build {:deps {io.github.clojure/tools.build {:mvn/version "0.10.0"}} + :ns-default build}}} diff --git a/src/mail_harvester/core.clj b/src/mail_harvester/core.clj new file mode 100644 index 0000000..125fb70 --- /dev/null +++ b/src/mail_harvester/core.clj @@ -0,0 +1,120 @@ +(ns mail-harvester.core + (:gen-class) + (:require [cljfx.api :as fx] + [mail-harvester.scraper :as scraper] + [clojure.core.async :refer [thread]]) + (:import [javafx.application Platform])) + +;; The main, mutable state for the app +(def *state + (atom {:status "Not in use" + :url "" + :browser "Chrome"})) + +;; Separate widgets so that they are easier to read. + +(defn url-field + "The URL field for the app" + [{}] + {:fx/type :h-box + :spacing 5 + :alignment :center + :children [{:fx/type :label + :text "URL: "} + {:fx/type :text-field + :on-text-changed #(swap! *state assoc :url %)}]}) + +(defn browser-picker + "The browser picker menu for the app" + [{}] + {:fx/type :h-box + :spacing 5 + :alignment :center + :children [{:fx/type :label + :text "Browser: "} + {:fx/type :choice-box + :items ["Chrome" "Firefox" "Safari"] + :value "Chrome" + :on-value-changed (fn [value] + (swap! *state assoc :browser value))}]}) + +(defn scrape-emails-button + "The 'Scrape URL for emails' button for the app" + [{}] + {:fx/type :button + :on-action (fn [_] + ;; Let the user know that scraping is going on + (swap! *state assoc :status "Scraping") + ;; Run the scraper function in another thread to prevent locking the UI + (thread (try + (let [res (scraper/scrape-url (-> @*state :url) + (-> @*state :browser) + "emails")] + ;; Export it to a CSV + (scraper/write-to-exports res "emails") + ;; ...and let the user know that we have exported it + (swap! *state assoc :status "Scraping Done!")) + (catch Exception e + ;; Write an error log + (spit "error.txt" e) + ;; And tell the user an error occured + (swap! *state assoc :status "Error! Please file an issue on GitHub"))))) + :text "Scrape URL for emails"}) + +(defn scrape-links-button + "The 'Scrape URL for links' button for the app" + [{}] + {:fx/type :button + :on-action (fn [_] + ;; Let the user know that scraping is going on + (swap! *state assoc :status "Scraping") + ;; Run the scraper function in another thread to prevent locking the UI + (thread (try + (let [res (scraper/scrape-url (-> @*state :url) + (-> @*state :browser) + "links")] + ;; Export it to a CSV + (scraper/write-to-exports res "links") + ;; ..and let the user know that we have exported it + (swap! *state assoc :status "Scraping Done!")) + (catch Exception e + ;; Write an error log + (println e) + ;; And tell the user an error occured + (swap! *state assoc :status "Error! Please file an issue on GitHub"))))) + :text "Scrape URL for links"}) + +(defn root + "The root app that glues all the components together" + [{:keys [status]}] + {:fx/type :stage + :showing true + :title "Mail Harvester" + :scene {:fx/type :scene + :root {:fx/type :v-box + :padding 15 + :alignment :center + :spacing 5 + :children [{:fx/type url-field} + {:fx/type browser-picker} + {:fx/type :label + :text "Choose the action you would like to perform"} + {:fx/type :v-box + :alignment :center + :spacing 5 + :children [{:fx/type scrape-links-button} + {:fx/type scrape-emails-button}]} + {:fx/type :label + :text (str "Status: " status)} + {:fx/type :label + :text "Visit the documentation for details on usage"}]}}}) + +;; A renderer that constantly checks the state and reloads if anything changes in the state +(def renderer (fx/create-renderer + :middleware (fx/wrap-map-desc assoc :fx/type root))) + +(defn -main + "The entry point for the app" + [& args] + (Platform/setImplicitExit true) ;; Exits when the main window closes + (fx/mount-renderer *state renderer)) diff --git a/src/mail_harvester/scraper.clj b/src/mail_harvester/scraper.clj new file mode 100644 index 0000000..7b8a364 --- /dev/null +++ b/src/mail_harvester/scraper.clj @@ -0,0 +1,53 @@ +(ns mail-harvester.scraper + (:require [etaoin.api :as e] + [clojure.java.io :as io] + [clojure.data.csv :as csv] + [clojure.string :as string]) + (:import [java.util UUID])) + +(defn fetch-emails + "Fetches the mailto: links in a website, removes the mailto: prefix and returns a list of emails" + [driver] + (for [rawemail (e/query-all driver {:css "a[href^=\"mailto:\"]"})] + ;; Replace instances of "mailto:" with an empty string + (string/replace (e/get-element-attr-el driver rawemail :href) + #"mailto:" + ""))) + +(defn fetch-links + "Fetches any links in a website and returns a list of links" + [driver] + (for [rawlink (e/query-all driver {:css "a"})] + ;; Return the raw link from the href attribute + (e/get-element-attr-el driver rawlink :href))) + +(defn scrape-url + "Scrapes from the URL, with information on which browser to use and what action to perform" + [url browser action] + (cond + (= browser "Firefox") (let [driver (e/firefox-headless {:path-driver "./drivers/geckodriver"})] + (e/go driver url) + (cond (= action "emails") (fetch-emails driver) + (= action "links") (fetch-links driver) + :else (println (format "Unknown action \"%s\" ignored" action)))) + (= browser "Chrome") (let [driver (e/chrome-headless {:path-driver "./drivers/chromedriver"})] + (e/go driver url) + (cond (= action "emails") (fetch-emails driver) + (= action "links") (fetch-links driver) + :else (println (format "Unknown action \"%s\" ignored" action)))) + (= browser "Safari") (let [driver (e/safari)] + (e/go driver url) + (cond (= action "emails") (fetch-emails driver) + (= action "links") (fetch-links driver) + :else (println (format "Unknown action \"%s\" ignored" action)))) + :else (throw (Exception. (str "Browser is not valid: " browser))))) + +(defn write-to-exports + "Exports a list of emails or links to a CSV file with a random UUID and a prefix" + [links type] + ;; Open a writer + (with-open [writer (io/writer (str type "-" (.toString (UUID/randomUUID)) ".csv"))] + ;; Write the data to it. It will handle closing it after it's not in use + (csv/write-csv writer + (map (fn [link] + (vector link)) links))))