-
Notifications
You must be signed in to change notification settings - Fork 0
/
archive.ps1
61 lines (51 loc) · 2.05 KB
/
archive.ps1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Make archive.ini if it doesn't exist
if (!(Test-Path -Path archive.ini)) {
Copy-Item -Path resources/defaults.ini -Destination archive.ini
}
# Read in archive.ini and set each line as a variable
Get-Content archive.ini | ForEach-Object {
$key, $value = $_ -split '=', 2
Set-Variable -Name $key -Value $value
}
# check if crawls are already running and exit if they are
$is_running=docker ps -q -f name="httrack"
if ($is_running) {
Write-Output "It looks like there is already a crawl running."
Write-Output "Either check on the status by running .\attach.ps1 or quit by running .\quit-crawlers.ps1"
exit
}
$is_running=docker ps -q -f name="webrecorder"
if ($is_running) {
Write-Output "It looks like there is already a crawl running."
Write-Output "Either check on the status by running .\attach.ps1 or quit by running .\quit-crawlers.ps1"
exit
}
# set variables
$url=$args[0]
$workdir=Join-Path -Path $pwd -ChildPath "crawls"
$domain=([System.Uri]$url).Host -replace '^www\.'
$now=Get-Date -UFormat '+%Y-%m-%dT%H%M%S'
$crawldir=Join-Path -Path $workdir -ChildPath $now-$domain
# build containers
docker.exe build -f resources/Dockerfile.webrecorder . -t site-archiving-toolkit-webrecorder
docker.exe build -f resources/Dockerfile.httrack . -t site-archiving-toolkit-httrack
# do browsertrix crawl
if ($enable_browsertrix -eq "TRUE") {
mkdir $crawldir\webrecorder
docker run --name webrecorder -d --rm -p 9037:9037 -v $crawldir/:/output -it site-archiving-toolkit-webrecorder /bin/bash /webrecorder.sh $url $domain $now
}
# do httrack crawl
if ($enable_httrack -eq "TRUE") {
mkdir $crawldir\httrack
docker run --name httrack -d --rm -v $crawldir/:/output site-archiving-toolkit-httrack /bin/bash /httrack.sh $url $domain $now
}
# attach to crawls as they run
$is_running=docker ps -q -f name="httrack"
if ($is_running) {
docker attach --sig-proxy=false httrack
}
$is_running=docker ps -q -f name="webrecorder"
if ($is_running) {
docker attach --sig-proxy=false webrecorder
}
Write-Output "Crawl of $url complete!"