-
Notifications
You must be signed in to change notification settings - Fork 0
/
master.conf
83 lines (71 loc) · 2.67 KB
/
master.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
include "application"
crawler {
modeler {
// This section is for the modeling phase
// Please store your CSV models in src/main/resources/targets
// and your golden models in src/main/resources/golden
// "http://your-website.com" {
// static.file = <filename.csv> CSV model with which the crawling will be performed
// (do not use in combination with dynamic.pages)
// dynamic.pages = <integer n> builds the model from scratch crawling at most n urls
// (do not use in combination with static.file
// the generated model will be used if neither static.file nor dynamic.pages
// is specified
// if a model of the website is already present and dynamic.pages is used,
// a new version of the model will be generated)
// wait = <integer> fixed pause between HTTP requests in milliseconds
// (default is 2000)
// randompause = <integer i> random pause in the range [0,i] between HTTP requests in milliseconds
// (default is 1000)
// javascript = <boolean> whether or not javascript will be loaded
// (default is false)
// maxfailures = <integer> max attempts to fetch a url
// (default is 1)
// crawl = <boolean> whether or not this website will be crawled with the model
// (default is false)
// golden = <filename.csv> golden model to perform an evaluation of the generated model
// (to be used only in combination with dynamic.pages)
// savepages = if true, the pages classified with the dynamic modeling will be saved
// along with a CSV of triples (url,file_location,pageclass)
// (default is false)
// }
}
crawling {
// This section is for the actual crawling phase
// Every parameter refers to a single website
fetchers = 1 // number of fetchers to use
pages = 1000000 // max number of pages to be fetched
frontierheap = 10000 // max number of urls allowed in the in-memory part of the frontier
}
}
nodes {
// This sections specifies the available hosts that will perform the crawl
// Name each host repositoryN with N = 0,1,2,...,n
// repository0 must be named CrawlSystem, the following ones RepositorySystemN
repository0 {
host = "127.0.0.1"
port = 2552
system = "CrawlSystem"
}
// repository1 {
// host = "192.168.1.132"
// port = 2552
// system = "RepositorySystem1"
// }
}
akka {
// Akka configuration for this node (repository0)
// Every repositoryN must define his repositoryN.conf file and overwrite this section
// with its address
actor {
provider = remote
}
remote {
log-remote-lifecycle-events = off
artery {
enabled = on
canonical.hostname = "127.0.0.1"
canonical.port = 2552
}
}
}