-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling.py
100 lines (83 loc) · 2.77 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from scrapy.crawler import Crawler
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from agg.spiders.nature_spider import NatureSpider
import os
from ui.custom_crawler import CustomCrawler
import threading
from subprocess import Popen
import subprocess
from project_vars import Paths, Spiders
from settings import load_settings
import pdb
# Read the sync settings file generated by the kivy app (depreciated)
def read_sync_settings(file = 'test.ini', section='Sync Settings'):
f = open('%s/%s' % (Paths.ui_path, file))
section_found = 0
settings = {}
for line in f.readlines():
stripped_line = line.rstrip()
if stripped_line == '[%s]' % section:
section_found = 1
elif section_found:
# If we have already found our section, but encounter another settings
# section header, it's time to stop recording settings
if '[' in stripped_line:
break
# Split the line into componenets. The 0th should be the key, the
# 2nd should be the value after the equals sign
data = stripped_line.split()
if data:
settings[data[0]] = data[2]
else:
continue
f.close()
return settings
# Need this function because scrapy and qt are installed in two different virtual environments.
def do_crawl_wrapper():
p = Popen()
def do_crawl():
#spider = NatureSpider()
#settings = Settings()
#settings.setmodule('crawler.agg.settings', priority='project')
#settings.set('SPIDER_MODULES', 'crawler.agg.spiders', priority='project')
#print settings.getdict('ITEM_PIPELINES')
#crawler = Crawler(spider, settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.crawl()
#reactor.run()
#process = CrawlerProcess(settings)
#process.crawl(spider)
#process.start()
#settings.set('SPIDER_MODULES', 'crawler.agg.spiders', priority='project')
#pdb.set_trace()
#process = CustomCrawler(settings)
#process.crawl(spider)
#process.start(stop_after_crawl=True)
# crawler = Crawler(spider, settings)
# crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
# crawler.crawl()
#runner = CrawlerRunner(settings)
#d = runner.crawl(spider)
#d.addBoth(lambda _: reactor.stop())
#reactor.run()
# Subprocess
settings = load_settings()
for spider in Spiders.spiders:
if settings[spider]["enabled"]:
script = ["scrapy", "crawl", spider, "-a", "sync_length=%s"
% settings[spider]["sync_length"]]
try:
p = Popen(script, cwd = '%s/crawler/agg' % os.getcwd())
p.wait()
print('Crawl Finished!')
except subprocess.CalledProcessError:
pass
except OSError:
pass
if __name__ == "__main__":
do_crawl()