crawling.py

from scrapy.crawler import Crawler
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from agg.spiders.nature_spider import NatureSpider
import os
from ui.custom_crawler import CustomCrawler
import threading
from subprocess import Popen
import subprocess
from project_vars import Paths, Spiders
from settings import load_settings
import pdb


# Read the sync settings file generated by the kivy app (depreciated)
def read_sync_settings(file = 'test.ini', section='Sync Settings'):

	f = open('%s/%s' % (Paths.ui_path, file))
	section_found = 0
	settings = {}
	for line in f.readlines():
		stripped_line = line.rstrip()
		if stripped_line == '[%s]' % section:
			section_found = 1
		elif section_found:

			# If we have already found our section, but encounter another settings
			# section header, it's time to stop recording settings
			if '[' in stripped_line:
				break
			# Split the line into componenets. The 0th should be the key, the
			# 2nd should be the value after the equals sign
			data = stripped_line.split()
			if data:
				settings[data[0]] = data[2] 					
		else:
			continue
	f.close()
	return settings

# Need this function because scrapy and qt are installed in two different virtual environments.
def do_crawl_wrapper():

	p = Popen()


def do_crawl():

	#spider = NatureSpider()
	#settings = Settings()
	#settings.setmodule('crawler.agg.settings', priority='project')
	#settings.set('SPIDER_MODULES', 'crawler.agg.spiders', priority='project')
	#print settings.getdict('ITEM_PIPELINES')
	#crawler = Crawler(spider, settings)
	#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
	#crawler.crawl()

	#reactor.run()
	#process = CrawlerProcess(settings)
	#process.crawl(spider)
	#process.start()

	#settings.set('SPIDER_MODULES', 'crawler.agg.spiders', priority='project')
	#pdb.set_trace()
	#process = CustomCrawler(settings)
	#process.crawl(spider)
	#process.start(stop_after_crawl=True)
	#        crawler = Crawler(spider, settings)
	#        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
	#        crawler.crawl()
	#runner = CrawlerRunner(settings)
	#d = runner.crawl(spider)
	#d.addBoth(lambda _: reactor.stop())
	#reactor.run()

	# Subprocess

	settings = load_settings()


	for spider in Spiders.spiders:
		if settings[spider]["enabled"]:
			script = ["scrapy", "crawl", spider, "-a", "sync_length=%s" 
			                % settings[spider]["sync_length"]]

			try:
				p = Popen(script, cwd = '%s/crawler/agg' % os.getcwd())
				p.wait()
				print('Crawl Finished!')
			except subprocess.CalledProcessError:
				pass
			except OSError:
				pass

if __name__ == "__main__":
   	do_crawl()