从脚本里运行scrapy的代码,脚本scrapy代码,# This snipp
文章由Byrx.net分享于2019-03-23 10:03:53
从脚本里运行scrapy的代码,脚本scrapy代码,# This snipp
# This snippet can be used to run scrapy spiders independent of scrapyd or the scrapy command line tool and use it from a script. # # The multiprocessing library is used in order to work around a bug in Twisted, in which you cannot restart an already running reactor or in this case a scrapy instance.# # [Here](http://groups.google.com/group/scrapy-users/browse_thread/thread/f332fc5b749d401a) is the mailing-list discussion for this snippet. #!/usr/bin/pythonimport osos.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings') #Must be at the top before other importsfrom scrapy import log, signals, projectfrom scrapy.xlib.pydispatch import dispatcherfrom scrapy.conf import settingsfrom scrapy.crawler import CrawlerProcessfrom multiprocessing import Process, Queueclass CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)# Usageif __name__ == "__main__": log.start() """ This example runs spider1 and then spider2 three times. """ items = list() crawler = CrawlerScript() items.append(crawler.crawl('spider1')) for i in range(3): items.append(crawler.crawl('spider2')) print items
评论关闭