i need scrapy take argument (-a file_name="stuff") command line , apply file created in csvwriterpipeline in pipelines.py file. (the reason went pipeline.py built in exporter repeating data , repeating header in output file. same code, writing in pipeline fixed it.)
i tried scrapy.utils.project import get_project_settings seen in
how access scrapy settings item pipeline
but couldn't change file name command line.
i've tried implementing @avaleske's solution that's on page, since addresses this, don't know place code talks in scrapy folder.
help?
settings.py:
bot_name = 'internal_links' spider_modules = ['internal_links.spiders'] newspider_module = 'internal_links.spiders' closespider_pagecount = 100 item_pipelines = ['internal_links.pipelines.csvwriterpipeline'] # crawl responsibly identifying (and website) on user-agent user_agent = 'internal_links (+http://www.mycompany.com)' file_name = "mytestfilename" pipelines.py:
import csv class csvwriterpipeline(object): def __init__(self, file_name): header = ["url"] self.file_name = file_name self.csvwriter = csv.writer(open(self.file_name, 'wb')) self.csvwriter.writerow(header) def process_item(self, item, internallinkspider): # build row export, export row row = [item['url']] self.csvwriter.writerow(row) return item spider.py:
from scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.contrib.spiders import crawlspider, rule internal_links.items import myitem class myspider(crawlspider): name = 'internallinkspider' allowed_domains = ['angieslist.com'] start_urls = ['http://www.angieslist.com'] rules = (rule(sgmllinkextractor(), callback='parse_url', follow=true), ) def parse_url(self, response): item = myitem() item['url'] = response.url return item
you can use "settings" notion , -s command-line argument:
scrapy crawl internallinkspider -s file_name="stuff" then, in pipeline:
import csv class csvwriterpipeline(object): @classmethod def from_crawler(cls, crawler): settings = crawler.settings file_name = settings.get("file_name") return cls(file_name) def __init__(self, file_name): header = ["url"] self.csvwriter = csv.writer(open(file_name, 'wb')) self.csvwriter.writerow(header) def process_item(self, item, internallinkspider): # build row export, export row row = [item['url']] self.csvwriter.writerow(row) return item
Comments
Post a Comment