測試一下編輯器功能 from scrapy.spider import Spider from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from scrapy.item import Item, Field import urllib class Question(Item): tags = Field() answers = Field() votes = Field() date = Field() link = Field() class ArgSpider(CrawlSpider): """ Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and displays links, number of votes etc in the terminal. Usage: ~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query] For example ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website" """ name = "StackSpider" def __init__(self,tag=None,query=None,*args,**kwargs): super(ArgSpider,self).__init__(*args,**kwargs) self.start_urls = [] urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}" query = urllib.quote(query) self.start_urls.append(urlTemplate.format(tag=tag,query=query)) def parse(self,response): """ @url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering" @returns items 15 @returns requests 0 1 @scrapes votes answers date link """ sel = Selector(response) elems = sel.css('.question-summary') results = [] for elem in elems: item = Question() item["tags"] = elem.css('.post-tag::text').extract() item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract() item["answers"] = elem.css('.status').xpath('.//strong/text()').extract() item["date"] = elem.css('.relativetime').xpath('.//@title').extract() link = elem.css('.result-link').xpath('.//a/@href').extract() item["link"] = link results.append(item) return results
文章標籤
全站熱搜
留言列表