# This script allows you to quickly crawl pages based off of the sitemap(s) for any given domain.  The XmlXPathSelector was not doing it for me nor was the XmlItemExporter this code is by far the fastest and easiest way I have found to crawl a site based off of the URLs listed in the sitemap.
 
import re
 
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.utils.response import body_or_str
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
 
class SitemapSpider(BaseSpider):
        name = "SitemapSpider"
        start_urls = ["http://www.domain.com/sitemap.xml"]
 
        def parse(self, response):
                nodename = 'loc'
                text = body_or_str(response)
                r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
                for match in r.finditer(text):
                        url = match.group(2)
                        yield Request(url, callback=self.parse_page)
 
        def parse_page(self, response):
                hxs = HtmlXPathSelector(response)
 
                #Mock Item
                blah = Item()
 
                #Do all your page parsing and selecting the elemtents you want
                blash.divText = hxs.select('//div/text()').extract()[0]
                yield blah
#//python/8398

回复 "通过scrapy抓取网站的sitemap信息"

这儿你可以回复上面这条便签

作者你的名字是？

标题给你的便签一个标题。

语言你的便签是以

你的便签在这儿输入便签内容

# This script allows you to quickly crawl pages based off of the sitemap(s) for any given domain.  The XmlXPathSelector was not doing it for me nor was the XmlItemExporter this code is by far the fastest and easiest way I have found to crawl a site based off of the URLs listed in the sitemap.
 
import re
 
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.utils.response import body_or_str
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
 
class SitemapSpider(BaseSpider):
	name = "SitemapSpider"
	start_urls = ["http://www.domain.com/sitemap.xml"]
 
	def parse(self, response):
		nodename = 'loc'
		text = body_or_str(response)
		r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
		for match in r.finditer(text):
			url = match.group(2)
			yield Request(url, callback=self.parse_page)
 
	def parse_page(self, response):
                hxs = HtmlXPathSelector(response)
 
                #Mock Item
		blah = Item()
 
		#Do all your page parsing and selecting the elemtents you want
                blash.divText = hxs.select('//div/text()').extract()[0]
		yield blah
#//python/8398

创建短链接创建一个较短的URL，连接到这个便签

私人私人便签不会显示在最近列表中

保存期限我们应该什么时候删除这张便签？

防滥用键入这些字符

Code666 (代码贴、代码片段)

[Python] 通过scrapy抓取网站的sitemap信息 →→→→→进入此内容的聊天室

回复 "通过scrapy抓取网站的sitemap信息"