[Python] 抓取指定网页以及该网页上所有链接 →→→→→进入此内容的聊天室

来自 , 2019-05-09, 写在 Python, 查看 116 次.
URL http://www.code666.cn/view/97af07a1
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # ****************************************************************************
  4. # Copyright (C) 2010 yangyingchao@gmail.com
  5.  
  6. # Author: yangyingchao <yangyingchao@gmail.com>
  7.  
  8. # This program is free software; you can redistribute it and/or modify it
  9. # under the terms of the GNU General Public License as published by the Free
  10. # Software Foundation; either version 2, or (at your option) any later
  11. # version.
  12.  
  13. # This program is distributed in the hope that it will be useful, but WITHOUT
  14. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15. # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  16. # more details.
  17.  
  18. # You should have received a copy of the GNU General Public License along with
  19. # GNU Emacs; see the file COPYING.  If not, write to the Free Software
  20. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  21. # ****************************************************************************
  22.  
  23. from copy import deepcopy
  24. from sgmllib import SGMLParser
  25. from xml.dom.minidom import *
  26. import os
  27. import re
  28. import sys
  29. import urllib2
  30.  
  31. title = "Untitled"
  32.  
  33. class MyParser(SGMLParser):
  34.  
  35.     def __init__(self):
  36.         self.data = ""
  37.         self.links = []
  38.         self.TAG_BEG = False
  39.         self.TAG_END = False
  40.         SGMLParser.__init__(self, 0)
  41.  
  42.     def handle_data(self, data):
  43.         if (self.TAG_BEG is True) and (self.TAG_END is False):
  44.             self.data += data
  45.         pass
  46.  
  47.     def start_title(self, attrs):
  48.         self.link = ""
  49.         self.data=""
  50.  
  51.         self.TAG_BEG = True
  52.         self.TAG_END = False
  53.         for (key, val) in attrs:
  54.             if key == "href":
  55.                 self.link = val
  56.  
  57.     def end_title(self):
  58.         self.TAG_BEG = False
  59.         self.TAG_END = True
  60.  
  61.         self.title = self.data.strip()
  62.  
  63.  
  64.     def flush(self):
  65.         pass
  66.  
  67.     def handle_comment(self, data):
  68.         pass
  69.  
  70.     def start_a(self, attrs):
  71.         self.data=""
  72.  
  73.         self.TAG_BEG = True
  74.         self.TAG_END = False
  75.         for (key, val) in attrs:
  76.             if key == "href":
  77.                 self.link = val
  78.  
  79.     def end_a(self):
  80.         self.TAG_BEG = False
  81.         self.TAG_END = True
  82.         tmp = {}
  83.         tmp["name"] = self.data
  84.         tmp["link"] = self.link
  85.         self.links.append(deepcopy(tmp))
  86.  
  87.  
  88.     def unknown_starttag(self, tag, attrs):
  89.         pass
  90.  
  91.     def unknown_endtag(self, tag):
  92.         pass
  93.  
  94.  
  95.     def unknown_entityref(self, ref):
  96.         pass
  97.  
  98.     def unknown_charref(self, ref):
  99.         pass
  100.  
  101.     def unknown_decl(self, data):
  102.         pass
  103.  
  104.     def close(self):
  105.         SGMLParser.close(self)
  106.         self.flush()
  107.  
  108. def lst2str(lst):
  109.     string = ""
  110.     for item in lst:
  111.         string += item.strip()+ "\n"
  112.     return string
  113.  
  114. def downURL(url, filename):
  115.     print "Download %s, save as %s"%(url, filename)
  116.     try:
  117.         fp = urllib2.urlopen(url)
  118.     except:
  119.         print "download exception"
  120.         print sys.exc_info()
  121.         return 0
  122.     op = open(filename, "wb")
  123.     while 1:
  124.         s = fp.read()
  125.         if not s:
  126.             break
  127.         op.write(s)
  128.     fp.close( )
  129.     op.close( )
  130.     return 1
  131.  
  132.  
  133. def reptile(base_url):
  134.     """
  135.    Download all articles from base_url.
  136.    Arguments:
  137.    - `base_url`: Url of website.
  138.    """
  139.     page_list = []
  140.     if not len(base_url):
  141.         print "No page to reptile!"
  142.         sys.exit(1)
  143.  
  144.     parser = MyParser()
  145.  
  146.     if base_url.startswith("http"):
  147.         myopen = urllib2.urlopen
  148.     else:
  149.         myopen = open
  150.  
  151.     try:
  152.         content = myopen(base_url).read()
  153.     except:
  154.         print "Failed to read from %s."%base_url
  155.         print sys.exc_info()
  156.  
  157.     for item in content:
  158.         parser.feed(item)
  159.  
  160.     for tmp in parser.links:
  161.         page_list.append(tmp.get("link"))
  162.  
  163.     global title
  164.     title = parser.title
  165.     parser.close()
  166.  
  167.     item_list = list(set(page_list))
  168.  
  169.     for item in item_list:
  170.         # Strip '#' from url.
  171.         pos = item.find('#')
  172.         if pos != -1:
  173.             item = item[:pos]
  174.  
  175.         # Added base_url to item if necessary
  176.         if not item.startswith("http"):
  177.             item = base_url.rstrip("/")+"/"+item
  178.             pass
  179.  
  180.         local_file = item.split("/")[-1]
  181.         print item, local_file
  182.         if not local_file:
  183.             print "Empty local file! Continue from next one!"
  184.             continue
  185.  
  186.         if os.access(local_file, os.F_OK):
  187.             print "File: %s existed, skip ..."%local_file
  188.         else:
  189.             ret = downURL(item, local_file)
  190.  
  191.     # Remember to download the index file!
  192.     downURL(base_url, "index.html")
  193.     print "Total: %d articles."%(len(item_list))
  194.     pass
  195.  
  196.  
  197. def walk_dir(lst, dirname, filenames):
  198.     for filename in filenames:
  199.         fn = os.path.join(dirname, filename)
  200.         if os.path.isdir(fn) or \
  201.                not filename.endswith("html"):
  202.             continue
  203.         print "Processing: %s"%fn
  204.         tmp = {}
  205.         parser = MyParser()
  206.         content = open(fn).read()
  207.         for item in content:
  208.             parser.feed(item)
  209.         tmp["file"] = filename
  210.         tmp["title"] = parser.title
  211.         parser.close()
  212.         lst.append(deepcopy(tmp))
  213.     pass
  214.  
  215. def gen_index():
  216.     """
  217.    Generate index of all htmls in this directory.
  218.    """
  219.     file_lists = []
  220.     os.path.walk(".", walk_dir, file_lists)
  221.  
  222.     fp = open("%s.devhelp2"%os.path.basename(os.getcwd()), "w")
  223.     string = '<?xml version="1.0" encoding="utf-8"?>\n<book author=""' +\
  224.         ' language="c" link="index.html" name="" title="%s"'%title+\
  225.         ' version="2" xmlns="http://www.devhelp.net/book">\n  <chapters>'
  226.     for item in file_lists:
  227.         link = item.get("file")
  228.         try:
  229.             name =item.get("title").decode('gbk').encode('utf-8')
  230.         except:
  231.             name = item.get("title")
  232.         finally:
  233.             string += '<sub link="%s" name="%s"/>\n'%(link, name)
  234.  
  235.     string +=   '\n</chapters>\n   </book>\n'
  236.     fp.write(string)
  237.     fp.close()
  238.  
  239. if __name__ == '__main__':
  240.     if len(sys.argv) != 2:
  241.         print "Usage: %s url of baidu space"%sys.argv[0]
  242.         print "Such as: %s http://hi.baidu.com/Username"
  243.         gen_index()
  244.         sys.exit(1)
  245.     base_url = sys.argv[1]
  246.     reptile (base_url)
  247.     gen_index()
  248.  
  249. #//python/1161

回复 "抓取指定网页以及该网页上所有链接"

这儿你可以回复上面这条便签

captcha