[Python] python在google上搜索 →→→→→进入此内容的聊天室

来自 , 2019-04-30, 写在 Python, 查看 117 次.
URL http://www.code666.cn/view/90fd4f88
  1. import re,urllib,urllib2
  2.  
  3. class GoogleHarvester:
  4.     re_links = re.compile(r'<a class=l href="(.+?)"',re.IGNORECASE|re.DOTALL)
  5.     def __init__(self):
  6.         pass
  7.     def harvest(self,terms):
  8.         '''Searchs Google for these terms. Returns only the links (URL).
  9.  
  10.           Input: terms (string) -- one or several words to search.
  11.  
  12.           Output: A list of urls (strings).
  13.                   Duplicates links are removed, links are sorted.
  14.          
  15.           Example: print GoogleHarvester().harvest('monthy pythons')
  16.        '''
  17.         print "Google: Searching for '%s'" % terms
  18.         links = {}
  19.         currentPage = 0
  20.         while True:
  21.             print "Google: Querying page %d (%d links found so far)" % (currentPage/100+1, len(links))
  22.             address = "http://www.google.com/search?q=%s&num=100&hl=en&start=%d" % (urllib.quote_plus(terms),currentPage)
  23.             request = urllib2.Request(address, None, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'} )
  24.             urlfile = urllib2.urlopen(request)
  25.             page = urlfile.read(200000)
  26.             urlfile.close()
  27.             for url in GoogleHarvester.re_links.findall(page):
  28.                 links[url] = 0
  29.             if "</div>Next</a></table></div><center>" in page: # Is there a "Next" link for next page of results ?
  30.                 currentPage += 100  # Yes, go to next page of results.
  31.             else:
  32.                 break   # No, break out of the while True loop.
  33.         print "Google: Found %d links." % len(links)
  34.         return sorted(links.keys())  
  35.  
  36. # Example: Search for "monthy pythons"
  37. links = GoogleHarvester().harvest('monthy pythons')
  38. open("links.txt","w+b").write("\n".join(links))
  39. #//python/1865

回复 "python在google上搜索"

这儿你可以回复上面这条便签

captcha