#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8   
#Filename:get_baike.py
 
import urllib2,re
import sys
 
 
 
def getHtml(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    return html
 
def clearBlank(html):
    if len(html) == 0 : return ''
    html = re.sub('\r|\n|\t','',html)
    while html.find("  ")!=-1 or html.find('&nbsp;')!=-1 :
        html = html.replace('&nbsp;',' ').replace('  ',' ')
    return html
 
 
if __name__ == '__main__':
        html = getHtml('http://baike.baidu.com/view/994462.htm',10)
        html = html.decode('gb2312','replace').encode('utf-8') #转码
 
        title_reg = r'<h1 class="title" id="[\d]+">(.*?)</h1>'
        content_reg = r'<div class="card-summary-content">(.*?)</p>'
 
        title = re.compile(title_reg).findall(html)
        content = re.compile(content_reg).findall(html)
 
        title[0] = re.sub(r'<[^>]*?>', '', title[0])
        content[0] = re.sub(r'<[^>]*?>', '', content[0])
 
        print title[0]
        print '#######################'
        print content[0]
#//python/5589

回复 "python采集百度百科代码演示"

这儿你可以回复上面这条便签

作者你的名字是？

标题给你的便签一个标题。

语言你的便签是以

你的便签在这儿输入便签内容

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8   
#Filename:get_baike.py

import urllib2,re
import sys

def getHtml(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    return html

def clearBlank(html):
    if len(html) == 0 : return ''
    html = re.sub('\r|\n|\t','',html)
    while html.find("  ")!=-1 or html.find(' ')!=-1 :
        html = html.replace(' ',' ').replace('  ',' ')
    return html

if __name__ == '__main__':
	html = getHtml('http://baike.baidu.com/view/994462.htm',10)
	html = html.decode('gb2312','replace').encode('utf-8') #转码

title_reg = r'<h1 class="title" id="[\d]+">(.*?)</h1>'
	content_reg = r'<div class="card-summary-content">(.*?)</p>'

title = re.compile(title_reg).findall(html)
	content = re.compile(content_reg).findall(html)

title[0] = re.sub(r'<[^>]*?>', '', title[0])
	content[0] = re.sub(r'<[^>]*?>', '', content[0])

print title[0]
	print '#######################'
	print content[0]
#//python/5589

创建短链接创建一个较短的URL，连接到这个便签

私人私人便签不会显示在最近列表中

保存期限我们应该什么时候删除这张便签？

防滥用键入这些字符

Code666 (代码贴、代码片段)

[Python] python采集百度百科代码演示 →→→→→进入此内容的聊天室

回复 "python采集百度百科代码演示"