用python爬博客


用python爬博客

by 伍雪颖
以爬王垠的博客为例:
import re
import urllib2

def getHtmlCode(url):
	return urllib2.urlopen(url).read()

def findTitleUrl(htmlString):
    regTitleUrl = re.compile("href=\"(.+?)\"")
    return regTitleUrl.findall(htmlString)

def findTitleContent(htmlString):
	regTitleContent = re.compile("\">(.+?)")
	return regTitleContent.findall(htmlString)

htmlCode = getHtmlCode('http://www.yinwang.org/')
titleContent = findTitleContent(htmlCode)
titleUrl = findTitleUrl(htmlCode)
for i in range(0, len(titleUrl)):
	print titleContent[i+3]
	print titleUrl[i+8]
	htmlPage = getHtmlCode(titleUrl[i+8])
	f = open("%s.html"%(titleContent[i+3]),'wb')
	f.write(htmlPage)
	f.close


评论关闭