利用python下载百度空间文章,,#! /usr/bin/


#! /usr/bin/env python#coding=utf-8import urllib2import reimport sysimport ospattern = ""reg_tail = ""username = ""def downURL(url, filename):print "Download %s, save as %s"%(url, filename)try:fp = urllib2.urlopen(url)except:print "download exception"return 0paths = os.getcwd()+username+'/'+filenameop = open(paths, "wb")while 1:s = fp.read()if not s:breakop.write(s)fp.close( )op.close( )return 1def getURL(url):print "Parsing %s"%urltry:fp = urllib2.urlopen(url)contents = fp.readlines()except:print "exception"return []item_list = []for s in contents:urls = pattern.findall(s)if urls:item_list.extend(urls)fp.close( )return item_listdef CreateDirectory():if not os.path.exists(os.getcwd()+username):os.mkdir(os.getcwd()+username)print 'step 2:Create Directory  Success!'else:print 'step 2:Directory has existed!'def reptile(base_url):"""Download all articles from base_url.Arguments:- `base_url`: Url of website."""page_list = []base_page = base_url.rstrip("/")+"/blog/index/"sign_tail = u"尾页"tail = ""total_page = 10global usernameprint 'step 3:Number of index'try:fp = urllib2.urlopen(base_page+"0")except:print "%s: Not such url"%pageprint sys.exc_info()else:for s in fp.readlines():if sign_tail in s.decode("gbk"):tail = s.decode("gbk")breakfp.close()if tail:pos = tail.rfind(u"尾页")total_page =int(tail[:pos-3].split("/")[-1])output_list = [ ]for idx in range(total_page+1):item_page = "%s%d"%(base_page, idx)item_list = getURL(item_page)if item_list:output_list.extend(item_list)print 'step 4:Down pages!'item_list = list(set(output_list))for item in item_list:down_url = item.replace("/%s"%username,"<a href="http://hi.baidu.com/%s">http://hi.baidu.com/%s"%username)local_file = down_url.split("/")[-1]ret = downURL(down_url,local_file)print "step 5:Total: %d articles."%(len(item_list))print "Congratulations"passif __name__ == '__main__':if len(sys.argv) != 2:print "Usage: %s url of baidu space"%sys.argv[0]print "Such as: %s <a href="http://hi.baidu.com/Username">http://hi.baidu.com/Username"sys.exit(1)base_url = sys.argv[1]if not base_url.startswith("<a href="http://hi.baidu.com/">http://hi.baidu.com/"):print "Wrong Type of URL??", "It works on Baidu Space only."sys.exit(1)username = base_url.rstrip("/").split("/")[-1]print ('step 1:'+username)CreateDirectory()reg_tail = re.compile(u"%s.*?尾页"%username)pattern = re.compile("/%s/blog/item/.*?\\.html"%username)reptile (base_url)#该片段来自于http://byrx.net

评论关闭