利用pyhton程序抓取JS特效模板,pyhton抓取js特效,利用pyhton程序抓取


利用pyhton程序抓取JS特效模板脚本,主要从懒人图库下载JavaScript特效模板,在脚本中使用了gevent这个第三方库,使用的时候需要先安装;

1.[图片] H)KNCNI~S(V`[VWZ(P$9ZMK.png

2.[代码][Python]代码 跳至 [2] [全屏预览]

#!/usr/bin/python# -*- coding: utf-8 -*-import urllib,os,sysimport gevent,refrom gevent import monkeyfrom bs4 import BeautifulSoupgevent.monkey.patch_socket()'''Description:Python 爬虫抓取懒人图库的JS脚本模板Author:adminCreate-Date:2015-05-25Version:1.0'''HTTP_URL = 'http://www.lanrentuku.com%s'DOWNLOAD_URL = HTTP_URL[:-2] + '/js/d%szip'reg=r'\d{1,}\.+'def encode(text):    return text.encode("utf8")def createDirectory(curPath):    myPath = os.path.join(getSubDirectory(), u'JS代码模板')    if not os.path.exists(myPath):        os.mkdir(myPath)    return os.path.join(myPath, curPath)def getSubDirectory():    return os.getcwd()def schedule(a, b, c):     per = 100.0 * a * b / c    if per > 100 :        per = 100    sys.stdout.write('%.1f%%\r' % per)    sys.stdout.flush()def geturllist(url):    url_list = {}    html = urllib.urlopen(url)    content = html.read()    html.close()    # 用BeautifulSoup解析    decodeHtml = BeautifulSoup(content)    try:        aTags = decodeHtml.find_all('div', {'class':'list-pngjs'})[0].find_all('a')    except IndexError, e:        print e        aTags = None    # 获取链接地址和标题    if aTags is not None:        for a_tag in aTags:            url_list[HTTP_URL % a_tag.get('href')] = a_tag.get_text()    return url_list   def download(down_url):    try:        m=re.search(reg,down_url[0])        name = DOWNLOAD_URL % m.group(0)        urllib.urlretrieve(name,createDirectory(down_url[1] + name[-4:]),schedule)    except Exception, e:        print e.message   def getpageurl(xUrl):    # 进行列表页循环    return [xUrl % page for page in xrange(1,49)]if __name__ == '__main__':    jobs = []    pageurl = getpageurl('http://www.lanrentuku.com/js/p%s.html')    # 爬取所有链接    for i in pageurl:        for k in geturllist(i).items():            jobs.append(gevent.spawn(download, k))    gevent.joinall(jobs)

编橙之家文章,

评论关闭