e-hentai抓取图片第二版,,[Python]代码im
e-hentai抓取图片第二版,,[Python]代码im
[Python]代码
import urllib,urllib2,cookielibimport os,re,time,sysdef GetHost(url): return url.split('/')[1]def GetAttr(url): return url.split('.')[-1].split('&')[0]def getehentai(url,way): #事先弄好规则 rule0=re.compile('Showing\s1\s-\s20\sof\s(.*)\simages')#获取分目录页面数量的规则 rule1=re.compile('<a\shref="([^<>"]*)"><img[^<>]*><br[^<>]*>[0-9]+</a>')#从目录页面抽取具体的漫画页面地址的规则 rule2=re.compile('</iframe><a\shref="[^<>]*"><img\ssrc="([^<>]*)"\sstyle="[^<>]*"\s/></a><iframe')#从漫画页面获取图片地址的规则 rule3=re.compile('<a\shref="([^<>"]*)">Download original')#如果存在原始大图,获取其地址的规则 #设置代理和cookie proxy_support=urllib2.ProxyHandler({'http': 'http://127.0.0.1:8087'}) cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener=urllib2.build_opener(proxy_support,cookie_support,urllib2.HTTPHandler) #默认的header模拟浏览器 header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727)', 'Accept-Language':'zh-CN,zh;q=0.8', 'Accept-Charset':'utf-8;q=0.7,*;q=0.7', 'connection':'keep-alive'} try: #开启主页面 header['Accept']='*/*' header['Host']='g.e-hentai.org' req=urllib2.Request('http://g.e-hentai.org',headers=header) opener.open(req) #打开登录页面 header['Referer']='http://g.e-hentai.org' header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*' req=urllib2.Request('http://g.e-hentai.org/home.php',headers=header) opener.open(req) #登录网站 header['Referer']='http://e-hentai.org/bounce_login.php' header['Host']='e-hentai.org' namekey=urllib.urlencode({'ipb_login_password':'xxxx','ipb_login_submit':'Login!','ipb_login_username':'xxxx'}) req=urllib2.Request('http://e-hentai.org/bounce_login.php?b=d&bt=1-1',data=namekey,headers=header) opener.open(req) #返回主页面 header['Host']='g.e-hentai.org' req=urllib2.Request('http://g.e-hentai.org',headers=header) opener.open(req) #打开漫画目录页 header['Referer']='http://g.e-hentai.org' req=urllib2.Request(url,headers=header) mdi=rule0.findall(opener.open(req).read()) k=mdi and int(mdi[0]) or 0 k= k%20 and k/20+1 or k/20 if not os.path.isdir(way):os.mkdir(way) except urllib2.HTTPError,e: print 'First Error : %s'%(e) sys.exit() #循环抓取所有漫画页 i=0 pick=[] suburl=url header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*' header['Host']=GetHost(url) for j in xrange(k): header['Referer']=suburl suburl='%s?p=%d'%(url,j) req=urllib2.Request(suburl,headers=header) try: page=opener.open(req) if page.geturl()!=suburl:break data=page.read() if data: medi=rule1.findall(data) for each in medi: i+=1 print 'Get %s'%(each) pick.append((each,way+'\\%03d.jpg'%(i))) except urllib2.HTTPError,e: print 'Second Error : %s'%(e) sys.exit() #从列表循环抓取每一页漫画 k=0 fail=[] for i,j in pick: if os.path.isfile(j): file=open(j,'rb') file.seek(0,2) if file.tell()>2048: print "%s --> existed"%(j) continue try: k+=1 header['Referer']=url+'?p=%d'%(k/20) header['Host']=GetHost(i) req=urllib2.Request(i,headers=header) data=opener.open(req).read() if data: see=rule2.findall(data) ori=rule3.findall(data) if ori: header['Referer']=i header['Host']=GetHost(ori[0]) header['Accept']='*/*' req=urllib2.Request(ori[0],headers=header) page=opener.open(req) #attr=GetAttr(page.geturl()) data=page.read() elif see: header['Referer']=k header['Host']=GetHost(see[0]) header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*' req=urllib2.Request(see[0],headers=header) page=opener.open(req) #attr=GetAttr(page.geturl()) data=page.read() else: data='' attr='' if len(data)>2048: file=open(j,'wb') file.write(data) file.close() print '%s --> %s'%(i,j) else: print '%s --> failed'%i fail.append((i,j)) except urllib2.HTTPError,e: print 'Third Error : %s'%(e) sys.exit() #尝试失败的抓取,最大重复一百次 k=0 while fail: i,j=fail[-1] try: k+=1 if k==100:break header['Referer']=url header['Host']=GetHost(i) req=urllib2.Request(i,headers=header) data=opener.open(req).read() if data: see=rule2.findall(data) ori=rule3.findall(data) if ori: header['Referer']=i header['Host']=GetHost(ori[0]) header['Accept']='*/*' req=urllib2.Request(ori[0],headers=header) page=opener.open(req) #attr=GetAttr(page.geturl()) data=page.read() elif see: header['Referer']=k header['Host']=GetHost(see[0]) header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*' req=urllib2.Request(see[0],headers=header) page=opener.open(req) #attr=GetAttr(page.geturl()) data=page.read() else: data='' attr='' if len(data)>2048: file=open(j,'wb') file.write(data) file.close() print '%s --> %s'%(i,j) fail.pop() else: print '%s --> failed'%i except urllib2.HTTPError,e: print 'Fourth Error : %s'%(e) sys.exit()if __name__=='__main__': #url=raw_input('Input Url Address:') #way=raw_input('Input Save Path:') #getehentai(url,way) getehentai('http://g.e-hentai.org/g/427281/97cebfc7e9/',u'D:\\fsr3')
相关内容
- 输出乘法表到excel,输出乘法表excel,[Python]代码im
- python 基础学习第二弹:类属性和实例属性,python实例
- 淘宝面试题猜数字游戏,淘宝试题猜数字,guess.py#!/u
- darkBing SQL扫描器0.1,darkbing扫描器0.1,[Python]代码#!
- 深度优先遍历嵌套容器,深度历嵌套容器,[Python]代码
- mysql基本操作类,mysql基本操作,db.pyimport
- Python脚本写的命令行下的词典,python脚本,#!/usr/bin/e
- Python下载百度空间文章,,[Python]代码#
- python mysql insert delete query,pythonmysql,mysql python
- (34〇5 〇 6 〇 8 〇 9 〇 1) 〇 2=2008,,[Python]代码fr
评论关闭