e-hentai抓取图片第二版,,[Python]代码im


[Python]代码

import urllib,urllib2,cookielibimport os,re,time,sysdef GetHost(url):    return url.split('/')[1]def GetAttr(url):    return url.split('.')[-1].split('&')[0]def getehentai(url,way):    #事先弄好规则    rule0=re.compile('Showing\s1\s-\s20\sof\s(.*)\simages')#获取分目录页面数量的规则    rule1=re.compile('<a\shref="([^<>"]*)"><img[^<>]*><br[^<>]*>[0-9]+</a>')#从目录页面抽取具体的漫画页面地址的规则    rule2=re.compile('</iframe><a\shref="[^<>]*"><img\ssrc="([^<>]*)"\sstyle="[^<>]*"\s/></a><iframe')#从漫画页面获取图片地址的规则    rule3=re.compile('<a\shref="([^<>"]*)">Download original')#如果存在原始大图,获取其地址的规则    #设置代理和cookie    proxy_support=urllib2.ProxyHandler({'http': 'http://127.0.0.1:8087'})    cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar())    opener=urllib2.build_opener(proxy_support,cookie_support,urllib2.HTTPHandler)    #默认的header模拟浏览器    header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727)',    'Accept-Language':'zh-CN,zh;q=0.8',    'Accept-Charset':'utf-8;q=0.7,*;q=0.7',    'connection':'keep-alive'}    try:        #开启主页面        header['Accept']='*/*'        header['Host']='g.e-hentai.org'        req=urllib2.Request('http://g.e-hentai.org',headers=header)        opener.open(req)        #打开登录页面        header['Referer']='http://g.e-hentai.org'        header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*'        req=urllib2.Request('http://g.e-hentai.org/home.php',headers=header)        opener.open(req)        #登录网站        header['Referer']='http://e-hentai.org/bounce_login.php'        header['Host']='e-hentai.org'        namekey=urllib.urlencode({'ipb_login_password':'xxxx','ipb_login_submit':'Login!','ipb_login_username':'xxxx'})        req=urllib2.Request('http://e-hentai.org/bounce_login.php?b=d&bt=1-1',data=namekey,headers=header)        opener.open(req)        #返回主页面        header['Host']='g.e-hentai.org'        req=urllib2.Request('http://g.e-hentai.org',headers=header)        opener.open(req)        #打开漫画目录页        header['Referer']='http://g.e-hentai.org'        req=urllib2.Request(url,headers=header)        mdi=rule0.findall(opener.open(req).read())        k=mdi and int(mdi[0]) or 0        k= k%20 and k/20+1 or k/20        if not os.path.isdir(way):os.mkdir(way)    except urllib2.HTTPError,e:        print 'First Error : %s'%(e)        sys.exit()    #循环抓取所有漫画页    i=0    pick=[]    suburl=url    header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*'    header['Host']=GetHost(url)    for j in xrange(k):        header['Referer']=suburl        suburl='%s?p=%d'%(url,j)        req=urllib2.Request(suburl,headers=header)        try:            page=opener.open(req)            if page.geturl()!=suburl:break            data=page.read()            if data:                medi=rule1.findall(data)                for each in medi:                    i+=1                    print 'Get %s'%(each)                    pick.append((each,way+'\\%03d.jpg'%(i)))        except urllib2.HTTPError,e:            print 'Second Error : %s'%(e)            sys.exit()    #从列表循环抓取每一页漫画    k=0    fail=[]    for i,j in pick:        if os.path.isfile(j):            file=open(j,'rb')            file.seek(0,2)            if file.tell()>2048:                print "%s --> existed"%(j)                continue        try:            k+=1            header['Referer']=url+'?p=%d'%(k/20)            header['Host']=GetHost(i)            req=urllib2.Request(i,headers=header)            data=opener.open(req).read()            if data:                see=rule2.findall(data)                ori=rule3.findall(data)                if ori:                    header['Referer']=i                    header['Host']=GetHost(ori[0])                    header['Accept']='*/*'                    req=urllib2.Request(ori[0],headers=header)                    page=opener.open(req)                    #attr=GetAttr(page.geturl())                    data=page.read()                elif see:                    header['Referer']=k                    header['Host']=GetHost(see[0])                    header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*'                    req=urllib2.Request(see[0],headers=header)                    page=opener.open(req)                    #attr=GetAttr(page.geturl())                    data=page.read()                else:                    data=''                    attr=''                if len(data)>2048:                    file=open(j,'wb')                    file.write(data)                    file.close()                    print '%s --> %s'%(i,j)                else:                    print '%s --> failed'%i                    fail.append((i,j))        except urllib2.HTTPError,e:            print 'Third Error : %s'%(e)            sys.exit()    #尝试失败的抓取,最大重复一百次    k=0    while fail:        i,j=fail[-1]        try:            k+=1            if k==100:break            header['Referer']=url            header['Host']=GetHost(i)            req=urllib2.Request(i,headers=header)            data=opener.open(req).read()            if data:                see=rule2.findall(data)                ori=rule3.findall(data)                if ori:                    header['Referer']=i                    header['Host']=GetHost(ori[0])                    header['Accept']='*/*'                    req=urllib2.Request(ori[0],headers=header)                    page=opener.open(req)                    #attr=GetAttr(page.geturl())                    data=page.read()                elif see:                    header['Referer']=k                    header['Host']=GetHost(see[0])                    header['Accept']='image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*'                    req=urllib2.Request(see[0],headers=header)                    page=opener.open(req)                    #attr=GetAttr(page.geturl())                    data=page.read()                else:                    data=''                    attr=''                if len(data)>2048:                    file=open(j,'wb')                    file.write(data)                    file.close()                    print '%s --> %s'%(i,j)                    fail.pop()                else:                    print '%s --> failed'%i        except urllib2.HTTPError,e:            print 'Fourth Error : %s'%(e)            sys.exit()if __name__=='__main__':    #url=raw_input('Input Url Address:')    #way=raw_input('Input Save Path:')    #getehentai(url,way)    getehentai('http://g.e-hentai.org/g/427281/97cebfc7e9/',u'D:\\fsr3')

评论关闭