“愤怒”离开百度空间(增加图片抓取功能),百度抓取,文章批量修改#codin


文章批量修改

#coding:gbkimport urllib2,urllib,re,osimport cookielib,time'''    百度爬虫类    @author:FC_LAMP'''class SpiderBaiDu:    #变量    sqlit = None    cur   = None    baseurl = 'http://hi.baidu.com/new/'    total  = 0    header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'}    #处理单引号    def qutoSin(self,string):        return string.replace("'","")    #登陆百度空间    '''      user 为用户名      pwd  为password    '''    def LoginBaiDu(self,user,pwd):        #设置        cookie = cookielib.CookieJar()        cookieProc = urllib2.HTTPCookieProcessor(cookie)        opener = urllib2.build_opener(cookieProc)        urllib2.install_opener(opener)        #请求        post = {            'username':user,            'password':pwd,            'tpl':'mn',            'u':'http://www.baidu.com/',            'psp_tt':0,            'mem_pass':'on'            }        post = urllib.urlencode(post)        req = urllib2.Request(            url='https://passport.baidu.com/?login',            data=post,            headers = self.header            )        res = urllib2.urlopen(req).read(500)        if 'passCookie' in res:            flag = True        else:            flag = 'Login Fail:%s'%user        return flag    #分析页面    def parse_html(self,domain,msg,debug=False):        url = self.baseurl+domain        #获取第一篇文章        c = urllib2.urlopen(url).read()        qbid = re.compile('.*?data-blogid="(.*?)".*?',re.S|re.I|re.U)        qbid = qbid.search(c)        if qbid==None:            raise Exception('此空间已被关!')        qbid = qbid.group(1)        url = 'http://hi.baidu.com/%s/item/%s'%(domain,qbid)        #开始解析        postedit_url = 'http://hi.baidu.com/pub/submit/modifytext';        i = 0 #禁言次数        while True:            c = urllib2.urlopen(url).read()            if debug:                #获取标题:                title = re.compile(                    '<title>(.*?)<\/title>',                    re.S|re.I|re.U                    )                title = title.search(c)                #解析成uncode对象                title = title.group(1).decode('utf-8')                print title                print url            #获取下一篇连接            nexturl = re.compile(                    '<div.*?class="detail-nav-pre">.*?<a.*?href="(.*?)".*?hidefocus.*?><\/a>.*?<\/div>',                     re.S|re.I|re.U                )            nexturl = nexturl.search(c)            #获取token            modify_url = 'http://hi.baidu.com/pub/show/modifytext?qbid='+qbid            modify_c = urllib2.urlopen(url).read()            qbtoken = re.compile(                'window.qBdsToken="(.*?)";'                )            qbtoken = qbtoken.search(modify_c).group(1)            #编码传回百度            postmsg = unicode(msg,'gbk').encode('utf-8')            #提交数据            postdata={                #'title':'愤怒离开:http://fc-lamp.blog.163.com/',                'content':postmsg,                'qbid':qbid,                'bdstoken':qbtoken,            }            postdata = urllib.urlencode(postdata)            req = urllib2.Request(                url=postedit_url,                data = postdata,                headers = self.header                )            res = urllib2.urlopen(req).read()            #换成dict            res = eval(res)            if res['errorMsg']!='':                i+=1                if i>=2:                    print '哎~~又被禁言了,休息四分种吧。'                    time.sleep(240)                else:                    print '哎~~又被禁言了,休息两分种吧。'                    time.sleep(120)                continue            else:                i=0            #是否还有下一篇            if nexturl==None:                break            url = self.qutoSin(nexturl.group(1).strip()).replace("\\",'')            qbid = url.split('/')[-1]#例子if __name__ =='__main__':    while True:        print '---================----'        print '特别说明:此程序仅做程序学习所用,严禁任何形式的商业用途,否则后果自负!!!'        print '        Author:FC_LAMP Blog: fc-lamp.blog.163.com'        print '---================----\n\n百度空间文章批量修改'        user = str(raw_input('\n你的用户名(回车确定):'))        pwd = str(raw_input('\n你的密码(回车确定):'))        print '登录中.....'        spider = SpiderBaiDu()        f = spider.LoginBaiDu(user,pwd)        if f == True:            print '登录成功!'            url = str(raw_input('你的空间域名(回车确定):'))            flag = str(raw_input('你确定要批量修改百度空间中的所有文章吗?(y/n)')).lower()            if flag =='y':                msg = str(raw_input('请输入新的内容:'))                st = time.time()                try:                    print '解析中.......'                    spider.parse_html(url,msg,debug=True)                except Exception as e:                    print '解析错误!空间无法被正常解析!空间是否被度娘关了(或没有文章)?'                et = time.time()                c = et - st                print('批量操作结束,耗时:%0.2f'%c)            else:                print ''        else:            print(f)

图片抓取下载

#coding:utf-8import os,urllib2,urllib,re,time#导入PILimport Image as imageimport StringIOst = time.time()#URL地址 这里仅是某一篇文章的URL#整个空间文章的抓取可与http://fc-lamp.blog.163.com/blog/static/1745666872012784515541/这篇文章#参考来做url = 'http://hi.baidu.com/xxx/item/xxxx'#保存地址save_dir = 'd:/imgtest/'#图片地址img_dir = 'img' #文章qbid(这里可以用spider代码来动态改变qbid值)qbid = 'test321'#下载内容old_c = urllib2.urlopen(url).read()c_patt = re.compile('<div\s+id=content.*?>([^>])<\/div>',                  re.I|re.S)old_c = c_patt.search(old_c).group()del c_patt#批量替换旧内容中的图片的路径img_patt = re.compile('src=".*?/(\w+\.\w+)"')new_c = img_patt.sub(r'src="./%s/\1"'%img_dir,old_c)#保存文章fp = open(os.path.join(save_dir,'%s.html'%qbid),'a')fp.write(new_c)fp.close()#真正下载图片img_patt = re.compile('src="(.*?)"')img_patt = img_patt.findall(old_c)i =0for img in img_patt:    i+=1    #图片名称    img_name = os.path.join(img_dir,img.split('/')[-1])    #获取图片资源    img = urllib2.urlopen(img).read()    im = image.open(StringIO.StringIO(img))    #im.show()    #合并路径    path = os.path.join(save_dir,img_name)    #保存    im.save(path)et = time.time() - stprint unicode('共抓取了 %d 张图片,耗时 %0.2f'%(i,et),'utf-8')

评论关闭