输入任意吧名下载百度贴吧图片,百度贴吧图片,python2.7.6,


python2.7.6,用了bs4模块。新手,之前没编程经验,目前还没搞懂class类。

这个脚本是学习借鉴了OSC好几个代码,然后写了个简单版。

另外请教下,Django难不难,搭建博客要花多长时间?我对HTML、PHP之类的都不懂

下午写好代码跑了下【美腿吧】,忘了关电脑,晚上吃饭回来发现电脑已经有16000张图片了..

# -*- coding:utf8 -*-from bs4 import BeautifulSoupimport os, urllib2, urllibimport timedef tieba_url():    print u'请输入想要去的贴吧:'    name = raw_input('>')    # 创建主文件夹//    path = os.getcwd()    path = unicode(path.decode('gbk'))          #!# 将win路径转化为unicode    path = os.path.join(path,u'爬虫实验')    tieba = unicode(name.decode('gbk'))         #!#  unicode,之前老报错    pathr = os.path.join(path,tieba)     if not os.path.isdir(pathr):        os.mkdir(pathr)    # 这个url,f和?之间加'/good',就可只抓精品区    url = 'http://tieba.baidu.com/f?kw=%s&tp=0&pn=' % name    urlr = urllib.quote(url,':/?=&')    #print urlr    return (urlr, pathr)def page_change(page,url):    url= '%s%d' %(url,page)          #!# 连接字符串和整型的方法,好牛!!    print url    content = urllib2.urlopen(url).read()    soup = BeautifulSoup(content)# print soup    my_tiezi = soup.find_all(class_='threadlist_text threadlist_title j_th_tit  notStarList ')    return my_tiezi#print my_tiezi# 创建从贴吧页面获取帖子网址的函数:def tieba_find_link(my_tiezi):       flinks = []    titles = []    for tiezi in my_tiezi:        link = tiezi.a.get('href')        flink = 'http://tieba.baidu.com' + link + '?see_lz=1' #只看LZ不止一页怎么搞,总页数怎么确定        #print flink        title = tiezi.a.string.encode('gbk','ignore')        flinks.append(flink)        titles.append(title)    return (flinks, titles)                            # 第一次遇到,搜索了,发现了元组这个好东西# 创建从帖子本身获取楼主照片的函数:def get_img(flinks,titles,path):    for i in range(len(flinks)):                    # 巧妙的解决了同时解包元组的问题        flink = flinks[i]        title = titles[i]        content = urllib2.urlopen(flink).read()           soup = BeautifulSoup(content)        ilinks = soup.find_all(class_='BDE_Image')    #检测是否有图片,如果没图片,忽略该帖子//        if ilinks == []:            continue        for ilink in ilinks:            src = ilink.get('src')            print u'准备下载图片,来自帖子'+ unicode(title.decode('gbk'))            try:                content = urllib2.urlopen(src,timeout=5).read()            except:                print u'请求超时,放弃下载此照片'                continue            print u'下载完毕'#            try:                with open(path+'/'+src[-10:],'wb') as code:                    code.write(content)                print u'成功保存照片'#print u'保存完毕'            except:                print u'估计是文件夹名字的问题,保存失败:'                print path+'/'+src[-10:]# 这是主函数:def main(url,path,page=0):      p1 = page_change(page,url)    (flinks,titles) = tieba_find_link(p1)    get_img(flinks,titles,path)    page = int(page) + 50    print u'开始抓取下一页'    print '-'* 40    main(url,path,page)        #之前这个顺序弄错了,老是报错。(url, path) = tieba_url()   main(url,path)#该片段来自于http://byrx.net

评论关闭