爬取和保存豆瓣小组图片,豆瓣小组图片,#!/usr/bin/e


#!/usr/bin/env python3#-*- coding=utf-8 -*-import requestsimport timeimport randomimport reimport configparserimport loggingimport logging.handlersimport lxml.etree as etreeimport threadingimport queueimport os.pathDOUBAN_HEADERS = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Referer':'http://www.douban.com/search?cat=1019&amp;q=%E5%AE%B3%E7%BE%9E','Accept-Language':'zh-CN,zh;q=0.8','User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36','Accept-Encoding':'gzip, deflate','Host':'www.douban.com','Connection':'Keep-Alive'}IMAGE_HEADERS = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36'}CNFG_FILE = 'douban_crawler.cfg'LOG_FILE = 'douban_crawler.log'MAX_LOG_SIZE = 1024 * 1024 #1MBLOG_BACKUP_COUNT = 3logger = logging.getLogger('crawler')logger.setLevel(logging.DEBUG)fh = logging.handlers.RotatingFileHandler(LOG_FILE,maxBytes=MAX_LOG_SIZE,backupCount=LOG_BACKUP_COUNT,encoding='utf-8')fh.setLevel(logging.DEBUG)ch = logging.StreamHandler()ch.setLevel(logging.INFO)formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(message)s")fh.setFormatter(formatter)ch.setFormatter(formatter)# add the handlers to loggerlogger.addHandler(fh)logger.addHandler(ch)DEBUG = logger.debugINFO = logger.infoWARNING = logger.warningERROR = logger.errorclass Parser_Douban_Group(threading.Thread):    def __init__(self, url, queue,t_name = 'Parser Group'):        threading.Thread.__init__(self,name=t_name)        self.data = queue        self.url = url        self.s = requests.Session()    def run(self):        #解析网页        INFO("{0} started!".format(self.getName()))        co = 0        htm = open_douban_page(self.url,self.s)        try:            parser = etree.HTMLParser(recover=True)            text_dom = etree.fromstring(htm, parser)        except Exception as e:            ERROR('Parse douban page error: {0}'.format(e))            #DEBUG('Page: {0}'.format(htm))        else:            group_name = ''.join(text_dom.xpath("//div[@id='group-info']/h1//text()")).strip()            INFO('Group name: {0}'.format(group_name))            div_node = text_dom.xpath("//tr[@class='']")            for x in div_node:                co = co + 1                item = {}                url = ''.join(x.xpath("child::td[@class='title']/a/attribute::href"))                title = ''.join(x.xpath("child::td[@class='title']/a//text()"))                auth = ''.join(x.xpath("child::td[@nowrap='nowrap']/a[@class='']//text()"))                reply = ''.join(x.xpath("child::td[@class='']//text()"))                time = ''.join(x.xpath("child::td[@class='time']//text()"))                item['title'] = title                item['url'] = url                item['auth'] = auth                item['reply'] = reply                item['time'] = time                #将数据依次存入队列                self.data.put(item, block=True)                DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), co,item['title'][:20]))        #存入结束标志        self.data.put({})        INFO("{0} finished! put {1} topic to queue.".format(self.getName(), co))class Parser_Douban_Topic(threading.Thread):    def __init__(self, topic_queue, content_queue, t_name = 'Parser Topic'):        threading.Thread.__init__(self,name=t_name)        self.topic_queue = topic_queue        self.content_queue = content_queue        self.s = requests.Session()    def run(self):        #解析网页        INFO("{0} started!".format(self.getName()))        co = 0        coo = 0        while True:            try:                #读取队列,最长等待5分钟                val = self.topic_queue.get(True,300)                if val:                    co = co + 1                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co, val['title'][:20]))                    htm = open_douban_page(val['url'],self.s)                    try:                        parser = etree.HTMLParser(recover=True)                        text_dom = etree.fromstring(htm, parser)                    except Exception as e:                        ERROR('Parse douban page error: {0}'.format(e))                        #DEBUG('Page: {0}'.format(htm))                    else:                        topic_name = ''.join(text_dom.xpath("//div[@id='content']/h1//text()")).replace('\n','').strip()                        DEBUG('Topic name: {0}'.format(topic_name))                        div_node = text_dom.xpath("//div[@class='topic-content']")                        img_list = div_node[0].xpath("descendant::img/attribute::src")                        for x in img_list:                            coo = coo + 1                            item = {}                            #url = ''.join(x.xpath("descendant::img/attribute::src"))                            item['title'] = topic_name + str(coo)                            item['url'] = x                            #将数据依次存入队列                            self.content_queue.put(item)                            DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), coo,item['title'][:20]))                else:                    self.topic_queue.put({})                    INFO("{0} finished! get {1} topic from queue.".format(self.getName(), co))                    break            except Exception as e:                ERROR("{0} timeout! {1}".format(self.getName(), e))                break        #存入结束标志        self.content_queue.put({})        INFO("{0} finished! put {1} image to queue.".format(self.getName(), coo))class Save_Douban_Group(threading.Thread):    def __init__(self, queue, folder_name = 'image', t_name = 'Storage'):        threading.Thread.__init__(self,name=t_name)        self.data = queue        self.folder = folder_name        self.s = requests.Session()    def run(self):        INFO("{0} started!".format(self.getName()))        co = 0        coo = 0        while True:            try:                #读取队列,最长等待5分钟                val = self.data.get(True,300)                if val:                    co = co + 1                    #fp.write('<{0}>.{1} - {2}\r{3}\r{4}\r\n'.format(                    #co,val['title'],val['time'],val['url'],val['abr']))                    DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co,val['title'][:20]))                    img_dt = open_douban_page(val['url'], self.s, ret_raw = True)                    img_nm = val['url'].split('/')[-1]                    if img_dt:                        fn = '{0}/{1}'.format(self.folder, img_nm)                        if not os.path.exists(fn):                            fp = open(fn, 'wb')                            fp.write(img_dt)                            fp.close()                            coo = coo + 1                else:                    self.data.put({})    #仍然存入结束标识                    break            except Exception as e:                ERROR("{0} timeout! {1}".format(self.getName(), e))                #break        #fp.close()        INFO("{0} finished! save image({1}/{2}).".format(self.getName(), coo, co))def open_douban_page(group_url, s, retries=3, ret_raw = False):    #读取网页    ret = ''    try:        cookies = dict(bid="RmFNKKPAd0s")        if ret_raw:            r = s.get(group_url, headers=IMAGE_HEADERS, stream=True)        else:            r = s.get(group_url, headers=DOUBAN_HEADERS, cookies=cookies)        #print(r.cookies)        r.raise_for_status()        time.sleep(random.uniform(0.3, 1.5))    except requests.ConnectionError as e:        ERROR('Connect douban error({0}): {1}'.format(retries,e))        retries = retries - 1        if retries > 0:            time.sleep(0.5)            ret = open_douban_page(group_url, s, retries)    except Exception as e:        ERROR('Open douban url({0}) error: {1}'.format(group_url, e))    else:        #INFO('Open douban page finished! - {0}'.format(r.url))        DEBUG('Request url: {0}'.format(group_url))        if ret_raw:            ret = r.raw.read()        else:            ret = r.text    return retdef crawler_douban(group_url, folder_name, task_name):    q_topic = queue.Queue()    q_content = queue.Queue()    parser_group_obj = []    parser_topic_obj = []    storage_pic_obj = []    for i in range(1,2):        parser_group_obj.append(Parser_Douban_Group(group_url, q_topic, '{0} {1}'.format(task_name, i)))    for i in range(1,2):        parser_topic_obj.append(Parser_Douban_Topic(q_topic, q_content, 'Parser Topic {0}'.format(i)))    for i in range(1,3):        storage_pic_obj.append(Save_Douban_Group(q_content, folder_name, 'Storage {0}'.format(i)))    for obj in parser_group_obj:        obj.start()    for obj in parser_topic_obj:        obj.start()    for obj in storage_pic_obj:        obj.start()    for obj in parser_group_obj:        obj.join()    for obj in parser_topic_obj:        obj.join()    for obj in storage_pic_obj:        obj.join()    del q_topic    del q_contentif __name__ == '__main__':    haixiu_hangzhou_url = 'http://www.douban.com/group/505137/'    haixiu_url = 'http://www.douban.com/group/haixiuzu/'    co =0    while True:        co = co + 1        time.sleep(2.0)        crawler_douban(haixiu_url, 'image', 'Parser HaiXiu Group ({0})'.format(co))    input('Press any key to exit!')

评论关闭