求推荐Python多线程爬虫重复内容处理思路,python多线程,import reque


import requestsfrom bs4 import BeautifulSoupimport threadingurl_num = 0url_list = ['http://ubuntuforums.org/forumdisplay.php?f=333',]for x in range(1, 50):    url_num += 1    raw_url = 'http://ubuntuforums.org/forumdisplay.php?f=333&page=%d' % url_num    url_list.append(raw_url)class MyThread(threading.Thread):    def __init__(self, func, args, name=""):        threading.Thread.__init__(self)        self.func = func        self.args = args        self.name = name    def run(self):        apply(self.func, self.args)def running(url):    # lock.acquire()    html = requests.get(url)    if html.status_code == 200:        html_text = html.text    soup = BeautifulSoup(html_text)    with open('/home/zhg/Pictures/cao.txt', 'a+') as f:        for link in soup.find_all('a', 'title'):            s = 'http://ubuntuforums.org/' + str(link.get('href')) + ' ' + str(link.get_text().encode('utf-8'))            f.writelines(s)            f.writelines('\n')    # lock.release()if __name__ == '__main__':    thread_list = [ MyThread(running, (url, ), running.__name__) for url in url_list ]    for t in thread_list:        t.setDaemon(True)        t.start()    for i in thread_list:        i.join()    print "process ended"    with open('/home/zhg/Pictures/cao.txt', 'r') as f:        f_list = f.readlines()        set_list = set(f_list)    for x in set_list:        if f_list.count(x) > 1:            print "the <%s> has found <%d>" % (x, f_list.count(x))

而且如何加锁的话运行速率和直接用for循环不用多线程一样,这是为什么?

额,这个问题解决了,问题留着给其他人参考。
数据在存储到文件之前没有去重复,而爬数据的网页上有置顶的文章,所以爬了50多页,三条置顶的文章出现了40多次。
我还以为是没加锁的关系。

编橙之家文章,

评论关闭