豆瓣-只看楼主-pyquery版,豆瓣楼主-pyquery版,[Python]代码#


[Python]代码

# Download douban post and parse by pyquery# Anonymous 2011-12@SZ# Configpost_home_url = 'http://www.douban.com/group/topic/23871584/'# Start from pyquery import PyQuery as pqdef iter_page(page):    return [page.eq(i) for i,_ in enumerate(page)]def get_page(i):    url = post_home_url+'?start=%d' %i #.format(i)    return pq(url=url)def get_topic(page):    post = page('.topic-content')    for item in iter_page(page):        user = item('.user-face a').attr('href')        if user is None: continue        text = {}        text['post'] = item('.topic-doc p').html().replace('<br/>','\n').replace('
','')        text['id'] = user[user.find('people')+7:-2]        text['time'] = item('.reply-doc h4').text()[:20]        text['name'] = item('.reply-doc h4 a').text()        yield user, textdef get_replies(page):    reply = page('.topic-reply li')    for item in iter_page(reply):        user = item('.user-face a').attr('href')        if user is None: continue        text = {}        text['post'] = item('.reply-doc p').html().replace('<br/>','\n').replace('
','')        text['id'] = user[user.find('people')+7:-2]        text['time'] = item('.reply-doc h4').text()[:20]        text['name'] = item('.reply-doc h4 a').text()        yield user, textdef get_lz():    for user,text in get_topic(get_page(0)):        return userprint 'Start ...\n'import time# LZlz = get_lz()#print 'LZ=',lz# TOPICfor user,text in get_topic(get_page(0)):    print text['time'].encode('UTF-8'),    print text['id'],    print text['name'].encode('UTF-8'),    print text['post'].encode('UTF-8')# REPLIESi = 0while True:    #print '\nstart=',i    page = get_page(i)    if(page('li').hasClass('clearfix')):        for user,text in get_replies(page):                 if (text is not None and user == lz):                print text['time'].encode('UTF-8'),                print text['id'],                print text['name'].encode('UTF-8')                print text['post'].encode('UTF-8')                print '\n'        i+= 100    else:        break    time.sleep(2)print '\n... Finished.'

评论关闭