豆瓣-只看楼主-pyquery版,豆瓣楼主-pyquery版,[Python]代码#
豆瓣-只看楼主-pyquery版,豆瓣楼主-pyquery版,[Python]代码#
[Python]代码
# Download douban post and parse by pyquery# Anonymous 2011-12@SZ# Configpost_home_url = 'http://www.douban.com/group/topic/23871584/'# Start from pyquery import PyQuery as pqdef iter_page(page): return [page.eq(i) for i,_ in enumerate(page)]def get_page(i): url = post_home_url+'?start=%d' %i #.format(i) return pq(url=url)def get_topic(page): post = page('.topic-content') for item in iter_page(page): user = item('.user-face a').attr('href') if user is None: continue text = {} text['post'] = item('.topic-doc p').html().replace('<br/>','\n').replace('&#13;','') text['id'] = user[user.find('people')+7:-2] text['time'] = item('.reply-doc h4').text()[:20] text['name'] = item('.reply-doc h4 a').text() yield user, textdef get_replies(page): reply = page('.topic-reply li') for item in iter_page(reply): user = item('.user-face a').attr('href') if user is None: continue text = {} text['post'] = item('.reply-doc p').html().replace('<br/>','\n').replace('&#13;','') text['id'] = user[user.find('people')+7:-2] text['time'] = item('.reply-doc h4').text()[:20] text['name'] = item('.reply-doc h4 a').text() yield user, textdef get_lz(): for user,text in get_topic(get_page(0)): return userprint 'Start ...\n'import time# LZlz = get_lz()#print 'LZ=',lz# TOPICfor user,text in get_topic(get_page(0)): print text['time'].encode('UTF-8'), print text['id'], print text['name'].encode('UTF-8'), print text['post'].encode('UTF-8')# REPLIESi = 0while True: #print '\nstart=',i page = get_page(i) if(page('li').hasClass('clearfix')): for user,text in get_replies(page): if (text is not None and user == lz): print text['time'].encode('UTF-8'), print text['id'], print text['name'].encode('UTF-8') print text['post'].encode('UTF-8') print '\n' i+= 100 else: break time.sleep(2)print '\n... Finished.'
相关内容
- 豆瓣-只看楼主,豆瓣-楼主,[Python]代码#
- 编程制-工程图(基础篇),工程基础篇,添加线# -*- cod
- 编程制-工程图(基础篇),工程基础篇,添加一个点# -*-
- 度换算成度分秒,换算成度分秒,[Python]代码#!
- 利用google地图根据地址批量获取经纬度,,利用google地图
- 上个程序提到的isin,程序提到isin,[Python]代码de
- 提取MD5值一样(或不一样)的文件,和后缀名一样的文
- python发送HTTP请求,python发送请求,GET 方法>>> im
- try...finally 异常,try...finally,异常处理import t
- 文件读写,,文件读写#第九章 文件#
评论关闭