抓取点东西,抓取点,#encoding=ut


#encoding=utf8import urllibfrom bs4 import BeautifulSoupfrom bs4.element import NavigableStringdef strip_tags(t, invalid_tags):    s=""    for e in t:        c=e        if not isinstance(e, NavigableString):            c = strip_tags(e.contents, invalid_tags)        s += unicode(c.strip())    return sfw=open(u"找老公的.txt",'w')host='club.iweihai.cn'f=urllib.urlopen('http://%s/thread-htm-fid-122-type-183-type-183.html#tabA' % host)content= f.read().decode('gbk')bs=BeautifulSoup(content)rows=bs.select("#threadlist .tr4")[0].find_next_siblings()for i in range(len(rows)):    r=rows[i].select(".subject_t")[0]    iurl='http://'+host+'/'+r['href']    ff=urllib.urlopen(iurl)    content2= ff.read().decode('gbk')    bs2=BeautifulSoup(content2)    print '#%d'%i,r.string,iurl    fw.write('#%d %s\\n' % (i,r.string.encode('utf8')))    fw.write('-'*80+'\\n')    r2=bs2.select("#read_tpc")[0]    fw.write(strip_tags(r2.contents,[]).encode('utf8')+'\\n')    fw.write('-'*80+'\\n')fw.close()#该片段来自于http://byrx.net

评论关闭