使用python抓取小说,,记录小说的信息{"不败战
使用python抓取小说,,记录小说的信息{"不败战
记录小说的信息
{
"不败战神": {
"url": "http://www.23us.com/html/27/27736/",
"bookmark": "第两百八十八节 唐天的判断"
},
"大主宰": {
"url": "http://www.23us.com/html/28/28373/",
"bookmark": "第一百九十五章 取巧"
},
"神级英雄": {
"url": "http://www.23us.com/html/42/42368/",
"bookmark": "第101章 牵动公会巨头的卷轴"
}
}
# -*- coding: utf-8 -*-from BeautifulSoup import BeautifulSoupimport urllib2import codecsimport timeimport jsonimport sys# 解决UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 0reload(sys)sys.setdefaultencoding('utf-8')def novelFilter(content): content=content.replace('<br />\\n<br />','') content=content.replace('<br />','') content=content.replace(' ','') content=content.replace('<dd id="contents">','') content=content.replace('</dd>','') return contentdef novelFetch(url, title): novel = urllib2.urlopen(url) soup = BeautifulSoup(novel.read().decode('gbk', 'ignore')) contents = '\\n' + title + '\\n' + str(soup.find('dd', id = 'contents')) contents = novelFilter(contents) print title.decode("utf-8") return contentsdef novelSelect(url, mark): link = urllib2.urlopen(url) soup = BeautifulSoup(link.read().decode('gbk', 'ignore')) body = soup.findAll('td') flag = False cont = '' href = '' title = '' for i in body: try: href = url + i.a['href'] title = str(i.a.string) if flag and href: cont += novelFetch(href, title) if title.decode("utf-8") == mark: flag = True except: pass return { 'contents': cont, 'bookmark': title }def novelManage(info=0): if info: f = codecs.open('novel.json', 'w') f.write(json.dumps(info, indent=2, ensure_ascii=False)) f.close() else: f = codecs.open('novel.json', 'r') info = json.loads(f.read()) return infoif __name__=='__main__': novels = novelManage() hasUpdate = False for title in novels: novel = novelSelect(novels[title]['url'], novels[title]['bookmark']) cont = novel['contents'] bookmark = novel['bookmark'] if cont and bookmark: novels[title]['bookmark'] = bookmark timesamp = time.strftime("%Y%m%d%H%M", time.localtime()) f = codecs.open(title + '_' + timesamp + '.txt', 'w', 'gbk') #使用gbk格式 f.write(cont) f.close() hasUpdate = True if not hasUpdate: print '小说没有更新。'.decode('utf-8') else: novelManage(novels)#该片段来自于http://byrx.net
相关内容
- python 加减小游戏,python减小游戏,from operato
- python 枚举系统进程,python枚举进程,from ctypes.
- 支持ftp上传下载文件和目录,ftp上传下载文件,支持ft
- 改进emacs里看糗百,改进emacs糗百,写代码写累了,可以
- 批量生成ETL Automation APP下Perl脚本执行目录,etlperl,imp
- Python异步任务队列,python异步队列,#!/usr/bin/e
- 再一个生成随机mac地址的脚本,生成mac脚本,#!/usr/bin/
- 命令行看糗百,命令行糗百,#!/usr/bin/e
- 一个用python写的用命令行看糗百的小工具,python,最近正
- 简单多线程字典暴力破解web表单,暴力破解web表单,py
评论关闭