Python爬取红黑联盟读书频道的图书,python读书频道,2.pngimgs/as
Python爬取红黑联盟读书频道的图书,python读书频道,2.pngimgs/as
2.png
imgs/asCode/30132527_I83p.png
1.png
imgs/asCode/30125721_DJdz.png
Crawler.py
'''Created on 2012年11月29日@author: jiangxiaoqiang程序主要完成爬取红黑联盟读书频道的试读图书,以HTML文件形式分图书存放(参考截图)暂未爬取样式和生成PDF'''# coding: UTF-8import reimport osfrom urllib.request import urlopenimport sys# 抓取网页内容,字符编码为encodingdef getWebPageContent(url, encoding): try: resp = urlopen(url) return resp.read().decode(encoding) except Exception: sys.stderr.write('获取URL为%s的网页内容时发生异常!\n' % (url))# 根据正则表达式reg提取content中第一个匹配reg的字符串,def splitMatch(content, reg): return re.search(reg, content).group(1)# 根据正则表达式reg提取content中所有匹配reg的内容,以列表形式返回def splitMatches(content, reg): return re.findall(reg, content)# 爬取形如http://book.2cto.com/201211/9239.html的页面所有有效链接def crawLinks(seedUrl): # 先将形如http://book.2cto.com/201211/9239.html的页面的内容扒下来 content = getWebPageContent(seedUrl, 'GB18030') # 网页内容,查看HTML源代码发现是GB2312编码,这里可以指定为gb2312或gbk或者GB18030编码 # 爬取书名 bookTitleReg = r'<title>(.*?)</title>\s*(?=<meta)' bookTitle = splitMatch(content, bookTitleReg)# bookTitle = bookTitle[0:str(bookTitle).index('-')] bookTitle = bookTitle[0:10] ''' 爬取章名作为key,每一章包含小节的href值作为value,存到字典里,类似这样: { '前言与目录':( '前言', '目录' ), ... } 分析一段示例HTML如下: <li class="chaptr">前言与目录</li> <li class="artl"> <a title="前言" target="_blank" href="http://book.2cto.com/201211/9240.html">前言</a> </li> <li class="artr"> <a title="目录" target="_blank" href="http://book.2cto.com/201211/9241.html">目录</a> </li> ''' chapterReg = r'<li\s*class=\"chaptr\">(.*?)</li>\s*((?:<li\s*class=\"(?:artl|artr)\">\s*<a\s*title=\".*?\"\s*target=\"_blank\"\s*href=\".*?\">.*?</a>\s*</li>\s*)+)' sectionReg = r'(?<=href=\")(.*?)(?=\")' # 使用了正向与负向预查,详细用法参考同系列代码# matches = re.findall(chapterReg, content) matches = splitMatches(content, chapterReg) chapterWithSectionDict = {} # 章节字典(顺序不一定是加进去的顺序) for m in matches: chapterName = m[0] sections = splitMatches(str(m[1:]), sectionReg) chapterWithSectionDict[chapterName] = sections return (bookTitle, sorted(chapterWithSectionDict.items(), key=lambda k: k[1], reverse=False))# 根据小节链接的列表和章节名保存为Text文档存放到saveDir目录下def crawSectionPageAndSaveAsText(fileName, saveDir, *sectionLink): print('开始爬取 <%s>' % (fileName)) filePath = os.path.join(saveDir, fileName)+'.html'# print('待写入路径为:' + filePath)# print('文件目录为:' + saveDir) if not os.path.exists(saveDir): # 目录不存在则创建 os.makedirs(saveDir) try: f = open(filePath, 'w+') # 写模式 f.write('<b>' + fileName + '</b><br /><br />') # 章名粗体 for link in sectionLink[0]: sectionTitleAndContent = crawSectionTitleAndContent(str(link)) f.write('<b>' + sectionTitleAndContent[0] + '</b><br />') # 小节名也粗体 f.write(sectionTitleAndContent[1] + '<br /><br />') print(' 爬取完成章节 <%s> !' % (sectionTitleAndContent[0])) f.close() except Exception: sys.stderr.write('写入 <%s> 时发生IO异常!\n' % (fileName))def crawSectionTitleAndContent(sectionLink): # 获取小节页面内容 content = getWebPageContent(sectionLink, 'GB18030') # 获取小节标题,小节标题实例如下: # <dl class="box_t"><dd>概述</dd></dl> sectionTitleReg = r'<dl\s*class="box_t"><dd>(.*?)</dd></dl>' # 只取标题所以用圆括号分个组 sectionTitle = splitMatch(content, sectionTitleReg) # 获取小节内容,小节内容实例如下: ''' <dl class="box_body" id="fontzoom"><dd id="Article"> <p>JavaScript包含大量脆弱的或有问题的特性,这些会妨碍编写优秀的程序</p> <p>其他内容</p> </dd></dl> ''' sectionContentReg = r'<dl\s*class="box_body"\s*id="fontzoom">\s*<dd\s*id="Article">([\d\D]*?)</dd>\s*</dl>' sectionContent = splitMatch(content, sectionContentReg) return (sectionTitle, sectionContent)def main(): # sectionLinks格式类似: # [('前言与目录', ['http://book.2cto.com/201211/9240.html', 'http://book.2cto.com/201211/9241.html']),...] sectionLinks = crawLinks('http://book.2cto.com/201211/9643.html') bookTitle = sectionLinks[0] print('<' + bookTitle + '> 共有' + str(len(sectionLinks[1])) + '章') for item in sectionLinks[1]: crawSectionPageAndSaveAsText(item[0], 'd:/红黑联盟/' + bookTitle + os.path.altsep, item[1]) print('爬取完成 <%s> !\n' % (item[0])) print('所有内容爬取完成!')if __name__=='__main__': main()
相关内容
- Learn Python By Practice — Utilities模块,pythonutilities,Utilit
- Google Python Class练习解答1-string1.py,python1-string1.py,stri
- Learn Python By Practice — 排序和元组,pythonpractice,排序和
- 用urllib按照百度音乐分类下载mp3,urllib百度音乐mp3,[P
- 从1到10的循环方式改进,10循环方式改进,换一种循环方
- python的一些好的非主流语法用法,python非主流语法,#编
- Learn Python By Practice — Range和Slice扩展学习,pythonslice,
- HTMLParser笔记,,[Python]代码#-
- Learn Python By Practice — 文件读写,,import sysim
- Learn Python By Practice — dict,pythondict,def dictTest
评论关闭