抓取网上的小说章节并写入txt文件,抓取网上章节txt,import urlli
文章由Byrx.net分享于2019-03-23 09:03:41
抓取网上的小说章节并写入txt文件,抓取网上章节txt,import urlli
import urllib.request as webimport re'''用来过滤出小说html中小说的章节名与内容'''def getContent(url): http = str(web.urlopen(url).read(),encoding='GBK') title = re.findall('<h1>.*?</h1>',http)[0] title = re.sub('</?h1>','',title) content = re.findall('<div id="content">.*?</div>',http)[0] content = re.sub('<br />','\\n',content) content = re.sub('<div id="content">|</div>','',content) content = re.sub(' ',' ',content) return (title,content)'''用来获取目录页码html下的章节超链接'''def getUrlList(url): http = str(web.urlopen(url).read(),encoding='GBK') lis = re.findall('<a.*?章.*?</a>',http) hrefs = [] for l in lis: try: hrefs.append(l.split('"')[1]) except: pass return hrefsif __name__ == '__main__': url = '小说地址url' f = open('e://name.txt',mode='w') urlList = getUrlList(url) numUrlList = [] for u in urlList[:-1]: try: #print(url,' ',url[:-5]) numUrlList.append(int(u[:-5])) except: pass numUrlList.sort() for href in numUrlList: h = url + str(href) + '.html' print(h) try: c = getContent(h) except: try: c = getContent(h) except: print('读取失败了') continue title,content = c print(title,'完成') f.write(title+'\\n') f.write(content) f.write('\\n') print('全部完成了,ohyeah') f.close()#该片段来自于http://byrx.net
评论关闭