抓取网上的小说章节并写入txt文件,抓取网上章节txt,[Python]代码im


[Python]代码

import urllib.request as webimport re'''用来过滤出小说html中小说的章节名与内容'''def getContent(url):    http = str(web.urlopen(url).read(),encoding='GBK')    title = re.findall('<h1>.*?</h1>',http)[0]    title = re.sub('</?h1>','',title)    content = re.findall('<div id="content">.*?</div>',http)[0]    content = re.sub('<br />','\n',content)    content = re.sub('<div id="content">|</div>','',content)    content = re.sub(' ',' ',content)    return (title,content)'''用来获取目录页码html下的章节超链接'''def getUrlList(url):    http = str(web.urlopen(url).read(),encoding='GBK')    lis = re.findall('<a.*?章.*?</a>',http)    hrefs = []    for l in lis:        try:            hrefs.append(l.split('"')[1])        except:            pass    return hrefsif __name__ == '__main__':    url = '小说地址url'    f = open('e://name.txt',mode='w')    urlList = getUrlList(url)    numUrlList = []    for u in urlList[:-1]:        try:            #print(url,'  ',url[:-5])            numUrlList.append(int(u[:-5]))        except:            pass    numUrlList.sort()    for href in numUrlList:        h = url + str(href) + '.html'        print(h)        try:            c = getContent(h)        except:            try:                c = getContent(h)            except:                print('读取失败了')                continue        title,content = c        print(title,'完成')        f.write(title+'\n')        f.write(content)        f.write('\n')    print('全部完成了,ohyeah')    f.close()  

评论关闭