使用python抓取小说,,记录小说的信息{"不败战


记录小说的信息

{

"不败战神": {

"url": "http://www.23us.com/html/27/27736/",

"bookmark": "第两百八十八节 唐天的判断"

},

"大主宰": {

"url": "http://www.23us.com/html/28/28373/",

"bookmark": "第一百九十五章 取巧"

},

"神级英雄": {

"url": "http://www.23us.com/html/42/42368/",

"bookmark": "第101章 牵动公会巨头的卷轴"

}

}

# -*- coding: utf-8 -*-from BeautifulSoup import BeautifulSoupimport urllib2import codecsimport timeimport jsonimport sys# 解决UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 0reload(sys)sys.setdefaultencoding('utf-8')def novelFilter(content):  content=content.replace('<br />\\n<br />','')  content=content.replace('<br />','')  content=content.replace(' ','')  content=content.replace('<dd id="contents">','')  content=content.replace('</dd>','')  return contentdef novelFetch(url, title):  novel = urllib2.urlopen(url)  soup = BeautifulSoup(novel.read().decode('gbk', 'ignore'))  contents = '\\n' + title + '\\n' + str(soup.find('dd', id = 'contents'))  contents = novelFilter(contents)  print title.decode("utf-8")  return contentsdef novelSelect(url, mark):  link = urllib2.urlopen(url)  soup = BeautifulSoup(link.read().decode('gbk', 'ignore'))  body = soup.findAll('td')  flag = False  cont = ''  href = ''  title = ''  for i in body:    try:      href = url + i.a['href']      title = str(i.a.string)      if flag and href:        cont += novelFetch(href, title)      if title.decode("utf-8") == mark:        flag = True    except:      pass  return {    'contents': cont,    'bookmark': title  }def novelManage(info=0):  if info:    f = codecs.open('novel.json', 'w')    f.write(json.dumps(info, indent=2, ensure_ascii=False))    f.close()  else:    f = codecs.open('novel.json', 'r')    info = json.loads(f.read())  return infoif __name__=='__main__':  novels = novelManage()  hasUpdate = False  for title in novels:    novel = novelSelect(novels[title]['url'], novels[title]['bookmark'])    cont = novel['contents']    bookmark = novel['bookmark']    if cont and bookmark:      novels[title]['bookmark'] = bookmark      timesamp = time.strftime("%Y%m%d%H%M", time.localtime())      f = codecs.open(title + '_' + timesamp + '.txt', 'w', 'gbk') #使用gbk格式      f.write(cont)      f.close()      hasUpdate = True  if not hasUpdate:    print '小说没有更新。'.decode('utf-8')  else:    novelManage(novels)#该片段来自于http://byrx.net

评论关闭