抽取新闻,,经常看财经新闻,写了个简


经常看财经新闻,写了个简单小程序,抽取财经网新闻到数据库慢慢看

#! /usr/bin/env python#coding=utf-8import urllibfrom bs4 import BeautifulSoupimport socketimport timeimport MySQLdbimport sysimport jieba,jieba.analysereload(sys)sys.setdefaultencoding('utf-8')timeout = 10socket.setdefaulttimeout(timeout)sleep_download_time = 20time.sleep(sleep_download_time)print 'Good! Start to url Parse! Please wait----------------------'url_caijing=[]; html_caijing=[]; templist_caijing=[]   #初始化url_caijing0 = 'http://stock.caijing.com.cn/gpdp/index.html'    # 大盘html_caijing0 =  BeautifulSoup(urllib.urlopen(url_caijing0).read(),from_encoding="gb18030")templist_caijing0 = html_caijing0.findAll("ul",{"class":"list"})  for i in templist_caijing0:     newslist_caijing0=i.findAll("a")    s = len(newslist_caijing0)print '加载了url_caijing0',s,',条记录进入-----------------------'url_caijing1 = 'http://stock.caijing.com.cn/stockresearch/'   #策略研究html_caijing1 =  BeautifulSoup(urllib.urlopen(url_caijing1).read(),from_encoding="gb18030")templist_caijing1 = html_caijing1.findAll("ul",{"class":"list"})  for i in templist_caijing1:     newslist_caijing1=i.findAll("a")   s = len(newslist_caijing1)print '加载了url_caijing1',s,',条记录进入-----------------------'url_caijing2 = 'http://stock.caijing.com.cn/market/'   #股票市场html_caijing2 =  BeautifulSoup(urllib.urlopen(url_caijing2).read(),from_encoding="gb18030")templist_caijing2 = html_caijing2.findAll("ul",{"class":"list"}) for i in templist_caijing2:     newslist_caijing2 = i.findAll("a") s = len(newslist_caijing2)print '加载了url_caijing2',s,',条记录进入-----------------------'url_caijing3 = 'http://industry.caijing.com.cn/industrianews/'   #产经要闻html_caijing3 =  BeautifulSoup(urllib.urlopen(url_caijing3).read(),from_encoding="gb18030")templist_caijing3 = html_caijing3.findAll("ul",{"class":"list"}) for i in templist_caijing3:     newslist_caijing3 = i.findAll("a") s = len(newslist_caijing3)print '加载了url_caijing3',s,',条记录进入-----------------------'url_caijing4 = 'http://economy.caijing.com.cn/economynews/'   #宏观html_caijing4 =  BeautifulSoup(urllib.urlopen(url_caijing4).read(),from_encoding="gb18030")templist_caijing4 = html_caijing4.findAll("ul",{"class":"list"}) for i in templist_caijing4:     newslist_caijing4 = i.findAll("a") s = len(newslist_caijing4)print '加载了url_caijing4',s,',条记录进入-----------------------'print '等待url 解析,waiting----------------------------------'newslist_caijing = newslist_caijing0 + newslist_caijing1 + newslist_caijing2 + newslist_caijing3 + newslist_caijing4hlink_caijing = []title_caijing = []for i in range(len(newslist_caijing)):    hlink_caijing.append(newslist_caijing[i]['href'])    title_caijing.append(newslist_caijing[i].get_text())temp_caijing=[]for i in hlink_caijing :    temp_caijing.append(BeautifulSoup(urllib.urlopen(i).read(),from_encoding="gb18030").find('div',{"id":"the_content"}).findAll("p"))tcaijing=[]      for i in range(len(temp_caijing)):    tcaijing.append([])    t = temp_caijing[i]    for j in t:        tcaijing[i].append(j.get_text())contents_caijing=[]for i in tcaijing:    contents_caijing.append('\\\\n'.join(i))keyword=[]for i in contents_caijing:    keyword.append(",".join(jieba.analyse.extract_tags(i,topK=10)))gettime = []for i in range(len(hlink_caijing)):    gettime.append((time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))))news_caijing=[]  for i in range(len(hlink_caijing)):    news_caijing.append(('财经','财经网',hlink_caijing[i],title_caijing[i].encode('utf-8'),keyword[i].encode('utf-8'),contents_caijing[i].encode('utf-8'),gettime[i]))conn = MySQLdb.connect(host="localhost",user="root",passwd="123456",charset="UTF8") conn.select_db('test')   cur=conn.cursor()cur.executemany("""replace into getnewslist values(%s,%s,%s,%s,%s,%s,%s) """,news_caijing)conn.commit()conn.close#该片段来自于http://byrx.net

评论关闭