抽取新闻,,经常看财经新闻,写了个简
抽取新闻,,经常看财经新闻,写了个简
经常看财经新闻,写了个简单小程序,抽取财经网新闻到数据库慢慢看
#! /usr/bin/env python#coding=utf-8import urllibfrom bs4 import BeautifulSoupimport socketimport timeimport MySQLdbimport sysimport jieba,jieba.analysereload(sys)sys.setdefaultencoding('utf-8')timeout = 10socket.setdefaulttimeout(timeout)sleep_download_time = 20time.sleep(sleep_download_time)print 'Good! Start to url Parse! Please wait----------------------'url_caijing=[]; html_caijing=[]; templist_caijing=[] #初始化url_caijing0 = 'http://stock.caijing.com.cn/gpdp/index.html' # 大盘html_caijing0 = BeautifulSoup(urllib.urlopen(url_caijing0).read(),from_encoding="gb18030")templist_caijing0 = html_caijing0.findAll("ul",{"class":"list"}) for i in templist_caijing0: newslist_caijing0=i.findAll("a") s = len(newslist_caijing0)print '加载了url_caijing0',s,',条记录进入-----------------------'url_caijing1 = 'http://stock.caijing.com.cn/stockresearch/' #策略研究html_caijing1 = BeautifulSoup(urllib.urlopen(url_caijing1).read(),from_encoding="gb18030")templist_caijing1 = html_caijing1.findAll("ul",{"class":"list"}) for i in templist_caijing1: newslist_caijing1=i.findAll("a") s = len(newslist_caijing1)print '加载了url_caijing1',s,',条记录进入-----------------------'url_caijing2 = 'http://stock.caijing.com.cn/market/' #股票市场html_caijing2 = BeautifulSoup(urllib.urlopen(url_caijing2).read(),from_encoding="gb18030")templist_caijing2 = html_caijing2.findAll("ul",{"class":"list"}) for i in templist_caijing2: newslist_caijing2 = i.findAll("a") s = len(newslist_caijing2)print '加载了url_caijing2',s,',条记录进入-----------------------'url_caijing3 = 'http://industry.caijing.com.cn/industrianews/' #产经要闻html_caijing3 = BeautifulSoup(urllib.urlopen(url_caijing3).read(),from_encoding="gb18030")templist_caijing3 = html_caijing3.findAll("ul",{"class":"list"}) for i in templist_caijing3: newslist_caijing3 = i.findAll("a") s = len(newslist_caijing3)print '加载了url_caijing3',s,',条记录进入-----------------------'url_caijing4 = 'http://economy.caijing.com.cn/economynews/' #宏观html_caijing4 = BeautifulSoup(urllib.urlopen(url_caijing4).read(),from_encoding="gb18030")templist_caijing4 = html_caijing4.findAll("ul",{"class":"list"}) for i in templist_caijing4: newslist_caijing4 = i.findAll("a") s = len(newslist_caijing4)print '加载了url_caijing4',s,',条记录进入-----------------------'print '等待url 解析,waiting----------------------------------'newslist_caijing = newslist_caijing0 + newslist_caijing1 + newslist_caijing2 + newslist_caijing3 + newslist_caijing4hlink_caijing = []title_caijing = []for i in range(len(newslist_caijing)): hlink_caijing.append(newslist_caijing[i]['href']) title_caijing.append(newslist_caijing[i].get_text())temp_caijing=[]for i in hlink_caijing : temp_caijing.append(BeautifulSoup(urllib.urlopen(i).read(),from_encoding="gb18030").find('div',{"id":"the_content"}).findAll("p"))tcaijing=[] for i in range(len(temp_caijing)): tcaijing.append([]) t = temp_caijing[i] for j in t: tcaijing[i].append(j.get_text())contents_caijing=[]for i in tcaijing: contents_caijing.append('\\\\n'.join(i))keyword=[]for i in contents_caijing: keyword.append(",".join(jieba.analyse.extract_tags(i,topK=10)))gettime = []for i in range(len(hlink_caijing)): gettime.append((time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))))news_caijing=[] for i in range(len(hlink_caijing)): news_caijing.append(('财经','财经网',hlink_caijing[i],title_caijing[i].encode('utf-8'),keyword[i].encode('utf-8'),contents_caijing[i].encode('utf-8'),gettime[i]))conn = MySQLdb.connect(host="localhost",user="root",passwd="123456",charset="UTF8") conn.select_db('test') cur=conn.cursor()cur.executemany("""replace into getnewslist values(%s,%s,%s,%s,%s,%s,%s) """,news_caijing)conn.commit()conn.close#该片段来自于http://byrx.net
相关内容
- 采用右递归的超简单八皇后解决,采用递归皇后,凡是线
- Python HTMLParser模块,pythonhtmlparser,HTMLParser是p
- Python 生肖和星座计算函数,python生肖,def chinese_
- Django 利用url来控制登录,django利用url登录,1. from djan
- Python PIL批量处理处理图片,,图片太大了,上百张图用
- Python 简单的备份文件脚本,python备份脚本,#! /usr/bin/
- Python 神经网络调教程序,python神经网络调教,import ran
- Python 代码行数统计程序,python行数统计程序,import sys
- Python 相似单词,Python单词,给你一个单词a,如果通过
- Django 简化view函数的编写,django简化view函数,1.定义包装
评论关闭