抓取网站特定内容后直接入mysql库,抓取入mysql库,Python > 2.
抓取网站特定内容后直接入mysql库,抓取入mysql库,Python > 2.
Python > 2.5 < 3.0
#!/usr/bin/env python#-*-coding:utf-8-*-#encoding=utf-8#--作者:小项--#--预览:<a href="http://www.20hotel.com/news--">http://www.20hotel.com/news--import sys;import os;import re;import random;import urllib2;import time;import datetime;#import socket;import MySQLdb as mysql;reload(sys)sys.setdefaultencoding('utf-8')#--转到目录--os.chdir('img')#urllib2.socket.setdefaulttimeout(15)User = 'username'Passwd = 'password'Host = 'localhost'Db = 'dbname'home = "<a href="http://www.8264.com/">http://www.8264.com/"#--链接数据库--contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8').cursor()lsid = []pnext = []for sid in xrange(1,100,10): lsid.append(str(sid))print "进行列表分段",lsid,"完成."for tid in reversed(xrange(2,len(lsid)+1)): for i in reversed(xrange(int(lsid[(int(tid)-2):(int(tid)-1)][0]),int(lsid[(int(tid)-1):int(tid)][0]))): #print i #==进行列表获取==# request = urllib2.Request("<a href="http://www.8264.com/portal-list-catid-251-page-">http://www.8264.com/portal-list-catid-251-page-"+str(i)+".html") request.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +<a href="http://www.google.com/bot.html">http://www.google.com/bot.html)') for u in reversed(re.findall('<h2><a href=\\"(.*?)\\" title=\\'',re.findall('<div class=\\"title_8264\\">(.*?)<div class=\\"pg\\">',urllib2.urlopen(request).read(),re.DOTALL)[0],re.DOTALL)): #print u #--获取内容页面-- newsurl = urllib2.Request(u) newsurl.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +<a href="http://www.google.com/bot.html">http://www.google.com/bot.html)') news = urllib2.urlopen(newsurl).read() time.sleep(int(random.uniform(1,5))) #--获取标题-- title = re.findall('<div class=\\"newstitle\\">(.*?)<\\/div>',news,re.DOTALL) #--获取时间-- dates = list(eval(re.sub('\\,0',',',re.sub(':| |-',',',re.findall('<td align=\\"center\\" valign=\\"middle\\">.*?<div style=\\"line-height:1.8; text-align:center;\\">\\xcc\\xed\\xbc\\xd3\\xca\\xb1\\xbc\\xe4\\xa3\\xba(.*?) ',news,re.DOTALL)[0])))) #--进行时间格式化-- #--2011-05-10 08:19 to 1305010787.029-- ttime = datetime.datetime(dates[0],dates[1],dates[2],dates[3],dates[4]) ptime = time.mktime(ttime.timetuple()) #--获取作者-- athour = re.sub('<.*?>','',re.findall(' \\xd7\\xf7\\xd5\\xdf\\xa3\\xba(.*?)<br \\/><a',news,re.DOTALL)[0]) #--获取分页链接-- page = re.findall('<div class=\\"pg\\">(.*?)<\\/div>',news,re.DOTALL) if page != []: pnext = re.findall('<a href=\\"(.*?)\\">[0-9]*<\\/a>',page[0],re.DOTALL) one_img = [] one_txt = re.sub('<[a|A].*?>|<\\/[a|A]>','',re.findall('<div class=\\"newstext\\">(.*?)<\\/div>',news,re.DOTALL)[0]) newstxt = re.sub('[http:\\/\\/image.8264.com\\/portal\\/[0-9]*\\/[0-9]*\\/|http:\\/\\/image.8264.com\\/portal\\/photo\\/[0-9]*\\/[0-9]*\\/]','',one_txt) one_img.extend(re.findall('<IMG src=\\"(.*?)\\">',one_txt,re.DOTALL)) for one_dimg in one_img: #--下载文章内图片-- one_yscurl = 'wget -q '+one_dimg os.system(one_yscurl) for p in pnext: #print p,"\\n" more_img = [] morepage = urllib2.Request(p) morepage.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +<a href="http://www.google.com/bot.html">http://www.google.com/bot.html)') pnewtxt = urllib2.urlopen(morepage).read() txt = re.sub('<[a|A].*?>|<\\/[a|A]>','',re.findall('<div class=\\"newstext\\">(.*?)<\\/div>',pnewtxt,re.DOTALL)[0]) #--得到入库的内容-- ntxt = re.sub('[http:\\/\\/image.8264.com\\/portal\\/[0-9]*\\/[0-9]*\\/|http:\\/\\/image.8264.com\\/portal\\/photo\\/[0-9]*\\/[0-9]*\\/]','',txt) #--处理内容中的图片-- more_img.extend(re.findall('<IMG src=\\"(.*?)\\">',txt,re.DOTALL)) for more_dimg in more_img: more_syscurl = 'wget -q '+more_dimg os.system(more_syscurl) newstxt += ntxt texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime #--进行数据插入-- contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts); print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!" time.sleep(int(random.uniform(30,90))) else: #pass only_img = [] only_txt = re.sub('<[a|A].*?>|<\\/[a|A]>','',re.findall('<div class=\\"newstext\\">(.*?)<\\/div>',news,re.DOTALL)[0]) newstxt = re.sub('[http:\\/\\/image.8264.com\\/portal\\/[0-9]*\\/[0-9]*\\/|http:\\/\\/image.8264.com\\/portal\\/photo\\/[0-9]*\\/[0-9]*\\/]','',only_txt) only_img.extend(re.findall('<IMG src=\\"(.*?)\\">',only_txt,re.DOTALL)) for only_img in only_img: only_syscurl = 'wget -q '+only_img os.system(only_syscurl) texts = title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime contents.execute("INSERT INTO `dbname`.`table_name` (`aid`, `class_id`, `title`, `content`, `author`, `order`, `state_radio`, `time`, `view_num`, `img`, `CityID`) VALUES (NULL, '2', %s, %s, %s, '0', '2', %s, '0', '', '53');",texts); print athour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!" time.sleep(int(random.uniform(30,90))) print "第",i,"页采集完成.休息一下,进入下一页采集." #--停顿一会-- time.sleep(int(random.uniform(1200,3200)))#--关闭数据库连接--contents.close();#该片段来自于http://byrx.net
相关内容
- python中常用检测字符串的相关函数使用范例,python范例
- 用python2.7在xp系统中实现,客户端在网页中点选查询出
- 解S先生与P先生谜题,s先生谜题,Python语言: 解S
- python获取当天日期,python获取当天,import datet
- 猜数字游戏升级,增加支持闯关记录文件,猜数字闯关
- RC4文件加密,,基于RC4流加密算法,使
- 从豆瓣自动获取mp3封面。 MP3与id3 名称互换,封面id3,由
- pop3协议收取邮件,pop3协议收取,# pop3.py i
- python在sqlite动态创建表代码,pythonsqlite,import sqlit
- Python版的文曲星猜数字,python文曲星猜数字,# -*- codin
评论关闭