获取网页内容并入库的python方法,网页内容python,获取网页内容并入库的py
获取网页内容并入库的python方法,网页内容python,获取网页内容并入库的py
获取网页内容并入库的python方法,这段代码中存在一个问题,就是找时间运行会卡死。目前不没有测试出问题在哪里,希望有python方面能力强的朋友帮找一找。
#-*-coding:utf-8-*-#encoding=utf-8import sys;import os;import re;import getopt;import random;import urllib2;import time;import datetime;#import socket;import MySQLdb as mysql;reload(sys)sys.setdefaultencoding('utf-8')opts,argv = getopt.getopt(sys.argv[1:],'')#urllib2.socket.setdefaulttimeout(15)User = 'DB_User_Name'Passwd = 'Password'Host = 'localhost'Db = 'DB_Name'home = "http://zhidao.baidu.com/"bmail = {'1':'@163.com','2':'@126.com','3':'@qq.com','4':'@gmail.com','5':'@sina.com.cn'}im = {'1':'web','2':'wap','3':'mobile','4':'qq','5':'msn','6':'api','7':'sina','8':'qqwb','9':'vote' }contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8')sql_item = contents.cursor()def Q (): q = re.sub(' ','',re.findall('<span class=\"question-title\">(.*?)</span>',qa_txt,re.DOTALL)[0]) q_p = re.findall('<pre id=\"question-content\">(.*?)</pre>',qa_txt,re.DOTALL) q_pc = re.findall('<pre id=\"question-suply\">(.*?)</pre>',qa_txt,re.DOTALL) if q_p == [] and q_pc == []: return q elif q_p != [] and q_pc == []: return q+"\n"+re.sub(' ','',q_p[0]) elif q_p == [] and q_pc != []: return q+"\n"+re.sub(' ','',q_pc[0]) else: return q+"\n"+re.sub(' ','',q_p[0])+"\n"+re.sub(' ','',q_pc[0])def QID (): return re.sub('<\/[a|A]>|<[a|A].*?>|\n|_$','',re.findall('<span class=\"gray\">\xcc\xe1\xce\xca\xd5\xdf\xa3\xba(.*?)<\/span>',qa_txt,re.DOTALL)[0])def A (): return re.findall('<pre.*?>(.*?)<\/pre>',ac_txt[0],re.DOTALL)[0]def AID (): if re.findall('<span class=\"gray\">\xbb\xd8\xb4\xf0\xd5\xdf\xa3\xba(.*?)<span class=\"v-split\">',ac_txt[0],re.DOTALL) == []: return "\xc8\xc8\xd0\xc4\xcd\xf8\xd3\xd1" else: return re.sub('<\/[a|A]>|<[a|A].*?>|\n|<span.*?>|\xc0\xb4\xd7\xd4\xcd\xc5\xb6\xd3|</span>| ','',re.findall('<span class=\"gray\">\xbb\xd8\xb4\xf0\xd5\xdf\xa3\xba(.*?)<span class=\"v-split\">',ac_txt[0],re.DOTALL)[0])def getid (users): contgetid = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8') member_uid = contgetid.cursor() member_uid.execute("select `username`,`uid` from `sql_table_name`") contgetid.close(); return dict(member_uid.fetchall()).get(users.decode('gbk','ignore'))def weibo_id (wbid): weibo = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8') weibo_tid = weibo.cursor() weibo_tid.execute("select `roottid`,`tid` from `sql_table_name` where `content` like %s;","%"+wbid.decode('gbk','ignore')+"%") weibo.close(); return weibo_tid.fetchall()s = 1while s: for sid in reversed(xrange(0,int(argv[0]),int(argv[1]))): for b in re.findall('<a href=\"\/question/(.*?)\.html\" title=\"',urllib2.urlopen("http://zhidao.baidu.com/browse/151?lm=0&word=&pn="+str(sid)).read(),re.DOTALL): request = urllib2.Request("http://zhidao.baidu.com/question/"+b+"\.html") request.add_header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)') qa_txt = urllib2.urlopen(request).read() ac_txt = re.findall('<div class=\"content\">(.*?)<!--start test sns-->',qa_txt,re.DOTALL) if len(ac_txt) > 0: ask = Q() ask_id = QID() reply = A() reply_id = AID() #==构造 ASK_ID 入库参数== #print "1.",ask_id,"<====>",getid(ask_id) if getid(ask_id) == None : member = ask_id.decode('gbk','ignore').encode('utf-8'),ask_id.decode('gbk','ignore').encode('utf-8'),str(int(random.uniform(1,3))),int(time.time()),int(time.time()),int(time.time()),str(int(random.uniform(1,3000))),ask_id.decode('gbk','ignore').encode('utf-8')+bmail.get(str(int(random.uniform(1,5)))) sql_item.execute("INSERT INTO `sql_table_name` (`uid`, `medal_id`, `media_id`, `username`, `nickname`, `password`, `secques`, `gender`, `regip`, `regdate`, `lastip`, `lastvisit`, `lastactivity`, `lastpost`, `oltime`, `pageviews`, `credits`, `extcredits1`, `extcredits2`, `extcredits3`, `extcredits4`, `extcredits5`, `extcredits6`, `extcredits7`, `extcredits8`, `email`, `bday`, `styleid`, `invisible`, `timeoffset`, `newpm`, `face_url`, `face`, `tag_count`, `role_id`, `role_type`, `new_msg_count`, `tag`, `own_tags`, `login_count`, `truename`, `phone`, `view_times`, `use_tag_count`, `create_tag_count`, `image_count`, `noticenum`, `ucuid`, `invite_count`, `invitecode`, `province`, `city`, `topic_count`, `at_count`, `follow_count`, `fans_count`, `email2`, `qq`, `msn`, `aboutme`, `at_new`, `comment_new`, `fans_new`, `topic_favorite_count`, `tag_favorite_count`, `disallow_beiguanzhu`, `validate`, `favoritemy_new`, `notice_at`, `notice_pm`, `notice_reply`, `user_notice_time`, `last_notice_time`, `theme_id`, `theme_bg_image`, `theme_bg_color`, `theme_text_color`, `theme_link_color`, `theme_bg_image_type`, `theme_bg_repeat`, `theme_bg_fixed`, `last_topic_content_id`) VALUES (null, '', 0, %s, %s, '4297f44b13955235245b2497399d7a93', '', %s, '', 0, '', %s, %s, %s, 0, 300, %s, 0, 30, 0, 0, 0, 0, 0, 0, %s, '0000-00-00', 0, 0, '', 0, '', '', 0, 3, 'normal', 0, '', 0, 1, '', '', 0, 1, 0, 0, 0, 0, 1, '35d69eddc4d041e8', '...', '..', 0, 0, 1, 1, '', '', '', '', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 't6', '', '#C4CD58', '#333333', '#007FA9', 'center', 0, 0, 0);",member) contents.commit() print "Ask_Insert:",ask_id,"\t\t\tOK !" else: pass #print "2.",reply_id,"<====>",getid(reply_id) if getid(reply_id) == None : reply_member = reply_id.decode('gbk','ignore').encode('utf-8'),reply_id.decode('gbk','ignore').encode('utf-8'),str(int(random.uniform(1,3))),int(time.time()),int(time.time()),int(time.time()),str(int(random.uniform(1,3000))),reply_id.decode('gbk','ignore').encode('utf-8')+bmail.get(str(int(random.uniform(1,5)))) sql_item.execute("INSERT INTO `sql_table_name` (`uid`, `medal_id`, `media_id`, `username`, `nickname`, `password`, `secques`, `gender`, `regip`, `regdate`, `lastip`, `lastvisit`, `lastactivity`, `lastpost`, `oltime`, `pageviews`, `credits`, `extcredits1`, `extcredits2`, `extcredits3`, `extcredits4`, `extcredits5`, `extcredits6`, `extcredits7`, `extcredits8`, `email`, `bday`, `styleid`, `invisible`, `timeoffset`, `newpm`, `face_url`, `face`, `tag_count`, `role_id`, `role_type`, `new_msg_count`, `tag`, `own_tags`, `login_count`, `truename`, `phone`, `view_times`, `use_tag_count`, `create_tag_count`, `image_count`, `noticenum`, `ucuid`, `invite_count`, `invitecode`, `province`, `city`, `topic_count`, `at_count`, `follow_count`, `fans_count`, `email2`, `qq`, `msn`, `aboutme`, `at_new`, `comment_new`, `fans_new`, `topic_favorite_count`, `tag_favorite_count`, `disallow_beiguanzhu`, `validate`, `favoritemy_new`, `notice_at`, `notice_pm`, `notice_reply`, `user_notice_time`, `last_notice_time`, `theme_id`, `theme_bg_image`, `theme_bg_color`, `theme_text_color`, `theme_link_color`, `theme_bg_image_type`, `theme_bg_repeat`, `theme_bg_fixed`, `last_topic_content_id`) VALUES (null, '', 0, %s, %s, '4297f44b13955235245b2497399d7a93', '', %s, '', 0, '', %s, %s, %s, 0, 300, %s, 0, 30, 0, 0, 0, 0, 0, 0, %s, '0000-00-00', 0, 0, '', 0, '', '', 0, 3, 'normal', 0, '', 0, 1, '', '', 0, 1, 0, 0, 0, 0, 1, '35d69eddc4d041e8', '...', '..', 0, 0, 1, 1, '', '', '', '', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 't6', '', '#C4CD58', '#333333', '#007FA9', 'center', 0, 0, 0);",reply_member) contents.commit() print "reply_Insert:",reply_id,"\t\t\tOK !" else: pass #contents.close() #print "3.",ask_id,"<====>",getid(ask_id) if weibo_id(ask) == () : inst_ask = getid(ask_id),ask_id.decode('gbk','ignore').encode('utf-8'),ask.decode('gbk','ignore').encode('utf-8'),int(time.time()),int(time.time()),im.get(str(int(random.uniform(1,9)))) sql_item.execute("INSERT INTO `sql_table_name` (`tid`, `uid`, `username`, `content`, `content2`, `imageid`, `videoid`, `musicid`, `roottid`, `replys`, `forwards`, `totid`, `touid`, `tousername`, `dateline`, `lastupdate`, `from`, `type`, `item_id`, `item`) VALUES (null, %s, %s, %s, '', 0, 0, 0, 0, 1, 0, 0, 0, '', %s, %s, %s, 'first', 0, '');",inst_ask) contents.commit() print "Ask_txt_Insert:",ask,"\t\t\t\tOK !" else: pass #print "4.",reply_id,"<====>",getid(reply_id) if weibo_id(reply) == () : #print weibo_id(ask) if len(weibo_id(ask)) > 1 : re_id = weibo_id(ask)[len(weibo_id(ask))-1] elif len(weibo_id(ask)) == 1 : re_id = weibo_id(ask)[0] elif len(weibo_id(ask)) == 0 : re_id = (0,0) #print getid(reply_id) inst_reply = getid(reply_id),reply_id.decode('gbk','ignore').encode('utf-8'),reply.decode('gbk','ignore').encode('utf-8'),str(re_id[1]),str(re_id[1]),getid(ask_id),ask_id.decode('gbk','ignore').encode('utf-8'),int(time.time()),int(time.time()),im.get(str(int(random.uniform(1,9)))) sql_item.execute("INSERT INTO `sql_table_name` (`tid`, `uid`, `username`, `content`, `content2`, `imageid`, `videoid`, `musicid`, `roottid`, `replys`, `forwards`, `totid`, `touid`, `tousername`, `dateline`, `lastupdate`, `from`, `type`, `item_id`, `item`) VALUES (null, %s, %s, %s, '', 0, 0, 0, %s, 0, 0, %s, %s, %s, %s, %s, %s, 'reply', 0, '');",inst_reply) contents.commit2966() print "Reply_txt_Insert:",reply,"\t\t\t\tOK !" else: pass #同步评论 #print ask,weibo_id(reply) if len(weibo_id(reply)) > 1 : reply_re = weibo_id(reply)[len(weibo_id(reply))-1] elif len(weibo_id(reply)) == 1 : reply_re = weibo_id(reply)[0] elif len(weibo_id(reply)) == 0 : pass #www.iplaypy.com #print len(reply_re[1]) topic_more = str(reply_re[0]),"a:1:{i:0;s:"+str(len(str(reply_re[1])))+":\""+str(reply_re[1])+"\";}" #print topic_more topic_more_two = str(reply_re[1]),str(reply_re[0]) #print topic_more_two sql_item.execute("INSERT INTO `sql_table_name` (`tid`, `parents`, `replyids`, `replyidscount`) VALUES (%s, '', %s, '1');",topic_more) sql_item.execute("INSERT INTO `sql_table_name` (`tid`, `parents`, `replyids`, `replyidscount`) VALUES (%s, %s, '', '0');",topic_more_two) #print weibo_id(reply) sql_item.execute("INSERT INTO `sql_table_name` (`tid` ,`replyid`) VALUES (%s, %s);",reply_re) print "Sync Reply OK!" print "============================================" time.sleep(int(random.uniform(30,320))) else: pass #time.sleep(int(random.uniform(100,200))) print "第",sid,"入库完成" print "所有采集完成,重新开始采集!"contents.close()
编橙之家文章,
相关内容
- 从日志文件中提取ip并找到归属地完成输出,日志文件
- 益智盒子问题python的解决方法,益智盒子python,益智盒子
- 调试js解决跨域问题python小工具,jspython,这是一个我以
- Python关于端口复用及线程操作方法,python复用,以下代码
- 用python实现函数调用拦截,python函数拦截,写的这个关于
- ipv4与ipv6地址如何转换的python解决办法,ipv6python,ipv4与
- python将ios及android文件写成excel的小工具,iosandroid,用p
- Python内置方法实现访问权限控制,python权限控制,Pytho
- Python获取MP3文件id3信息的方法源码,pythonid3,Python获取
- Python socket方法获取接口IP地址,pythonsocket,Python socke
评论关闭