爬虫 爬取网址,爬虫网址,爬取网页上的链接 然后分
爬虫 爬取网址,爬虫网址,爬取网页上的链接 然后分
爬取网页上的链接 然后分析链接 然后再爬取链接 分析链接。。。。。
#!/usr/bin/env python#coding=utf-8import urllib2,refrom uliweb.orm import *def geturl(url):# h4 = u'http://www.baidu.com/' h4 = url url = re.search(r"://.[^/]+/",h4) if url == None: url = re.search(r"://.+",h4) print url.group() yuming = re.sub("""[:/ ]""",'',url.group())#print yuming if yuming == None: return None yuming = "http://" + yuming + "/" h4=h4.encode("utf-8") f = urllib2.urlopen(h4,timeout=5000) buf = f.read()#print buf urls = re.findall(r"<[aA].*?href.*?>",buf) list_jue = [] list_xiang = [] for n in urls:# print n url = re.search(r"=.*?[ >]",n) #print url.group() url_box = re.sub("""[= '">]""",'',url.group()) #print url_box if url_box == '#': continue if '/' not in url_box: continue if ':' not in url_box: #l1 = yuming + '/' + url_box continue #print l1 list_jue.append(url_box) #print list_jue #print url_box for i in urls: url = re.search(r"=.*?[ >]",i) url_box1 = re.sub("""[= '">]""",'',url.group()) if 'http' in url_box1: continue if url_box1 == '#': continue if '/' not in url_box1: continue l1 = yuming + url_box1 list_xiang.append(l1) data = list_jue+list_xiang return datadb = get_connection('mysql://root:root@localhost/spider?charset=utf8')class urls(Model): url = Field(str) status = Field(str)def search_url(url): n = urls.get(urls.c.url == url) return ndef insert_url(url): u = search_url(url) if u: return n = urls() n.url = url n.status = "0" n.save()def get_url(): n = urls.get(urls.c.status == "0") return ndef update_url(n): n = urls.get(urls.c.id == n.id) n.update(status="1") n.save()def save_newurl(url): for u in url: insert_url(u) print "add %s OK!" %(u)#db.metadata.drop_all()#db.metadata.create_all()#n = urls()#n.url = "http://v.hpcasts.com/"#n.status = "0"#n.save()while 1: new = get_url() try: url = geturl(new.url)#该片段来自于http://byrx.net
相关内容
- 清空当前目录下,除本脚本文件的所有文本文件(点文件
- RSA算法的简单实现,RSA算法简单实现,#!/usr/bin/e
- Python 将DOC文档转换为PDF,,import sys,
- Django获取当前request,django当前request,django只能在vie
- python redis订阅发布示例代码,pythonredis,可以使用pip in
- 批量删除所下载的.git文件夹,批量删除.git文件夹,因经
- 自动查询数据,生成修改ldap密码文件,并自动在服务器
- Python 文件夹复制,python文件夹复制,#! /usr/bin/
- json 解析 天气api,jsonapi,# -*- coding
- 天翼开放平台免费短信验证码Python版SDK,pythonsdk,天翼开
评论关闭