爬虫 爬取网址,爬虫网址,爬取网页上的链接 然后分


爬取网页上的链接 然后分析链接 然后再爬取链接 分析链接。。。。。

#!/usr/bin/env python#coding=utf-8import urllib2,refrom uliweb.orm import *def geturl(url):#   h4 = u'http://www.baidu.com/'    h4 = url    url = re.search(r"://.[^/]+/",h4)    if url == None:        url = re.search(r"://.+",h4)    print url.group()    yuming = re.sub("""[:/ ]""",'',url.group())#print yuming    if yuming == None:        return None    yuming = "http://" + yuming + "/"    h4=h4.encode("utf-8")    f = urllib2.urlopen(h4,timeout=5000)    buf = f.read()#print buf    urls = re.findall(r"<[aA].*?href.*?>",buf)    list_jue = []    list_xiang = []    for n in urls:#   print n        url = re.search(r"=.*?[ >]",n)    #print url.group()        url_box = re.sub("""[= '">]""",'',url.group())    #print url_box        if url_box == '#':            continue        if '/' not in url_box:            continue        if ':' not in url_box:            #l1 = yuming + '/' + url_box            continue            #print l1        list_jue.append(url_box)        #print list_jue        #print url_box    for i in urls:        url = re.search(r"=.*?[ >]",i)        url_box1 = re.sub("""[= '">]""",'',url.group())        if 'http' in url_box1:            continue        if url_box1 == '#':            continue        if '/' not in url_box1:            continue        l1 = yuming + url_box1        list_xiang.append(l1)    data = list_jue+list_xiang    return datadb = get_connection('mysql://root:root@localhost/spider?charset=utf8')class urls(Model):    url = Field(str)    status = Field(str)def search_url(url):    n = urls.get(urls.c.url == url)    return ndef insert_url(url):    u = search_url(url)    if u:        return    n = urls()    n.url = url    n.status = "0"    n.save()def get_url():    n = urls.get(urls.c.status == "0")    return ndef update_url(n):    n = urls.get(urls.c.id == n.id)    n.update(status="1")    n.save()def save_newurl(url):    for u in url:        insert_url(u)         print "add %s OK!" %(u)#db.metadata.drop_all()#db.metadata.create_all()#n = urls()#n.url = "http://v.hpcasts.com/"#n.status = "0"#n.save()while 1:    new = get_url()    try:        url = geturl(new.url)#该片段来自于http://byrx.net

评论关闭