python提取页面内的url列表,python提取页面url,python提取页面内的


python提取页面内的url列表

from bs4 import BeautifulSoupimport time,re,urllib2t=time.time()websiteurls={}def scanpage(url):    websiteurl=url    t=time.time()    n=0    html=urllib2.urlopen(websiteurl).read()    soup=BeautifulSoup(html)    pageurls=[]    Upageurls={}    pageurls=soup.find_all("a",href=True)    for links in pageurls:        if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:            Upageurls[links.get("href")]=0    for links in Upageurls.keys():        try:            urllib2.urlopen(links).getcode()        except:            print "connect failed"        else:            t2=time.time()            Upageurls[links]=urllib2.urlopen(links).getcode()            print n,            print links,            print Upageurls[links]            t1=time.time()            print t1-t2        n+=1    print ("total is "+repr(n)+" links")    print time.time()-tscanpage("http://news.163.com/")

评论关闭