Python_网页爬虫,, 1 import
Python_网页爬虫,, 1 import
1 import sys 2 import multiprocessing 3 import re 4 import os 5 import urllib.request as lib 6 7 def craw_links( url,depth,keyword,processed): 8 ‘‘‘ url:the url to craw 9 deth:the current depth to craw10 keyword:the tuple of keywords to focus11 pool:process pool12 ‘‘‘13 14 contents=[]15 if url.startswith((‘htpp://‘,‘https://‘)):16 if url not in processed:17 #mark this url as processed18 processed.append(url)19 else:20 #avoid prossing the same url again21 return22 print(‘Crawing ‘+url+‘...‘)23 fp = lib.urlopen(url)24 #python3 returns bytes,so need to decode25 contents = fp.read()26 contents_decoded = contents.decode(‘UTF-8‘)27 fp.close()28 pattern = ‘|‘.join(keyword)29 #if this page contains certain keywords,save it to a file30 flag = False31 if pattern:32 searched = re.search(pattern,contents_decoded)33 else:34 #if the keywords to filter is not given,save current page35 flag = True36 if flag or searched:37 with open(‘craw\\‘+url.replace(‘:‘,‘_‘).replace(‘/‘,‘_‘),‘wb‘) as fp:38 fp.write(contents)39 #find all the links in the current page40 links = re.findall(‘href="(.*?)"‘,contents_decoded)41 #craw all links in the current page42 for link in links:43 #consider the relative path44 if not link.startswith((‘http://‘,‘https://‘)):45 try:46 index=url.rindex(‘/‘)47 link = url[0:index+1]+link48 except:49 pass50 if depth>0 and link.endswith((‘.htm‘,‘.html‘)):51 craw_links(link,depth-1,keyword,processed)52 53 if __name__ == ‘__main__‘:54 processed = []55 keywords = (‘KeyWord1‘,‘KeyWord2‘)56 if os.path.exists(‘craw‘) or not os.path.isdir(‘craw‘):57 os.mkdir(‘craw‘)58 craw_links(r‘http://docs.python.org/3/library/index.html‘,1,keywords,processed)
Python_网页爬虫
评论关闭