Python_网页爬虫,, 1 import


 1 import sys 2 import multiprocessing 3 import re 4 import os 5 import urllib.request as lib 6  7 def craw_links( url,depth,keyword,processed): 8     ‘‘‘ url:the url to craw 9         deth:the current depth to craw10         keyword:the tuple of keywords to focus11         pool:process pool12     ‘‘‘13 14     contents=[]15     if url.startswith((‘htpp://‘,‘https://‘)):16         if url not in processed:17             #mark this url as processed18             processed.append(url)19         else:20             #avoid prossing the same url again21             return22         print(‘Crawing ‘+url+‘...‘)23         fp = lib.urlopen(url)24         #python3 returns bytes,so need to decode25         contents = fp.read()26         contents_decoded = contents.decode(‘UTF-8‘)27         fp.close()28         pattern = ‘|‘.join(keyword)29         #if this page contains certain keywords,save it to a file30         flag = False31         if pattern:32             searched = re.search(pattern,contents_decoded)33         else:34             #if the keywords to filter is not given,save current page35             flag = True36         if flag or searched:37             with open(‘craw\\‘+url.replace(‘:‘,‘_‘).replace(‘/‘,‘_‘),‘wb‘)  as fp:38                 fp.write(contents)39         #find all the links in the current page40         links = re.findall(‘href="(.*?)"‘,contents_decoded)41         #craw all links in the current page42         for link in links:43             #consider the relative path44             if not link.startswith((‘http://‘,‘https://‘)):45                 try:46                     index=url.rindex(‘/‘)47                     link = url[0:index+1]+link48                 except:49                     pass50             if depth>0 and link.endswith((‘.htm‘,‘.html‘)):51                 craw_links(link,depth-1,keyword,processed)52 53 if __name__ == ‘__main__‘:54     processed = []55     keywords = (‘KeyWord1‘,‘KeyWord2‘)56     if os.path.exists(‘craw‘) or not os.path.isdir(‘craw‘):57         os.mkdir(‘craw‘)58     craw_links(r‘http://docs.python.org/3/library/index.html‘,1,keywords,processed)

Python_网页爬虫

评论关闭