python之路——爬虫实例，,urlControl

文章由Byrx.net分享于2019-08-30 02:08:09评论（243）

python之路——爬虫实例，,urlControl

urlController.py

import bsControllerfrom urllib import requestclass SpiderMain(object):    def __init__(self):        self.header = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,               ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,               ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘,               ‘Accept-Encoding‘: ‘none‘,               ‘Accept-Language‘: ‘en-US,en;q=0.8‘,               ‘Connection‘: ‘keep-alive‘}        self.bsManage = bsController.bsManage()    def getUrl(self,rootUrl):        for i in range(1,500):            url = rootUrl+‘%s‘ %i+‘.html‘            req = request.Request(url)            for h in self.header:                   req.add_header(h, self.header[h])            try:              html = request.urlopen(req).read()              # print(html)              self.bsManage.getPageUrl(html,i)              req.close()            except request.URLError as e:              if hasattr(e, ‘code‘):                print(‘Error code:‘,e.code)              elif hasattr(e, ‘reason‘):                print(‘Reason:‘,e.reason)if __name__==‘__main__‘:    rootUrl = ‘http://www.meitulu.com/item/‘    obj_root = SpiderMain()    obj_root.getUrl(rootUrl)

bsController.py

from bs4 import BeautifulSoupfrom urllib import requestimport osclass bsManage:    def __init__(self):        self.pageUrl = ‘http://www.meitulu.com/item/‘        self.header = {            ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,            ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘,            ‘Accept-Encoding‘: ‘none‘,            ‘Accept-Language‘: ‘en-US,en;q=0.8‘,            ‘Connection‘: ‘keep-alive‘}    # html是获取到的网页的html    # i表示i_x.html    def getPageUrl(self,html,i):        soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘)        # 获取到最后一个连接        lastUrl = soup.find_all(‘div‘, {‘id‘: ‘pages‘})[0].find_all(‘a‘)[-2][‘href‘]        # print(html)        # print(lastUrl)        # 获取到最后一页的数字        if i < 10:            len = 1        elif i<100:            len = 2        elif i<1000:            len = 3        elif i<10000:            len = 4        lastPage = int(lastUrl[29+len:-5])        # 创建图片文件夹        if not os.path.exists(‘img‘):            os.mkdir(‘img‘)        path = ‘img/%s‘ %i        if not os.path.exists(path):            os.mkdir(path)        # 先爬取第一页 因为url格式不一样        # 获取所需要图片的连接 array        links = soup.find_all(‘img‘,class_=‘content_img‘)        for link in links:               name = str(link[‘src‘])[-21:]               data = request.urlopen(link[‘src‘]).read()               img = open(‘img/%s/‘ %i + name,‘wb+‘)               img.write(data)               img.close()        # print(‘%d 已经爬完‘ %i)        # str = self.pageUrl + ‘%s‘ %i + ‘.html‘        # print(str)        # 每一个页面下有lastPage个小页面        for j in range(2,lastPage+1):            # 重新拼接url 获取到下一页的url            url = self.pageUrl + ‘%s_%s‘ %(i,j) + ‘.html‘            self.saveImgWithUrl(url,i)        print(‘%d 已经爬完‘ %i)    def saveImgWithUrl(self,url,i):        req = request.Request(url)        for h in self.header:            req.add_header(h, self.header[h])        try:            html = request.urlopen(req).read()            soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘)            # 获取所需要图片的连接 array            links = soup.find_all(‘img‘, class_=‘content_img‘)            for link in links:                name = str(link[‘src‘])[-21:]                data = request.urlopen(link[‘src‘]).read()                img = open(‘img/%s/‘ % i + name, ‘wb+‘)                img.write(data)                img.close()        except request.URLError as e:            if hasattr(e, ‘code‘):                print(‘Error code:‘, e.code)            elif hasattr(e, ‘reason‘):                print(‘Reason:‘, e.reason)

python之路——爬虫实例

热门文章：

python之路——爬虫实例，,urlControl

python之路——爬虫实例，,urlControl

相关内容

最新python教程

python~HOT