【Python requests多页面爬取案例】 񩲝,,原文:


原文: http://blog.gqylpy.com/gqy/321

"```python
import requests
from fake_useragent import UserAgent # 随机ua库

class Boring():

def __init__(self, page_scope=(4, 7)):    """    :param page_scope: 页码范围    """    self.page_scope = page_scope    self.all_id = self.get_all_company_id()    self.enterprise_info = self.get_all_company_info()    self.show_enterprise_info()@propertydef firefox_ua(self):    """返回随机火狐UA头"""    ua = UserAgent(use_cache_server=False)    return {'User-Agent': ua.Firefox}  # ua.Firefox:随机生成火狐浏览器UAdef get_all_company_id(self):    """    将返回指定页码数内的公司的id    :param start_page: 起始页码    :param end_page: 结束页码    """    all_id = {}    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'  # 此连接见图1    for page in range(self.page_scope[0], self.page_scope[1] + 1):        json_text = requests.post(url, data=self.post_data(page), headers=self.firefox_ua).json()        current_page_all_id = [dict['ID'] for dict in json_text['list']]        all_id.setdefault(page, current_page_all_id)    return all_iddef get_all_company_info(self):    """开始获取公司信息"""    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'  # 见图3    enterprise_info = {}    for page in self.all_id:        for id in self.all_id.get(page):            response = requests.post(url, data={'id': id}, headers=self.firefox_ua)  # data={'id': id}:见图4            if response.headers['Content-Type'] == 'application/json;charset=UTF-8':                json_text = response.json()                enterprise_info.setdefault(json_text.get('businessPerson'), json_text.get('epsName'))                # 这里仅获取企业负责人和企业名    return enterprise_infodef show_enterprise_info(self):    [print(k, v) for k, v in self.enterprise_info.items()]def post_data(self, page):    """获取公司列表时要提交的form"""    return {        'on': 'true',        'page': page,        'pageSize': '15',        'productName': '',        'conditionType': '1',        'applyname': '',        'applysn': '',    }  # 见图2

go

Boring()
```

"

原文: http://blog.gqylpy.com/gqy/321

【Python requests多页面爬取案例】 񩲝

评论关闭