开线程爬取黑猫里的阿里投诉信息,
开线程爬取黑猫里的阿里投诉信息,
仅供学习,请适度开线程
一.代码
import requests
from requests_html import HTMLSession
import time
from concurrent.futures import ThreadPoolExecutor
import json
pool = ThreadPoolExecutor(30)
big_list = []
pool_name_list =[]
session = HTMLSession()
def dewu_company(x):
try:
print(f'第{x+1}页')
params = {
'couid': '1878960481',
'type': '1',
'page_size': f'{(x + 1) * 10}',
'page': f'{x + 1}',
# 'callback':'jQuery11',
}
url = 'https://tousu.sina.com.cn/api/company/received_complaints'
res = requests.get(url, params=params, verify=False)
info_list = res.json()['result']['data']['complaints']
for dict_info in info_list:
dict_info['main']['url'] = 'https:' + dict_info['main']['url']
dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
info_url = dict_info['main']['url']
print(info_url)
res = session.get(info_url, verify=False)
new_dict = dict()
new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
# new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
# new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
have_http_img_list = []
for a in not_have_http_img_list:
have_http_img_list.append('https:' + a)
new_dict['投诉图片'] = have_http_img_list
vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
print(vide_id_list)
new_vide_list = []
if vide_id_list:
for vide_id in vide_id_list:
t = int(time.time())
vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
res = session.get(vide_info_url, verify=False)
try:
new_vide_list.append(res.json())
except:
pass
new_dict['投诉视频详情'] = new_vide_list
dict_info['投诉详情'] = new_dict
big_list.append(dict_info)
except:
print('错误跳过这一页')
def run(page):
'''爬取的页面数量'''
for x in range(page):
name = pool.submit(dewu_company,x)
pool_name_list.append(name)
for name_1 in pool_name_list:
name_1.result()
print('全部结束开始保存本地')
with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
json.dump(big_list, fw)
print('保存完毕')
if __name__ == '__main__':
run(1)
相关内容
- 暂无相关文章
评论关闭