python xpath 爬取豆瓣电脑版电影案例,,from lxml


from lxml import etreeimport requestsurl = ‘https://movie.douban.com/chart‘headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" }response = requests.get(url,headers=headers)html_str = response.content.decode()# print(html_str)# 使用etree来处理数据html = etree.HTML(html_str)# 获取电影的url地址url_list = html.xpath("//div[@class=‘indent‘]/div/table//div[@class=‘pl2‘]/a/@href")#print(url_list)# 获取电影图片地址img_list = html.xpath("//div[@class=‘indent‘]/div/table//a[@class=‘nbg‘]/img/@src")#print(img_list)# 把每一部电影组成一个字典,字典中是电影的数据    # 1.分组    # 2.每一组提取数据rets = html.xpath("//div[@class=‘indent‘]/div/table")for table in rets:    item = {}    item[‘title‘] = table.xpath(".//div[@class=‘pl2‘]/a/text()")[0].replace("/","").strip()    item[‘href‘] = table.xpath(".//div[@class=‘pl2‘]/a/@href")[0]    item[‘img‘] = table.xpath(".//a[@class=‘nbg‘]/img/@src")[0]    item[‘comment_num‘] = table.xpath(".//div[@class=‘pl2‘]/div//span[@class=‘pl‘]/text()")[0]    item[‘rating_num‘] = table.xpath(".//div[@class=‘pl2‘]/div//span[@class=‘rating_nums‘]/text()")[0]    print(item)

python xpath 爬取豆瓣电脑版电影案例

评论关闭