python-爬取微博信息,,# -*- codi


# -*- coding: utf-8 -*-import requests, reimport timeimport osimport csvimport sysimport importlibfrom fake_useragent import UserAgentimportlib.reload(sys)class WeiBoSpider():    def __init__(self, page):        self.path = os.getcwd() + "/weibo.csv"        self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig")        self.writer = csv.writer(self.csvfile)        # csv头部        self.writer.writerow((‘话题链接‘, ‘话题内容‘, ‘楼主ID‘, ‘楼主昵称‘, ‘楼主性别‘, ‘发布日期‘,                 ‘发布时间‘, ‘转发量‘, ‘评论量‘, ‘点赞量‘, ‘评论者ID‘, ‘评论者昵称‘,                 ‘评论者性别‘, ‘评论日期‘, ‘评论时间‘, ‘评论内容‘))        self.headers = {    ‘Cookie‘: ‘_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431‘,    ‘Referer‘: ‘https://m.weibo.cn/detail/4312409864846621‘,    ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36‘,    ‘X-Requested-With‘: ‘XMLHttpRequest‘}        self.comments_ID = []        self.page = page    def get_title_id(self):        # 获取内容中的id列表        for page in range(1, self.page):            self.headers = {                "User-Agent": UserAgent().chrome            }            time.sleep(1)            api_url = ‘https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=‘ + str(page)            rep = requests.get(url=api_url, headers=self.headers)            # 获取ID值并写入列表comment_ID中            for json in rep.json()[‘data‘][‘statuses‘]:                comment_ID = json[‘id‘]                self.comments_ID.append(comment_ID)    def spider_title(self, id):        """爬取战役情每个主题的详情页面"""        try:            title_url = ‘https://m.weibo.cn/detail/‘ + str(id)            html_text = requests.get(url=title_url, headers=self.headers).text            # 内容            title = re.findall(‘.*?"text": "(.*?)",.*?‘, html_text)[0]            # 去掉title中的html标签            text = re.sub(‘<(S*?)[^>]*>.*?|<.*? />‘, ‘‘, title)            # 用户id            user_id = re.findall(‘.*?"id": "(.*?)",.*?‘, html_text)[0]            # 用户昵称            user_nicname = re.findall(‘.*?"screen_name": "(.*?)",.*?‘, html_text)[0]            # 性别            user_gender = re.findall(‘.*?"gender": "(.*?)",.*?‘, html_text)[0]            # 发布时间            created_title_time = re.findall(‘.*?"created_at": "(.*?)",.*?‘, html_text)[0].split(" ")            # 日期            if ‘Mar‘ in created_title_time:                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘03‘, created_title_time[2])            elif ‘Feb‘ in created_title_time:                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘02‘, created_title_time[2])            elif ‘Jan‘ in created_title_time:                title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘01‘, created_title_time[2])            else:                pass            # 发布时间            add_title_time = created_title_time[3]            # 转发量            reposts_count = re.findall(‘.*?"reposts_count": (.*?),.*?‘, html_text)[0]            # 评论量            comments_count = re.findall(‘.*?"comments_count": (.*?),.*?‘, html_text)[0]            # 点赞量            attitudes_count = re.findall(‘.*?"attitudes_count": (.*?),.*?‘, html_text)[0]            comment_count = int(int(comments_count) / 20)  # 每个ajax一次加载20条数据            position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD,                         add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ")            # 写入数据            print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD,                         add_title_time, reposts_count, comments_count, attitudes_count)            self.writer.writerow((position1))            return comment_count        except:            pass    def get_page(self, id, max_id, id_type):        # 抓取评论信息        params = {            ‘max_id‘: max_id,            ‘max_id_type‘: id_type        }        url = ‘https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id‘.format(id, id )        try:            r = requests.get(url, params=params, headers=self.headers)            if r.status_code == 200:                return r.json()        except requests.ConnectionError as e:            print(‘error‘, e.args)            pass    def parse_page(self, jsondata):        if jsondata:            items = jsondata.get(‘data‘)            item_max_id = {}            item_max_id[‘max_id‘] = items[‘max_id‘]            item_max_id[‘max_id_type‘] = items[‘max_id_type‘]            return item_max_id    def write_csv(self, jsondata):        for json in jsondata[‘data‘][‘data‘]:            # 用户ID            user_id = json[‘user‘][‘id‘]            # 用户昵称            user_name = json[‘user‘][‘screen_name‘]            # 用户性别,m表示男性,表示女性            user_gender = json[‘user‘][‘gender‘]            # 获取评论            comments_text = json[‘text‘]            comment_text = re.sub(‘<(S*?)[^>]*>.*?|<.*? />‘, ‘‘, comments_text)  # 正则匹配掉html标签            # 评论时间            created_times = json[‘created_at‘].split(‘ ‘)            if ‘Feb‘ in created_times:                created_YMD = "{}/{}/{}".format(created_times[-1], ‘02‘, created_times[2])            elif ‘Jan‘ in created_times:                created_YMD = "{}/{}/{}".format(created_times[-1], ‘01‘, created_times[2])            else:                print(‘该时间不在疫情范围内,估计数据有误!‘)                pass            created_time = created_times[3]  # 评论时间时分秒            position2 = (                " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD,                created_time,                comment_text)            self.writer.writerow((position2))  # 写入数据    def main(self):        self.get_title_id()        count_title = len(self.comments_ID)        for count, comment_ID in enumerate(self.comments_ID):            print("正在爬取第%s个话题,一共找到个%s话题需要爬取" % (count + 1, count_title))            # maxPage获取返回的最大评论数量            maxPage = self.spider_title(comment_ID)            m_id = 0            id_type = 0            if maxPage != 0:  # 小于20条评论的不需要循环                try:                    # 用评论数量控制循环                    for page in range(0, maxPage):                        # 自定义函数-抓取网页评论信息                        jsondata = self.get_page(comment_ID, m_id, id_type)                        # 自定义函数-写入CSV文件                        self.write_csv(jsondata)                        # 自定义函数-获取评论item最大值                        results = self.parse_page(jsondata)                        time.sleep(1)                        m_id = results[‘max_id‘]                        id_type = results[‘max_id_type‘]                except:                    pass            print("--------------------------分隔符---------------------------")        self.csvfile.close()if __name__ == ‘__main__‘:    startTime = time.time()    spider = WeiBoSpider(15)    spider.main()    endTime = time.time()    useTime = (endTime - startTime) / 60    print("该次所获的信息一共使用%s分钟" % useTime)

python-爬取微博信息

评论关闭