从订阅源下载图片的例子,订阅图片例子,__author__ =


__author__ = 'Saint'import osimport urllib.requestimport jsonfrom html.parser import HTMLParser# 从获取的网页内容筛选图片的内容class MyHtmlParser(HTMLParser):    links = []    def handle_starttag(self, tag, attrs):        if tag == "img":            if len(attrs) == 0:                pass            else:                for name, value in attrs:                    if name == "src":                        self.links.append(value)class Down(object):    # 总的目录    img_path = "E:/saint"    # 下载目录    dir = ''    # 采集源地址    collect_links = ["http://dy.163.com/v2/media/articlelist/T1374483113516-1", "http://dy.163.com/v2/media/articlelist/T1420776257254-1", "http://dy.163.com/v2/media/articlelist/T1376641060407-1"]    img_links = "http://dy.163.com/v2/article"    def handleCollect(self):        for collect_link in self.collect_links:            notice = "开始从[" + collect_link + "]采集图片"            print(notice)            # 建立下载的目录            dir_name = collect_link.split("/")[-1]            self.isDirExists(dir_name)            dict = self.getListFromSubscribe(collect_link)            if dict == False:                print("数据采集失败,是否继续(y/n)")                op = input();                if op == "y":                    os.system("cls")                    pass                elif op == "n":                    print("停止采集")                    break                else:                    os.system("cls")                    print("非法输入")                    break            else:                for page in dict:                    page_uri = self.img_links + "/" + page["tid"] + "/" + page["docid"]                    self.getImgFromUri(page_uri)                    print("是否继续(y/n)")                    new_op = input();                    if new_op == "n":                        os.system("cls")                        print("采集完毕")                        break        print("OK")    # 从订阅源获取目录    def getListFromSubscribe(self, uri):        res = urllib.request.urlopen(uri)        if res.code < 200 or res.code > 300:            os.system("clear")            return False        else:            result = res.read().decode("gbk") # 3.4版本的read()返回的是byte类型,需要decode()处理,选项是网页编码            dict = json.loads(result)            if dict['code'] != 1:                print(dict['msg'])                return False            else:                return dict['data']    # 获取本期订阅的网页,并从网页中提取出来需要的图片    def getImgFromUri(self, uri):        html_code = urllib.request.urlopen(uri).read().decode("gbk")        hp = MyHtmlParser()        hp.feed(html_code)        hp.close()        for link in hp.links: # hp.links 是图片的下载地址的列表            self.writeToDisk(link)    # 检查文件目录是否存在,如果不存在,则创建目录    def isDirExists(self, dir_name):        self.dir = self.img_path + dir_name        isExists = os.path.exists(self.dir)        if not isExists:            os.makedirs(self.dir)            return True        else:            return True    # 下载文件,并且写入磁盘    def writeToDisk(self, url):        os.chdir(self.dir)        file = urllib.request.urlopen(url).read()        file_name = url.split("/")[-1]        open(file_name, "wb").write(file)        return Trueif __name__ == "__main__":    down = Down()    down.handleCollect()

评论关闭