网页图片jpg|jpeg抓取器,图片jpgjpeg抓取,依赖beautifuls


依赖beautifulsoup4库

linux下测试通过

#!/usr/bin/env python2# encoding: utf-8"""# @brief: 抓某个网页上的jpg和jpeg图片Usage:    python fetchjpg.py http://example.com/"""from bs4 import BeautifulSoup as bsfrom urllib2 import urlopenfrom urllib import urlretrieveimport osimport sysdef random_suffix(jpgname):    import random    t = jpgname.split('.')    t[1] = t[1] + str(random.random())[2-6]    return '.'.join(t)def get_domain_name(url):    return 'http://' + (''.join(url.split('//')[1:])).split('/')[0]def get_image_url(image_webpage_url):    """DOC: # @brief: get_image_url : 获取图片的url """    image_url = []    soup = bs(urlopen(url).read())    url_name = soup.html.title.string    image_url.append(url_name)    for image in soup.findAll('img'):        if image.has_attr('src'):            if "jpg" in image["src"] or "jpeg" in image["src"]:                if image['src'][0] == '/':                    image_url.append(get_domain_name(image_webpage_url) +                                     image["src"])                else:                    image_url.append(image["src"])    return image_urldef get_image(url, local_folder):    """DOC:    # @param: url  网页网址    # @param: local_folder 本地保存目录    """    if os.path.isdir(local_folder):            i = 0            for image in get_image_url(url):                if i == 0:                    image_page_name = image.replace('.', '')                    i = 1                    continue                filename = image_page_name + '_' + image.split("/")[-1]                outpath = os.path.join(local_folder, filename)                """                if os.path.exists(outpath):                    print filename + u'已存在, 跳过'                    continue                    """                if os.path.exists(outpath):                    filename = random_suffix(filename)                    outpath = os.path.join(local_folder, filename)                import socket                socket.setdefaulttimeout(60)                urlretrieve(image, outpath)                print image + " ==> " + outpathdef _usage():    print "usage: python fetchjpg.py http://example.com [outpath]"if __name__ == "__main__":    #print webpage_charset('http://www.example.com')    l = len(sys.argv[1:])    if l == 0 or l > 2:        _usage()        sys.exit(-1)    if l == 1:        # 默认本地保存路径        out_folder = "/media/E/pics"        url = sys.argv[-1]    else:        url = sys.argv[1]        out_folder = sys.argv[2]    if not url.lower().startswith("http"):        _usage()        sys.exit(-1)    get_image(url, out_folder)#该片段来自于http://byrx.net

评论关闭