网页图片jpg|jpeg抓取器,图片jpgjpeg抓取,依赖beautifuls
网页图片jpg|jpeg抓取器,图片jpgjpeg抓取,依赖beautifuls
依赖beautifulsoup4库
linux下测试通过
#!/usr/bin/env python2# encoding: utf-8"""# @brief: 抓某个网页上的jpg和jpeg图片Usage: python fetchjpg.py http://example.com/"""from bs4 import BeautifulSoup as bsfrom urllib2 import urlopenfrom urllib import urlretrieveimport osimport sysdef random_suffix(jpgname): import random t = jpgname.split('.') t[1] = t[1] + str(random.random())[2-6] return '.'.join(t)def get_domain_name(url): return 'http://' + (''.join(url.split('//')[1:])).split('/')[0]def get_image_url(image_webpage_url): """DOC: # @brief: get_image_url : 获取图片的url """ image_url = [] soup = bs(urlopen(url).read()) url_name = soup.html.title.string image_url.append(url_name) for image in soup.findAll('img'): if image.has_attr('src'): if "jpg" in image["src"] or "jpeg" in image["src"]: if image['src'][0] == '/': image_url.append(get_domain_name(image_webpage_url) + image["src"]) else: image_url.append(image["src"]) return image_urldef get_image(url, local_folder): """DOC: # @param: url 网页网址 # @param: local_folder 本地保存目录 """ if os.path.isdir(local_folder): i = 0 for image in get_image_url(url): if i == 0: image_page_name = image.replace('.', '') i = 1 continue filename = image_page_name + '_' + image.split("/")[-1] outpath = os.path.join(local_folder, filename) """ if os.path.exists(outpath): print filename + u'已存在, 跳过' continue """ if os.path.exists(outpath): filename = random_suffix(filename) outpath = os.path.join(local_folder, filename) import socket socket.setdefaulttimeout(60) urlretrieve(image, outpath) print image + " ==> " + outpathdef _usage(): print "usage: python fetchjpg.py http://example.com [outpath]"if __name__ == "__main__": #print webpage_charset('http://www.example.com') l = len(sys.argv[1:]) if l == 0 or l > 2: _usage() sys.exit(-1) if l == 1: # 默认本地保存路径 out_folder = "/media/E/pics" url = sys.argv[-1] else: url = sys.argv[1] out_folder = sys.argv[2] if not url.lower().startswith("http"): _usage() sys.exit(-1) get_image(url, out_folder)#该片段来自于http://byrx.net
相关内容
- Google Python Class练习解答1-string1.py,python1-string1.py,#!/u
- Python抓取百度查询结果,python抓取查询结果,#win python
- python通过calendar输出指定年份的全年日历,pythoncalenda
- 邮件发送,,# -*- coding
- python通过apply使用元祖和列表调用函数,pythonapply,def
- 蒙特卡洛方法计算圆周率,导致内存泄露,蒙特卡洛圆
- python读取文件内容并获得读取位置,python读取,#!/usr/b
- syslog client 批量发送测试文本,syslogclient,将代码存为
- 比较两文件的相似度(比较中文),文件中文,#!/usr/b
- python3 终端下英汉词典 BeautifulSoup+网络爬虫,,#!/usr/bi
评论关闭