抓取图片示例,抓取示例,#!/usr/bin/p
文章由Byrx.net分享于2019-03-23 09:03:48
抓取图片示例,抓取示例,#!/usr/bin/p
#!/usr/bin/python# -*- coding:utf-8 -*-import reimport osimport urllib, urllib2, cookielibimport shutilfrom BeautifulSoup import BeautifulSoup # ---- utils ----def normalize_url(url): return "http://" + url if cmp(url[0:7],"http://") != 0 else urldef safeDir(dir): return dir.replace('/', '')# ---- variable ----homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"homepageSuffix = ".html"threadPrefix = "http://60dxw.comww1.baisex.me/"homedir = "baixingge"# ---- login ----cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())opener = urllib2.build_opener(cookie)# ---- file ----if (os.path.exists(homedir) == False): os.mkdir(homedir)os.chdir(homedir)# ---- crawl ----for page in range(1, 25): pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix) # ---- mkdir ---- if (os.path.exists(str(page)) == False): os.mkdir(str(page)) os.chdir(str(page)) print pageUrl # ---- download ---- html_body = urllib.urlopen(pageUrl).read() soup = BeautifulSoup(html_body) # ---- extract ---- threaddUrls = [] urlRaws = soup.findAll('th', attrs = {'class' : ['new', 'common']}) urlPattern = re.compile(r'href="([^"]*)"') titlePattern = re.compile(r'>([^<]*)</a>') for urlRaw in urlRaws: h = urlPattern.search(str(urlRaw)) t = titlePattern.search(str(urlRaw)) threadUrl = h.group(1) threadTitle = t.group(1) if (os.path.exists(threadTitle) == False): os.mkdir(safeDir(threadTitle)) else: continue os.chdir(safeDir(threadTitle)) page_url = threadPrefix + threadUrl print "---->{0}".format(page_url) print "---->{0}".format(safeDir(threadTitle)) page_body = urllib.urlopen(page_url).read() page_soup = BeautifulSoup(page_body) imgPattern = re.compile(r'img src="([^"]*)" onload') i = imgPattern.findall(str(page_soup)) index = 0 for img in i: print "-------->{0}".format(img) imgSuffix = img[img.rindex('.'):] imgName = "{0}{1}".format(str(index), imgSuffix) urllib.urlretrieve(img, imgName, None) index += 1 os.chdir("../") os.chdir("../")#该片段来自于http://byrx.net
相关内容
- django 简单显示数据库的内容,django数据库内容,index.h
- python常用列表(数组)操作演示,python操作演示,s = [
- 在Python中的高斯 - 赛德尔方法,python赛德尔,''' x,numIt
- python通过apply使用元祖和列表调用函数,pythonapply,def
- python通过正则表达式分析网页中的图片并进行替换,
- python返回昨天的日期,python返回昨天, #-*-coding:
- python通过openpyxl生成Excel文件,pythonopenpyxl,from openpyx
- PyQt4获取屏幕鼠标位置像素颜色,pyqt4获取屏幕像素,"
- 网页图片jpg|jpeg抓取器,图片jpgjpeg抓取,依赖beautifuls
- Google Python Class练习解答1-string1.py,python1-string1.py,#!/u
评论关闭