抓取图片示例,抓取示例,#!/usr/bin/p


#!/usr/bin/python# -*- coding:utf-8 -*-import reimport osimport urllib, urllib2, cookielibimport shutilfrom BeautifulSoup import BeautifulSoup # ---- utils ----def normalize_url(url):    return "http://" + url if cmp(url[0:7],"http://") != 0 else urldef safeDir(dir):    return dir.replace('/', '')# ---- variable ----homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"homepageSuffix = ".html"threadPrefix = "http://60dxw.comww1.baisex.me/"homedir = "baixingge"# ---- login ----cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())opener = urllib2.build_opener(cookie)# ---- file ----if (os.path.exists(homedir) == False):    os.mkdir(homedir)os.chdir(homedir)# ---- crawl ----for page in range(1, 25):    pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)    # ---- mkdir ----    if (os.path.exists(str(page)) == False):        os.mkdir(str(page))    os.chdir(str(page))    print pageUrl    # ---- download ----    html_body = urllib.urlopen(pageUrl).read()    soup = BeautifulSoup(html_body)    # ---- extract ----    threaddUrls = []    urlRaws = soup.findAll('th', attrs = {'class' : ['new', 'common']})    urlPattern = re.compile(r'href="([^"]*)"')    titlePattern = re.compile(r'>([^<]*)</a>')    for urlRaw in urlRaws:         h = urlPattern.search(str(urlRaw))        t = titlePattern.search(str(urlRaw))        threadUrl = h.group(1)        threadTitle = t.group(1)        if (os.path.exists(threadTitle) == False):            os.mkdir(safeDir(threadTitle))        else:            continue        os.chdir(safeDir(threadTitle))        page_url = threadPrefix + threadUrl        print "---->{0}".format(page_url)        print "---->{0}".format(safeDir(threadTitle))        page_body = urllib.urlopen(page_url).read()        page_soup = BeautifulSoup(page_body)        imgPattern = re.compile(r'img src="([^"]*)" onload')        i = imgPattern.findall(str(page_soup))        index = 0        for img in i:            print "-------->{0}".format(img)            imgSuffix = img[img.rindex('.'):]            imgName = "{0}{1}".format(str(index), imgSuffix)            urllib.urlretrieve(img, imgName, None)            index += 1        os.chdir("../")    os.chdir("../")#该片段来自于http://byrx.net

评论关闭