使用Python下载整个网站的连接,适合能目录浏览的网站。


[python]
# Copyright (C) 2012 xxx(xxx) Co., LTD. 
# All rights reserved. 

# Developed by RD BIOS Team. 

# Authors: perry <perry.peng@cn.xxx.com> 

# Date: January 11, 2012 

# Project Name: WEBDOWN 
# Project Version: 1.0.0 

# Project descrition: 

# History: 
#    Date        Auther      Description 
#    ----------------------------------------------------------------- 
#    2012/01/11  perry       created. 

# Note: 
#  xxx 
 
__version__ = "1.0.0" 
 
import os, sys, io 
 
import sqlite3 
 
try: 
  # Python 2.7 
  from urlparse import urlparse 
  from urllib import ( 
    unquote, 
    url2pathname) 
 
except ImportError: 
  # Python 3.2 
  from urllib.parse import urlparse 
 
try: 
  # Python 2.7 
  from HTMLParser import HTMLParser 
except ImportError: 
  # Python 3.2 
  from html.parser import HTMLParser 
 
try: 
  # Python 2.7 
  from httplib import HTTPConnection 
except ImportError: 
  # Python 3.2 
  from http.client import HTTPConnection 
 
import time 
import threading 
 
class DownloadThread(threading.Thread): 
  def __init__(self, wd): 
    self.wd = wd 
    threading.Thread.__init__(self) 
     
  def run(self): 
    http = HTTPConnection(wd.url) 
     
    while True: 
      s = self.wd.get1() 
      if s is None: 
        if not self.wd.finished: 
          break 
        time.sleep(1) 
        continue       
         
      x = unquote(s.encode(sys.stdin.encoding)) 
      p = os.getcwd() + url2pathname(x) 
       
      if not os.path.exists(p): 
        try: 
          http.close() 
          http.request('GET', s) 
          r = http.getresponse() 
          if r.status == 200: 
            print r.getheader('content-length', 0), s 
                       
            try: 
              f = open(p, 'wb') 
              f.write(r.read()) 
            finally: 
              f.close()         
        except: 
          print 'FAIL ', s 
      else: 
        print 'EXISTS ', s 
         
      self.wd.set1(s, 1) 
       
    print('exit...') 
 
class Webdown(HTMLParser): 
  finished = False 
  def __init__(self, url): 
    try: 
      url_info = urlparse(url, 'http') 
      self.url = url_info.netloc 
      self.http = HTTPConnection(url_info.netloc) 
      self.dbc = sqlite3.connect(':memory:', check_same_thread = False) 
      self.lock = threading.Lock() 
      self.path = url_info.path 
      self.dbc.execute('''''
        create table if not exists download (
          id integer primary key autoincrement,
          name text,
          url text,
          path text,
          local_path text,
          is_dir integer default 0,
          is_searched integer default 0,
          is_queried integer default 0,
          is_download integer default 0)''') 
 
      name = self.path 
      while name.endswith('/'): 
        name = name[:-1] 
      self.path = name + '/' 
 
      i = name.rfind('/') 
      if i > 0: 
        name = name[i + 1:] 
 
      self.puturl(name, self.url, self.path, os.getcwd(), 1) 
    except: 
      print('WebDown initialize failure...') 
 
    HTMLParser.__init__(self) 
 
  def handle_starttag(self, tag, attrs): 
    if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href': 
      return 
 
    href = attrs[0][1] 
    if href == '../':     # ignore the parent folder. 
      return 
 
    if href == './':      # ignore the current folder. 
      return 
       
    if href.startswith('?'): 
      return 
       
    if href.startswith('~'): 
      return 
 
    dir = 0 
    name = href 
    searched = 1 
 
    if name.endswith('/'): 
      name = name[:-1] 
      searched = 0 
      dir = 1 
 
    self.puturl(name, self.url, self.path + href, '', dir, searched) 
 
  def puturl(self, name, url, path, lpath='', isdir=0, searched=0): 
    self.lock.acquire() 
    self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', ( 
      name,url, path, lpath, isdir, searched)) 
    self.lock.release() 
 
  def set1(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute('update download set is_queried=? where path=?', (status, path)) 
    self.lock.release() 
 
  def get1(self): 
    self.lock.acquire() 
    r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1') 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None: 
      return s[0] 
    return s 
 
  def set2(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute('update download set is_searched=? where path=?', (status, path)) 
    self.lock.release() 
     
  def get2(self, url): 
    self.lock.acquire() 
    r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,)) 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None and s[0] is not None: 
      s = s[0] 
      if not s.endswith('/'): 
        s = s + '/' 
    return s 
 
  def set3(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute('update download set is_download=? where path=?', (status, path)) 
 
  def get3(self): 
    self.lock.acquire() 
    r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1') 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None: 
      return s[0] 
    return s 
 
  def go(self): 
    self.finished = True 
    q = DownloadThread(self) 
    q.start() 
    while self.path is not None: 
      try: 
        s = unquote(self.path.encode(sys.stdin.encoding)) 
        p = os.getcwd() + url2pathname(s) 
        if not os.path.exists(p): 
          os.makedirs(p) 
        #print(s) 
      except: 
        pass 
 
      try: 
        self.http.close() 
        self.http.request('GET', self.path) 
        r = self.http.getresponse() 
        if r.status == 200: 
          self.reset() 
          self.feed(r.read()) 
      except: 
        pass 
 
      self.set2(self.path, 1) 
      self.path = self.get2(self.url) 
       
    self.finished = False 
    q.join() 
 
if __name__ == "__main__": 
  if len(sys.argv) > 1: 
    url = sys.argv[0] 
    url = url.strip() 
  else: 
    # http://www.20cn.net/share/alalmn 
    # http://www.gaby.de/ftp/pub/win3x/archive/ 
    print('You must provide a valid Url.\n') 
    print('Usage:\n  Python %s target' % os.path.basename(sys.argv[0])) 
    print('    target   --- specify a URL to donwload.\n') 
    url = '' 
    while len(url) == 0: 
      if sys.version.startswith('3.2'): 
        url = input('Please enter a URL:') 
      else: 
        url = raw_input('Please enter a URL:') 
      url = url.strip() 
  wd = Webdown(url) 
  wd.go() 
 
摘自  perry_peng的专栏 

相关内容

    暂无相关文章

评论关闭