使用Python下载整个网站的连接，适合能目录浏览的网站。

文章由Byrx.net分享于2019-03-22 12:03:02评论（428）

使用Python下载整个网站的连接，适合能目录浏览的网站。

[python]
# Copyright (C) 2012 xxx(xxx) Co., LTD.
# All rights reserved.
#
# Developed by RD BIOS Team.
#
# Authors: perry <perry.peng@cn.xxx.com>
#
# Date: January 11, 2012
#
# Project Name: WEBDOWN
# Project Version: 1.0.0
#
# Project descrition:
#
# History:
#    Date        Auther      Description
#    -----------------------------------------------------------------
#    2012/01/11 perry       created.
#
# Note:
# xxx

__version__ = "1.0.0"

import os, sys, io

import sqlite3

try:
# Python 2.7
from urlparse import urlparse
from urllib import (
    unquote,
    url2pathname)

except ImportError:
# Python 3.2
from urllib.parse import urlparse

try:
# Python 2.7
from HTMLParser import HTMLParser
except ImportError:
# Python 3.2
from html.parser import HTMLParser

try:
# Python 2.7
from httplib import HTTPConnection
except ImportError:
# Python 3.2
from http.client import HTTPConnection

import time
import threading

class DownloadThread(threading.Thread):
def __init__(self, wd):
    self.wd = wd
    threading.Thread.__init__(self)

def run(self):
    http = HTTPConnection(wd.url)

    while True:
      s = self.wd.get1()
      if s is None:
        if not self.wd.finished:
          break
        time.sleep(1)
        continue

      x = unquote(s.encode(sys.stdin.encoding))
      p = os.getcwd() + url2pathname(x)

      if not os.path.exists(p):
        try:
          http.close()
          http.request('GET', s)
          r = http.getresponse()
          if r.status == 200:
            print r.getheader('content-length', 0), s

            try:
              f = open(p, 'wb')
              f.write(r.read())
            finally:
              f.close()
        except:
          print 'FAIL ', s
      else:
        print 'EXISTS ', s

      self.wd.set1(s, 1)

    print('exit...')

class Webdown(HTMLParser):
finished = False
def __init__(self, url):
    try:
      url_info = urlparse(url, 'http')
      self.url = url_info.netloc
      self.http = HTTPConnection(url_info.netloc)
      self.dbc = sqlite3.connect(':memory:', check_same_thread = False)
      self.lock = threading.Lock()
      self.path = url_info.path
      self.dbc.execute('''''
        create table if not exists download (
          id integer primary key autoincrement,
          name text,
          url text,
          path text,
          local_path text,
          is_dir integer default 0,
          is_searched integer default 0,
          is_queried integer default 0,
          is_download integer default 0)''')

      name = self.path
      while name.endswith('/'):
        name = name[:-1]
      self.path = name + '/'

      i = name.rfind('/')
      if i > 0:
        name = name[i + 1:]

      self.puturl(name, self.url, self.path, os.getcwd(), 1)
    except:
      print('WebDown initialize failure...')

    HTMLParser.__init__(self)

def handle_starttag(self, tag, attrs):
    if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href':
      return

    href = attrs[0][1]
    if href == '../':     # ignore the parent folder.
      return

    if href == './':      # ignore the current folder.
      return

    if href.startswith('?'):
      return

    if href.startswith('~'):
      return

    dir = 0
    name = href
    searched = 1

    if name.endswith('/'):
      name = name[:-1]
      searched = 0
      dir = 1

    self.puturl(name, self.url, self.path + href, '', dir, searched)

def puturl(self, name, url, path, lpath='', isdir=0, searched=0):
    self.lock.acquire()
    self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', (
      name,url, path, lpath, isdir, searched))
    self.lock.release()

def set1(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_queried=? where path=?', (status, path))
    self.lock.release()

def get1(self):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1')
    s = r.fetchone()
    self.lock.release()

    if s is not None:
      return s[0]
    return s

def set2(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_searched=? where path=?', (status, path))
    self.lock.release()

def get2(self, url):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,))
    s = r.fetchone()
    self.lock.release()

    if s is not None and s[0] is not None:
      s = s[0]
      if not s.endswith('/'):
        s = s + '/'
    return s

def set3(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_download=? where path=?', (status, path))

def get3(self):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1')
    s = r.fetchone()
    self.lock.release()

    if s is not None:
      return s[0]
    return s

def go(self):
    self.finished = True
    q = DownloadThread(self)
    q.start()
    while self.path is not None:
      try:
        s = unquote(self.path.encode(sys.stdin.encoding))
        p = os.getcwd() + url2pathname(s)
        if not os.path.exists(p):
          os.makedirs(p)
        #print(s)
      except:
        pass

      try:
        self.http.close()
        self.http.request('GET', self.path)
        r = self.http.getresponse()
        if r.status == 200:
          self.reset()
          self.feed(r.read())
      except:
        pass

      self.set2(self.path, 1)
      self.path = self.get2(self.url)

    self.finished = False
    q.join()

if __name__ == "__main__":
if len(sys.argv) > 1:
    url = sys.argv[0]
    url = url.strip()
else:
    # http://www.20cn.net/share/alalmn
    # http://www.gaby.de/ftp/pub/win3x/archive/
    print('You must provide a valid Url.\n')
    print('Usage:\n Python %s target' % os.path.basename(sys.argv[0]))
    print('    target   --- specify a URL to donwload.\n')
    url = ''
    while len(url) == 0:
      if sys.version.startswith('3.2'):
        url = input('Please enter a URL:')
      else:
        url = raw_input('Please enter a URL:')
      url = url.strip()
wd = Webdown(url)
wd.go()

摘自 perry_peng的专栏

热门文章：

使用Python下载整个网站的连接，适合能目录浏览的网站。

使用Python下载整个网站的连接，适合能目录浏览的网站。

相关内容

最新python教程

python~HOT