使用Python下载整个网站的连接,适合能目录浏览的网站。
使用Python下载整个网站的连接,适合能目录浏览的网站。
[python]# Copyright (C) 2012 xxx(xxx) Co., LTD.
# All rights reserved.
#
# Developed by RD BIOS Team.
#
# Authors: perry <perry.peng@cn.xxx.com>
#
# Date: January 11, 2012
#
# Project Name: WEBDOWN
# Project Version: 1.0.0
#
# Project descrition:
#
# History:
# Date Auther Description
# -----------------------------------------------------------------
# 2012/01/11 perry created.
#
# Note:
# xxx
__version__ = "1.0.0"
import os, sys, io
import sqlite3
try:
# Python 2.7
from urlparse import urlparse
from urllib import (
unquote,
url2pathname)
except ImportError:
# Python 3.2
from urllib.parse import urlparse
try:
# Python 2.7
from HTMLParser import HTMLParser
except ImportError:
# Python 3.2
from html.parser import HTMLParser
try:
# Python 2.7
from httplib import HTTPConnection
except ImportError:
# Python 3.2
from http.client import HTTPConnection
import time
import threading
class DownloadThread(threading.Thread):
def __init__(self, wd):
self.wd = wd
threading.Thread.__init__(self)
def run(self):
http = HTTPConnection(wd.url)
while True:
s = self.wd.get1()
if s is None:
if not self.wd.finished:
break
time.sleep(1)
continue
x = unquote(s.encode(sys.stdin.encoding))
p = os.getcwd() + url2pathname(x)
if not os.path.exists(p):
try:
http.close()
http.request('GET', s)
r = http.getresponse()
if r.status == 200:
print r.getheader('content-length', 0), s
try:
f = open(p, 'wb')
f.write(r.read())
finally:
f.close()
except:
print 'FAIL ', s
else:
print 'EXISTS ', s
self.wd.set1(s, 1)
print('exit...')
class Webdown(HTMLParser):
finished = False
def __init__(self, url):
try:
url_info = urlparse(url, 'http')
self.url = url_info.netloc
self.http = HTTPConnection(url_info.netloc)
self.dbc = sqlite3.connect(':memory:', check_same_thread = False)
self.lock = threading.Lock()
self.path = url_info.path
self.dbc.execute('''''
create table if not exists download (
id integer primary key autoincrement,
name text,
url text,
path text,
local_path text,
is_dir integer default 0,
is_searched integer default 0,
is_queried integer default 0,
is_download integer default 0)''')
name = self.path
while name.endswith('/'):
name = name[:-1]
self.path = name + '/'
i = name.rfind('/')
if i > 0:
name = name[i + 1:]
self.puturl(name, self.url, self.path, os.getcwd(), 1)
except:
print('WebDown initialize failure...')
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href':
return
href = attrs[0][1]
if href == '../': # ignore the parent folder.
return
if href == './': # ignore the current folder.
return
if href.startswith('?'):
return
if href.startswith('~'):
return
dir = 0
name = href
searched = 1
if name.endswith('/'):
name = name[:-1]
searched = 0
dir = 1
self.puturl(name, self.url, self.path + href, '', dir, searched)
def puturl(self, name, url, path, lpath='', isdir=0, searched=0):
self.lock.acquire()
self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', (
name,url, path, lpath, isdir, searched))
self.lock.release()
def set1(self, path, status=0):
self.lock.acquire()
self.dbc.execute('update download set is_queried=? where path=?', (status, path))
self.lock.release()
def get1(self):
self.lock.acquire()
r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1')
s = r.fetchone()
self.lock.release()
if s is not None:
return s[0]
return s
def set2(self, path, status=0):
self.lock.acquire()
self.dbc.execute('update download set is_searched=? where path=?', (status, path))
self.lock.release()
def get2(self, url):
self.lock.acquire()
r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,))
s = r.fetchone()
self.lock.release()
if s is not None and s[0] is not None:
s = s[0]
if not s.endswith('/'):
s = s + '/'
return s
def set3(self, path, status=0):
self.lock.acquire()
self.dbc.execute('update download set is_download=? where path=?', (status, path))
def get3(self):
self.lock.acquire()
r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1')
s = r.fetchone()
self.lock.release()
if s is not None:
return s[0]
return s
def go(self):
self.finished = True
q = DownloadThread(self)
q.start()
while self.path is not None:
try:
s = unquote(self.path.encode(sys.stdin.encoding))
p = os.getcwd() + url2pathname(s)
if not os.path.exists(p):
os.makedirs(p)
#print(s)
except:
pass
try:
self.http.close()
self.http.request('GET', self.path)
r = self.http.getresponse()
if r.status == 200:
self.reset()
self.feed(r.read())
except:
pass
self.set2(self.path, 1)
self.path = self.get2(self.url)
self.finished = False
q.join()
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[0]
url = url.strip()
else:
# http://www.20cn.net/share/alalmn
# http://www.gaby.de/ftp/pub/win3x/archive/
print('You must provide a valid Url.\n')
print('Usage:\n Python %s target' % os.path.basename(sys.argv[0]))
print(' target --- specify a URL to donwload.\n')
url = ''
while len(url) == 0:
if sys.version.startswith('3.2'):
url = input('Please enter a URL:')
else:
url = raw_input('Please enter a URL:')
url = url.strip()
wd = Webdown(url)
wd.go()
摘自 perry_peng的专栏
相关内容
- 暂无相关文章
评论关闭