新鲜出炉的网页UI山寨脚本,新鲜出炉ui山寨,会下载网页依赖的js,
新鲜出炉的网页UI山寨脚本,新鲜出炉ui山寨,会下载网页依赖的js,
会下载网页依赖的js, css, images,并将源html的路径修改为对应路径,依赖requests,
使用方法 python sitecopy.py http://www.baidu.com
如果想抓取需要登陆的页面,那么你可以先调用get_login_cookies, 这个函数的参数是一个dict,这个dict不同的网站不同,你可以通过httpfox查看, 得到cookies后,将cookies传递给SiteCopy
目前在linux上使用没发现什么问题,windows还没有严格的测试,应该有一些bug
#!/usr/bin/env python#-*- coding:utf-8 -*- """copy the UI of site specified by user"""import sysreload(sys)sys.setdefaultencoding('utf-8')import osimport os.pathimport reimport errnofrom Queue import Queueimport threadingfrom urlparse import urlparse, urljoinimport requestsdef make_sure_path_exists(path): try: os.makedirs(path) except OSError as exception: if exception.errno != errno.EEXIST: raisedef write_to_file(filename, content, mode='w'): make_sure_path_exists(os.path.dirname(filename)) try: with open(filename, mode) as f: f.write(content) except IOError: print 'can not open file : %s' % filenamedef get_login_cookie(login_url, post_data): r = requests.post(login_url, data=post_data) return r.cookiesclass BaseHandler(object): def __init__(self, url, content, queue): self._url = url self._content = content self._queue = queue def _URL_normalization(self, url): url = url.strip('"\\' ') o = urlparse(url) if not o.netloc: url = urljoin(self._url, url) return url def save(self, filename, mode='w'): write_to_file(filename, self._content, mode)class CssHandler(BaseHandler): def __init__(self, url, content, queue): super(CssHandler, self).__init__(url, content, queue) def convert_path(self): content = self._content p = re.compile(r'url\\((.+?)\\)', re.I) ret = '' last_idx = 0 for m in p.finditer(self._content): url = m.group(1) url = self._URL_normalization(url) # filename = urlparse(url).path.lstrip(' /') filename = 'css-url' + urlparse(url).path if filename.endswith('.css'): file_type = 'css' dir_prefix = '../css/' elif filename.endswith('.js'): file_type = 'js' dir_prefix = '../js/' else: file_type = 'img' dir_prefix = '../img/' self._queue.put((url, file_type, filename.replace('/', os.sep))) s = m.start(1) e = m.end(1) ret += content[last_idx: s] + dir_prefix + filename last_idx = e ret += content[e:] self._content = retclass HtmlHandler(BaseHandler): def __init__(self, url, content, queue): super(HtmlHandler, self).__init__(url, content, queue) def convert_path(self): html = self._content img_regex = re.compile(r'<img[^>]*? src="(.+?)"', re.I) link_regex = re.compile(r'<link[^>]*? href="(.+?)"', re.I) js_regex = re.compile(r'<script[^>]*? src="(.+?)"', re.I) for regex in (img_regex, link_regex, js_regex): ret = '' last_idx = 0 for m in regex.finditer(html): url = m.group(1) url = self._URL_normalization(url) filename = os.path.basename(urlparse(url).path) if filename.endswith('.css'): file_type = 'css' dir_prefix = 'static/css/' elif filename.endswith('.js'): file_type = 'js' dir_prefix = 'static/js/' else: file_type = 'img' dir_prefix = 'static/img/' self._queue.put((url, file_type, filename.replace('/', os.sep))) s = m.start(1) e = m.end(1) ret += html[last_idx: s] + dir_prefix + filename last_idx = e ret += html[e:] html = ret self._content = htmlclass Downloader(threading.Thread): def __init__(self, t_name, queue, base_dir, js_dir, css_dir, img_dir, cookies=''): super(Downloader, self).__init__(name=t_name) self._queue = queue self._base_dir = base_dir self._js_dir = js_dir self._css_dir = css_dir self._img_dir = img_dir self._cookies = cookies def run(self): while True: url, file_type, filename = self._queue.get() try: r = requests.get(url, cookies=self._cookies) except requests.exceptions.RequestException: print 'download page error: %s' % url self._queue.task_done() continue except: print 'other exception' self._queue.task_done() continue if file_type == 'html': h = HtmlHandler(url, r.content, self._queue) h.convert_path() h.save(os.path.join(self._base_dir, filename)) elif file_type == 'js': write_to_file(os.path.join(self._js_dir, filename), r.content) elif file_type == 'css': c = CssHandler(url, r.content, self._queue) c.convert_path() c.save(os.path.join(self._css_dir, filename)) elif file_type == 'img': write_to_file(os.path.join(self._img_dir, filename), r.content, 'wb') else: print 'unkonwn type' print 'download complete: %s' % url self._queue.task_done()class SiteCopy(object): def __init__(self, thread_num, start_url, cookies=''): super(SiteCopy, self).__init__() self._thread_num = thread_num self._start_url = start_url self._cookies = cookies self._queue = Queue() def run(self): o = urlparse(self._start_url) base_dir = (o.netloc + o.path).replace(os.sep, '-') js_dir = os.path.join(base_dir, 'static', 'js') css_dir = os.path.join(base_dir, 'static', 'css') img_dir = os.path.join(base_dir, 'static', 'img') self._queue.put((self._start_url, 'html', 'index.html')) for i in range(0, self._thread_num): task = Downloader('Downloader', self._queue, base_dir, js_dir, css_dir, img_dir, self._cookies) task.daemon = True task.start() self._queue.join()if __name__ == '__main__': if len(sys.argv) != 2: print 'wrong arguments' sys.exit(-1) start_url = sys.argv[1] s = SiteCopy(10, start_url) s.run()#该片段来自于http://byrx.net
相关内容
- Python读取html中指定元素生成excle文件,,#coding=gbki
- 抓取豆瓣各类型电影的链接和评论,按评分排列,豆瓣
- python生成图片验证码的代码,,#!/usr/bin/e
- 带简单容错机制的python Gopher客户端程序,pythongopher,i
- 获取网页的正文,获取网页正文,有什么办法可以直接获
- Python 的 Twisted 和 ZeroMQ 集成的示例,twistedzeromq,import
- 像fileinput一样使用,处理tarfile文件组,fileinputtarfile,py
- 一只从百度开始不断搜索的小爬虫,百度搜索爬虫,这是
- 不带重复的全排列,不带重复排列,from sys imp
- 简单的猜数字代码,简单猜数字代码,# -*- coding
评论关闭