新鲜出炉的网页UI山寨脚本,新鲜出炉ui山寨,会下载网页依赖的js,


会下载网页依赖的js, css, images,并将源html的路径修改为对应路径,依赖requests,

使用方法 python sitecopy.py http://www.baidu.com

如果想抓取需要登陆的页面,那么你可以先调用get_login_cookies, 这个函数的参数是一个dict,这个dict不同的网站不同,你可以通过httpfox查看, 得到cookies后,将cookies传递给SiteCopy

目前在linux上使用没发现什么问题,windows还没有严格的测试,应该有一些bug

#!/usr/bin/env python#-*- coding:utf-8 -*- """copy the UI of site specified by user"""import sysreload(sys)sys.setdefaultencoding('utf-8')import osimport os.pathimport reimport errnofrom Queue import Queueimport threadingfrom urlparse import urlparse, urljoinimport requestsdef make_sure_path_exists(path):    try:        os.makedirs(path)    except OSError as exception:        if exception.errno != errno.EEXIST:            raisedef write_to_file(filename, content, mode='w'):    make_sure_path_exists(os.path.dirname(filename))    try:        with open(filename, mode) as f:            f.write(content)    except IOError:        print 'can not open file : %s' % filenamedef get_login_cookie(login_url, post_data):    r = requests.post(login_url, data=post_data)    return r.cookiesclass BaseHandler(object):    def __init__(self, url, content, queue):        self._url = url        self._content = content        self._queue = queue    def _URL_normalization(self, url):        url = url.strip('"\\' ')        o = urlparse(url)        if not o.netloc:            url = urljoin(self._url, url)        return url    def save(self, filename, mode='w'):        write_to_file(filename, self._content, mode)class CssHandler(BaseHandler):    def __init__(self, url, content, queue):        super(CssHandler, self).__init__(url, content, queue)    def convert_path(self):        content = self._content        p = re.compile(r'url\\((.+?)\\)', re.I)        ret = ''        last_idx = 0        for m in p.finditer(self._content):            url = m.group(1)            url = self._URL_normalization(url)            # filename = urlparse(url).path.lstrip(' /')            filename = 'css-url' + urlparse(url).path            if filename.endswith('.css'):                file_type = 'css'                dir_prefix = '../css/'            elif filename.endswith('.js'):                file_type = 'js'                dir_prefix = '../js/'            else:                file_type = 'img'                dir_prefix = '../img/'            self._queue.put((url, file_type, filename.replace('/', os.sep)))            s = m.start(1)            e = m.end(1)            ret += content[last_idx: s] + dir_prefix + filename            last_idx = e        ret += content[e:]        self._content = retclass HtmlHandler(BaseHandler):    def __init__(self, url, content, queue):        super(HtmlHandler, self).__init__(url, content, queue)    def convert_path(self):        html = self._content        img_regex = re.compile(r'<img[^>]*? src="(.+?)"', re.I)        link_regex = re.compile(r'<link[^>]*? href="(.+?)"', re.I)        js_regex = re.compile(r'<script[^>]*? src="(.+?)"', re.I)        for regex in (img_regex, link_regex, js_regex):            ret = ''            last_idx = 0            for m in regex.finditer(html):                url = m.group(1)                url = self._URL_normalization(url)                filename = os.path.basename(urlparse(url).path)                if filename.endswith('.css'):                    file_type = 'css'                    dir_prefix = 'static/css/'                elif filename.endswith('.js'):                    file_type = 'js'                    dir_prefix = 'static/js/'                else:                    file_type = 'img'                    dir_prefix = 'static/img/'                self._queue.put((url, file_type, filename.replace('/', os.sep)))                s = m.start(1)                e = m.end(1)                ret += html[last_idx: s] + dir_prefix + filename                last_idx = e            ret += html[e:]            html = ret        self._content =  htmlclass Downloader(threading.Thread):    def __init__(self, t_name, queue, base_dir, js_dir,                 css_dir, img_dir, cookies=''):        super(Downloader, self).__init__(name=t_name)        self._queue = queue        self._base_dir = base_dir        self._js_dir = js_dir        self._css_dir = css_dir        self._img_dir = img_dir        self._cookies = cookies    def run(self):        while True:            url, file_type, filename = self._queue.get()            try:                r = requests.get(url, cookies=self._cookies)            except requests.exceptions.RequestException:                print 'download page error: %s' % url                self._queue.task_done()                continue            except:                print 'other exception'                self._queue.task_done()                continue            if file_type == 'html':                h = HtmlHandler(url, r.content, self._queue)                h.convert_path()                h.save(os.path.join(self._base_dir, filename))            elif file_type == 'js':                write_to_file(os.path.join(self._js_dir, filename), r.content)            elif file_type == 'css':                c = CssHandler(url, r.content, self._queue)                c.convert_path()                c.save(os.path.join(self._css_dir, filename))            elif file_type == 'img':                write_to_file(os.path.join(self._img_dir, filename),                              r.content, 'wb')            else:                print 'unkonwn type'            print 'download complete: %s' % url            self._queue.task_done()class SiteCopy(object):    def __init__(self, thread_num, start_url, cookies=''):        super(SiteCopy, self).__init__()        self._thread_num = thread_num        self._start_url = start_url        self._cookies = cookies        self._queue = Queue()    def run(self):        o = urlparse(self._start_url)        base_dir = (o.netloc + o.path).replace(os.sep, '-')        js_dir = os.path.join(base_dir, 'static', 'js')        css_dir = os.path.join(base_dir, 'static', 'css')        img_dir = os.path.join(base_dir, 'static', 'img')        self._queue.put((self._start_url, 'html', 'index.html'))        for i in range(0, self._thread_num):            task = Downloader('Downloader', self._queue, base_dir, js_dir,                              css_dir, img_dir, self._cookies)            task.daemon = True            task.start()        self._queue.join()if __name__ == '__main__':    if len(sys.argv) != 2:        print 'wrong arguments'        sys.exit(-1)    start_url = sys.argv[1]    s = SiteCopy(10, start_url)    s.run()#该片段来自于http://byrx.net

评论关闭