一个检测某网页依赖第三方资源的 python 脚本,第三方python,#!/usr/bin/e


#!/usr/bin/env python# -*- coding: utf8 -*-# 通过输入的网址获取其依赖的站点(html中引用到的)# 依赖文件格式如下:# *.microsoft.com# *.outlook.com# *.apple.com# *.ibm.comimport urllib2import urlparseimport socketimport sysimport redef printHelp():    print 'Approach 1: python DepSpy.py url dstfile'    print '    * url starts with http:// or https://.'    print '    * dstfile is the full name of output file,'    print '      results output to stdin if dstfile is empty.'    print '\r\nApproach 2: python DepSpy.py urlfile dstfile'    print '    * urlfile is the full name of file listing input urls(splitted by \\n).'    print '    * dstfile is the full name of output file,'    print '      results output to stdin if dstfile is empty.'# 根据命令行调用相应功能def dispatch(args):    try:        if len(args) < 2:            printHelp()            return []        elif len(args) == 2 and (['h', '/h', '-h', '?', '/?', '-?', 'help', '-help', '/help'].count(args[1]) != 0):            printHelp()        elif args[1].find(r'http://') == 0 or args[1].find(r'https://') == 0:            # 命令行参数为一个网址            return getDependHost(args[1])        else:            # 命令行参数为一个网址列表文件名            urls = readURLList(args[1])            ret = []            for u in urls:                print'---- Dealing with: ' + u + ' ----'                lst = getDependHost(u)                for it in lst:                    if ret.count(it) == 0:                        ret.append(it)            return ret    except Exception , e:        print e    return []# 获取依赖站点_pattern = re.compile(r'<(?:script|link).*(?:src|href)\s?=\s?"(https?://.+?)"')_pwww = re.compile(r'^[a-z0-9-_]+\.')def getDependHost(url):    try:        if url.find('http://') != 0:            url = 'http://' + url        def getHost(str):            netloc = urlparse.urlparse(str).netloc            if netloc.find('baidu.com') != -1:                # 百度的网址要单独处理                return netloc            elif netloc.count('.') < 2:                return '*.' + netloc            else:                netloc, dummy = re.subn(_pwww, '*.', netloc)            return netloc        resp = urllib2.urlopen(url)        html = resp.read()        deps = _pattern.findall(html)        deps = map(getHost, deps)        selfHost = getHost(url)        ret = []        for it in deps:            if ret.count(it) == 0 and selfHost != it:                ret.append(it)        print ret        return ret    except Exception , e:        print e    return []# 读取网址列表def readURLList(path):    fp = open(path, 'r')    urls = []    try:        urls = fp.read().replace('\r', '').replace('*', 'www').split('\n')    finally:        fp.close()    return urls# 程序入口if __name__ == '__main__':    socket.setdefaulttimeout(60) # 全局超时设置    lst = dispatch(sys.argv)    if len(sys.argv) > 2:        try:            distFilename = sys.argv[2]            fp = open(distFilename, 'w')            for it in lst:                fp.write(it + '\r\n')            fp.close()        except Exception , e:            print 'Write File Error'    else:        try:            for it in lst:                print it        except Exception , e:            print 'Error'

评论关闭