文本文件常用工具,,common_lib/u


common_lib/utl

platform: any

# encoding: utf-8import hashlibimport osimport timeimport codecs_SZ_K1 = 1024 # 1024_SZ_M1 = 1048576 # 1024**2def browsefiles(pathroot, fileprefix='', extlist=set()):    """ 从给定路径开始遍历目录树, 按给定的        fileprefix(文件名称前缀), #  不给定具体字符串则无限制        extlist(文件扩展名列表), #  不给定具体内容则无限制    ... 筛选适合的文件    """    for root, dirs, files in os.walk(pathroot):        for dirname in dirs:            browsefiles(os.path.join(root, dirname), extlist)        for filename in files:            basename, extname = os.path.splitext(filename)            if ((not fileprefix or filename.startswith(fileprefix))                    and (not extlist or extname in extlist)):                yield os.path.join(root, filename)def filelines(filename):    f = open(filename)    buf_size, lines = _SZ_M1, 0    read_f = f.read # loop optimization    buf = read_f(buf_size)    while buf:        lines += buf.count('\\n')        buf = read_f(buf_size)    return linesdef filemd5(f, block_size=_SZ_M1):    md5 = hashlib.md5()    map(md5.update, open(f).read(block_size))    return md5.hexdigest()def fileinfo(filename):    path, fname = os.path.split(filename)    basename, ext = os.path.splitext(fname)    statinfo = os.stat(filename)    return dict(        fullname=filename, # 文件全路径名称        filename=fname, # 文件名称        path=path, # 文件所在路径        basename=basename, # 文件基本名        ext=ext, # 文件扩展名        filesize=statinfo.st_size, # 文件大小        filelines=filelines(filename), # 文件文本行数        modified=os.path.getmtime(filename), # 修改时间        md5=filemd5(filename) # 文件的MD5值        )def regexstream(filename, regexpatt,         coding='utf-8', logger=None):    """ 以给定的正则解析文件    """    with codecs.open(filename, 'r', coding) as handle:        for i, x in enumerate(itertools.imap(                regexpatt.match, handle)):            if x:                yield x            elif logger:                logger.warning("Regex match fail at %d line"%i)            else:                continuedef testNexample():    for srcfile in browsefiles(            pathroot='/home/tim/Project',            fileprefix='',            extlist=set(['.py',])            ):        print fileinfo(srcfile)if __name__ == '__main__':    testNexample()#该片段来自于http://byrx.net

评论关闭