文本文件常用工具,,common_lib/u
文本文件常用工具,,common_lib/u
common_lib/utl
platform: any
# encoding: utf-8import hashlibimport osimport timeimport codecs_SZ_K1 = 1024 # 1024_SZ_M1 = 1048576 # 1024**2def browsefiles(pathroot, fileprefix='', extlist=set()): """ 从给定路径开始遍历目录树, 按给定的 fileprefix(文件名称前缀), # 不给定具体字符串则无限制 extlist(文件扩展名列表), # 不给定具体内容则无限制 ... 筛选适合的文件 """ for root, dirs, files in os.walk(pathroot): for dirname in dirs: browsefiles(os.path.join(root, dirname), extlist) for filename in files: basename, extname = os.path.splitext(filename) if ((not fileprefix or filename.startswith(fileprefix)) and (not extlist or extname in extlist)): yield os.path.join(root, filename)def filelines(filename): f = open(filename) buf_size, lines = _SZ_M1, 0 read_f = f.read # loop optimization buf = read_f(buf_size) while buf: lines += buf.count('\\n') buf = read_f(buf_size) return linesdef filemd5(f, block_size=_SZ_M1): md5 = hashlib.md5() map(md5.update, open(f).read(block_size)) return md5.hexdigest()def fileinfo(filename): path, fname = os.path.split(filename) basename, ext = os.path.splitext(fname) statinfo = os.stat(filename) return dict( fullname=filename, # 文件全路径名称 filename=fname, # 文件名称 path=path, # 文件所在路径 basename=basename, # 文件基本名 ext=ext, # 文件扩展名 filesize=statinfo.st_size, # 文件大小 filelines=filelines(filename), # 文件文本行数 modified=os.path.getmtime(filename), # 修改时间 md5=filemd5(filename) # 文件的MD5值 )def regexstream(filename, regexpatt, coding='utf-8', logger=None): """ 以给定的正则解析文件 """ with codecs.open(filename, 'r', coding) as handle: for i, x in enumerate(itertools.imap( regexpatt.match, handle)): if x: yield x elif logger: logger.warning("Regex match fail at %d line"%i) else: continuedef testNexample(): for srcfile in browsefiles( pathroot='/home/tim/Project', fileprefix='', extlist=set(['.py',]) ): print fileinfo(srcfile)if __name__ == '__main__': testNexample()#该片段来自于http://byrx.net
评论关闭