批量转换html内的&#XXXXX;为中文,且转换文件编码为UTF-8,,lomatus#!/us


lomatus

#!/usr/bin/python#coding=utf-8#Author Lomatus#Email tourszhou#gmail.comimport sys, os, re, string, iodef utoutf(htm):    op = open(htm,'r')    str = op.read()    s = str    unic = re.findall("\&#\d{5};",s)    for u in unic:        uni = u;        num = int(u[2:7])        utf = unichr(num).encode('UTF-8')        s = s.replace(uni,utf)    out = s.replace("Windows-1252","UTF-8")    op.close()    op = open(htm,'w')    op.write(out)    op.close()if __name__ == "__main__":    argLen = len(sys.argv)    if argLen > 2 :        print "Error synax"    elif argLen==2:        p = sys.argv[1]        if re.match("^\w+\.htm",p):            utoutf(p)            print 'File:',p,'converted'        elif os.path.exists(p):            if not re.match("^\D{1}:\\\\",p):                p = os.getcwd()+"\\"+sys.argv[1]                print "Read Fold:"+p            os.chdir(p)            filelist = os.listdir(p)            i = 0            for file in filelist:                if re.match("^\w+\.htm",file):                    utoutf(file)                    i = i+1                    print i," File:",file,"      Converted!"

评论关闭