随意转换字符编码,字符编码,python转换字符编码


python转换字符编码utf8-gb18030

#!/bin/python#coding: gbk#windows用的程序,所以文件是用的gbk编码#这个函数是从网上摘的def is_zh (c):    try:        x = ord (c)    except:        return False    # Punct & Radicals    if x >= 0x2e80 and x <= 0x33ff:        return True    # Fullwidth Latin Characters    elif x >= 0xff00 and x <= 0xffef:        return True    # CJK Unified Ideographs &    # CJK Unified Ideographs Extension A    elif x >= 0x4e00 and x <= 0x9fbb:        return True    # CJK Compatibility Ideographs    elif x >= 0xf900 and x <= 0xfad9:        return True    # CJK Unified Ideographs Extension B    elif x >= 0x20000 and x <= 0x2a6d6:        return True    # CJK Compatibility Supplement    elif x >= 0x2f800 and x <= 0x2fa1d:        return True    else:        return Falsdef mdcode( str, encoding='utf-8' ):    if isinstance(str, unicode):        return str.encode(encoding)    for c in ('utf-8', 'gb18030', 'gbk', 'gb2312','utf-16'):        try:            if encoding == 'unicode':                return str.decode(c)            else:                return str.decode(c).encode( encoding )        except:            pass    raise 'Unknown charset'def mdcode_char( str, encoding='utf-8' ):    str_list = list( str )    #print str_list    str_result = []    pos = 0    while pos < len(str_list):    #print pos    #小于127的属于字母、数字等    if ord( str_list[pos] ) <= 127 :            str_result.append(str_list[pos])            pos += 1        continue        #test utf8 3个字符    #utf8 占3个字节,先取三个字节看是不是utf8        c = ''.join( str_list[pos:pos+3] )        try:            zh = c.decode('utf-8')            flag = True    except:            flag = False            zh = ''        #如果可以用decode对3个字节用utf8解码,则确定可能是utf8的汉字    #再用 is_zh 函数 判断是否符合汉字的编码范围        #print flag,zh,is_zh(zh)    if flag and is_zh( zh ):            pos += 3            str_result.append(zh)        continue        #test gb18030 2个字符    #gb18030 占2个字节,先取两个字节看是不是gb18030        c = ''.join( str_list[pos:pos+2] )        try:            zh = c.decode('gb18030')            flag = True    except:            flag = False        #如果可以用decode对2个字节用gb18030解码,则确定可能是gb18030的汉字    #再用 is_zh 函数 判断是否符合汉字的编码范围        #print flag,zh,is_zh(zh)    if flag and is_zh( zh ):            pos += 2            str_result.append(zh)        continue        else:            raise 'Unknown charset_char'    return ''.join( [ i.encode( encoding ) for i in str_result ] )if __name__ == '__main__':    #测试字符串    sgbk = "123abc中国汉字简体"    #转换成utf8    sutf8 = mdcode( sgbk )    #这个字符串包含了gbk和utf8的汉字    sall = sgbk + sutf8 + sgbk    #转换编码    sresult = mdcode_char( sall, 'utf-8' )    #sresult = mdcode( sall, 'gbk' )    #windows测试的,所以要用gbk打印    print mdcode(sresult,'gbk')

评论关闭