随意转换字符编码，字符编码,python转换字符编码

文章由Byrx.net分享于2019-03-23 09:03:23评论（595）

随意转换字符编码，字符编码,python转换字符编码

python转换字符编码utf8-gb18030

#!/bin/python#coding: gbk#windows用的程序，所以文件是用的gbk编码#这个函数是从网上摘的def is_zh (c):    try:        x = ord (c)    except:        return False    # Punct &amp; Radicals    if x &gt;= 0x2e80 and x &lt;= 0x33ff:        return True    # Fullwidth Latin Characters    elif x &gt;= 0xff00 and x &lt;= 0xffef:        return True    # CJK Unified Ideographs &amp;    # CJK Unified Ideographs Extension A    elif x &gt;= 0x4e00 and x &lt;= 0x9fbb:        return True    # CJK Compatibility Ideographs    elif x &gt;= 0xf900 and x &lt;= 0xfad9:        return True    # CJK Unified Ideographs Extension B    elif x &gt;= 0x20000 and x &lt;= 0x2a6d6:        return True    # CJK Compatibility Supplement    elif x &gt;= 0x2f800 and x &lt;= 0x2fa1d:        return True    else:        return Falsdef mdcode( str, encoding='utf-8' ):    if isinstance(str, unicode):        return str.encode(encoding)    for c in ('utf-8', 'gb18030', 'gbk', 'gb2312','utf-16'):        try:            if encoding == 'unicode':                return str.decode(c)            else:                return str.decode(c).encode( encoding )        except:            pass    raise 'Unknown charset'def mdcode_char( str, encoding='utf-8' ):    str_list = list( str )    #print str_list    str_result = []    pos = 0    while pos &lt; len(str_list):    #print pos    #小于127的属于字母、数字等    if ord( str_list[pos] ) &lt;= 127 :            str_result.append(str_list[pos])            pos += 1        continue        #test utf8 3个字符    #utf8 占3个字节，先取三个字节看是不是utf8        c = ''.join( str_list[pos:pos+3] )        try:            zh = c.decode('utf-8')            flag = True    except:            flag = False            zh = ''        #如果可以用decode对3个字节用utf8解码，则确定可能是utf8的汉字    #再用 is_zh 函数 判断是否符合汉字的编码范围        #print flag,zh,is_zh(zh)    if flag and is_zh( zh ):            pos += 3            str_result.append(zh)        continue        #test gb18030 2个字符    #gb18030 占2个字节，先取两个字节看是不是gb18030        c = ''.join( str_list[pos:pos+2] )        try:            zh = c.decode('gb18030')            flag = True    except:            flag = False        #如果可以用decode对2个字节用gb18030解码，则确定可能是gb18030的汉字    #再用 is_zh 函数 判断是否符合汉字的编码范围        #print flag,zh,is_zh(zh)    if flag and is_zh( zh ):            pos += 2            str_result.append(zh)        continue        else:            raise 'Unknown charset_char'    return ''.join( [ i.encode( encoding ) for i in str_result ] )if __name__ == '__main__':    #测试字符串    sgbk = "123abc中国汉字简体"    #转换成utf8    sutf8 = mdcode( sgbk )    #这个字符串包含了gbk和utf8的汉字    sall = sgbk + sutf8 + sgbk    #转换编码    sresult = mdcode_char( sall, 'utf-8' )    #sresult = mdcode( sall, 'gbk' )    #windows测试的,所以要用gbk打印    print mdcode(sresult,'gbk')

热门文章：

随意转换字符编码，字符编码,python转换字符编码

随意转换字符编码，字符编码,python转换字符编码

相关内容

最新python源码实例

python~HOT