随意转换字符编码,字符编码,python转换字符编码
随意转换字符编码,字符编码,python转换字符编码
python转换字符编码utf8-gb18030
#!/bin/python#coding: gbk#windows用的程序,所以文件是用的gbk编码#这个函数是从网上摘的def is_zh (c): try: x = ord (c) except: return False # Punct & Radicals if x >= 0x2e80 and x <= 0x33ff: return True # Fullwidth Latin Characters elif x >= 0xff00 and x <= 0xffef: return True # CJK Unified Ideographs & # CJK Unified Ideographs Extension A elif x >= 0x4e00 and x <= 0x9fbb: return True # CJK Compatibility Ideographs elif x >= 0xf900 and x <= 0xfad9: return True # CJK Unified Ideographs Extension B elif x >= 0x20000 and x <= 0x2a6d6: return True # CJK Compatibility Supplement elif x >= 0x2f800 and x <= 0x2fa1d: return True else: return Falsdef mdcode( str, encoding='utf-8' ): if isinstance(str, unicode): return str.encode(encoding) for c in ('utf-8', 'gb18030', 'gbk', 'gb2312','utf-16'): try: if encoding == 'unicode': return str.decode(c) else: return str.decode(c).encode( encoding ) except: pass raise 'Unknown charset'def mdcode_char( str, encoding='utf-8' ): str_list = list( str ) #print str_list str_result = [] pos = 0 while pos < len(str_list): #print pos #小于127的属于字母、数字等 if ord( str_list[pos] ) <= 127 : str_result.append(str_list[pos]) pos += 1 continue #test utf8 3个字符 #utf8 占3个字节,先取三个字节看是不是utf8 c = ''.join( str_list[pos:pos+3] ) try: zh = c.decode('utf-8') flag = True except: flag = False zh = '' #如果可以用decode对3个字节用utf8解码,则确定可能是utf8的汉字 #再用 is_zh 函数 判断是否符合汉字的编码范围 #print flag,zh,is_zh(zh) if flag and is_zh( zh ): pos += 3 str_result.append(zh) continue #test gb18030 2个字符 #gb18030 占2个字节,先取两个字节看是不是gb18030 c = ''.join( str_list[pos:pos+2] ) try: zh = c.decode('gb18030') flag = True except: flag = False #如果可以用decode对2个字节用gb18030解码,则确定可能是gb18030的汉字 #再用 is_zh 函数 判断是否符合汉字的编码范围 #print flag,zh,is_zh(zh) if flag and is_zh( zh ): pos += 2 str_result.append(zh) continue else: raise 'Unknown charset_char' return ''.join( [ i.encode( encoding ) for i in str_result ] )if __name__ == '__main__': #测试字符串 sgbk = "123abc中国汉字简体" #转换成utf8 sutf8 = mdcode( sgbk ) #这个字符串包含了gbk和utf8的汉字 sall = sgbk + sutf8 + sgbk #转换编码 sresult = mdcode_char( sall, 'utf-8' ) #sresult = mdcode( sall, 'gbk' ) #windows测试的,所以要用gbk打印 print mdcode(sresult,'gbk')
相关内容
- 约瑟夫环问题,约瑟夫环,[Python]代码de
- string reverse,reverse,[Python]代码de
- 1.1 a string has all unique characters? use arrays and strings,char
- 把图片列表合成一个GIF动画图片,图片列表gif动画,im
- 批量删除文件,,test111.py#
- urllib2获取抓取的数据信息,urllib2获取抓取,urlfile = ur
- solaris系统巡检程序,solaris巡检程序,python+shell
- 基于用户的推荐算法余弦相似性实现,用户算法余弦相
- python用来获得图片exif信息的库代码,,# library te
- 筛选冒烟用例,冒烟用例,在测试用例的管理上具有很
评论关闭