Create Python codecs from Unicode mapping files,codecsmapping,'''PYTHON SO


'''PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2--------------------------------------------1. This LICENSE AGREEMENT is between the Python Software Foundation('PSF'), and the Individual or Organization ('Licensee') accessing andotherwise using this software ('Python') in source or binary form andits associated documentation.2. Subject to the terms and conditions of this License Agreement, PSFhereby grants Licensee a nonexclusive, royalty-free, world-widelicense to reproduce, analyze, test, perform and/or display publicly,prepare derivative works, distribute, and otherwise use Pythonalone or in any derivative version, provided, however, that PSF'sLicense Agreement and PSF's notice of copyright, i.e., 'Copyright (c)2001, 2002, 2003, 2004 Python Software Foundation; All Rights Reserved'are retained in Python alone or in any derivative version preparedby Licensee.3. In the event Licensee prepares a derivative work that is based onor incorporates Python or any part thereof, and wants to makethe derivative work available to others as provided herein, thenLicensee hereby agrees to include in any such work a brief summary ofthe changes made to Python.4. PSF is making Python available to Licensee on an 'AS IS'basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS ORIMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO ANDDISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESSFOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOTINFRINGE ANY THIRD PARTY RIGHTS.5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHONFOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS ASA RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.6. This License Agreement will automatically terminate upon a materialbreach of its terms and conditions.7. Nothing in this License Agreement shall be deemed to create anyrelationship of agency, partnership, or joint venture between PSF andLicensee.  This License Agreement does not grant permission to use PSFtrademarks or trade name in a trademark sense to endorse or promoteproducts or services of Licensee, or any third party.8. By copying, installing or otherwise using Python, Licenseeagrees to be bound by the terms and conditions of this LicenseAgreement.'''''' Unicode Mapping Parser and Codec Generator.This script parses Unicode mapping files as available from the Unicodesite (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codecmodules from them. The codecs use the standard character mapping codecto actually apply the mapping.Synopsis: gencodec.py dir codec_prefixAll files in dir are scanned and those producing non-empty mappingswill be written to <codec_prefix><mapname>.py with <mapname> being thefirst part of the map's filename ('a' in a.b.c.txt) converted tolowercase with hyphens replaced by underscores.The tool also writes marshalled versions of the mapping tables to thesame location (with .mapping extension).Written by Marc-Andre Lemburg (mal@lemburg.com).(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.(c) Copyright Guido van Rossum, 2000.'''#'import re,os,time,marshal# Create numeric tables or character based ones ?numeric = 1mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'                   '\s+'                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'                   '\s*'                   '(#.+)?')def parsecodes(codes,               len=len, filter=filter,range=range):    ''' Converts code combinations to either a single code integer        or a tuple of integers.        meta-codes (in angular brackets, e.g. <LR> and <RL>) are        ignored.        Empty codes or illegal ones are returned as None.    '''    if not codes:        return None    l = codes.split('+')    if len(l) == 1:        return int(l[0],16)    for i in range(len(l)):        try:            l[i] = int(l[i],16)        except ValueError:            l[i] = None    l = filter(lambda x: x is not None, l)    if len(l) == 1:        return l[0]    else:        return tuple(l)def readmap(filename):    f = open(filename,'r')    lines = f.readlines()    f.close()    enc2uni = {}    identity = []    unmapped = range(256)    for i in range(256):        unmapped[i] = i    for line in lines:        line = line.strip()        if not line or line[0] == '#':            continue        m = mapRE.match(line)        if not m:            #print '* not matched: %s' % repr(line)            continue        enc,uni,comment = m.groups()        enc = parsecodes(enc)        uni = parsecodes(uni)        if not comment:            comment = ''        else:            comment = comment[1:]        if enc < 256:            unmapped.remove(enc)            if enc == uni:                identity.append(enc)            else:                enc2uni[enc] = (uni,comment)        else:            enc2uni[enc] = (uni,comment)    # If there are more identity-mapped entries than unmapped entries,    # it pays to generate an identity dictionary first, and add explicit    # mappings to None for the rest    if len(identity)>=len(unmapped):        for enc in unmapped:            enc2uni[enc] = (None, '')        enc2uni['IDENTITY'] = 256    return enc2unidef hexrepr(t):    if t is None:        return 'None'    try:        len(t)    except:        return '0x%04x' % t    return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'def unicoderepr(t):    if t is None:        return 'None'    if numeric:        return hexrepr(t)    else:        try:            len(t)        except:            return repr(unichr(t))        return repr(''.join(map(unichr, t)))def keyrepr(t):    if t is None:        return 'None'    if numeric:        return hexrepr(t)    else:        try:            len(t)        except:            if t < 256:                return repr(chr(t))            else:                return repr(unichr(t))        return repr(''.join(map(chr, t)))def codegen(name,map,comments=1):    ''' Returns Python source for the given map.        Comments are included in the source, if comments is true (default).    '''    l = [        '''\''' Python Character Mapping Codec generated from '%s' with gencodec.py.'''#'import codecs### Codec APIsclass Codec(codecs.Codec):    def encode(self,input,errors='strict'):        return codecs.charmap_encode(input,errors,encoding_map)    def decode(self,input,errors='strict'):        return codecs.charmap_decode(input,errors,decoding_map)class StreamWriter(Codec,codecs.StreamWriter):    passclass StreamReader(Codec,codecs.StreamReader):    pass### encodings module APIdef getregentry():    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)### Decoding Map''' % name,        ]    if map.has_key('IDENTITY'):        l.append('decoding_map = codecs.make_identity_dict(range(%d))'                 % map['IDENTITY'])        l.append('decoding_map.update({')        splits = 1        del map['IDENTITY']    else:        l.append('decoding_map = {')        splits = 0    mappings = map.items()    mappings.sort()    append = l.append    i = 0    for e,value in mappings:        try:            (u,c) = value        except TypeError:            u = value            c = ''        key = keyrepr(e)        if c and comments:            append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))        else:            append('\t%s: %s,' % (key,unicoderepr(u)))        i += 1        if i == 4096:            # Split the definition into parts to that the Python            # parser doesn't dump core            if splits == 0:                append('}')            else:                append('})')            append('decoding_map.update({')            i = 0            splits = splits + 1    if splits == 0:        append('}')    else:        append('})')    append('''### Encoding Mapencoding_map = codecs.make_encoding_map(decoding_map)''')    return '\n'.join(l)def pymap(name,map,pyfile,comments=1):    code = codegen(name,map,comments)    f = open(pyfile,'w')    f.write(code)    f.close()def marshalmap(name,map,marshalfile):    d = {}    for e,(u,c) in map.items():        d[e] = (u,c)    f = open(marshalfile,'wb')    marshal.dump(d,f)    f.close()def convertdir(dir,prefix='',comments=1):    mapnames = os.listdir(dir)    for mapname in mapnames:        name = os.path.split(mapname)[1]        name = name.replace('-','_')        name = name.split('.')[0]        name = name.lower()        codefile = name + '.py'        marshalfile = name + '.mapping'        print 'converting %s to %s and %s' % (mapname,                                              prefix + codefile,                                              prefix + marshalfile)        try:            map = readmap(os.path.join(dir,mapname))            if not map:                print '* map is empty; skipping'            else:                pymap(mapname, map, prefix + codefile,comments)                marshalmap(mapname, map, prefix + marshalfile)        except ValueError:            print '* conversion failed'def rewritepythondir(dir,prefix='',comments=1):    mapnames = os.listdir(dir)    for mapname in mapnames:        if not mapname.endswith('.mapping'):            continue        codefile = mapname[:-len('.mapping')] + '.py'        print 'converting %s to %s' % (mapname,                                       prefix + codefile)        try:            map = marshal.load(open(os.path.join(dir,mapname),                               'rb'))            if not map:                print '* map is empty; skipping'            else:                pymap(mapname, map, prefix + codefile,comments)        except ValueError, why:            print '* conversion failed: %s' % whyif __name__ == '__main__':    import sys    if 1:        apply(convertdir,tuple(sys.argv[1:]))    else:        apply(rewritepythondir,tuple(sys.argv[1:]))

评论关闭