python实现支持unicode中文的AC自动机

文章由Byrx.net分享于2019-03-22 11:03:35评论（118）

python实现支持unicode中文的AC自动机

最近开始从分析数据，要从大量短文本中匹配很多关键字，如果暴力find的话，发现CPU成为了瓶颈，于是想到了AC自动机

AC自动机是多模式匹配的一个经典数据结构，原理是和KMP一样的构造fail指针，不过AC自动机是在Trie树上构造的，但原理是一样的。

为了能够匹配unicode，我讲unicode编码之后，按照每4位进行索引，变成了16叉trie树。

其实这种事情应该用C/C++来写的，不过我不太会使用python调用c，所以就用python改写了

下面是代码

[python] #coding=utf-8

KIND = 16
#BASE = ord('a')

class Node():
    def __init__(self):
        self.fail = None
        self.next = [None]*KIND
        self.end = False

class AC_Automachine():
    def __init__(self):
        self.root = Node()
        self.queue = []

    def getIndex(self,char):
        return ord(char)# - BASE

    def insert(self,string):
        p = self.root
        for char in string:
            index = self.getIndex(char)
            if p.next[index] == None:
                p.next[index] = Node()
            p = p.next[index]
        p.end = True

    def build_automachine(self):
        self.root.fail = None
        self.queue.append(self.root)
        while len(self.queue)!=0:
            parent = self.queue[0]
            self.queue.pop(0)
            for i,child in enumerate(parent.next):
                if child == None:continue
                if parent == self.root:
                    child.fail = self.root
                else:
                    failp = parent.fail
                    while failp != None:
                        if failp.next[i] != None:
                            child.fail = failp.next[i]
                            break
                        failp = failp.fail
                    if failp==None: child.fail=self.root
                self.queue.append(child)

    def matchOne(self,string):
        p = self.root
        for char in string:
            index = self.getIndex(char)
            while p.next[index]==None and p!=self.root: p=p.fail
            if p.next[index]==None:p=self.root
            else: p=p.next[index]
            if p.end:return True
        return False



class UnicodeAC_AutoMachine():
    def __init__(self):
        self.ac = AC_Automachine()

    def getAcString(self,string):
        string = bytearray(string.encode('utf-8'))
        ac_string = ''
        for byte in string:
            ac_string += chr(byte%16)
            ac_string += chr(byte/16)
        #print ac_string
        return ac_string

    def insert(self,string):
        if type(string) != unicode:
            raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
        ac_string = self.getAcString(string)
        self.ac.insert(ac_string)

    def build_automachine(self):
        self.ac.build_automachine()

    def matchOne(self,string):
        if type(string) != unicode:
            raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
        ac_string = self.getAcString(string)
        return self.ac.matchOne(ac_string)


def main2():
    ac = UnicodeAC_AutoMachine()
    ac.insert(u'丁亚光')
    ac.insert(u'好吃的')
    ac.insert(u'好玩的')
    ac.build_automachine()
    print ac.matchOne(u'hi,丁亚光在干啥')
    print ac.matchOne(u'ab')
    print ac.matchOne(u'不能吃饭啊')
    print ac.matchOne(u'饭很好吃，有很多好好的吃的，')
    print ac.matchOne(u'有很多好玩的')

if __name__ == '__main__':
    main2()

#coding=utf-8

KIND = 16
#BASE = ord('a')

class Node():
    def __init__(self):
        self.fail = None
        self.next = [None]*KIND
        self.end = False

class AC_Automachine():
    def __init__(self):
        self.root = Node()
        self.queue = []

    def getIndex(self,char):
        return ord(char)# - BASE

    def insert(self,string):
        p = self.root
        for char in string:
            index = self.getIndex(char)
            if p.next[index] == None:
                p.next[index] = Node()
            p = p.next[index]
        p.end = True

    def build_automachine(self):
        self.root.fail = None
        self.queue.append(self.root)
        while len(self.queue)!=0:
            parent = self.queue[0]
            self.queue.pop(0)
            for i,child in enumerate(parent.next):
                if child == None:continue
                if parent == self.root:
                    child.fail = self.root
                else:
                    failp = parent.fail
                    while failp != None:
                        if failp.next[i] != None:
                            child.fail = failp.next[i]
                            break
                        failp = failp.fail
                    if failp==None: child.fail=self.root
                self.queue.append(child)

    def matchOne(self,string):
        p = self.root
        for char in string:
            index = self.getIndex(char)
            while p.next[index]==None and p!=self.root: p=p.fail
            if p.next[index]==None:p=self.root
            else: p=p.next[index]
            if p.end:return True
        return False

class UnicodeAC_AutoMachine():
    def __init__(self):
        self.ac = AC_Automachine()

    def getAcString(self,string):
        string = bytearray(string.encode('utf-8'))
        ac_string = ''
        for byte in string:
            ac_string += chr(byte%16)
            ac_string += chr(byte/16)
        #print ac_string
        return ac_string

    def insert(self,string):
        if type(string) != unicode:
            raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
        ac_string = self.getAcString(string)
        self.ac.insert(ac_string)

    def build_automachine(self):
        self.ac.build_automachine()

    def matchOne(self,string):
        if type(string) != unicode:
            raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
        ac_string = self.getAcString(string)
        return self.ac.matchOne(ac_string)

def main2():
    ac = UnicodeAC_AutoMachine()
    ac.insert(u'丁亚光')
    ac.insert(u'好吃的')
    ac.insert(u'好玩的')
    ac.build_automachine()
    print ac.matchOne(u'hi,丁亚光在干啥')
    print ac.matchOne(u'ab')
    print ac.matchOne(u'不能吃饭啊')
    print ac.matchOne(u'饭很好吃，有很多好好的吃的，')
    print ac.matchOne(u'有很多好玩的')

if __name__ == '__main__':
main2()

下面是测试结果

关键词数是2000，长度2-12

文本长度为60-80，分别测试4000条文本和300000条文本的速度

速度分别为 5.5s 和 619.8s

如下图，挺不错的

摘自 New Day New Plan

热门文章：

python实现支持unicode中文的AC自动机

python实现支持unicode中文的AC自动机

相关内容

最新python教程

python~HOT