统计英文词频,英文词频,python2.7适用于


python2.7

适用于统计英文词频

# -*- encoding: utf-8 -*-# by sorcerdu#基本功能和用法在提示中#原理是利用分隔符分词存入列表,然后从列表读出存入字典,键为词,值存放词的数量#中文统计词频的话,得先分词后再进行。import os,string,codecsimport sys,timedef readfile():    wordlist=[]    base=open('base.txt','r')    baseinfo=base.readlines()    tagf=open('tag.txt','r')    tagfinfo=tagf.readlines()    for i in tagfinfo:        tags=i.split(' ')    for i in baseinfo:        words=i.split(' ')        for word in words:            if word != '\\t'and word != '\\n' and word!=' ' and word != '' and word>=2:               word=word.replace('\\t','')               word=word.replace('\\n','')               word=word.replace(' ','')               word=word.replace('.\\n','')               if word!='':                   wordlist.append(word)##        tags=['.','"',',','!','?','(',')']        for x in range(len(tags)):            tag=tags[x]            for k in range(len(wordlist)):                if tag in wordlist[k]: #用符号分割                    words=wordlist[k].split(tag)                    del wordlist[k]                    for  j in range(len(words)): #去掉判断后的空字符                        if words[j]!='':                            wordlist.append(words[j])    base.close()    tagf.close()    return wordlistdef getstr(word,count,allwordnum):    countstr=word+'--------'+str(count)+'--------'+str(allwordnum)    return countstrif __name__=="__main__":   wordcnt={}    wordlist=readfile()   wordlistall=wordlist   allwordnum=len(wordlistall)   outdata=open('count.txt','w')   print '******************************************'   print(u'提示:')   print(u'     1、要统计的文章放置于本程序路径下的base.txt中')    print(u'     2、单词分割符存放在本程序路径下的tag.txt中,以空格为分隔符,默认已对换码符,换行符,空格,句号(英文)处理')   print(u'     3、统计的结果保存在本程序路径下的count.txt中')   print '******************************************'   print(u"开始统计咯......")   print'------------------------------------------------------------------------'   for i in wordlistall:       if i in wordcnt:          wordcnt[i]+=1       else:          wordcnt[i]=1   for word,cnt in wordcnt.iteritems():       print word+'--------'+str(cnt)+'--------'+str(allwordnum)       outdata.write(getstr(word,cnt,allwordnum)+'\\n')   print'------------------------------------------------------------------------'   print(u"完成")   print(u'按任意键退出')   outdata.close()   os.system("pause")#该片段来自于http://byrx.net

评论关闭