python 语料处理(从文件夹中读取文件夹中文件,分词,去停用词,去单个字),,# -*- codi


# -*- coding:utf8 -*-import osimport jiebadef splitSentence(inputFile):    fin = open(inputFile, ‘r‘)                                  #以读的方式打开文件    global fout                                                 #以写得方式打开文件    #print fin    global stop    for eachLine in fin:    #print eachLine        line = eachLine.strip()#.decode(‘utf-8‘, ‘ignore‘)      #去除每行首尾可能出现的空格,并转为Unicode进行处理    line=line.strip(‘\n‘)                                   #去掉多余空行        wordList = list(jieba.cut(line))                        #用结巴分词,对每行内容进行分词    #wordList = list(jieba.cut_for_search(line))                   outStr = ‘‘        for word in wordList:#            if len(word)>1:                if not word in stop:                        outStr += word                        outStr += ‘ ‘        fout.write(outStr.strip().encode(‘utf-8‘))              #将分词好的结果写入到输出文件    fout.write(‘\n‘)    fin.close()    #path=r‘/media/软件/zhuomian/VARandLDAr/train‘#r‘D:/zhuomian/VARandLDA/train‘path=‘/home/xdj/train‘fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files]stop = [line.strip().decode(‘utf-8‘, ‘ignore‘) for line in open(‘/home/xdj/chstop.txt‘).readlines()]fout = open(‘myOutput.txt‘, ‘w‘)  fout.write(‘%d‘ %len(fns)+‘\n‘)for f in fns:    splitSentence(f)                                           #splitSentence(‘/home/xdj/train/C3-Art/C3-Art1459.txt‘, ‘myOutput.txt‘)print(len(fns))fout.close()

python 语料处理(从文件夹中读取文件夹中文件,分词,去停用词,去单个字)

相关内容

    暂无相关文章

评论关闭