Python读取html中指定元素生成excle文件,,#coding=gbki


#coding=gbkimport stringimport codecsimport os,timeimport xlwtimport xlrdfrom bs4 import BeautifulSoup from xlrd import open_workbookclass LogMsg:        def __init__(self,logfile,Level=0):                try:                        import logging                        #self.logger = None                        self.logger = logging.getLogger()                        self.hdlr = logging.FileHandler(logfile)                        formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")                        self.hdlr.setFormatter(formatter)                        self.logger.addHandler(self.hdlr)                        #logger.setLevel()                        if Level == 10:                                self.logger.setLevel(logging.DEBUG)                        elif Level == 20:                                self.logger.setLevel(logging.INFO)                        elif Level == 30:                                self.logger.setLevel(logging.WARNING)                        elif Level == 40:                                self.logger.setLevel(logging.ERROR)                        elif Level == 50:                                self.logger.setLevel(logging.CRITICAL)                        else:                                self.logger.setLevel(logging.NOTSET)                except:                        print "log init error!"                        exit(1)        def output(self,logInfo):                Level = self.logger.getEffectiveLevel()                try:                        if Level == 10:                                self.logger.debug(logInfo)                        elif Level == 20:                                self.logger.info(logInfo)                        elif Level == 30:                                self.logger.warning(logInfo)                        elif Level == 40:                                self.logger.error(logInfo)                        elif Level == 50:                                self.logger.critical(logInfo)                        else:                                self.logger.info(logInfo)                except:                        print "log output error!"                        exit(1)        def close(self):                try:                #logging.shutdown([self.hdlr])                        self.logger.removeHandler(self.hdlr)                except:                        print "log closed error!"                        exit(1) Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())logFileTime = time.strftime("%Y%m%d",time.localtime())Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTimelog = LogMsg(Logfile,20)DATAPATH = '/data/pyExample/'XLSname = 'dangjian_'+Logtime+'.xls'if __name__ == '__main__':    wbk = xlwt.Workbook(encoding = 'gbk')    sheet = wbk.add_sheet('基本内容导入模板')    sheet.write(0,0,'内容类型 ')    sheet.write(0,1,'栏目名称')    sheet.write(0,2,'栏目编号')    sheet.write(0,3,'内容名称')    sheet.write(0,4,'时长')    sheet.write(0,5,'关键字')    sheet.write(0,6,'看点')    sheet.write(0,7,'作者')    sheet.write(0,8,'来源')    sheet.write(0,9,'子内容1')    sheet.write(0,10,'子内容2')    xlsContent = []       files = os.listdir(DATAPATH)    k = 0    for f in files:          if os.path.splitext(f)[1] == '.html':            content=[]            log.output('当前文件:'+f)            htmlFile =codecs.open(DATAPATH+f,'r','gbk')            lines = htmlFile.readlines()            if not lines:                log.output ('not line')            for line in lines:                if line.strip()=='\\n':                    log.output('该处是空行')                else:                    line = line.replace(' ','')                    soup  = BeautifulSoup(line)                    for tdd in soup.findAll('td'):                          #print tdd.text.encode("gbk")                        content.append(tdd.text.encode("gbk"))                       #print line.encode('gbk')             htmlFile.close()                for i in content:                print content.index(i),',',i                 log.output(i)                 log.output(content.index(i))             print '----------------------------------------'            folderName =  content[6]            contentName=  content[4]                   duration =    filter(str.isdigit, content[16])            int_duration = string.atoi(duration)*60            str_duration = "%i"%int_duration            keyWord =     content[6]             desciption =  content[36]            videoName_1 = content[10]            print folderName            print contentName            print str_duration            print keyWord            print desciption            print videoName_1            log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')            print k                        sheet.write(k+1,0,'')            sheet.write(k+1,1,folderName)            sheet.write(k+1,2,'')            sheet.write(k+1,3,contentName)            sheet.write(k+1,4,str_duration)            sheet.write(k+1,5,keyWord)            sheet.write(k+1,6,desciption)            sheet.write(k+1,7,'管理员')            sheet.write(k+1,8,'华数编辑')            sheet.write(k+1,9,videoName_1)            sheet.write(k+1,10,'')            k+=1    wbk.save(DATAPATH + XLSname)            print '========================================='#该片段来自于http://byrx.net

评论关闭