Python批量将word转html,并将html内容发布至网站。,pythonword,#coding=utf-


#coding=utf-8__author__ = 'zhm'from win32com import client as wcimport osimport timeimport randomimport MySQLdbimport redef wordsToHtml(dir):#批量把文件夹的word文档转换成html文件    #金山WPS调用,抢先版的用KWPS,正式版WPS    word = wc.Dispatch('KWPS.Application')    for path, subdirs, files in os.walk(dir):        for wordFile in files:            wordFullName = os.path.join(path, wordFile)            #print "word:" + wordFullName            doc = word.Documents.Open(wordFullName)            wordFile2 = unicode(wordFile, "gbk")            dotIndex = wordFile2.rfind(".")            if(dotIndex == -1):                print '********************ERROR: 未取得后缀名!'            fileSuffix = wordFile2[(dotIndex + 1) : ]            if(fileSuffix == "doc" or fileSuffix == "docx"):                fileName = wordFile2[ : dotIndex]                htmlName = fileName + ".html"                htmlFullName = os.path.join(unicode(path, "gbk"), htmlName)                # htmlFullName = unicode(path, "gbk") + "\\" + htmlName                print u'生成了html文件:' + htmlFullName                doc.SaveAs(htmlFullName, 8)                doc.Close()    word.Quit()    print ""    print "Finished!"def html_add_to_db(dir):#将转换成功的html文件批量插入数据库中。    conn = MySQLdb.connect(        host='localhost',        port=3306,        user='root',        passwd='root',        db='test',        charset='utf8'        )    cur = conn.cursor()    for path, subdirs, files in os.walk(dir):        for htmlFile in files:            htmlFullName = os.path.join(path, htmlFile)            title = os.path.splitext(htmlFile)[0]            targetDir = 'D:/files/htmls/'      #D:/files为web服务器配置的静态目录            sconds = time.time()            msconds = sconds * 1000            targetFile = os.path.join(targetDir, str(int(msconds))+str(random.randint(100, 10000)) +'.html')            htmlFile2 = unicode(htmlFile, "gbk")            dotIndex = htmlFile2.rfind(".")            if(dotIndex == -1):                print '********************ERROR: 未取得后缀名!'            fileSuffix = htmlFile2[(dotIndex + 1) : ]            if(fileSuffix == "htm" or fileSuffix == "html"):               if not os.path.exists(targetDir):                    os.makedirs(targetDir)               htmlFullName = os.path.join(unicode(path, "gbk"), htmlFullName)               htFile = open(htmlFullName,'rb')               #获取网页内容               htmStrCotent = htFile.read()               #找出里面的图片               img=re.compile(r"""<img\s.*?\s?src\s*=\s*['|"]?([^\s'"]+).*?>""",re.I)               m = img.findall(htmStrCotent)               for tagContent in m:                   imgSrc = unicode(tagContent, "gbk")                   imgSrcFullName = os.path.join(path, imgSrc)                   #上传图片                   imgTarget = 'D:/files/images/whzx/'                   img_sconds = time.time()                   img_msconds = sconds * 1000                   targetImgFile = os.path.join(imgTarget, str(int(img_msconds))+str(random.randint(100, 10000)) +'.png')                   if not os.path.exists(imgTarget):                      os.makedirs(imgTarget)                   if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))):                       tmpImgFile = open(imgSrcFullName,'rb')                       tmpWriteImgFile = open(targetImgFile, "wb")                       tmpWriteImgFile.write(tmpImgFile.read())                       tmpImgFile.close()                       tmpWriteImgFile.close()                       htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1])               if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))):                   #用iframe包装转换好的html文件。                    iframeHtml='''                    <script type="text/javascript" language="javascript">                        function iFrameHeight() {                            var ifm= document.getElementById("iframepage");                            var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument;                            if(ifm != null &amp;&amp; subWeb != null) {                                ifm.height = subWeb.body.scrollHeight;                            }                        }                    </script>                    <iframe src='''+targetFile.split(':')[1]+'''                     marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe>                    '''                    tmpTargetFile = open(targetFile, "wb")                    tmpTargetFile.write(htmStrCotent)                    tmpTargetFile.close()                    htFile.close()                    try:                        # 执行                        sql = "insert into common_article(title,content) values(%s,%s)"                        param = (unicode(title, "gbk"),iframeHtml)                        cur.execute(sql,param)                    except:                        print "Error: unable to insert data"    cur.close()    conn.commit()    # 关闭数据库连接    conn.close()if __name__ == '__main__':    wordsToHtml('d:/word')    html_add_to_db('d:/word')

评论关闭