当前位置 博文首页 > python读取html中指定元素生成excle文件示例

    python读取html中指定元素生成excle文件示例

    作者:admin 时间:2021-07-04 17:47

    Python2.7编写的读取html中指定元素,并生成excle文件

    复制代码 代码如下:

    #coding=gbk
    import string
    import codecs
    import os,time
    import xlwt
    import xlrd
    from bs4 import BeautifulSoup
    from xlrd import open_workbook

    class LogMsg:
            def __init__(self,logfile,Level=0):
                    try:
                            import logging
                            #self.logger = None
                            self.logger = logging.getLogger()
                            self.hdlr = logging.FileHandler(logfile)
                            formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")
                            self.hdlr.setFormatter(formatter)
                            self.logger.addHandler(self.hdlr)
                            #logger.setLevel()
                            if Level == 10:
                                    self.logger.setLevel(logging.DEBUG)
                            elif Level == 20:
                                    self.logger.setLevel(logging.INFO)
                            elif Level == 30:
                                    self.logger.setLevel(logging.WARNING)
                            elif Level == 40:
                                    self.logger.setLevel(logging.ERROR)
                            elif Level == 50:
                                    self.logger.setLevel(logging.CRITICAL)
                            else:
                                    self.logger.setLevel(logging.NOTSET)
                    except:
                            print "log init error!"
                            exit(1)

            def output(self,logInfo):
                    Level = self.logger.getEffectiveLevel()
                    try:
                            if Level == 10:
                                    self.logger.debug(logInfo)
                            elif Level == 20:
                                    self.logger.info(logInfo)
                            elif Level == 30:
                                    self.logger.warning(logInfo)
                            elif Level == 40:
                                    self.logger.error(logInfo)
                            elif Level == 50:
                                    self.logger.critical(logInfo)
                            else:
                                    self.logger.info(logInfo)
                    except:
                            print "log output error!"
                            exit(1)

            def close(self):
                    try:
                    #logging.shutdown([self.hdlr])
                            self.logger.removeHandler(self.hdlr)
                    except:
                            print "log closed error!"
                            exit(1)

    Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
    logFileTime = time.strftime("%Y%m%d",time.localtime())
    Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
    log = LogMsg(Logfile,20)


    DATAPATH = '/data/pyExample/'
    XLSname = 'dangjian_'+Logtime+'.xls'


    if __name__ == '__main__':
       

        wbk = xlwt.Workbook(encoding = 'gbk')
        sheet = wbk.add_sheet('基本内容导入模板')
        sheet.write(0,0,'内容类型 ')
        sheet.write(0,1,'栏目名称')
        sheet.write(0,2,'栏目编号')
        sheet.write(0,3,'内容名称')
        sheet.write(0,4,'时长')
        sheet.write(0,5,'关键字')
        sheet.write(0,6,'看点')
        sheet.write(0,7,'作者')
        sheet.write(0,8,'来源')
        sheet.write(0,9,'子内容1')
        sheet.write(0,10,'子内容2')
        xlsContent = []  
        files = os.listdir(DATAPATH)
        k = 0
        for f in files: 
            if os.path.splitext(f)[1] == '.html':
                content=[]
                log.output('当前文件:'+f)
                htmlFile =codecs.open(DATAPATH+f,'r','gbk')
                lines = htmlFile.readlines()
                if not lines:
                    log.output ('not line')
                for line in lines:
                    if line.strip()=='\n':
                        log.output('该处是空行')
                    else:
                        line = line.replace(' ','')
                        soup  = BeautifulSoup(line)
                        for tdd in soup.findAll('td'): 
                            #print tdd.text.encode("gbk")
                            content.append(tdd.text.encode("gbk"))      
                    #print line.encode('gbk')
                htmlFile.close()   
                for i in content:
                    print content.index(i),',',i
                    log.output(i)
                    log.output(content.index(i))
                print '----------------------------------------'
               

                folderName =  content[6]
                contentName=  content[4]      
                duration =    filter(str.isdigit, content[16])
                int_duration = string.atoi(duration)*60
                str_duration = "%i"%int_duration
                keyWord =     content[6]
                desciption =  content[36]
                videoName_1 = content[10]
                print folderName
                print contentName
                print str_duration
                print keyWord
                print desciption
                print videoName_1
                log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
                print k           
                sheet.write(k+1,0,'')
                sheet.write(k+1,1,folderName)
                sheet.write(k+1,2,'')
                sheet.write(k+1,3,contentName)
                sheet.write(k+1,4,str_duration)
                sheet.write(k+1,5,keyWord)
                sheet.write(k+1,6,desciption)
                sheet.write(k+1,7,'管理员')
                sheet.write(k+1,8,'华数编辑')
                sheet.write(k+1,9,videoName_1)
                sheet.write(k+1,10,'')
                k+=1

        wbk.save(DATAPATH + XLSname)       

        print '=========================================' 

    jsjbwy
    下一篇:没有了