当前位置 博文首页 > 使用python解析xml成对应的html示例分享

    使用python解析xml成对应的html示例分享

    作者:admin 时间:2021-07-03 18:42

    SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

    复制代码 代码如下:

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    #---------------------------------------
    #   程序:XML解析器
    #   版本:01.0
    #   作者:mupeng
    #   日期:2013-12-18
    #   语言:Python 2.7
    #   功能:将xml解析成对应的html
    #   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
    #   继承ContentHandler并重写其事件处理函数
    #   Dispatcher主要用于相应标签的起始、结束事件的派发
    #---------------------------------------
    from xml.sax.handler import ContentHandler
    from xml.sax import parse

    class Dispatcher:
        def dispatch(self, prefix, name, attrs=None):
            mname = prefix + name.capitalize()
            dname = 'default' + prefix.capitalize()
            method = getattr(self, mname, None)
            if callable(method): args = ()
            else:
                method = getattr(self, dname, None)
                #args = name
            #if prefix == 'start': args += attrs
            if callable(method): method()

        def startElement(self, name, attrs):
            self.dispatch('start', name, attrs)

        def endElement(self, name):
            self.dispatch('end', name)

    class Website(Dispatcher, ContentHandler):

        def __init__(self):
            self.fout = open('ddt_SAX.html', 'w')
            self.imagein = False
            self.desflag = False
            self.item = False
            self.title = ''
            self.link = ''
            self.guid = ''
            self.url = ''
            self.pubdate = ''
            self.description = ''
            self.temp = ''
            self.prx = ''
        def startChannel(self):

            self.fout.write('''<html>\n<head>\n<title> RSS-''')

        def endChannel(self):
           self.fout.write('''
                        <tr><td height="20"></td></tr>
                        </table>
                        </center>
                        <script>
        function  GetTimeDiff(str)
        {
         if(str == '')
         {
          return '';
         }

         var pubDate = new Date(str);
         var nowDate = new Date();
         var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();
         var days = diffMilSeconds/86400000;
         days = parseInt(days);

         diffMilSeconds = diffMilSeconds-(days*86400000);
         var hours = diffMilSeconds/3600000;
         hours = parseInt(hours);

         diffMilSeconds = diffMilSeconds-(hours*3600000);
         var minutes = diffMilSeconds/60000;
         minutes = parseInt(minutes);

         diffMilSeconds = diffMilSeconds-(minutes*60000);
         var seconds = diffMilSeconds/1000;
         seconds = parseInt(seconds);

         var returnStr = "±±¾©·¢²¼Ê±¼ä£º" + pubDate.toLocaleString();

         if(days > 0)
         {
          returnStr = returnStr + "&nbsp;£¨¾àÀëÏÖÔÚ" + days + "Ìì" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";
         }
         else if (hours > 0)
         {
          returnStr = returnStr + "&nbsp;£¨¾àÀëÏÖÔÚ" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";
         }
         else if (minutes > 0)
         {
          returnStr = returnStr + "&nbsp;£¨¾àÀëÏÖÔÚ" + minutes + "·ÖÖÓ£©";
         }

         return returnStr;

        }

        function GetSpanText()
        {
         var pubDate;
         var pubDateArray;
         var spanArray = document.getElementsByTagName("span");

         for(var i = 0; i < spanArray.length; i++)
         {
          pubDate = spanArray[i].innerHTML;
          document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate);   
         }
        }

        GetSpanText();
       </script>
                    </body>
                    </html>
                    ''')
           self.fout.close()

        def characters(self, chars):
            if chars.strip():
                #chars = chars.strip()
                self.temp += chars
                #print self.temp

          
        def startTitle(self):

            if self.item:
                self.fout.write('''
                            <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<B>
                        ''')

        def endTitle(self):

            if not self.imagein and not self.item:
                self.title = self.temp
                self.temp = ''
                self.fout.write(self.title.encode('gb2312'))

                #self.title = self.temp
                self.fout.write('''
                    </title>\n</head>\n<body>\n<center>\n
                    <script>\n

                            function copyLink()
                            {
                                    clipboardData.setData("Text",window.location.href);
                                    alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼ôÌù°å");
                            }

                            function subscibeLink()
                            {
                                    var str = window.location.pathname;
                                    while(str.match(/^\//))
                                    {
                                            str = str.replace(/^\//,"");
                                    }
                                    window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");

                            }
                            </script>\n
                    <table width="750" cellpadding="0" cellspacing="0">\n
                    <tr>\n
                    <td align="right" style="padding-right:15px;" valign="bottom">\n
                ''')

            if self.item:
                self.title = self.temp
                self.temp = ''
                self.fout.write(self.title.encode('gb2312'))
                self.fout.write('''
                            </B>
                            </td>
                            </tr>
                            <tr bgcolor="#eeeeee">
                            <td style="padding-left:5px;">
                            ''')

        def startImage(self):
            self.imagein = True

        def endImage(self):
            self.imagein = False

        def startLink(self):
            if self.imagein:
                self.fout.write('''<A href=" ''')

               
        def endLink(self):
            self.link = self.temp
            self.temp = ''
            if self.imagein:
                self.fout.write(self.link.encode('gb2312'))
                self.fout.write('''" target="_blank">\n ''')
            elif self.item:
                #self.link = self.temp
                pass
            else:
                self.fout.write(self.link)
                self.fout.write(''' " target="
          _blank
         "> ''')
                self.fout.write(self.title.encode('gb2312'))
                self.fout.write(''' </A></B></td>
                                </tr>
                                <tr><td colspan="2" align="center">
                                ''')
                self.fout.write(self.description.encode('gb2312'))
                self.fout.write('''
                            </td></tr>
                            <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copyLink();">¸´ÖÆ´ËÒ³Á´½Ó</a>                <a href="javascript:subscibeLink();">ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ã棨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©</a></b></td></tr>
                            </table>
                            <table width="750" cellpadding="0" cellspacing="0">
                                ''')

        def startUrl(self):
            if self.imagein:
                self.fout.write('''<IMG src=" ''')
        def endUrl(self):
            self.url = self.temp
            self.temp = ''
            if self.imagein:
                self.fout.write(self.url.encode('gb2312'))
                self.fout.write('''" border="0">\n
                                </A>
                                </td>
                                <td align="left" valign="bottom" style="padding-bottom:8px;"><B><A href="
                                ''')
            if self.item:
                #self.url = self.temp
                pass

        def defaultStart(self):
            pass
        def defaultEnd(self):
            self.temp = ''
        def startDescription(self):
            pass
        def endDescription(self):
            self.description = self.temp
            self.temp = ''
            if self.item:
                #self.fout.write('¡¡¡¡')
                self.fout.write(self.description.encode('gb2312'))

        def endGuid(self):
            self.guid = self.temp
        def endPubdate(self):
            if not self.temp.startswith('http'):
             self.pubdate = self.temp
             self.temp = ''
            else:
                self.pubdate = ''
        def startItem(self):
            self.item = True
        def endItem(self):
            self.item = False
            self.fout.write('''
                                </td>
                                </tr>
                                <tr bgcolor="#eeeeee">
                                <td style="padding-top:5px;padding-left:5px;">
                                <A href="''')
            self.fout.write(self.link)
            self.fout.write(''' " target="_blank"> ''')
            self.fout.write(self.guid)
            self.fout.write('''
                            </A>
                            </td>
                            </tr>
                            <tr bgcolor="#eeeeee">
                            <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')
            self.fout.write(self.pubdate)
            self.fout.write('''</span></td>
                            </tr>
                            <tr height="10"><td></td></tr>''')

    #程序入口
    if __name__ == '__main__':
        parse('ddt.xml', Website())

    jsjbwy
    下一篇:没有了