ホームページ >バックエンド開発 >Python チュートリアル >Python を使用して XML を解析して対応する HTML サンプルを共有する

Python を使用して XML を解析して対応する HTML サンプルを共有する

WBOY
WBOYオリジナル
2016-06-16 08:44:401413ブラウズ

SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
#   程序:XML解析器
#   版本:01.0
#   作者:mupeng
#   日期:2013-12-18
#   语言:Python 2.7
#   功能:将xml解析成对应的html
#   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
#   继承ContentHandler并重写其事件处理函数
#   Dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
    def dispatch(self, prefix, name, attrs=None):
        mname = prefix + name.capitalize()
        dname = 'default' + prefix.capitalize()
        method = getattr(self, mname, None)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, None)
            #args = name
        #if prefix == 'start': args += attrs
        if callable(method): method()

    def startElement(self, name, attrs):
        self.dispatch('start', name, attrs)

    def endElement(self, name):
        self.dispatch('end', name)

class Website(Dispatcher, ContentHandler):

    def __init__(self):
        self.fout = open('ddt_SAX.html', 'w')
        self.imagein = False
        self.desflag = False
        self.item = False
        self.title = ''
        self.link = ''
        self.guid = ''
        self.url = ''
        self.pubdate = ''
        self.description = ''
        self.temp = ''
        self.prx = ''
    def startChannel(self):

        self.fout.write('''\n\n RSS-''')<br><br>    def endChannel(self):<br>       self.fout.write('''<br>                    <tr><td height="20"></td></tr><br>                    </table><br>                    </center><br>                    <script><br>    function  GetTimeDiff(str)<br>    {<br>     if(str == '')<br>     {<br>      return '';<br>     }</p> <p>     var pubDate = new Date(str);<br>     var nowDate = new Date();<br>     var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();<br>     var days = diffMilSeconds/86400000;<br>     days = parseInt(days);</p> <p>     diffMilSeconds = diffMilSeconds-(days*86400000);<br>     var hours = diffMilSeconds/3600000;<br>     hours = parseInt(hours);</p> <p>     diffMilSeconds = diffMilSeconds-(hours*3600000);<br>     var minutes = diffMilSeconds/60000;<br>     minutes = parseInt(minutes);</p> <p>     diffMilSeconds = diffMilSeconds-(minutes*60000);<br>     var seconds = diffMilSeconds/1000;<br>     seconds = parseInt(seconds);<br><br>     var returnStr = "±±¾©·¢²¼Ê±¼ä£º" + pubDate.toLocaleString();</p> <p>     if(days > 0)<br>     {<br>      returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + days + "Ìì" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";<br>     }<br>     else if (hours > 0)<br>     {<br>      returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";<br>     }<br>     else if (minutes > 0)<br>     {<br>      returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + minutes + "·ÖÖÓ£©";<br>     }</p> <p>     return returnStr;</p> <p>    }</p> <p>    function GetSpanText()<br>    {<br>     var pubDate;<br>     var pubDateArray;<br>     var spanArray = document.getElementsByTagName("span");</p> <p>     for(var i = 0; i < spanArray.length; i++)<BR> {<BR> pubDate = spanArray[i].innerHTML;<BR> document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate); <BR> }<BR> }</P> <P> GetSpanText();<BR> </script><br>                </body><br>                </html><br>                ''')<br>       self.fout.close()</p> <p>    def characters(self, chars):<br>        if chars.strip():<br>            #chars = chars.strip()<br>            self.temp += chars<br>            #print self.temp<br><br>       <br>    def startTitle(self):<br><br>        if self.item:<br>            self.fout.write('''<br>                        <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<B><br>                    ''')<br><br>    def endTitle(self):<br><br>        if not self.imagein and not self.item:<br>            self.title = self.temp<br>            self.temp = ''<br>            self.fout.write(self.title.encode('gb2312'))<br><br>            #self.title = self.temp<br>            self.fout.write('''<br>                \n\n\n

\n
                <script>\n</p> <p>                        function copyLink()<br>                        {<br>                                clipboardData.setData("Text",window.location.href);<br>                                alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼ôÌù°å");<br>                        }</p> <p>                        function subscibeLink()<br>                        {<br>                                var str = window.location.pathname;<br>                                while(str.match(/^\//))<br>                                {<br>                                        str = str.replace(/^\//,"");<br>                                }<br>                                window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");</p> <p>                        }<br>                        </script>\n
                \n
                \n
               
                       
                       
                       
                           
                           
                       
                       
\n
            ''')

        if self.item:
            self.title = self.temp
            self.temp = ''
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write('''
                       
                       

                        ''')

    def startImage(self):
        self.imagein = True

    def endImage(self):
        self.imagein = False

    def startLink(self):
        if self.imagein:
            self.fout.write('''\n ''')
        elif self.item:
            #self.link = self.temp
            pass
        else:
            self.fout.write(self.link)
            self.fout.write(''' " target="
      _blank
     "> ''')
            self.fout.write(self.title.encode('gb2312'))
            self.fout.write('''


                            ''')
            self.fout.write(self.description.encode('gb2312'))
            self.fout.write('''
                       
¸´ÖÆ´ËÒ³Á´½Ó                ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ã棨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©

                       
                            ''')

    def startUrl(self):
        if self.imagein:
            self.fout.write('''\n
                           
                           
                           




''')

#程序エントリー
if __name__ == '__main__':
parse('ddt.xml', Website())

声明:
この記事の内容はネチズンが自主的に寄稿したものであり、著作権は原著者に帰属します。このサイトは、それに相当する法的責任を負いません。盗作または侵害の疑いのあるコンテンツを見つけた場合は、admin@php.cn までご連絡ください。


'')
self.fout.write(self.guid)
self.fout.write('''


< /tr>
self.fout.write(self.pubdate)
self.fout.write(''