Maison >développement back-end >Tutoriel Python >使用python解析xml成对应的html示例分享

使用python解析xml成对应的html示例分享

WBOY
WBOYoriginal
2016-06-16 08:44:401413parcourir

SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
#   程序:XML解析器
#   版本:01.0
#   作者:mupeng
#   日期:2013-12-18
#   语言:Python 2.7
#   功能:将xml解析成对应的html
#   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
#   继承ContentHandler并重写其事件处理函数
#   Dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
    def dispatch(self, prefix, name, attrs=None):
        mname = prefix + name.capitalize()
        dname = 'default' + prefix.capitalize()
        method = getattr(self, mname, None)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, None)
            #args = name
        #if prefix == 'start': args += attrs
        if callable(method): method()

    def startElement(self, name, attrs):
        self.dispatch('start', name, attrs)

    def endElement(self, name):
        self.dispatch('end', name)

class Website(Dispatcher, ContentHandler):

    def __init__(self):
        self.fout = open('ddt_SAX.html', 'w')
        self.imagein = False
        self.desflag = False
        self.item = False
        self.title = ''
        self.link = ''
        self.guid = ''
        self.url = ''
        self.pubdate = ''
        self.description = ''
        self.temp = ''
        self.prx = ''
    def startChannel(self):

        self.fout.write('''\n

\n RSS-''')<br><br>    def endChannel(self):<br>       self.fout.write('''<br>                    <tr><td height="20"></td></tr> <br>                    <br>                    <br>                    <script><BR> function GetTimeDiff(str)<BR> {<BR> if(str == '')<BR> {<BR> return '';<BR> } <P> var pubDate = new Date(str);<BR> var nowDate = new Date();<BR> var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();<BR> var days = diffMilSeconds/86400000;<BR> days = parseInt(days); <P> diffMilSeconds = diffMilSeconds-(days*86400000);<BR> var hours = diffMilSeconds/3600000;<BR> hours = parseInt(hours); <P> diffMilSeconds = diffMilSeconds-(hours*3600000);<BR> var minutes = diffMilSeconds/60000;<BR> minutes = parseInt(minutes); <P> diffMilSeconds = diffMilSeconds-(minutes*60000);<BR> var seconds = diffMilSeconds/1000;<BR> seconds = parseInt(seconds);<br><br> var returnStr = "±±&frac34;&copy;·&cent;&sup2;&frac14;&Ecirc;±&frac14;&auml;&pound;&ordm;" + pubDate.toLocaleString(); <P> if(days > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + days + "&Igrave;ì" + hours + "&ETH;&iexcl;&Ecirc;±" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> }<BR> else if (hours > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + hours + "&ETH;&iexcl;&Ecirc;±" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> }<BR> else if (minutes > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> } <P> return returnStr; <P> } <P> function GetSpanText()<BR> {<BR> var pubDate;<BR> var pubDateArray;<BR> var spanArray = document.getElementsByTagName("span"); <P> for(var i = 0; i < spanArray.length; i++)<BR> {<BR> pubDate = spanArray[i].innerHTML;<BR> document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate); <BR> }<BR> } <P> GetSpanText();<BR> </script><br>                <br>                <br>                ''')<br>       self.fout.close() <p>    def characters(self, chars):<br>        if chars.strip():<br>            #chars = chars.strip()<br>            self.temp += chars<br>            #print self.temp<br><br>       <br>    def startTitle(self):<br><br>        if self.item:<br>            self.fout.write('''<br>                        </p> <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<b><br>                    ''')<br><br>    def endTitle(self):<br><br>        if not self.imagein and not self.item:<br>            self.title = self.temp<br>            self.temp = ''<br>            self.fout.write(self.title.encode('gb2312'))<br><br>            #self.title = self.temp<br>            self.fout.write('''<br>                \n\n\n</b><center>\n<br>                <script>\n <P> function copyLink()<BR> {<BR> clipboardData.setData("Text",window.location.href);<BR> alert("RSS&Aacute;&acute;&frac12;&Oacute;&Ograve;&Ntilde;&frac34;&shy;&cedil;&acute;&Ouml;&AElig;&micro;&frac12;&frac14;&ocirc;&Igrave;ù°&aring;");<BR> } <P> function subscibeLink()<BR> {<BR> var str = window.location.pathname;<BR> while(str.match(/^\//))<BR> {<BR> str = str.replace(/^\//,"");<BR> }<BR> window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self"); <P> }<BR> </script>\n<br>                <table width="750" cellpadding="0" cellspacing="0">\n<br>                <tr>\n<br>                <td align="right" style="padding-right:15px;" valign="bottom">\n<br>            ''')<br><br>        if self.item:<br>            self.title = self.temp<br>            self.temp = ''<br>            self.fout.write(self.title.encode('gb2312'))<br>            self.fout.write('''<br>                        <br>                        </td> <br>                        </tr> <br>                        <tr bgcolor="#eeeeee"> <br>                        <td style="padding-left:5px;"> <br>                        ''') <p>    def startImage(self):<br>        self.imagein = True</p> <p>    def endImage(self):<br>        self.imagein = False<br><br>    def startLink(self):<br>        if self.imagein:<br>            self.fout.write('''<a href="''')<br><br>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20<BR>%C2%A0%C2%A0%C2%A0%20def%20endLink(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.link%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.imagein:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.link.encode('gb2312'))<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''" target="_blank">\n ''')<br>        elif self.item:<br>            #self.link = self.temp<br>            pass<br>        else:<br>            self.fout.write(self.link)<br>            self.fout.write(''' " target="<br>      _blank<br>     "> ''')<br>            self.fout.write(self.title.encode('gb2312'))<br>            self.fout.write(''' </a></p> </td> <br>                            </tr> <br>                            <tr><td colspan="2" align="center"> <br>                            ''')<br>            self.fout.write(self.description.encode('gb2312'))<br>            self.fout.write('''<br>                        </td></tr> <br>                        <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copyLink();">¸´ÖÆ´ËÒ³Á´½Ó</a>                <a href="javascript:subscibeLink();">ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ã棨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©</a></b></td></tr> <br>                        </table> <br>                        <table width="750" cellpadding="0" cellspacing="0"> <br>                            ''') <p>    def startUrl(self):<br>        if self.imagein:<br>            self.fout.write('''<img src="''')<BR>%C2%A0%C2%A0%C2%A0%20def%20endUrl(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.url%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.imagein:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.url.encode('gb2312'))<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''" border="0" alt="使用python解析xml成对应的html示例分享" >\n<br>                            <br>                            <br>                            </p> <td align="left" valign="bottom" style="padding-bottom:8px;"><b><a href="<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20''')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.item:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20#self.url%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass</P>%0A<P>%C2%A0%C2%A0%C2%A0%20def%20defaultStart(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass<BR>%C2%A0%C2%A0%C2%A0%20def%20defaultEnd(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%20def%20startDescription(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass<BR>%C2%A0%C2%A0%C2%A0%20def%20endDescription(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.description%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.item:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20#self.fout.write('%C2%A1%C2%A1%C2%A1%C2%A1')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.description.encode('gb2312'))<br><br>%C2%A0%C2%A0%C2%A0%20def%20endGuid(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.guid%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%20def%20endPubdate(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20not%20self.temp.startswith('http'):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.pubdate%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20else:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.pubdate%20=%20''<BR>%C2%A0%C2%A0%C2%A0%20def%20startItem(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.item%20=%20True<BR>%C2%A0%C2%A0%C2%A0%20def%20endItem(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.item%20=%20False<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20</td><BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20</tr><BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20<tr%20bgcolor="><br>                            </a></b></td> <td style="padding-top:5px;padding-left:5px;"> <br>                            <a href="''')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.link)<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''%20" target="_blank"> ''')<br>        self.fout.write(self.guid)<br>        self.fout.write('''<br>                        </a><br>                        </td> <br>                        <br>                        <tr bgcolor="#eeeeee"> <br>                        <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')<br>        self.fout.write(self.pubdate)<br>        self.fout.write('''</span></td> <br>                        </tr> <br>                        <tr height="10"><td></td></tr>''') <p>#程序入口<br>if __name__ == '__main__':<br>    parse('ddt.xml', Website())<br></p> </table> </center> </td> </tr>
Déclaration:
Le contenu de cet article est volontairement contribué par les internautes et les droits d'auteur appartiennent à l'auteur original. Ce site n'assume aucune responsabilité légale correspondante. Si vous trouvez un contenu suspecté de plagiat ou de contrefaçon, veuillez contacter admin@php.cn