Home >Backend Development >Python Tutorial >使用python解析xml成对应的html示例分享

使用python解析xml成对应的html示例分享

WBOY
WBOYOriginal
2016-06-16 08:44:401401browse

SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
#   程序:XML解析器
#   版本:01.0
#   作者:mupeng
#   日期:2013-12-18
#   语言:Python 2.7
#   功能:将xml解析成对应的html
#   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
#   继承ContentHandler并重写其事件处理函数
#   Dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
    def dispatch(self, prefix, name, attrs=None):
        mname = prefix + name.capitalize()
        dname = 'default' + prefix.capitalize()
        method = getattr(self, mname, None)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, None)
            #args = name
        #if prefix == 'start': args += attrs
        if callable(method): method()

    def startElement(self, name, attrs):
        self.dispatch('start', name, attrs)

    def endElement(self, name):
        self.dispatch('end', name)

class Website(Dispatcher, ContentHandler):

    def __init__(self):
        self.fout = open('ddt_SAX.html', 'w')
        self.imagein = False
        self.desflag = False
        self.item = False
        self.title = ''
        self.link = ''
        self.guid = ''
        self.url = ''
        self.pubdate = ''
        self.description = ''
        self.temp = ''
        self.prx = ''
    def startChannel(self):

        self.fout.write('''\n

\n RSS-''')<br><br>    def endChannel(self):<br>       self.fout.write('''<br>                    <tr><td height="20"></td></tr> <br>                    <br>                    <br>                    <script><BR> function GetTimeDiff(str)<BR> {<BR> if(str == '')<BR> {<BR> return '';<BR> } <P> var pubDate = new Date(str);<BR> var nowDate = new Date();<BR> var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();<BR> var days = diffMilSeconds/86400000;<BR> days = parseInt(days); <P> diffMilSeconds = diffMilSeconds-(days*86400000);<BR> var hours = diffMilSeconds/3600000;<BR> hours = parseInt(hours); <P> diffMilSeconds = diffMilSeconds-(hours*3600000);<BR> var minutes = diffMilSeconds/60000;<BR> minutes = parseInt(minutes); <P> diffMilSeconds = diffMilSeconds-(minutes*60000);<BR> var seconds = diffMilSeconds/1000;<BR> seconds = parseInt(seconds);<br><br> var returnStr = "±±&frac34;&copy;·&cent;&sup2;&frac14;&Ecirc;±&frac14;&auml;&pound;&ordm;" + pubDate.toLocaleString(); <P> if(days > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + days + "&Igrave;ì" + hours + "&ETH;&iexcl;&Ecirc;±" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> }<BR> else if (hours > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + hours + "&ETH;&iexcl;&Ecirc;±" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> }<BR> else if (minutes > 0)<BR> {<BR> returnStr = returnStr + " &pound;¨&frac34;à&Agrave;&euml;&Iuml;&Ouml;&Ocirc;&Uacute;" + minutes + "·&Ouml;&Ouml;&Oacute;&pound;&copy;";<BR> } <P> return returnStr; <P> } <P> function GetSpanText()<BR> {<BR> var pubDate;<BR> var pubDateArray;<BR> var spanArray = document.getElementsByTagName("span"); <P> for(var i = 0; i < spanArray.length; i++)<BR> {<BR> pubDate = spanArray[i].innerHTML;<BR> document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate); <BR> }<BR> } <P> GetSpanText();<BR> </script><br>                <br>                <br>                ''')<br>       self.fout.close() <p>    def characters(self, chars):<br>        if chars.strip():<br>            #chars = chars.strip()<br>            self.temp += chars<br>            #print self.temp<br><br>       <br>    def startTitle(self):<br><br>        if self.item:<br>            self.fout.write('''<br>                        </p> <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<b><br>                    ''')<br><br>    def endTitle(self):<br><br>        if not self.imagein and not self.item:<br>            self.title = self.temp<br>            self.temp = ''<br>            self.fout.write(self.title.encode('gb2312'))<br><br>            #self.title = self.temp<br>            self.fout.write('''<br>                \n\n\n</b><center>\n<br>                <script>\n <P> function copyLink()<BR> {<BR> clipboardData.setData("Text",window.location.href);<BR> alert("RSS&Aacute;&acute;&frac12;&Oacute;&Ograve;&Ntilde;&frac34;&shy;&cedil;&acute;&Ouml;&AElig;&micro;&frac12;&frac14;&ocirc;&Igrave;ù°&aring;");<BR> } <P> function subscibeLink()<BR> {<BR> var str = window.location.pathname;<BR> while(str.match(/^\//))<BR> {<BR> str = str.replace(/^\//,"");<BR> }<BR> window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self"); <P> }<BR> </script>\n<br>                <table width="750" cellpadding="0" cellspacing="0">\n<br>                <tr>\n<br>                <td align="right" style="padding-right:15px;" valign="bottom">\n<br>            ''')<br><br>        if self.item:<br>            self.title = self.temp<br>            self.temp = ''<br>            self.fout.write(self.title.encode('gb2312'))<br>            self.fout.write('''<br>                        <br>                        </td> <br>                        </tr> <br>                        <tr bgcolor="#eeeeee"> <br>                        <td style="padding-left:5px;"> <br>                        ''') <p>    def startImage(self):<br>        self.imagein = True</p> <p>    def endImage(self):<br>        self.imagein = False<br><br>    def startLink(self):<br>        if self.imagein:<br>            self.fout.write('''<a href="''')<br><br>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20<BR>%C2%A0%C2%A0%C2%A0%20def%20endLink(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.link%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.imagein:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.link.encode('gb2312'))<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''" target="_blank">\n ''')<br>        elif self.item:<br>            #self.link = self.temp<br>            pass<br>        else:<br>            self.fout.write(self.link)<br>            self.fout.write(''' " target="<br>      _blank<br>     "> ''')<br>            self.fout.write(self.title.encode('gb2312'))<br>            self.fout.write(''' </a></p> </td> <br>                            </tr> <br>                            <tr><td colspan="2" align="center"> <br>                            ''')<br>            self.fout.write(self.description.encode('gb2312'))<br>            self.fout.write('''<br>                        </td></tr> <br>                        <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copyLink();">¸´ÖÆ´ËÒ³Á´½Ó</a>                <a href="javascript:subscibeLink();">ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ã棨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©</a></b></td></tr> <br>                        </table> <br>                        <table width="750" cellpadding="0" cellspacing="0"> <br>                            ''') <p>    def startUrl(self):<br>        if self.imagein:<br>            self.fout.write('''<img src="''')<BR>%C2%A0%C2%A0%C2%A0%20def%20endUrl(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.url%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.imagein:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.url.encode('gb2312'))<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''" border="0" alt="使用python解析xml成对应的html示例分享" >\n<br>                            <br>                            <br>                            </p> <td align="left" valign="bottom" style="padding-bottom:8px;"><b><a href="<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20''')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.item:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20#self.url%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass</P>%0A<P>%C2%A0%C2%A0%C2%A0%20def%20defaultStart(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass<BR>%C2%A0%C2%A0%C2%A0%20def%20defaultEnd(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%20def%20startDescription(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20pass<BR>%C2%A0%C2%A0%C2%A0%20def%20endDescription(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.description%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20self.item:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20#self.fout.write('%C2%A1%C2%A1%C2%A1%C2%A1')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.description.encode('gb2312'))<br><br>%C2%A0%C2%A0%C2%A0%20def%20endGuid(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.guid%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%20def%20endPubdate(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20if%20not%20self.temp.startswith('http'):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.pubdate%20=%20self.temp<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.temp%20=%20''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20else:<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.pubdate%20=%20''<BR>%C2%A0%C2%A0%C2%A0%20def%20startItem(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.item%20=%20True<BR>%C2%A0%C2%A0%C2%A0%20def%20endItem(self):<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.item%20=%20False<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20</td><BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20</tr><BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20<tr%20bgcolor="><br>                            </a></b></td> <td style="padding-top:5px;padding-left:5px;"> <br>                            <a href="''')<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write(self.link)<BR>%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%20self.fout.write('''%20" target="_blank"> ''')<br>        self.fout.write(self.guid)<br>        self.fout.write('''<br>                        </a><br>                        </td> <br>                        <br>                        <tr bgcolor="#eeeeee"> <br>                        <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')<br>        self.fout.write(self.pubdate)<br>        self.fout.write('''</span></td> <br>                        </tr> <br>                        <tr height="10"><td></td></tr>''') <p>#程序入口<br>if __name__ == '__main__':<br>    parse('ddt.xml', Website())<br></p> </table> </center> </td> </tr>
Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn