解析RSS类

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOriginal: 2016-06-07 11:44:491382browse

简单但功能强大的PHP解析RSS文件类。
by Vojtech Semecky, webmaster @ webdot . cz最新版本,更新类容,手册,示例参见: http://lastrss.webdot.cz/ http://sssui.com
我只是摘抄过来,由于英文不好,所以稍加汉化了注释/* ====================================================================== lastRSS 0.9.1 简单但功能强大的PHP解析RSS文件类。 by Vojtech Semecky, webmaster @ webdot . cz 最新版本,更新类容,手册,示例参见: http://lastrss.webdot.cz/ http://sssui.com ---------------------------------------------------------------------- LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License (GPL) as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. To read the license please visit http://www.gnu.org/copyleft/gpl.html ====================================================================== */ /* $rss=new lastRSS(); //实例化 $rss->cache_dir = 'cache'; //设置缓存目录，要手动建立 $rss->cache_time = 3600; //设置缓存时间。默认为0，即随访问更新缓存；建议设置为3600，一个小时 $rss->default_cp = 'UTF-8'; //设置RSS字符编码，默认为UTF-8 $rss->cp = 'GBK'; //设置输出字符编码，默认为GBK $rss->items_limit = 10; //设置输出数量，默认为10 $rss->date_format = 'U'; //设置时间格式。默认为字符串；U为时间戳，可以用date设置格式 $rss->stripHTML = true; //设置过滤html脚本。默认为false，即不过滤 $rss->CDATA = 'content'; //设置处理CDATA信息。默认为nochange。另有strip和content两个选项 $url = 'http://hi.baidu.com/gincn/rss '; $data = $rss->Get($url); //处理RSS并获取内容 print_r($data); */ /** * lastRSS * 简单但功能强大的PHP解析RSS文件类。 */ class lastRSS { // ------------------------------------------------------------------- // 共有属性 // ------------------------------------------------------------------- var $default_cp = 'UTF-8'; var $CDATA = 'nochange'; var $cp = ''; var $items_limit = 0; var $stripHTML = False; var $date_format = ''; // ------------------------------------------------------------------- // 私有属性 // ------------------------------------------------------------------- var $channeltags = array ('title', 'link', 'description', 'language', 'copyright', 'managingEditor', 'webMaster', 'lastBuildDate', 'rating', 'docs'); var $itemtags = array('title', 'link', 'description', 'author', 'category', 'comments', 'enclosure', 'guid', 'pubDate', 'source'); var $imagetags = array('title', 'url', 'link', 'width', 'height'); var $textinputtags = array('title', 'description', 'name', 'link'); // ------------------------------------------------------------------- // 解析RSS文件，并返回关联数组。 // ------------------------------------------------------------------- function Get ($rss_url) { //如果启用缓存 if ($this->cache_dir != '') { $cache_file = $this->cache_dir . '/rsscache_' . md5($rss_url); $timedif = @(time() - filemtime($cache_file)); if ($timedif cache_time) { // 缓存文件是最新,则返回缓存数组 $result = unserialize(join('', file($cache_file))); // 如果缓存不为空,则设置$cached=1 if ($result) $result['cached'] = 1; } else { // 缓存文件已过期,则创建新的缓存文件 $result = $this->Parse($rss_url); $serialized = serialize($result); if ($f = @fopen($cache_file, 'w')) { fwrite ($f, $serialized, strlen($serialized)); fclose($f); } if ($result) $result['cached'] = 0; } } // 如果未启用缓存,则直接加载文件 else { $result = $this->Parse($rss_url); if ($result) $result['cached'] = 0; } return $result; } // ------------------------------------------------------------------- // 重定义preg_match(); 返回修正过后的第一个匹配 // from 'classic' preg_match() array output // ------------------------------------------------------------------- function my_preg_match ($pattern, $subject) { // 开始正在匹配 preg_match($pattern, $subject, $out); // 如果结果不为空,则继续 if(isset($out[1])) { // 处理 CDATA (如果存在) if ($this->CDATA == 'content') { // 获取 CDATA内容 (不存在 CDATA 标签) $out[1] = strtr($out[1], array(''', ']]>'=>'')); } elseif ($this->CDATA == 'strip') { // 去除 CDATA $out[1] = strtr($out[1], array(''', ']]>'=>'')); } //转换成设置的编码 if ($this->cp != '') $out[1] = iconv($this->rsscp, $this->cp.'//TRANSLIT', $out[1]); return trim($out[1]); } else { return ''; } } // ------------------------------------------------------------------- // 替换html实体为真实字符 // ------------------------------------------------------------------- function unhtmlentities ($string) { // Get HTML entities table $trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES); // Flip keysvalues $trans_tbl = array_flip ($trans_tbl); // Add support for ' entity (missing in HTML_ENTITIES) $trans_tbl += array(''' => "'"); // Replace entities by values return strtr ($string, $trans_tbl); } // ------------------------------------------------------------------- // Parse() 是由GET()调用的私有方法,用来解析RSS文件. // 所以不要在你的代码中使用Parse(),而是用 Get($rss_file)方法来替代. // ------------------------------------------------------------------- function Parse ($rss_url) { //打开RSS文件 if ($f = @fopen($rss_url, 'r')) { $rss_content = ''; while (!feof($f)) { $rss_content .= fgets($f, 4096); } fclose($f); // 解析文件编码 $result['encoding'] = $this->my_preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $rss_content); //如果文件编码一致则直接使用 if ($result['encoding'] != '') { $this->rsscp = $result['encoding']; } // This is used in my_preg_match() //否则使用默认的编码 else { $this->rsscp = $this->default_cp; } // This is used in my_preg_match() // 解析 CHANNEL信息 preg_match("'<channel.>(.*?)'si", $rss_content, $out_channel); foreach($this->channeltags as $channeltag) { $temp = $this->my_preg_match("'(.*?)$channeltag>'si", $out_channel[1]); if ($temp != '') $result[$channeltag] = $temp; // Set only if not empty } // If date_format is specified and lastBuildDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['lastBuildDate'])) !==-1) { // 解析 lastBuildDate 到指定的时间格式 $result['lastBuildDate'] = date($this->date_format, $timestamp); } // 解析 TEXTINPUT preg_match("'<textinput>]*[^/])>(.*?)</textinput>'si", $rss_content, $out_textinfo); // This a little strange regexp means: // Look for tag <textinput> with or without any attributes, but skip truncated version <textinput></textinput> (it's not beggining tag) if (isset($out_textinfo[2])) { foreach($this->textinputtags as $textinputtag) { $temp = $this->my_preg_match("'(.*?)$textinputtag>'si", $out_textinfo[2]); if ($temp != '') $result['textinput_'.$textinputtag] = $temp; // Set only if not empty } } // 解析 IMAGE preg_match("'<image.>(.*?)'si", $rss_content, $out_imageinfo); if (isset($out_imageinfo[1])) { foreach($this->imagetags as $imagetag) { $temp = $this->my_preg_match("'(.*?)$imagetag>'si", $out_imageinfo[1]); if ($temp != '') $result['image_'.$imagetag] = $temp; // Set only if not empty } } // 解析 ITEMS preg_match_all("'<item>(.*?)</item>'si", $rss_content, $items); $rss_items = $items[2]; $i = 0; $result['items'] = array(); // create array even if there are no items foreach($rss_items as $rss_item) { // If number of items is lower then limit: Parse one item if ($i items_limit || $this->items_limit == 0) { foreach($this->itemtags as $itemtag) { $temp = $this->my_preg_match("'(.*?)$itemtag>'si", $rss_item); if ($temp != '') $result['items'][$i][$itemtag] = $temp; // Set only if not empty } // Strip HTML tags and other bullshit from DESCRIPTION if ($this->stripHTML && $result['items'][$i]['description']) $result['items'][$i]['description'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['description']))); // Strip HTML tags and other bullshit from TITLE if ($this->stripHTML && $result['items'][$i]['title']) $result['items'][$i]['title'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['title']))); // If date_format is specified and pubDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['items'][$i]['pubDate'])) !==-1) { // convert pubDate to specified date format $result['items'][$i]['pubDate'] = date($this->date_format, $timestamp); } // Item 计数 $i++; } } $result['items_count'] = $i; return $result; } else // 文件打开错误返回False { return False; } } }</image.></textinput></channel.>

AD：真正免费，域名+虚机+企业邮箱=0元

Statement：

The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn

Previous article：ThinkPHP使用缓存解决递归问题，只查库一次Next article：递归无限级菜单分类

See more

解析RSS类

Related articles