首页 >后端开发 >php教程 >最近采集写的一个超简单实用的HTML解析类

最近采集写的一个超简单实用的HTML解析类

WBOY
WBOY原创
2016-07-25 09:02:04687浏览
$xp = new xf_HtmlDom(); $xp->loadHtml('http://dealer.bitauto.com/100040078/cars.html'); $rows = $xp->find('dl/dd/a', 0)->innertext; print_r($rows);
  1. $oldSetting = libxml_use_internal_errors( true );
  2. libxml_clear_errors();
  3. /**
  4. *
  5. * -+-----------------------------------
  6. * |PHP5 Framework - 2011
  7. * |Web Site: www.iblue.cc
  8. * |E-mail: mejinke@gmail.com
  9. * |Date: 2012-10-12
  10. * -+-----------------------------------
  11. *
  12. * @desc HTML解析器
  13. * @author jingke
  14. */
  15. class XF_HtmlDom
  16. {
  17. private $_xpath = null;
  18. private $_nodePath = '';
  19. public function __construct($xpath = null, $nodePath = '')
  20. {
  21. $this->_xpath = $xpath;
  22. $this->_nodePath = $nodePath;
  23. }
  24. public function loadHtml($url)
  25. {
  26. ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
  27. $content = '';
  28. if(strpos(strtolower($url), 'http')===false)
  29. {
  30. $content = file_get_contents($url);
  31. }
  32. else
  33. {
  34. $ch = curl_init();
  35. $user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
  36. $user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
  37. curl_setopt($ch, CURLOPT_URL, $url);
  38. curl_setopt($ch, CURLOPT_HEADER, false);
  39. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  40. curl_setopt($ch, CURLOPT_REFERER, $url);
  41. curl_setopt($ch, CURLOPT_USERAGENT, $user_agent1);
  42. curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
  43. $content =curl_exec($ch);
  44. curl_close($ch);
  45. }
  46. $html = new DOMDocument();
  47. $html->loadHtml($content);
  48. $this->_xpath = new DOMXPath( $html );
  49. return $this;
  50. }
  51. public function find($query, $index = null)
  52. {
  53. if($this->_nodePath == '')
  54. $this->_nodePath = '//';
  55. else
  56. $this->_nodePath .= '/';
  57. $nodes = $this->_xpath->query($this->_nodePath.$query);
  58. if ($index == null && !is_numeric($index))
  59. {
  60. $tmp = array();
  61. foreach ($nodes as $node)
  62. {
  63. $tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());
  64. }
  65. return $tmp;
  66. }
  67. return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
  68. }
  69. /**
  70. * 获取内容
  71. */
  72. public function text()
  73. {
  74. if ($this->_nodePath != '' && $this->_xpath != null )
  75. return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
  76. else
  77. return false;
  78. }
  79. /**
  80. * 获取属性值
  81. */
  82. public function getAttribute($name)
  83. {
  84. if ($this->_nodePath != '' && $this->_xpath != null )
  85. return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
  86. else
  87. return false;
  88. }
  89. public function __get($name)
  90. {
  91. if($name == 'innertext')
  92. return $this->text();
  93. else
  94. return $this->getAttribute($name);
  95. }
  96. }
复制代码


声明:
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn