Home >Backend Development >PHP Tutorial >A super simple and practical HTML parsing class recently collected and written

A super simple and practical HTML parsing class recently collected and written

WBOY
WBOYOriginal
2016-07-25 09:02:04687browse
$xp = new xf_HtmlDom(); $xp->loadHtml('http://dealer.bitauto.com/100040078/cars.html'); $rows = $xp->find('dl/dd/a', 0)->innertext; print_r($rows);
  1. $oldSetting = libxml_use_internal_errors( true );
  2. libxml_clear_errors();
  3. /**
  4. *
  5. * -+-----------------------------------
  6. * |PHP5 Framework - 2011
  7. * |Web Site: www.iblue.cc
  8. * |E-mail: mejinke@gmail.com
  9. * |Date: 2012-10-12
  10. * -+--------------- --------------------
  11. *
  12. * @desc HTML parser
  13. * @author jingke
  14. */
  15. class XF_HtmlDom
  16. {
  17. private $_xpath = null;
  18. private $_nodePath = '';
  19. public function __construct($xpath = null, $nodePath = '')
  20. {
  21. $this->_xpath = $xpath;
  22. $this->_nodePath = $nodePath;
  23. }
  24. public function loadHtml($url)
  25. {
  26. ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
  27. $content = '';
  28. if(strpos(strtolower($url), 'http')===false)
  29. {
  30. $content = file_get_contents($url);
  31. }
  32. else
  33. {
  34. $ch = curl_init();
  35. $user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
  36. $user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
  37. curl_setopt($ch, CURLOPT_URL, $url);
  38. curl_setopt($ch, CURLOPT_HEADER, false);
  39. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  40. curl_setopt($ch, CURLOPT_REFERER, $url);
  41. curl_setopt($ch, CURLOPT_USERAGENT, $user_agent1);
  42. curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
  43. $content =curl_exec($ch);
  44. curl_close($ch);
  45. }
  46. $html = new DOMDocument();
  47. $html->loadHtml($content);
  48. $this->_xpath = new DOMXPath( $html );
  49. return $this;
  50. }
  51. public function find($query, $index = null)
  52. {
  53. if($this->_nodePath == '')
  54. $this->_nodePath = '//';
  55. else
  56. $this->_nodePath .= '/';
  57. $nodes = $this->_xpath->query($this->_nodePath.$query);
  58. if ($index == null && !is_numeric($index))
  59. {
  60. $tmp = array();
  61. foreach ($nodes as $node)
  62. {
  63. $tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());
  64. }
  65. return $tmp;
  66. }
  67. return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
  68. }
  69. /**
  70. * Get content
  71. */
  72. public function text()
  73. {
  74. if ($this->_nodePath != '' && $this->_xpath != null )
  75. return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
  76. else
  77. return false;
  78. }
  79. /**
  80. * Get attribute value
  81. */
  82. public function getAttribute($name)
  83. {
  84. if ($this->_nodePath != '' && $this->_xpath != null )
  85. return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
  86. else
  87. return false;
  88. }
  89. public function __get($name)
  90. {
  91. if($name == 'innertext')
  92. return $this->text();
  93. else
  94. return $this->getAttribute($name);
  95. }
  96. }
复制代码


Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn