Maison  >  Article  >  développement back-end  >  解析HTML标签,并实现快速查找节点,获取节点信息

解析HTML标签,并实现快速查找节点,获取节点信息

WBOY
WBOYoriginal
2016-07-25 08:48:491415parcourir
详细介绍和使用请点击源码出处。
  1. /**
  2. * html标签解析包
  3. *
  4. * @category TagParse
  5. * @package TagParse
  6. * @author kun
  7. * @copyright 2014 kun
  8. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  9. * @version 1.0
  10. * @link http://www.blogkun.com
  11. * @since 1.0
  12. */
  13. namespace TagParse;
  14. /**
  15. * TagDomRoot
  16. *
  17. * @category TagParse
  18. * @package TagParse
  19. * @author kun
  20. * @copyright 2014 kun
  21. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  22. * @version 1.0
  23. * @link http://www.blogkun.com
  24. * @since 1.0
  25. */
  26. class TagDomRoot
  27. {
  28. public $tag = 'root';
  29. public $plaintext;
  30. public $child = array();
  31. public $level = 0;
  32. public static $TagParseError = false;
  33. protected static $TagSet = array();
  34. protected static $FoundNode = array();
  35. public static $ErrorTag = array();
  36. /**
  37. * initProperty
  38. *
  39. * @access public
  40. *
  41. * @return null
  42. */
  43. public function initProperty()
  44. {
  45. $TagParseError = false;
  46. $TagSet = array();
  47. $FoundNode = array();
  48. $DumpScriptCode = array();
  49. $ErrorTag = array();
  50. }
  51. /**
  52. * __construct
  53. *
  54. * @param string $str The tag string to be parse.
  55. *
  56. * @access public
  57. *
  58. * @return TagDomRoot
  59. */
  60. public function __construct($str)
  61. {
  62. $this->_removeNoise($str);
  63. if ($str === null) {
  64. self::$TagParseError = true;
  65. } else {
  66. $l = strpos($str, ' if ($l !== false) {
  67. $this->plaintext = substr($str, 0, $l);
  68. }
  69. $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {
  70. $this->plaintext .= implode($matches[1]);
  71. }
  72. $r = strrpos($str, '>');
  73. if ($r !== false) {
  74. $this->plaintext .= substr($str, $r+1);
  75. }
  76. $tagCollect = array();
  77. $attrCollect = array();
  78. $innerContentCollect = array();
  79. if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  80. self::$TagParseError = true;
  81. }
  82. foreach ($tagCollect as $index => $tag) {
  83. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  84. }
  85. }
  86. }
  87. /**
  88. * parseTag
  89. *
  90. * @param mixed $str Description.
  91. * @param mixed &$tagCollect Description.
  92. * @param mixed &$attrCollect Description.
  93. * @param mixed &$innerContentCollect Description.
  94. *
  95. * @access protected
  96. *
  97. * @return boolean Value.
  98. */
  99. protected function parseTag($str, array &$tagCollect, array &$attrCollect, array &$innerContentCollect)
  100. {
  101. $selfClosingTags = array('img' => 1, 'br' => 1, 'input' => 1, 'meta' => 1, 'link' => 1, 'hr' => 1, 'base' => 1, 'embed' => 1, 'spacer' => 1);
  102. $end = -2;
  103. $close = 0;
  104. $error = false;
  105. $tag = '';
  106. while (true) {
  107. $l = strpos($str, ' if ($l === false) {//parse end
  108. break;
  109. }
  110. if (strpos(substr($str, $l, 2), '/') !== false) {//surplus closing tag,discard
  111. $error = true;
  112. $end = $l+strlen($tag);
  113. self::$ErrorTag[] = substr($str, $l, strpos($str, '>', $l)-$l+1);
  114. continue;
  115. }
  116. $r = strpos($str, '>', $l);
  117. $tag = substr($str, $l+1, $r-$l-1);
  118. if (!ctype_alpha($tag[0]) || strpos($tag, ' $end = $r + 1;
  119. continue;
  120. }
  121. $tag = preg_replace("~\n+~", ' ', $tag);
  122. $space = strpos($tag, ' ');
  123. if ($space !== false) {
  124. $attrCollect[] = substr($tag, $space+1);
  125. $tag = substr($tag, 0, $space);
  126. } else {
  127. $attrCollect[] = '';
  128. }
  129. $tagCollect[] = $tag;
  130. if (isset($selfClosingTags[$tag])) {
  131. $innerContentCollect[] = '';
  132. $end = $r-strlen($tag)-2;
  133. $close = $r+1;
  134. continue;
  135. }
  136. $countOpen = -1;
  137. $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $open);
  138. if ($close === false) {//surplus opening tag
  139. $innerContentCollect[] = substr($str, $r+1);
  140. $error = true;
  141. self::$ErrorTag[] = '';
  142. break;
  143. }
  144. $start = $open;
  145. while ($open $countOpen++;
  146. $open = strpos($str, ' }
  147. while ($countOpen > 0 && $close !== false) {
  148. $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $close+strlen($tag)+3);
  149. if ($close === false) {
  150. break;
  151. }
  152. $countOpen--;
  153. while ($open $open = strpos($str, ' $countOpen++;
  154. }
  155. }
  156. if ($close === false) {//标签闭合不配对
  157. $innerContentCollect[] = substr($str, $r+1);
  158. $error = true;
  159. break;
  160. }
  161. $end = $close;
  162. $r = strpos($str, '>', $start);
  163. $innerContentCollect[] = substr($str, $r+1, $end - $r - 1);
  164. }
  165. return !$error;
  166. }
  167. /**
  168. * _removeNoise
  169. *
  170. * @param string &$str The tag string to be parse.
  171. *
  172. * @access private
  173. *
  174. * @return string
  175. */
  176. private function _removeNoise(&$str)
  177. {
  178. $str = preg_replace('~~is', '', $str);
  179. $str = preg_replace('~~is', '', $str);
  180. $str = preg_replace('~*?>~is', '', $str);
  181. }
  182. /**
  183. * parseSelectors
  184. *
  185. * @param string $selectors user's select condition.
  186. * @param array &$selectorsTag tags
  187. * @param array &$selectorsAttr attributes
  188. *
  189. * @access protected
  190. *
  191. * @return null
  192. */
  193. protected function parseSelectors($selectors, array &$selectorsTag, array &$selectorsAttr)
  194. {
  195. preg_match_all('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selectors, $matches);
  196. $selectorsTag = $matches[1];
  197. foreach ($matches[2] as $key => $value) {
  198. $selectorsAttr[$key] = array();
  199. if ($value !== '') {
  200. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $value, $matches);
  201. foreach ($matches[1] as $index => $attr) {
  202. $selectorsAttr[$key][$attr] = $matches[2][$index];
  203. }
  204. }
  205. }
  206. }
  207. /**
  208. * find
  209. *
  210. * @param mixed $selectors user's select condition.
  211. * @param array $selectorsTag tags.
  212. * @param array $selectorsAttr attributes.
  213. *
  214. * @access public
  215. *
  216. * @return array
  217. */
  218. public function find($selectors, $selectorsTag = array(), $selectorsAttr = array())
  219. {
  220. if ($selectors !== null) {
  221. $this->parseSelectors($selectors, $selectorsTag, $selectorsAttr);
  222. }
  223. var_dump($selectorsTag, $selectorsAttr);exit();
  224. if (!empty($selectorsTag)) {
  225. $this->seek($selectorsTag, $selectorsAttr);
  226. foreach ($this->child as $key => $node) {
  227. $node->find(null, $selectorsTag, $selectorsAttr);
  228. }
  229. }
  230. if ($selectors !== null) {
  231. $res = self::$FoundNode;
  232. self::$FoundNode = array();
  233. return $res;
  234. }
  235. }
  236. /**
  237. * findGlobal
  238. *
  239. * @param string $selectors user's select condition.
  240. *
  241. * @access public
  242. *
  243. * @return array
  244. */
  245. public function findGlobal($selectors)
  246. {
  247. $space = strpos($selectors, ' ', strpos($selectors, ']'));
  248. if ($space === false) {
  249. return $this->findOneGlobal($selectors);
  250. } else {
  251. $selectorsAttr = array();
  252. $selectorsTag = array();
  253. $this->findOneGlobal(substr($selectors, 0, $space), false);
  254. $this->parseSelectors(substr($selectors, $space + 1), $selectorsTag, $selectorsAttr);
  255. if (!empty(self::$FoundNode) && !empty($selectorsTag)) {
  256. $nodes = self::$FoundNode;
  257. self::$FoundNode = array();
  258. foreach ($nodes as $key => $node) {
  259. $node->seek($selectorsTag, $selectorsAttr);
  260. }
  261. }
  262. }
  263. $res = self::$FoundNode;
  264. self::$FoundNode = array();
  265. return $res;
  266. }
  267. /**
  268. * seek
  269. *
  270. * @param array $selectorsTag tags.
  271. * @param array $selectorsAttr attributes.
  272. *
  273. * @access protected
  274. *
  275. * @return null
  276. */
  277. protected function seek($selectorsTag, $selectorsAttr)
  278. {
  279. foreach ($this->child as $key => $node) {
  280. $isFind = true;
  281. if ($node->tag === $selectorsTag[0]) {
  282. foreach ($selectorsAttr[0] as $attrName => $value) {
  283. if (isset($node->attr[$attrName])
  284. && (preg_match('~.*? '.$value.' .*?~', $node->attr[$attrName]) > 0
  285. || preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0
  286. || preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0
  287. || preg_match('~ '.$value.'$~', $node->attr[$attrName]) > 0)
  288. ) {
  289. continue;
  290. } else {
  291. $isFind = false;
  292. break;
  293. }
  294. }
  295. } else {
  296. $isFind = false;
  297. }
  298. if ($isFind) {
  299. if (count($selectorsTag) === 1) {
  300. self::$FoundNode[] = $node;
  301. } else {
  302. $node->seek(
  303. array_slice($selectorsTag, 1),
  304. array_slice($selectorsAttr, 1)
  305. );
  306. }
  307. }
  308. }
  309. }
  310. /**
  311. * findOneGlobal
  312. *
  313. * @param string $selector user's select condition.
  314. * @param bool $isReturn weather return value.
  315. *
  316. * @access public
  317. *
  318. * @return array
  319. */
  320. public function findOneGlobal($selector, $isReturn = true)
  321. {
  322. preg_match('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selector, $matches);
  323. $tag = $matches[1];
  324. $attr = array();
  325. if (isset($matches[2])) {
  326. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $matches[2], $matches);
  327. foreach ($matches[1] as $key => $value) {
  328. $attr[$value] = $matches[2][$key];
  329. }
  330. }
  331. if (isset(self::$TagSet[$tag])) {
  332. foreach (self::$TagSet[$tag] as $attrValue => $nodeArray) {
  333. $isFind = true;
  334. foreach ($attr as $attrName => $value) {
  335. if (preg_match('~'.$attrName.'=".*? '.$value.' .*?"~', $attrValue)
  336. || preg_match('~'.$attrName.'="'.$value.' .*?"~', $attrValue)
  337. || preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)
  338. || preg_match('~'.$attrName.'="'.$value.'"~', $attrValue)
  339. ) {
  340. continue;
  341. } else {
  342. $isFind = false;
  343. break;
  344. }
  345. }
  346. if ($isFind) {
  347. foreach ($nodeArray as $key => $node) {
  348. self::$FoundNode[] = $node;
  349. }
  350. }
  351. }
  352. }
  353. if ($isReturn) {
  354. $res = self::$FoundNode;
  355. self::$FoundNode = array();
  356. return $res;
  357. }
  358. }
  359. }
  360. /**
  361. * TagDomNode
  362. *
  363. * @uses TagDomRoot
  364. *
  365. * @category TagParse
  366. * @package TagParse
  367. * @author kun
  368. * @copyright 2014 kun
  369. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  370. * @version 1.0
  371. * @link http://www.blogkun.com
  372. * @since 1.0
  373. */
  374. class TagDomNode extends TagDomRoot
  375. {
  376. public $attr = array();
  377. public $parent = null;
  378. /**
  379. * __construct
  380. *
  381. * @param mixed $tag tag.
  382. * @param mixed $parent parent node.
  383. * @param mixed $attr attribute.
  384. * @param mixed $innerContent tag content.
  385. * @param mixed $level node level.
  386. *
  387. * @access public
  388. *
  389. * @return TagDomNode
  390. */
  391. public function __construct($tag, $parent, $attr, $innerContent, $level)
  392. {
  393. $this->tag = $tag;
  394. $this->parent = $parent;
  395. $this->_parseAttr($attr);
  396. $this->level = $level;
  397. $l = strpos($innerContent, ' if ($l !== false) {
  398. $this->plaintext = substr($innerContent, 0, $l);
  399. }
  400. $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {
  401. $this->plaintext .= implode($matches[1]);
  402. } else {
  403. $this->plaintext .= $innerContent;
  404. }
  405. $r = strrpos($innerContent, '>');
  406. if ($r !== false) {
  407. $this->plaintext .= substr($innerContent, $r+1);
  408. }
  409. $tagCollect = array();
  410. $attrCollect = array();
  411. $innerContentCollect = array();
  412. if ($this->parseTag($innerContent, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  413. self::$TagParseError = true;
  414. }
  415. foreach ($tagCollect as $index => $tag) {
  416. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  417. }
  418. if (!isset(self::$TagSet[$this->tag])) {
  419. self::$TagSet[$this->tag] = array();
  420. }
  421. if (!isset(self::$TagSet[$this->tag][$attr])) {
  422. self::$TagSet[$this->tag][$attr] = array();
  423. }
  424. self::$TagSet[$this->tag][$attr][] = &$this;
  425. }
  426. /**
  427. * _parseAttr
  428. *
  429. * @param string $str attribute string.
  430. *
  431. * @access public
  432. *
  433. * @return null
  434. */
  435. private function _parseAttr($str)
  436. {
  437. preg_match_all('~(?[\w-]+)="(?.*?)"~s', $str, $matches);
  438. foreach ($matches['attrName'] as $key => $value) {
  439. $this->attr[$value] = $matches['attrValue'][$key];
  440. }
  441. }
  442. }
复制代码


Déclaration:
Le contenu de cet article est volontairement contribué par les internautes et les droits d'auteur appartiennent à l'auteur original. Ce site n'assume aucune responsabilité légale correspondante. Si vous trouvez un contenu suspecté de plagiat ou de contrefaçon, veuillez contacter admin@php.cn