首頁  >  文章  >  後端開發  >  解析HTML標籤,並實現快速查找節點,取得節點信息

解析HTML標籤,並實現快速查找節點,取得節點信息

WBOY
WBOY原創
2016-07-25 08:48:491415瀏覽
詳細介紹使用請點選來源碼出處。
  1. /**
  2. * htmlmap 解析器
  3. *
  4. * @category TagParse
  5. * @package TagParse
  6. * @author 這個
  7. * @copyright 2014 * @version 1.0
  8. * @link http://www.blogkun.com
  9. * @since 1.0
  10. */
  11. 命名空間 TagParse;
  12. /**
  13. * TagDomRoot
  14. *
  15. * @category TagParse
  16. * @package TagParse
  17. * @author kun
  18. * @copyright 2014 kun; license http://www.php.net/license/3_01.txt PHP 許可證3.01
  19. * @版本1.0
  20. * @link http://www.blogkun.com
  21. * @since 1.0
  22. */
  23. class TagDomRoot
  24. {
  25. public $tag = 'root';
  26. public $plaintext;
  27. public $child = array();
  28. public $level = 0;
  29. public static $ TagParseError = false;
  30. protected static $TagSet = array();
  31. protected static $FoundNode = array();
  32. public static $ErrorTag = array();
  33. 🎝>*/
  34. public function initProperty()
  35. {
  36. $TagParseError = false;
  37. $TagSet = array();
  38. $FoundNode = array(); $ErrorTag = array();
  39. }
  40. /**
  41. * initProperty
  42. *
  43. * @access public
  44. *
  45. * @return null
  46. */
  47. public function __construct($str)
  48. {
  49. $
  50. $ this->_removeNoise($str);
  51. if ($str === null) {
  52. self::$TagParseError = true;
  53. } else {
  54. $l = strpos($str ,> } else {
  55. $l = strpos($str , ' if ($l !== false) {
  56. $this->plaintext = substr($str, 0, $l);
  57. }
  58. $res = preg_match_all( '~>(.*?) if ($res !== false && $res >; 0) {
  59. $this->plaintext .= 內爆($matches[1]);
  60. }
  61. $r = strrpos($str, '>');
  62. if ($ r !== false) {
  63. $this->plaintext 。 🎜 >
  64. if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  65. self::$TagParseError = true;
  66. }
  67. self::$TagParseError = true;
  68. }
  69. foreach ($tagCollect as $index => $tag) {
  70. $this->child[] = new TagDomNode($tag , $this, $attrCollect[$index], $innerContentCollect[$index], $ this ->level 1);
  71. }
  72. }
  73. }
  74. /**
  75. * __construct
  76. *
  77. * @param string $str 要解析的標籤字串。
  78. *
  79. * @access public
  80. *
  81. * @return TagDomRoot
  82. */
  83. 受保護的函數parseTag($str, array &$taglect, array &$attrCollect, array &$innerContentCollect)
  84. {
  85. $selfClosingTags = array('img' => 1, 'br ' => 1, '輸入' => 1, '元' => 1、 '連結'=> 1、'小時'=> 1、'基礎'=> 1、'嵌入'=> 1、'間隔'=> 1);
  86. $end = -2;
  87. $close = 0;
  88. $error = false;
  89. $tag = '';
  90. while (true) {
  91. $l = strpos($str, ' if ($l === false) {//解析結束
  92. break;
  93. }
  94. if (strpos(substr($str, $l, 2), '/' ) !== false) {//多餘的結束標記,丟棄
  95. $error = true;
  96. $end = $l strlen( $tag);
  97. self::$ErrorTag[] = substr( $ str, $l, strpos($str, '>', $l)-$l 1);
  98. 繼續;
  99. }
  100. $r = strpos($str, '>' , $l);
  101. $tag = substr($str, $l 1, $r-$l-1);
  102. if (!ctype_alpha($tag[0]) || strpos($tag, ' $end = $r 1;
  103. 繼續;
  104. }
  105. $tag = preg_replace("~n ~", ' ', $tag);
  106. $space = strpos($tag, ' ');
  107. if ($space !== false) {
  108. $attrCollect[] = substr($tag, $space 1);
  109. $tag = substr($tag, 0, $space);
  110. } else {
  111. $attrCollect[] = '' ;
  112. }
  113. $tagCollect[] = $tag;
  114. if (isset($selfClosingTags[$tag])) {
  115. $innerContentCollect[] = '';
  116. $end = $> $innerContentCollect[] = '';
  117. $end = $ r-strlen($tag)-2;
  118. $close = $r 1;
  119. 繼續;
  120. }
  121. $countOpen = -1;
  122. $open = strpos($ str, ' $close = strpos($str, ''.$tag.'>', $open);
  123. if ($close === false) {//告白的開始標記
  124. $innerContentCollect[] = substr($str, $r 1);
  125. $error = true;
  126. self::$ErrorTag[] = '';
  127. 規則;
  128. }
  129. $start = $open;
  130. while ($open
  131. $close && $open !== false) {
  132. $countOpen ;
  133. $open = strpos($str, ' }
  134. while ($countOpen > 0 && $close ! == false) {
  135. $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $close strlen($tag) 3);
  136. if ($close === false) {
  137. break ;
  138. }
  139. $countOpen--;
  140. while ($open $open = strpos($str, ' $countOpen ;
  141. }
  142. }
  143. if ($close === false) {//標籤閉合不佈局
  144. $innerContentCollect[] = substr($str, $r 1);
  145. $error = true;
  146. 中斷;
  147. }
  148. $end = $close;
  149. $r = strpos($ str, '> ', $start);
  150. $innerContentCollect[] = substr($str, $r 1, $end - $r - 1);
  151. }
  152. return !$錯誤;
  153. }
  154. /**
  155. * _removeNoise
  156. *
  157. * @param string &$str 要解析的標籤字串。
  158. *
  159. * @access private
  160. *
  161. * @return string
  162. */
  163. 私有函數_removeNoise(&$str)
  164. {
  165. $str = preg_replace('~~is', '', $str);
  166. $str = preg_replace('~~is', ' ', $str);
  167. $str = preg_replace('~*?>~is', '', $str);
  168. }
  169. /**
  170. * parseSelectors
  171. *
  172. * @param string $selectors 使用者的選擇條件。
  173. * @param array &$selectorsTag 標籤
  174. * @param array &$selectorsAttr 屬性
  175. *
  176. * @access protected
  177. *
  178. * @return null *
  179. * @return null
  180. */
  181. protected function parseSelectors ( $selectors, 陣列&$selectorsTag, 陣列&$selectorsAttr)
  182. {
  183. preg_match_all('~([wd] )([[wd -="._/ ] ])?~', $selectors, $ matches);
  184. $selectorsTag = $matches[1];
  185. foreach ($matches[2] as $key =>; $value) {
  186. $selectorsAttr[$key] = array();
  187. if ($value !== '') {
  188. preg_match_all('~([wd-] )="([wd -._/] )"~', $value, $matches);
  189. foreach ($matches[1] as $index => $attr) {
  190. $selectorsAttr[$key][$attr] = $matches[2][$index];
  191. }
  192. }
  193. }
  194. }
  195. /**
  196. * find
  197. *
  198. * @param mix $selectors 使用者的選擇條件。
  199. * @param array $selectorsTag 標籤。
  200. * @param array $selectorsAttr 屬性。
  201. *
  202. * @access public
  203. *
  204. * @return array
  205. */
  206. public function find($selectors , $selectorsTag = array(), $selectorsAttr = array())
  207. {
  208. if ($selectors !== null) {
  209. $this->parseSelectors($selectors, $selectorsTag, $selectorsAttr );
  210. }
  211. var_dump($selectorsTag, $selectorsAttr);exit();
  212. if (!empty($selectorsTag)) {
  213. $this->seek($selectorsTag, $selectorsAttr) );
  214. foreach ($this->child as $key => $node) {
  215. $node->find(null, $selectorsTag, $selectorsAttr);
  216. }
  217. }
  218. if ($selectors !== null) {
  219. $res = self::$FoundNode;
  220. self::$FoundNode = array(); return $res ; } }
  221. /**
  222. * findGlobal
  223. *
  224. * @param string $selectors 使用者的選擇條件。
  225. *
  226. * @access public
  227. *
  228. * @return array
  229. */
  230. public function findGlobal($selectors)
  231. {
  232. $space = strpos($selectors, ' ', strpos($selectors, ']' ));
  233. if ($space === false) {
  234. return $this->findOneGlobal($selectors);
  235. } else {
  236. $selectorsAttr = array();
  237. $
  238. $selectorsAttr = array();
  239. $ selectorsTag = array();
  240. $this->findOneGlobal(substr($selectors, 0, $space), false);
  241. $this->parseSelectors(substr($selectors, $space 1) , $selectorsTag , $selectorsAttr);
  242. if (!empty(self::$FoundNode) && !empty($selectorsTag)) {
  243. $nodes = self::$FoundNode;
  244. self::$ FoundNode = 陣列();
  245. foreach ($nodes as $key => $node) {
  246. $node->seek($selectorsTag, $selectorsAttr);
  247. }
  248. }
  249. }
  250. $res = self::$FoundNode;
  251. self::$FoundNode = array();
  252. return $res;
  253. }
  254. /**
  255. * 尋找
  256. *
  257. * @param 陣列 $selectorsTag 標籤。
  258. * @param 陣列 $selectorsAttr 屬性。
  259. *
  260. * @access protected
  261. *
  262. * @return null
  263. */
  264. protected functioneek($selectorsTag, $selectorsAttr)
  265. {
  266. foreach ($this->child as $key =>; $node) {
  267. $isFind = true;
  268. if ($node->tag === $selectorsTag[0]) {
  269. foreach ($selectorsAttr[0] as $attrName => $ value) {
  270. if ( isset($node->attr[$attrName])
  271. && (preg_match('~.*? '.$價值。 .*?~', $node ->attr[$attrName]) > 0
  272. || preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0
  273. | | preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0
  274. || preg_match('~ '.$value.'$~', $node ->attr[$attrName]) > 0)
  275. ) {
  276. 續;
  277. } else {
  278. $isFind = false ;
  279. 中斷;
  280. }
  281. }
  282. } else {
  283. $isFind = false;
  284. }
  285. if ($isFind) {
  286. if (count($selectorsTag ) ) === 1) {
  287. self::$ FoundNode[] = $node;
  288. } else {
  289. $node->seek(
  290. array_slice($selectorsTag, 1),
  291. array_slice($selectorsAttr, 1)
  292. );
  293. }
  294. }
  295. }
  296. }
  297. /***/ $選擇器, $isReturn = true)
  298. {
  299. preg_match('~([wd] )([[wd -="._/] ])?~', $selector, $matches);
  300. $tag = $matches[1];
  301. $ attr = array();
  302. if (isset($matches[2])) {
  303. preg_match_all('~([wd-] )= "( [wd-._/] )"~', $matches[2], $matches);
  304. foreach ($matches[1] as $key => $value) {
  305. $attr[$ value] = $matches [2][$key];
  306. }
  307. }
  308. if (isset(self::$TagSet[$tag])) {
  309. foreach (self::$TagSet [$ tag] as $attrValue => $nodeArray) {
  310. $isFind = true;
  311. foreach ($attr as $attrName => $value) {
  312. if (preg_match('~'.$attrName .' = ".*? '.$value.' .*?"~', $attrValue)
  313. || preg_match('~'.$attrName.'="'.$value.' .*?" ~' , $attrValue)
  314. || preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)
  315. || preg_match('~'.$attrName. '="'.$value.'"~', $attrValue)
  316. ) {
  317. 繼續;
  318. } else {
  319. $isFind = false;
  320. 中斷;
  321. }
  322. }
  323. if ($isFind) {
  324. foreach ($nodeArray as $key =>; $node) {
  325. self::$FoundNode[] = $key =>; $node) {
  326. 是 $節點;
  327. }
  328. }
  329. }
  330. }
  331. if ($isReturn) {
  332. $res = self ::$FoundNode;
  333. self::$FoundNode = array( );
  334. 回傳$res;
  335. }
  336. }
  337. }
  338. /**
  339. * TagDomNode
  340. *
  341. * @uses TagDomRoot
  342. *
  343. * @category TagParse
  344. * @package TagParse
  345. * @author kun
  346. * @package TagParse
  347. * @author kun * @copyright 2014 kun
  348. * @license http://www.php.net/license/3_01.txt PHP 許可證3.01
  349. * @version 1.0
  350. * @link http://www. php.net/license/3_01.txt php.net/license/3_01.txt blogkun.com
  351. * @since 1.0
  352. */
  353. class TagDomNode 擴充 TagDomRoot
  354. {
  355. public $attr = array();
  356. public $parent = null;
  357. /**
  358. * __construct
  359. *
  360. * @param Mixed $tag 標籤。
  361. * @param Mixed $parent 父節點。
  362. * @param Mixed $attr 屬性。
  363. * @param Mixed $innerContent 標記內容。
  364. * @param 混合 $level 節點層級。
  365. *
  366. * @access public
  367. *
  368. * @return TagDomNode
  369. */
  370. 公用函數__construct($tag, $parent, $attr, $innerContent, $level)
  371. {
  372. $this->tag = $tag;
  373. $this->parent = $parent;
  374. $this->_parseAttr($attr);
  375. $this->level = $level;
  376. $l = strpos($ insideContent, ' if ($l !== false) {
  377. $this->plaintext = substr($innerContent, 0, $l);
  378. }
  379. $ res = preg_match_all(' ~ >(.*?) if ($res !== false && $res > 0) {
  380. $this ->plaintext .= implode($ matches [1]);
  381. } else {
  382. $this->plaintext .= $innerContent;
  383. }
  384. $r = strrpos($innerContent, '>');
  385. if ( $ r !== false) {
  386. $this->plaintext .= substr($innerContent, $r 1);
  387. }
  388. $tagCollect = array();
  389. $attrlect = array();
  390. $innerContentCollect = array();
  391. if ($this->parseTag($innerContent, $tagCollect, $attrCollect) , $innerContentCollect) === false) {
  392. self::$TagParseError = true;
  393. }
  394. foreach ($tagCollect as $index =>; $tag) {
  395. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level 1);
  396. }
  397. if (!isset(self::$TagSet[$this->tag])) {
  398. self::$TagSet[$this->tag] = array();
  399. }
  400. if (!isset(self::$TagSet[$this->tag][$attr])) {
  401. self::$TagSet[$this->tag][$attr] = array();
  402. }
  403. self::$TagSet[$this->tag][$attr][] = &$this;
  404. }
  405. /* *
  406. * _parseAttr
  407. *
  408. * @param string $str 屬性字串。
  409. *
  410. * @access public
  411. *
  412. * @return null
  413. */
  414. 解剖函數_parseAttr($str)
  415. {
  416. preg_match_all('~(?[w-] )="(?.*?) "~s' , $str, $matches);
  417. foreach ($matches['attrName'] as $key => $value) {
  418. $this->attr[$value] = $matches ['attrValue'][ $ key];
  419. }
  420. }
  421. }
複製程式碼


陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn