Home  >  Article  >  Backend Development  >  PHP uses the Tencent QQ Weibo API interface to obtain the code of Weibo content

PHP uses the Tencent QQ Weibo API interface to obtain the code of Weibo content

WBOY
WBOYOriginal
2016-07-25 08:57:471209browse
  1. /**

  2. * Use Tencent QQ Weibo API interface to obtain Weibo content
  3. * by bbs.it-home.org
  4. */
  5. define('HDOM_TYPE_ELEMENT', 1);
  6. define('HDOM_TYPE_COMMENT', 2);
  7. define('HDOM_TYPE_TEXT', 3);
  8. define('HDOM_TYPE_ENDTAG', 4);
  9. define('HDOM_TYPE_ROOT', 5);
  10. define('HDOM_TYPE_UNKNOWN', 6);
  11. define('HDOM_QUOTE_DOUBLE', 0);
  12. define('HDOM_QUOTE_SINGLE', 1);
  13. define('HDOM_QUOTE_NO', 3);
  14. define('HDOM_INFO_BEGIN', 0);
  15. define('HDOM_INFO_END', 1);
  16. define('HDOM_INFO_QUOTE', 2);
  17. define('HDOM_INFO_SPACE', 3);
  18. define('HDOM_INFO_TEXT', 4);
  19. define('HDOM_INFO_INNER', 5);
  20. define('HDOM_INFO_OUTER', 6);
  21. define('HDOM_INFO_ENDSPACE',7);
  22. // helper functions
  23. // ---------------
  24. // get html dom form file
  25. function file_get_html() {
  26. $dom = new simple_html_dom;
  27. $args = func_get_args();
  28. $dom->load(call_user_func_array('file_get_contents', $args), true);
  29. return $dom;
  30. }
  31. // get html dom form string
  32. function str_get_html($str, $lowercase=true) {
  33. $dom = new simple_html_dom;
  34. $dom->load($str, $lowercase);
  35. return $dom;
  36. }
  37. // dump html dom tree
  38. function dump_html_tree($node, $show_attr=true, $deep=0) {
  39. $lead = str_repeat(' ', $deep);
  40. echo $lead.$node->tag;
  41. if ($show_attr && count($node->attr)>0) {
  42. echo '(';
  43. foreach($node->attr as $k=>$v)
  44. echo "[$k]=>"".$node->$k.'", ';
  45. echo ')';
  46. }
  47. echo "n";
  48. foreach($node->nodes as $c)
  49. dump_html_tree($c, $show_attr, $deep+1);
  50. }
  51. // get dom form file (dePRecated)
  52. function file_get_dom() {
  53. $dom = new simple_html_dom;
  54. $args = func_get_args();
  55. $dom->load(call_user_func_array('file_get_contents', $args), true);
  56. return $dom;
  57. }
  58. // get dom form string (deprecated)
  59. function str_get_dom($str, $lowercase=true) {
  60. $dom = new simple_html_dom;
  61. $dom->load($str, $lowercase);
  62. return $dom;
  63. }
  64. // simple html dom node
  65. // ---------------
  66. class simple_html_dom_node {
  67. public $nodetype = HDOM_TYPE_TEXT;
  68. public $tag = 'text';
  69. public $attr = array();
  70. public $children = array();
  71. public $nodes = array();
  72. public $parent = null;
  73. public $_ = array();
  74. private $dom = null;
  75. function __construct($dom) {
  76. $this->dom = $dom;
  77. $dom->nodes[] = $this;
  78. }
  79. function __destruct() {
  80. $this->clear();
  81. }
  82. function __toString() {
  83. return $this->outertext();
  84. }
  85. // clean up memory due to php5 circular references memory leak...
  86. function clear() {
  87. $this->dom = null;
  88. $this->nodes = null;
  89. $this->parent = null;
  90. $this->children = null;
  91. }

  92. // dump node's tree

  93. function dump($show_attr=true) {
  94. dump_html_tree($this, $show_attr);
  95. }
  96. // returns the parent of node
  97. function parent() {
  98. return $this->parent;
  99. }
  100. // returns children of node
  101. function children($idx=-1) {
  102. if ($idx===-1) return $this->children;
  103. if (isset($this->children[$idx])) return $this->children[$idx];
  104. return null;
  105. }
  106. // returns the first child of node
  107. function first_child() {
  108. if (count($this->children)>0) return $this->children[0];
  109. return null;
  110. }
  111. // returns the last child of node
  112. function last_child() {
  113. if (($count=count($this->children))>0) return $this->children[$count-1];
  114. return null;
  115. }
  116. // returns the next sibling of node
  117. function next_sibling() {
  118. if ($this->parent===null) return null;
  119. $idx = 0;
  120. $count = count($this->parent->children);
  121. while ($idx<$count && $this!==$this->parent->children[$idx])
  122. ++$idx;
  123. if (++$idx>=$count) return null;
  124. return $this->parent->children[$idx];
  125. }
  126. // returns the previous sibling of node
  127. function prev_sibling() {
  128. if ($this->parent===null) return null;
  129. $idx = 0;
  130. $count = count($this->parent->children);
  131. while ($idx<$count && $this!==$this->parent->children[$idx])
  132. ++$idx;
  133. if (--$idx<0) return null;
  134. return $this->parent->children[$idx];
  135. }
  136. // get dom node's inner html
  137. function innertext() {
  138. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  139. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  140. $ret = '';
  141. foreach($this->nodes as $n)
  142. $ret .= $n->outertext();
  143. return $ret;
  144. }
  145. // get dom node's outer text (with tag)
  146. function outertext() {
  147. if ($this->tag==='root') return $this->innertext();
  148. // trigger callback
  149. if ($this->dom->callback!==null)
  150. call_user_func_array($this->dom->callback, array($this));
  151. if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
  152. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  153. // render begin tag
  154. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  155. // render inner text
  156. if (isset($this->_[HDOM_INFO_INNER]))
  157. $ret .= $this->_[HDOM_INFO_INNER];
  158. else {
  159. foreach($this->nodes as $n)
  160. $ret .= $n->outertext();
  161. }
  162. // render end tag
  163. if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
  164. $ret .= 'tag.'>';
  165. return $ret;
  166. }
  167. // get dom node's plain text
  168. function text() {
  169. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  170. switch ($this->nodetype) {
  171. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  172. case HDOM_TYPE_COMMENT: return '';
  173. case HDOM_TYPE_UNKNOWN: return '';
  174. }
  175. if (strcasecmp($this->tag, 'script')===0) return '';
  176. if (strcasecmp($this->tag, 'style')===0) return '';
  177. $ret = '';
  178. foreach($this->nodes as $n)
  179. $ret .= $n->text();
  180. return $ret;
  181. }

  182. function xmltext() {

  183. $ret = $this->innertext();
  184. $ret = str_ireplace(' $ret = str_replace(']]>', '', $ret);
  185. return $ret;
  186. }
  187. // build node's text with tag
  188. function makeup() {
  189. // text, comment, unknown
  190. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  191. $ret = '<'.$this->tag;
  192. $i = -1;
  193. foreach($this->attr as $key=>$val) {
  194. ++$i;
  195. // skip removed attribute
  196. if ($val===null || $val===false)
  197. continue;
  198. $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  199. //no value attr: nowrap, checked selected...
  200. if ($val===true)
  201. $ret .= $key;
  202. else {
  203. switch($this->_[HDOM_INFO_QUOTE][$i]) {
  204. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  205. case HDOM_QUOTE_SINGLE: $quote = '''; break;
  206. default: $quote = '';
  207. }
  208. $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
  209. }
  210. }
  211. $ret = $this->dom->restore_noise($ret);
  212. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  213. }
  214. // find elements by CSS selector
  215. function find($selector, $idx=null) {
  216. $selectors = $this->parse_selector($selector);
  217. if (($count=count($selectors))===0) return array();
  218. $found_keys = array();
  219. // find each selector
  220. for ($c=0; $c<$count; ++$c) {
  221. if (($levle=count($selectors[0]))===0) return array();
  222. if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
  223. $head = array($this->_[HDOM_INFO_BEGIN]=>1);
  224. // handle descendant selectors, no recursive!
  225. for ($l=0; $l<$levle; ++$l) {
  226. $ret = array();
  227. foreach($head as $k=>$v) {
  228. $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
  229. $n->seek($selectors[$c][$l], $ret);
  230. }
  231. $head = $ret;
  232. }
  233. foreach($head as $k=>$v) {
  234. if (!isset($found_keys[$k]))
  235. $found_keys[$k] = 1;
  236. }
  237. }
  238. // sort keys
  239. ksort($found_keys);
  240. $found = array();
  241. foreach($found_keys as $k=>$v)
  242. $found[] = $this->dom->nodes[$k];
  243. // return nth-element or array
  244. if (is_null($idx)) return $found;
  245. else if ($idx<0) $idx = count($found) + $idx;
  246. return (isset($found[$idx])) ? $found[$idx] : null;
  247. }
  248. // seek for given conditions
  249. protected function seek($selector, &$ret) {
  250. list($tag, $key, $val, $exp, $no_key) = $selector;
  251. // xpath index
  252. if ($tag && $key && is_numeric($key)) {
  253. $count = 0;
  254. foreach ($this->children as $c) {
  255. if ($tag==='*' || $tag===$c->tag) {
  256. if (++$count==$key) {
  257. $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
  258. return;
  259. }
  260. }
  261. }
  262. return;
  263. }
  264. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  265. if ($end==0) {
  266. $parent = $this->parent;
  267. while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
  268. $end -= 1;
  269. $parent = $parent->parent;
  270. }
  271. $end += $parent->_[HDOM_INFO_END];
  272. }
  273. for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
  274. $node = $this->dom->nodes[$i];
  275. $pass = true;
  276. if ($tag==='*' && !$key) {
  277. if (in_array($node, $this->children, true))
  278. $ret[$i] = 1;
  279. continue;
  280. }
  281. // compare tag
  282. if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
  283. // compare key
  284. if ($pass && $key) {
  285. if ($no_key) {
  286. if (isset($node->attr[$key])) $pass=false;
  287. }
  288. else if (!isset($node->attr[$key])) $pass=false;
  289. }
  290. // compare value
  291. if ($pass && $key && $val && $val!=='*') {
  292. $check = $this->match($exp, $val, $node->attr[$key]);
  293. // handle multiple class
  294. if (!$check && strcasecmp($key, 'class')===0) {
  295. foreach(explode(' ',$node->attr[$key]) as $k) {
  296. $check = $this->match($exp, $val, $k);
  297. if ($check) break;
  298. }
  299. }
  300. if (!$check) $pass = false;
  301. }
  302. if ($pass) $ret[$i] = 1;
  303. unset($node);
  304. }
  305. }
  306. protected function match($exp, $pattern, $value) {
  307. switch ($exp) {
  308. case '=':
  309. return ($value===$pattern);
  310. case '!=':
  311. return ($value!==$pattern);
  312. case '^=':
  313. return preg_match("/^".preg_quote($pattern,'/')."/", $value);
  314. case '$=':
  315. return preg_match("/".preg_quote($pattern,'/')."$/", $value);
  316. case '*=':
  317. if ($pattern[0]=='/')
  318. return preg_match($pattern, $value);
  319. return preg_match("/".$pattern."/i", $value);
  320. }
  321. return false;
  322. }
  323. protected function parse_selector($selector_string) {
  324. // pattern of CSS selectors, modified from mootools
  325. $pattern = "/([w-:*]*)(?:#([w-]+)|.([w-]+))?(?:[@?(!?[w-]+)(?:([!*^$]?=)["']?(.*?)["']?)?])?([/, ]+)/is";
  326. preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
  327. $selectors = array();
  328. $result = array();
  329. //print_r($matches);
  330. foreach ($matches as $m) {
  331. $m[0] = trim($m[0]);
  332. if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
  333. // for borwser grnreated xpath
  334. if ($m[1]==='tbody') continue;
  335. list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
  336. if(!empty($m[2])) {$key='id'; $val=$m[2];}
  337. if(!empty($m[3])) {$key='class'; $val=$m[3];}
  338. if(!empty($m[4])) {$key=$m[4];}
  339. if(!empty($m[5])) {$exp=$m[5];}
  340. if(!empty($m[6])) {$val=$m[6];}
  341. // convert to lowercase
  342. if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
  343. //elements that do NOT have the specified attribute
  344. if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
  345. $result[] = array($tag, $key, $val, $exp, $no_key);
  346. if (trim($m[7])===',') {
  347. $selectors[] = $result;
  348. $result = array();
  349. }
  350. }
  351. if (count($result)>0)
  352. $selectors[] = $result;
  353. return $selectors;
  354. }
  355. function __get($name) {
  356. if (isset($this->attr[$name])) return $this->attr[$name];
  357. switch($name) {
  358. case 'outertext': return $this->outertext();
  359. case 'innertext': return $this->innertext();
  360. case 'plaintext': return $this->text();
  361. case 'xmltext': return $this->xmltext();
  362. default: return array_key_exists($name, $this->attr);
  363. }
  364. }
  365. function __set($name, $value) {
  366. switch($name) {
  367. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  368. case 'innertext':
  369. if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
  370. return $this->_[HDOM_INFO_INNER] = $value;
  371. }
  372. if (!isset($this->attr[$name])) {
  373. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  374. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  375. }
  376. $this->attr[$name] = $value;
  377. }
  378. function __isset($name) {
  379. switch($name) {
  380. case 'outertext': return true;
  381. case 'innertext': return true;
  382. case 'plaintext': return true;
  383. }
  384. //no value attr: nowrap, checked selected...
  385. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  386. }
  387. function __unset($name) {
  388. if (isset($this->attr[$name]))
  389. unset($this->attr[$name]);
  390. }
  391. // camel naming conventions
  392. function getAllAttributes() {return $this->attr;}
  393. function getAttribute($name) {return $this->__get($name);}
  394. function setAttribute($name, $value) {$this->__set($name, $value);}
  395. function hasAttribute($name) {return $this->__isset($name);}
  396. function removeAttribute($name) {$this->__set($name, null);}
  397. function getElementById($id) {return $this->find("#$id", 0);}
  398. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
  399. function getElementByTagName($name) {return $this->find($name, 0);}
  400. function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
  401. function parentNode() {return $this->parent();}
  402. function childNodes($idx=-1) {return $this->children($idx);}
  403. function firstChild() {return $this->first_child();}
  404. function lastChild() {return $this->last_child();}
  405. function nextSibling() {return $this->next_sibling();}
  406. function previousSibling() {return $this->prev_sibling();}
  407. }
  408. // simple html dom parser
  409. // -----------------------------------------------------------------------------
  410. class simple_html_dom {
  411. public $root = null;
  412. public $nodes = array();
  413. public $callback = null;
  414. public $lowercase = false;
  415. protected $pos;
  416. protected $doc;
  417. protected $char;
  418. protected $size;
  419. protected $cursor;
  420. protected $parent;
  421. protected $noise = array();
  422. protected $token_blank = " trn";
  423. protected $token_equal = ' =/>';
  424. protected $token_slash = " />rnt";
  425. protected $token_attr = ' >';
  426. // use isset instead of in_array, performance boost about 30%...
  427. protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
  428. protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
  429. protected $optional_closing_tags = array(
  430. 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
  431. 'th'=>array('th'=>1),
  432. 'td'=>array('td'=>1),
  433. 'li'=>array('li'=>1),
  434. 'dt'=>array('dt'=>1, 'dd'=>1),
  435. 'dd'=>array('dd'=>1, 'dt'=>1),
  436. 'dl'=>array('dd'=>1, 'dt'=>1),
  437. 'p'=>array('p'=>1),
  438. 'nobr'=>array('nobr'=>1),
  439. );
  440. function __construct($str=null) {
  441. if ($str) {
  442. if (preg_match("/^http:///i",$str) || is_file($str))
  443. $this->load_file($str);
  444. else
  445. $this->load($str);
  446. }
  447. }
  448. http://www.devdao.com/
  449. function __destruct() {
  450. $this->clear();
  451. }
  452. // load html from string
  453. function load($str, $lowercase=true) {
  454. // prepare
  455. $this->prepare($str, $lowercase);
  456. // strip out comments
  457. $this->remove_noise("''is");
  458. // strip out cdata
  459. $this->remove_noise("''is", true);
  460. // strip out