首頁 >後端開發 >php教程 >從請求的頁面提取關鍵字

從請求的頁面提取關鍵字

WBOY
WBOY原創
2016-07-25 08:49:341126瀏覽
}
它可以從一個給定的URL檢索網頁中提取出一些關鍵字

例如從代碼珠璣的首頁可以提取出類似下面圖片中的關鍵字 從請求的頁面提取關鍵字 從請求的頁面提取關鍵字
  1. if(!empty($_REQUEST["url"])){
  2. include 'class.keywords.php' ;
  3. $keywords = new keywordsugest();
  4. $keywords->_lang = 'es';
  5. $keywords->_encoding = 'iso-8859-1';
  6. $keywords ->_catego = 'telecom';
  7. $keywords->_keyCount = 100; // is like the porcent %
  8. $keywords->file($_REQUEST['url']);
  9. #$keywords->readMetaKeyWords();
  10. #$keywords->readHtmlKeyWords();
  11. $keywords->readAll();
  12. echoKey /br>';
  13. $i = 1;
  14. foreach($keywords->get() as $word) echo $i .". $word
    ";
  15. }
  16. //url例如:http://www.codepearl.com
  17. echo "
    ";
  18. ?>
複製程式碼
  1. class keywordsugest{
  2. var $_html = FALSE;
  3. var $_keyCount = 5; _keyWords = array();
  4. var $_encoding = 'UTF-8';
  5. var $_lang = 'es';
  6. var $_catego = '電信';
  7. var $_url = '' ;
  8. /**
  9. * # 讀取元關鍵字
  10. *
  11. */
  12. public function readMetaKeyWords() {
  13. if (! $this->_html) return ;
  14. preg_match(' /"]*)"?[s]*[/]?[s]*>/is', $this->_html, $match);
  15. //$tags = get_meta_tags($ this->_url);
  16. //echo $tags['keywords']
  17. if (count($match) ) {
  18. $this->_keyWords = array_unique(explode( ',', preg_replace('/s/i', ' ', mb_strtolower($match[1], $this->_encoding)))) ;
  19. }
  20. }
  21. /* *
  22. * 剝離標籤
  23. *
  24. * @param mix $string
  25. */
  26. private function rip_tags($string) {
  27. // ----- 刪除HTML 標籤-----
  28. $string = preg_replace ('/]*>/', ' ', $string);
  29. /* // -----刪除控製字元-----
  30. $string = str_replace("r", '', $string); // --- 替換為空格
  31. $string = str_replace("n", ' ' , $字串); // --- 替換為空格
  32. $string = str_replace ("t", ' ', $string); // --- 替換為空格
  33. */
  34. // ----- 刪除多個空格-----
  35. $string = trim(preg_replace('/ {2,}/', ' ', $string));
  36. return $string;
  37. }
  38. /**
  39. * # 從頁面正文或字串中讀取關鍵字
  40. *
  41. */
  42. public function readHtmlKeyWords() {
  43. if (! $this->_html) return;
  44. if(!empty($this->_keyWords)){
  45. $implo = implode(' ',$this- >_keyWords);
  46. $this->_html = $this->_html." ".$implo;
  47. $this->_keyWords = array();
  48. }
  49. $this->_html = str_replace(' ',' ', $this->_html);
  50. # 刪除不需要的部分
  51. $toRemove = array( 'head', ' script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
  52. foreach ($toRemove as $remove ) $this-> ;_html = preg_replace("/.*?/is", ' ', $this->_html);
  53. # 刪除評論
  54. $this->_html = preg_replace("//is", ' ', $this->_html);
  55. # 刪除html標籤
  56. $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
  57. $this->_html = htmlspecialchars_decodeialchars_decode ($this->_html);
  58. # 解碼編碼的hmtl 實體
  59. $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
  60. # 分解為單字
  61. $words = preg_split("/[s] |[t] |[.] |[,] |[:] |[; ] |[!] |[?] |[| ] /s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
  62. if (count($words)) {
  63. $Frequency = array_count_values($words); unset($Frequency['']);
  64. if (count($Frequency)) {
  65. # 刪除停用字與標點符號
  66. include('stopwords_'.$this ->_lang.'.php');
  67. include('glodic_' .$this->_catego.'_'.$this->_lang.'.php');
  68. $punct = '~!@#$%^&*()_ |}{ [];:'",<.>/?`-=\';
  69. foreach (array_keys($Frequency) as $word ) {
  70. if ( (in_array($w​​ord, $ stopWords)) 或(strspn($word, $punct) == strlen($word)) ){ unset($Frequency[$word]); }
  71. }
  72. $max = max($頻率);
  73. $count = count($Frequency);
  74. $tot = round(($max * 100) / $count) ;
  75. $tot2 = round(($this->_keyCount * 100) / $count);
  76. if($tot >; $count){$tot = $tot / 2;}
  77. if( $tot2 > $count){$tot = $tot / 2;}
  78. $showmax = round(($tot $tot2) / 2);
  79. foreach (array_keys($Frequency) as $word) {
  80. if ( in_array($w​​ord, $ glodic) ) {$頻率[$word] = $頻率[$word] $showmax; }
  81. }
  82. # 依頻率排序
  83. arsort($Frequency, SORT_NUMERIC);
  84. # 將它們加到關鍵字數​​組
  85. $i = 0;
  86. foreach ($Frequency as $word=>$count) {
  87. if ((!in_array($ word, $this->_keyWords)) &&
  88. (!is_numeric($word) ) &&
  89. (!empty($word)) ) {
  90. $this->_keyWords[] = ( string)$word;
  91. $i ;
  92. if ( $i == $showmax) 中斷;
  93. }
  94. }
  95. }
  96. }
  97. }
  98. /**
  99. * 更改預設的 utf-8 編碼
  100. *
  101. * @param mix $enc
  102. */
  103. private函數編碼($enc = FALSE) {
  104. if ($enc) $this->_encoding = $enc ;
  105. }
  106. /**
  107. * # 從檔案或網址讀取
  108. *
  109. * @param mix $fileUrl
  110. */
  111. 公用函數檔($fileUrl = FALSE) {
  112. if ($fileUrl){ $this- > ;_html = @file_get_contents($fileUrl);
  113. $this->_url = $fileUrl;
  114. }
  115. }
  116. /**
  117. /**
  118. * # 定義 html 為字串
  119. *
  120. * @param mix $page
  121. * /
  122. public function html($page = FALSE) {
  123. if ($page) $this->_html = $page;
  124. }
  125. /* *
  126. * # 讀取元關鍵字與正文
  127. *
  128. */
  129. public function readAll() {
  130. if ($this->_html !== FALSE) {
  131. $this->readMetaKeyWords();
  132. $this->readHtmlKeyWords();
  133. }
  134. $this->_keyWords = array_unique($this->_keyWords);
  135. }
  136. }
  137. **
  138. * # 以陣列形式傳回關鍵字
  139. *
*/
public function get() { return $this->_keyWords;
}
?>
複製程式碼


陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn