Heim  >  Artikel  >  Backend-Entwicklung  >  从请求的页面提取关键词

从请求的页面提取关键词

WBOY
WBOYOriginal
2016-07-25 08:49:341070Durchsuche
它可以从一个给定的URL检索网页提取出一些关键词

例如从代码珠玑的首页可以提取出类似下面图片中的关键词 从请求的页面提取关键词 从请求的页面提取关键词
  1. if(!empty($_REQUEST["url"])){
  2. include 'class.keywords.php';
  3. $keywords = new keywordsugest();
  4. $keywords->_lang = 'es';
  5. $keywords->_encoding = 'iso-8859-1';
  6. $keywords->_catego = 'telecom';
  7. $keywords->_keyCount = 100; // is like the porcent %
  8. $keywords->file($_REQUEST['url']);
  9. #$keywords->readMetaKeyWords();
  10. #$keywords->readHtmlKeyWords();
  11. $keywords->readAll();
  12. echo 'Keywords found :';
  13. $i = 1;
  14. foreach($keywords->get() as $word) echo $i++.". $word
    ";
  15. }
  16. //url例如:http://www.codepearl.com
  17. echo "
    ";
  18. ?>
复制代码
  1. class keywordsugest{
  2. var $_html = FALSE;
  3. var $_keyCount = 5;
  4. var $_keyWords = array();
  5. var $_encoding = 'UTF-8';
  6. var $_lang = 'es';
  7. var $_catego = 'telecom';
  8. var $_url = '';
  9. /**
  10. * # read meta keywords
  11. *
  12. */
  13. public function readMetaKeyWords() {
  14. if (! $this->_html) return;
  15. preg_match('/"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
  16. //$tags = get_meta_tags($this->_url);
  17. //echo $tags['keywords'];
  18. if (count($match)) {
  19. $this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
  20. }
  21. }
  22. /**
  23. * strip tags
  24. *
  25. * @param mixed $string
  26. */
  27. private function rip_tags($string) {
  28. // ----- remove HTML TAGs -----
  29. $string = preg_replace ('/]*>/', ' ', $string);
  30. /* // ----- remove control characters -----
  31. $string = str_replace("\r", '', $string); // --- replace with empty space
  32. $string = str_replace("\n", ' ', $string); // --- replace with space
  33. $string = str_replace("\t", ' ', $string); // --- replace with space
  34. */
  35. // ----- remove multiple spaces -----
  36. $string = trim(preg_replace('/ {2,}/', ' ', $string));
  37. return $string;
  38. }
  39. /**
  40. * # read keywords from page body or string
  41. *
  42. */
  43. public function readHtmlKeyWords() {
  44. if (! $this->_html) return;
  45. if(!empty($this->_keyWords)){
  46. $implo = implode(' ',$this->_keyWords);
  47. $this->_html = $this->_html." ".$implo;
  48. $this->_keyWords = array();
  49. }
  50. $this->_html = str_replace(' ',' ', $this->_html);
  51. # remove unneeded parts
  52. $toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
  53. foreach ($toRemove as $remove) $this->_html = preg_replace("/\.*?\/is", ' ', $this->_html);
  54. # remove comments
  55. $this->_html = preg_replace("/\/is", ' ', $this->_html);
  56. # delete html tags
  57. $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
  58. $this->_html = htmlspecialchars_decode($this->_html);
  59. # decode encoded hmtl entities
  60. $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
  61. # break into words
  62. $words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
  63. if (count($words)) {
  64. $frequency = array_count_values($words);
  65. unset($frequency['']);
  66. if (count($frequency)) {
  67. # delete stop words and interpunctions
  68. include('stopwords_'.$this->_lang.'.php');
  69. include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
  70. $punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
  71. foreach (array_keys($frequency) as $word) {
  72. if ( (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word]); }
  73. }
  74. $max = max($frequency);
  75. $count = count($frequency);
  76. $tot = round(($max * 100) / $count);
  77. $tot2 = round(($this->_keyCount * 100) / $count);
  78. if($tot > $count){$tot = $tot / 2;}
  79. if($tot2 > $count){$tot = $tot / 2;}
  80. $showmax = round(($tot + $tot2) / 2);
  81. foreach (array_keys($frequency) as $word) {
  82. if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
  83. }
  84. # sort by frequency
  85. arsort($frequency, SORT_NUMERIC);
  86. # add them to keyword array
  87. $i = 0;
  88. foreach ($frequency as $word=>$count) {
  89. if ( (! in_array($word, $this->_keyWords)) &&
  90. (! is_numeric($word)) &&
  91. (! empty($word)) ) {
  92. $this->_keyWords[] = (string)$word;
  93. $i++;
  94. if ($i == $showmax) break;
  95. }
  96. }
  97. }
  98. }
  99. }
  100. /**
  101. * change the encoding from default utf-8
  102. *
  103. * @param mixed $enc
  104. */
  105. private function encoding($enc = FALSE) {
  106. if ($enc) $this->_encoding = $enc;
  107. }
  108. /**
  109. * # reads from file or url
  110. *
  111. * @param mixed $fileUrl
  112. */
  113. public function file($fileUrl = FALSE) {
  114. if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
  115. $this->_url = $fileUrl;
  116. }
  117. }
  118. /**
  119. * # define html as string
  120. *
  121. * @param mixed $page
  122. */
  123. public function html($page = FALSE) {
  124. if ($page) $this->_html = $page;
  125. }
  126. /**
  127. * # reads both meta keywords and from body
  128. *
  129. */
  130. public function readAll() {
  131. if ($this->_html !== FALSE) {
  132. $this->readMetaKeyWords();
  133. $this->readHtmlKeyWords();
  134. }
  135. $this->_keyWords = array_unique($this->_keyWords);
  136. }
  137. /**
  138. * # returns keywords as array
  139. *
  140. */
  141. public function get() {
  142. return $this->_keyWords;
  143. }
  144. }
  145. ?>
复制代码


Stellungnahme:
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn