它可以从一个给定的URL检索网页提取出一些关键词
例如从代码珠玑的首页可以提取出类似下面图片中的关键词
-
- if(!empty($_REQUEST["url"])){
-
- include 'class.keywords.php';
-
- $keywords = new keywordsugest();
- $keywords->_lang = 'es';
- $keywords->_encoding = 'iso-8859-1';
- $keywords->_catego = 'telecom';
- $keywords->_keyCount = 100; // is like the porcent %
- $keywords->file($_REQUEST['url']);
-
- #$keywords->readMetaKeyWords();
-
- #$keywords->readHtmlKeyWords();
-
- $keywords->readAll();
-
- echo 'Keywords found :';
-
- $i = 1;
-
- foreach($keywords->get() as $word) echo $i++.". $word
";
- }
- //url例如:http://www.codepearl.com
- echo "";
-
-
- ?>
复制代码
-
- class keywordsugest{
-
- var $_html = FALSE;
- var $_keyCount = 5;
- var $_keyWords = array();
- var $_encoding = 'UTF-8';
- var $_lang = 'es';
- var $_catego = 'telecom';
- var $_url = '';
-
- /**
- * # read meta keywords
- *
- */
- public function readMetaKeyWords() {
-
- if (! $this->_html) return;
-
- preg_match('/"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
-
- //$tags = get_meta_tags($this->_url);
- //echo $tags['keywords'];
-
-
- if (count($match)) {
- $this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
- }
- }
-
- /**
- * strip tags
- *
- * @param mixed $string
- */
- private function rip_tags($string) {
-
- // ----- remove HTML TAGs -----
- $string = preg_replace ('/]*>/', ' ', $string);
-
- /* // ----- remove control characters -----
- $string = str_replace("\r", '', $string); // --- replace with empty space
- $string = str_replace("\n", ' ', $string); // --- replace with space
- $string = str_replace("\t", ' ', $string); // --- replace with space
- */
- // ----- remove multiple spaces -----
- $string = trim(preg_replace('/ {2,}/', ' ', $string));
-
- return $string;
-
- }
-
- /**
- * # read keywords from page body or string
- *
- */
- public function readHtmlKeyWords() {
-
- if (! $this->_html) return;
-
- if(!empty($this->_keyWords)){
- $implo = implode(' ',$this->_keyWords);
- $this->_html = $this->_html." ".$implo;
- $this->_keyWords = array();
- }
-
- $this->_html = str_replace(' ',' ', $this->_html);
-
- # remove unneeded parts
- $toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
-
- foreach ($toRemove as $remove) $this->_html = preg_replace("/\.*?\/is", ' ', $this->_html);
-
- # remove comments
- $this->_html = preg_replace("/\/is", ' ', $this->_html);
-
- # delete html tags
- $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
-
- $this->_html = htmlspecialchars_decode($this->_html);
-
- # decode encoded hmtl entities
- $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
-
- # break into words
- $words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
-
- if (count($words)) {
-
- $frequency = array_count_values($words);
- unset($frequency['']);
-
- if (count($frequency)) {
-
- # delete stop words and interpunctions
- include('stopwords_'.$this->_lang.'.php');
- include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
-
- $punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
-
- foreach (array_keys($frequency) as $word) {
- if ( (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word]); }
- }
-
- $max = max($frequency);
- $count = count($frequency);
- $tot = round(($max * 100) / $count);
- $tot2 = round(($this->_keyCount * 100) / $count);
- if($tot > $count){$tot = $tot / 2;}
- if($tot2 > $count){$tot = $tot / 2;}
- $showmax = round(($tot + $tot2) / 2);
-
- foreach (array_keys($frequency) as $word) {
- if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
- }
-
- # sort by frequency
- arsort($frequency, SORT_NUMERIC);
-
- # add them to keyword array
- $i = 0;
-
- foreach ($frequency as $word=>$count) {
-
- if ( (! in_array($word, $this->_keyWords)) &&
- (! is_numeric($word)) &&
- (! empty($word)) ) {
- $this->_keyWords[] = (string)$word;
-
- $i++;
-
- if ($i == $showmax) break;
- }
- }
- }
- }
- }
-
-
- /**
- * change the encoding from default utf-8
- *
- * @param mixed $enc
- */
- private function encoding($enc = FALSE) {
-
- if ($enc) $this->_encoding = $enc;
- }
-
-
- /**
- * # reads from file or url
- *
- * @param mixed $fileUrl
- */
- public function file($fileUrl = FALSE) {
-
- if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
- $this->_url = $fileUrl;
- }
- }
-
-
-
- /**
- * # define html as string
- *
- * @param mixed $page
- */
- public function html($page = FALSE) {
-
- if ($page) $this->_html = $page;
- }
-
-
- /**
- * # reads both meta keywords and from body
- *
- */
- public function readAll() {
-
- if ($this->_html !== FALSE) {
-
- $this->readMetaKeyWords();
-
- $this->readHtmlKeyWords();
- }
-
- $this->_keyWords = array_unique($this->_keyWords);
- }
-
-
- /**
- * # returns keywords as array
- *
- */
- public function get() {
-
- return $this->_keyWords;
- }
- }
-
- ?>
复制代码
|