Heim >Backend-Entwicklung >PHP-Tutorial >从请求的页面提取关键词

从请求的页面提取关键词

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOriginal: 2016-07-25 08:49:341144Durchsuche

它可以从一个给定的URL检索网页提取出一些关键词

例如从代码珠玑的首页可以提取出类似下面图片中的关键词从请求的页面提取关键词

if(!empty($_REQUEST["url"])){
include 'class.keywords.php';
$keywords = new keywordsugest();
$keywords->_lang = 'es';
$keywords->_encoding = 'iso-8859-1';
$keywords->_catego = 'telecom';
$keywords->_keyCount = 100; // is like the porcent %
$keywords->file($_REQUEST['url']);
#$keywords->readMetaKeyWords();
#$keywords->readHtmlKeyWords();
$keywords->readAll();
echo 'Keywords found :';
$i = 1;
foreach($keywords->get() as $word) echo $i++.". $word
";
}
//url例如：http://www.codepearl.com
echo "

";
?>

复制代码

class keywordsugest{
var $_html = FALSE;
var $_keyCount = 5;
var $_keyWords = array();
var $_encoding = 'UTF-8';
var $_lang = 'es';
var $_catego = 'telecom';
var $_url = '';
/**
* # read meta keywords
*
*/
public function readMetaKeyWords() {
if (! $this->_html) return;
preg_match('/"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
//$tags = get_meta_tags($this->_url);
//echo $tags['keywords'];
if (count($match)) {
$this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
}
}
/**
* strip tags
*
* @param mixed $string
*/
private function rip_tags($string) {
// ----- remove HTML TAGs -----
$string = preg_replace ('/]*>/', ' ', $string);
/* // ----- remove control characters -----
$string = str_replace("\r", '', $string); // --- replace with empty space
$string = str_replace("\n", ' ', $string); // --- replace with space
$string = str_replace("\t", ' ', $string); // --- replace with space
*/
// ----- remove multiple spaces -----
$string = trim(preg_replace('/ {2,}/', ' ', $string));
return $string;
}
/**
* # read keywords from page body or string
*
*/
public function readHtmlKeyWords() {
if (! $this->_html) return;
if(!empty($this->_keyWords)){
$implo = implode(' ',$this->_keyWords);
$this->_html = $this->_html." ".$implo;
$this->_keyWords = array();
}
$this->_html = str_replace(' ',' ', $this->_html);
# remove unneeded parts
$toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
foreach ($toRemove as $remove) $this->_html = preg_replace("/\.*?\/is", ' ', $this->_html);
# remove comments
$this->_html = preg_replace("/\/is", ' ', $this->_html);
# delete html tags
$this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
$this->_html = htmlspecialchars_decode($this->_html);
# decode encoded hmtl entities
$this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
# break into words
$words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
if (count($words)) {
$frequency = array_count_values($words);
unset($frequency['']);
if (count($frequency)) {
# delete stop words and interpunctions
include('stopwords_'.$this->_lang.'.php');
include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
$punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
foreach (array_keys($frequency) as $word) {
if ( (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word]); }
}
$max = max($frequency);
$count = count($frequency);
$tot = round(($max * 100) / $count);
$tot2 = round(($this->_keyCount * 100) / $count);
if($tot > $count){$tot = $tot / 2;}
if($tot2 > $count){$tot = $tot / 2;}
$showmax = round(($tot + $tot2) / 2);
foreach (array_keys($frequency) as $word) {
if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
}
# sort by frequency
arsort($frequency, SORT_NUMERIC);
# add them to keyword array
$i = 0;
foreach ($frequency as $word=>$count) {
if ( (! in_array($word, $this->_keyWords)) &&
(! is_numeric($word)) &&
(! empty($word)) ) {
$this->_keyWords[] = (string)$word;
$i++;
if ($i == $showmax) break;
}
}
}
}
}
/**
* change the encoding from default utf-8
*
* @param mixed $enc
*/
private function encoding($enc = FALSE) {
if ($enc) $this->_encoding = $enc;
}
/**
* # reads from file or url
*
* @param mixed $fileUrl
*/
public function file($fileUrl = FALSE) {
if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
$this->_url = $fileUrl;
}
}
/**
* # define html as string
*
* @param mixed $page
*/
public function html($page = FALSE) {
if ($page) $this->_html = $page;
}
/**
* # reads both meta keywords and from body
*
*/
public function readAll() {
if ($this->_html !== FALSE) {
$this->readMetaKeyWords();
$this->readHtmlKeyWords();
}
$this->_keyWords = array_unique($this->_keyWords);
}
/**
* # returns keywords as array
*
*/
public function get() {
return $this->_keyWords;
}
}
?>

复制代码

Stellungnahme：

Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn

Vorheriger Artikel：基于RMM的简易中文分词 Nächster Artikel：一个简单的数据库连接和文本缓存综合类

In Verbindung stehende Artikel

Mehr sehen