Copy code The code is as follows:
/**
* Uniary word segmentation algorithm
* UTF8 encodes the next character. If the ASCII code of the first character is not greater than 192, it will only occupy 1 byte.
* If the ASCII code of the first character is greater than 192 and less than 224, it will occupy 2 bytes. , otherwise it will occupy 3 bytes
* Uniary word segmentation needs to add ft_min_word_len=1 in the my.ini file of mysql
* You can use the mysql query statement show variables like '%ft%' to view mysql full-text search related settings
*
* @access global
* @param string $str
* @param boolean $unique Whether to remove duplicate values
* @param boolean $merge Whether to merge additional values
* @ return array
*/
function seg_word($ str,$unique=false,$merge=true)
{
$str = trim(strip_tags($str));
$strlen = strlen($str);
if($strlen == 0) return array();
$spc = ' ';
//Add characters to be filtered as needed
$search = array(',', '/', '\', '.', ';', ':', ''', '!', '~','"', '`', '^', '(', ')', '?', '- ', "t", "n", ''', '<', '>', "r", "rn", '$', '&', '%', '#', '@ ', '+', '=', '{', '}', '[', ']', ')', '(', '.', '.', ', ', '!', ';', '"', '"', ''', ''', '[', ']', ', ', '—', ' ', '《', '》', '-' , '...', '[', ']',':');
$numpairs = array('1'=>'one','2'=>'two','3'= >'three','4'=>'four','5'=>'five','6'=>'six','7'=>'seven','8'= >'Eight','9'=>'Nine','0'=>'Zero');
$str = alab_num($str);
$str = str_replace($search, ' ',$str);
$ord = $i = $k = 0;
$prechar = 0;// 0-blank 1-English ampersand 2-Chinese
$result = array( );
$annex = array();
while($ord = ord($str[$i]))
{
//1 byte character
if ($ord <= 0xC0 )
{
//Remove empty string
if($ord < 33) {
$prechar=0;
$i++;
$k++;
continue;
}
//Additional Chinese uppercase number conversion
if(isset($numpairs[$str[$i]])) {
$annex[]= $numpairs[$str[$i]];
}
//If the preceding character is Chinese
if( $prechar == 2 ){
$result[++$k] = $str [$i];
}
else {
$result[$k] .= $str[$i];
}
$prechar = 1;
$i++;
}
else //2-3 byte characters (Chinese)
{
if($ord < 0xE0)
$step = 2;
else
$ step = 3;
$c = substr($str,$i,$step);
if(false !== $key = array_search($c,$numpairs)){
$annex[ ] = $key;
}
if ($prechar != 0) {
$result[++$k] = $c;
}
else {
$result [$k] .= $c;
}
$prechar = 2;
$i+=$step;
}
}
$result = $merge? array_merge($result,$annex) : $result ;
return $unique ? array_unique($result) : $result ;
}
http://www.bkjia.com/PHPjc/320992.htmlwww.bkjia.comtruehttp: //www.bkjia.com/PHPjc/320992.htmlTechArticleCopy the code code as follows: /** * Unary word segmentation algorithm * UTF8 encodes the next character if the first character ASCII code is not If it is greater than 192, it only occupies 1 byte* If the ASCII code of the first character is greater than 192 and less than 224, it occupies...