집 >백엔드 개발 >PHP 튜토리얼 >RMM 단어 분할 알고리즘 클래스

RMM 단어 분할 알고리즘 클래스

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB원래의: 2016-07-25 08:47:541004검색

RMM 단어 분할 알고리즘 클래스

//RMM 단어 분할 알고리즘
class SplitWord{
var $TagDic = Array();
var $RankDic = Array();
var $ SourceStr = '';
var $ResultStr = '';
var $SplitChar = ' '; //구분자
var $SplitLen = 4; //예약어 길이
var $MaxLen = 7; //사전에서 가장 큰 한자, 여기에 있는 값은 바이트 배열의 최대 인덱스입니다.
var $MinLen = 3; //가장 작은 한자, 여기에 있는 값은 바이트 배열의 최대 인덱스입니다.
function SplitWord(){
$this->__construct();
}
function __construct(){
//고급 단어 분할, 사전 로드 단어 분할 속도 향상
$dicfile = dirname(__FILE__)."/ppldic.csv";
$fp = fopen($dicfile,'r') //어휘의 단어 읽기
while($line = fgets($fp,256)){
$ws =explore(' ',$line); //어휘의 단어 분할
$this->TagDic[$ws[ 0] ] = $ws[1];
$this->RankDic[strlen($ws[0])][$ws[0]] = $ws[2];
}
fclose( $fp); //사전 파일 닫기
}
//리소스 추출
function Clear(){
@fclose($this->QuickDic);
}
//소스 문자열 설정
function SetSource($str){
$this->SourceStr = $this->UpdateStr($str);
$this- >ResultStr = "";
}
//문자열이 중국어에 존재하지 않는지 확인
function NotGBK($str)
{
if($str== "" ) return "";
if( ord($str[0])>0x80 ) return false;
else return true;
}
//RMM 단어 분할 알고리즘
함수 SplitRMM($str=""){
if($str!="") $this->SetSource($str);
if($this->SourceStr==" ") return "";
$this->SourceStr = $this->UpdateStr($this->SourceStr);
$spwords =explore(" ",$this->SourceStr);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i-- ){
if($spwords[$i]=="") 계속;
if($this->NotGBK($spwords[$i])){
if(preg_match("/ [^0 -9. -]/",$spwords[$i]))
{ $this->ResultStr = $spwords[$i].$spc.$this->ResultStr; }
else
{
$nextword = "";
@$nextword = substr($this->ResultStr,0,strpos($this->ResultStr,""));
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($ c)) ;
if(strlen($spwords[$i]) <= $this->SplitLen)
{
}
else
{
$this- >ResultStr = $this->RunRMM($spwords[$i]).$spc.$this->ResultStr;
}
}
}
return $this-> ResultStr;
}
//역 매칭 방법을 사용하여 전체 중국어 문자열 분해
function RunRMM($str){
$spc = $this->SplitChar;
$spLen = strlen ($ str);
$rsStr = "";
$okWord = "";
$tmpWord = "";
$WordArray = Array();
//역방향 사전 일치
for($i=($spLen-1);$i>=0;){
//가능한 가장 작은 단어에 도달했을 때
if($i<=$this->MinLen ) {
if($i==1){
$WordArray[] = substr($str,0,2);
}else
{
$w = substr($ str ,0,$this->MinLen 1);
if($this->IsWord($w)){
$WordArray[] = $w;
}else{
$ WordArray[] = substr($str,2,2);
$WordArray[] = substr($str,0,2);
}
}
$i = -1; break ;
}
//최소 단어 이상 상황 분석
if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
else $maxPos = $i;
$isMatch = false;
for($j=$maxPos;$j>=0;$j=$j-2){
$w = substr($str,$ i-$j,$j 1);
if($this->IsWord($w)){
$WordArray[] = $w;
$i = $i-$j- 1;
$isMatch = true;
break;
}
}
}
$rsStr = $this->otherword($WordArray);
return $rsStr ;
}
function otherword($WordArray){
$wlen = count($WordArray)-1; //배열의 요소 수 계산
$rsStr = "" ; / /변수 초기화
$spc = $this->SplitChar;
for($i=$wlen;$i>=0;$i--)
{
$rsStr . = $ spc.$WordArray[$i].","; //배열을 쉼표로 분할합니다
}
//이 단락의 단어 분할 결과를 반환합니다
$rsStr = preg_replace("/^ ".$spc."/",",",$rsStr);
return $rsStr;
}
//특정 단어가 사전에 있는지 확인
function IsWord($okWord){
$slen = strlen($okWord);
if($slen > $this - >MaxLen) return false;
else return isset($this->RankDic[$slen][$okWord]);
}
//문자열 정리(구두점용) , 중국어, 영어 혼합 등 사전 처리)
function UpdateStr($str){
$spc = $this->SplitChar;
$slen = strlen($str);
if( $slen ==0) return '';
$okstr = '';
$prechar = 0; // 0-공백 1-영어 2-중국어 3-기호
for($i=0 ;$ i<$slen;$i ){
if(ord($str[$i]) < 0x81){
//영어 공백 기호
if(ord($str[$i ] ) < 33){
if($prechar!=0&&$str[$i]!="r"&&$str[$i]!="n") $okstr .= $spc;
$prechar=0;
계속;
}else if(preg_match("/[^0-9a-zA-Z@.%#:\&_-]/",$str[$i]) ) {
if($prechar==0){ $okstr .= $str[$i]; $prechar=3;}
else{ $okstr .= $spc.$str[$i]; $ prechar=3;}
}else{
if($prechar==2||$prechar==3)
{ $okstr .= $spc.$str[$i]; = 1;}
else
{
if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; = 3; }
else { $okstr .= $str[$i]; $prechar=1 }
}
}
}
else{
// Previous 문자가 중국어가 아니고 공백이 아닌 경우 공백 추가
if($prechar!=0 && $prechar!=2) $okstr .= $spc;
//한자인 경우
if(isset ($str[$i 1])){
$c = $str[$i].$str[$i 1];
$n = hexdec(bin2hex($ c));
if($n<0xA13F && $n > 0xAA40){
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
else{
$okstr .= $c;
$prechar = 2;
}
$i ;
}
}
}
return $okstr;
}
}
// 전화
$split=new SplitWord();
echo $split ->SplitRMM("php 검색 기술");
// ppldic.csv 사전의 형식은 단어 공간 번호 n입니다

코드 복사

성명：

이전 기사：프로그래머가 코딩 인터뷰에 성공하기 위한 8가지 팁다음 기사：프로그래머가 코딩 인터뷰에 성공하기 위한 8가지 팁

RMM 단어 분할 알고리즘 클래스

관련 기사