Home  >  Article  >  Backend Development  >  PHP汉字转拼音的两种方法+PHP提取汉字(中文)方法

PHP汉字转拼音的两种方法+PHP提取汉字(中文)方法

WBOY
WBOYOriginal
2016-06-23 13:47:251081browse

方法一:根据ASCII码转换 ,GB2312库对多音字也无能为力。

GB2312标准共收录6763个汉字,不在范围内的汉字是无法转换,如:中国前总理朱?基的“?”字。
GB2312中对所收汉字进行了“分区”处理,每区含有94个汉字/符号。这种表示方式也称为区位码。
01-09区为特殊符号。
16-55区为一级汉字,按拼音排序。(3755个)
56-87区为二级汉字,按部首/笔画排序。(3008个)
10-15区及88-94区则未有编码。
占用的码位是72*94=6768。其中有5个空位是D7FA-D7FE。所以实际共6763个汉字。其中一级汉字3755个,二级汉字3008个。而此类算法实际只能转换3755个汉字。
优点:没有使用大文字库,文件相对较小,没有使用正则表达式,性能相对较高。支持首字母转换。
缺点:没有在GB2312中收录的汉字无法转换,多音字无法识别。
(如果对拼音转换要求不高的朋友,建议使用这个。)

<?php // 此类是根据ASCII码转换,GB2312库对多音字也无能为力。// GB2312标准共收录6763个汉字,不在范围内的汉字是无法转换,如:中国前总理朱?基的“?”字。class pinyin{  public static function utf8_to($s, $isfirst = false) {    return self::to(self::utf8_to_gb2312($s), $isfirst);  }  public static function utf8_to_gb2312($s) {    return iconv('UTF-8', 'GB2312//IGNORE', $s);  }  // 字符串必须为GB2312编码  public static function to($s, $isfirst = false) {    $res = '';    $len = strlen($s);    $pinyin_arr = self::get_pinyin_array();    for($i=0; $i<$len; $i++) {      $ascii = ord($s{$i});      if($ascii > 0x80) {        $ascii2 = ord($s{++$i});        $ascii = $ascii * 256 + $ascii2 - 65536;      }      if($ascii  0) {        if(($ascii >= 48 && $ascii = 97 && $ascii = 65 && $ascii  -10247) {        $res .= '_';      }else{        foreach($pinyin_arr as $py=>$asc) {          if($asc  0xE0) {      $s = self::utf8_to_gb2312($s{0}.$s{1}.$s{2});    }elseif($ascii = 65 && $ascii = 97 && $ascii =-20319 && $asc=-20283 && $asc=-19775 && $asc=-19218 && $asc=-18710 && $asc=-18526 && $asc=-18239 && $asc=-17922 && $asc=-17417 && $asc=-16474 && $asc=-16212 && $asc=-15640 && $asc=-15165 && $asc=-14922 && $asc=-14914 && $asc=-14630 && $asc=-14149 && $asc=-14090 && $asc=-13318 && $asc=-12838 && $asc=-12556 && $asc=-11847 && $asc=-11055 && $asc

方法二:根据拼音组合数组检索


<?php class pinyin{   private $d=array(     array("a",-20319),     array("ai",-20317),     array("an",-20304),     array("ang",-20295),     array("ao",-20292),     array("ba",-20283),     array("bai",-20265),     array("ban",-20257),     array("bang",-20242),     array("bao",-20230),     array("bei",-20051),     array("ben",-20036),     array("beng",-20032),     array("bi",-20026),     array("bian",-20002),     array("biao",-19990),     array("bie",-19986),     array("bin",-19982),     array("bing",-19976),     array("bo",-19805),     array("bu",-19784),     array("ca",-19775),     array("cai",-19774),     array("can",-19763),     array("cang",-19756),     array("cao",-19751),     array("ce",-19746),     array("ceng",-19741),     array("cha",-19739),     array("chai",-19728),     array("chan",-19725),     array("chang",-19715),     array("chao",-19540),     array("che",-19531),     array("chen",-19525),     array("cheng",-19515),     array("chi",-19500),     array("chong",-19484),     array("chou",-19479),     array("chu",-19467),     array("chuai",-19289),     array("chuan",-19288),     array("chuang",-19281),     array("chui",-19275),     array("chun",-19270),     array("chuo",-19263),     array("ci",-19261),     array("cong",-19249),     array("cou",-19243),     array("cu",-19242),     array("cuan",-19238),     array("cui",-19235),     array("cun",-19227),     array("cuo",-19224),     array("da",-19218),     array("dai",-19212),     array("dan",-19038),     array("dang",-19023),     array("dao",-19018),     array("de",-19006),     array("deng",-19003),     array("di",-18996),     array("dian",-18977),     array("diao",-18961),     array("die",-18952),     array("ding",-18783),     array("diu",-18774),     array("dong",-18773),     array("dou",-18763),     array("du",-18756),     array("duan",-18741),     array("dui",-18735),     array("dun",-18731),     array("duo",-18722),     array("e",-18710),     array("en",-18697),     array("er",-18696),     array("fa",-18526),     array("fan",-18518),     array("fang",-18501),     array("fei",-18490),     array("fen",-18478),     array("feng",-18463),     array("fo",-18448),     array("fou",-18447),     array("fu",-18446),     array("ga",-18239),     array("gai",-18237),     array("gan",-18231),     array("gang",-18220),     array("gao",-18211),     array("ge",-18201),     array("gei",-18184),     array("gen",-18183),     array("geng",-18181),     array("gong",-18012),     array("gou",-17997),     array("gu",-17988),     array("gua",-17970),     array("guai",-17964),     array("guan",-17961),     array("guang",-17950),     array("gui",-17947),     array("gun",-17931),     array("guo",-17928),     array("ha",-17922),     array("hai",-17759),     array("han",-17752),     array("hang",-17733),     array("hao",-17730),     array("he",-17721),     array("hei",-17703),     array("hen",-17701),     array("heng",-17697),     array("hong",-17692),     array("hou",-17683),     array("hu",-17676),     array("hua",-17496),     array("huai",-17487),     array("huan",-17482),     array("huang",-17468),     array("hui",-17454),     array("hun",-17433),     array("huo",-17427),     array("ji",-17417),     array("jia",-17202),     array("jian",-17185),     array("jiang",-16983),     array("jiao",-16970),     array("jie",-16942),     array("jin",-16915),     array("jing",-16733),     array("jiong",-16708),     array("jiu",-16706),     array("ju",-16689),     array("juan",-16664),     array("jue",-16657),     array("jun",-16647),     array("ka",-16474),     array("kai",-16470),     array("kan",-16465),     array("kang",-16459),     array("kao",-16452),     array("ke",-16448),     array("ken",-16433),     array("keng",-16429),     array("kong",-16427),     array("kou",-16423),     array("ku",-16419),     array("kua",-16412),     array("kuai",-16407),     array("kuan",-16403),     array("kuang",-16401),     array("kui",-16393),     array("kun",-16220),     array("kuo",-16216),     array("la",-16212),     array("lai",-16205),     array("lan",-16202),     array("lang",-16187),     array("lao",-16180),     array("le",-16171),     array("lei",-16169),     array("leng",-16158),     array("li",-16155),     array("lia",-15959),     array("lian",-15958),     array("liang",-15944),     array("liao",-15933),     array("lie",-15920),     array("lin",-15915),     array("ling",-15903),     array("liu",-15889),     array("long",-15878),     array("lou",-15707),     array("lu",-15701),     array("lv",-15681),     array("luan",-15667),     array("lue",-15661),     array("lun",-15659),     array("luo",-15652),     array("ma",-15640),     array("mai",-15631),     array("man",-15625),     array("mang",-15454),     array("mao",-15448),     array("me",-15436),     array("mei",-15435),     array("men",-15419),     array("meng",-15416),     array("mi",-15408),     array("mian",-15394),     array("miao",-15385),     array("mie",-15377),     array("min",-15375),     array("ming",-15369),     array("miu",-15363),     array("mo",-15362),     array("mou",-15183),     array("mu",-15180),     array("na",-15165),     array("nai",-15158),     array("nan",-15153),     array("nang",-15150),     array("nao",-15149),     array("ne",-15144),     array("nei",-15143),     array("nen",-15141),     array("neng",-15140),     array("ni",-15139),     array("nian",-15128),     array("niang",-15121),     array("niao",-15119),     array("nie",-15117),     array("nin",-15110),     array("ning",-15109),     array("niu",-14941),     array("nong",-14937),     array("nu",-14933),     array("nv",-14930),     array("nuan",-14929),     array("nue",-14928),     array("nuo",-14926),     array("o",-14922),     array("ou",-14921),     array("pa",-14914),     array("pai",-14908),     array("pan",-14902),     array("pang",-14894),     array("pao",-14889),     array("pei",-14882),     array("pen",-14873),     array("peng",-14871),     array("pi",-14857),     array("pian",-14678),     array("piao",-14674),     array("pie",-14670),     array("pin",-14668),     array("ping",-14663),     array("po",-14654),     array("pu",-14645),     array("qi",-14630),     array("qia",-14594),     array("qian",-14429),     array("qiang",-14407),     array("qiao",-14399),     array("qie",-14384),     array("qin",-14379),     array("qing",-14368),     array("qiong",-14355),     array("qiu",-14353),     array("qu",-14345),     array("quan",-14170),     array("que",-14159),     array("qun",-14151),     array("ran",-14149),     array("rang",-14145),     array("rao",-14140),     array("re",-14137),     array("ren",-14135),     array("reng",-14125),     array("ri",-14123),     array("rong",-14122),     array("rou",-14112),     array("ru",-14109),     array("ruan",-14099),     array("rui",-14097),     array("run",-14094),     array("ruo",-14092),     array("sa",-14090),     array("sai",-14087),     array("san",-14083),     array("sang",-13917),     array("sao",-13914),     array("se",-13910),     array("sen",-13907),     array("seng",-13906),     array("sha",-13905),     array("shai",-13896),     array("shan",-13894),     array("shang",-13878),     array("shao",-13870),     array("she",-13859),     array("shen",-13847),     array("sheng",-13831),     array("shi",-13658),     array("shou",-13611),     array("shu",-13601),     array("shua",-13406),     array("shuai",-13404),     array("shuan",-13400),     array("shuang",-13398),     array("shui",-13395),     array("shun",-13391),     array("shuo",-13387),     array("si",-13383),     array("song",-13367),     array("sou",-13359),     array("su",-13356),     array("suan",-13343),     array("sui",-13340),     array("sun",-13329),     array("suo",-13326),     array("ta",-13318),     array("tai",-13147),     array("tan",-13138),     array("tang",-13120),     array("tao",-13107),     array("te",-13096),     array("teng",-13095),     array("ti",-13091),     array("tian",-13076),     array("tiao",-13068),     array("tie",-13063),     array("ting",-13060),     array("tong",-12888),     array("tou",-12875),     array("tu",-12871),     array("tuan",-12860),     array("tui",-12858),     array("tun",-12852),     array("tuo",-12849),     array("wa",-12838),     array("wai",-12831),     array("wan",-12829),     array("wang",-12812),     array("wei",-12802),     array("wen",-12607),     array("weng",-12597),     array("wo",-12594),     array("wu",-12585),     array("xi",-12556),     array("xia",-12359),     array("xian",-12346),     array("xiang",-12320),     array("xiao",-12300),     array("xie",-12120),     array("xin",-12099),     array("xing",-12089),     array("xiong",-12074),     array("xiu",-12067),     array("xu",-12058),     array("xuan",-12039),     array("xue",-11867),     array("xun",-11861),     array("ya",-11847),     array("yan",-11831),     array("yang",-11798),     array("yao",-11781),     array("ye",-11604),     array("yi",-11589),     array("yin",-11536),     array("ying",-11358),     array("yo",-11340),     array("yong",-11339),     array("you",-11324),     array("yu",-11303),     array("yuan",-11097),     array("yue",-11077),     array("yun",-11067),     array("za",-11055),     array("zai",-11052),     array("zan",-11045),     array("zang",-11041),     array("zao",-11038),     array("ze",-11024),     array("zei",-11020),     array("zen",-11019),     array("zeng",-11018),     array("zha",-11014),     array("zhai",-10838),     array("zhan",-10832),     array("zhang",-10815),     array("zhao",-10800),     array("zhe",-10790),     array("zhen",-10780),     array("zheng",-10764),     array("zhi",-10587),     array("zhong",-10544),     array("zhou",-10533),     array("zhu",-10519),     array("zhua",-10331),     array("zhuai",-10329),     array("zhuan",-10328),     array("zhuang",-10322),     array("zhui",-10315),     array("zhun",-10309),     array("zhuo",-10307),     array("zi",-10296),     array("zong",-10281),     array("zou",-10274),     array("zu",-10270),     array("zuan",-10262),     array("zui",-10260),     array("zun",-10256),     array("zuo",-10254)   );  	public function get_pinyin($str,$charset="utf-8"){  		if($charset!="gb2312"){  			$str=$this->set_char($str,$charset,"gb2312");  			$str=$this->c($str);  			$str=$this->set_char($str,"gb2312",$charset);  		}else{  			$str=$this->c($str);  		}  		return $str;  	}   private function set_char($str,$charset="utf-8",$charset_out="gb2312"){   	if(function_exists('iconv')){        	$str=iconv($charset,$charset_out,$str);   	}elseif(function_exists("mb_convert_encoding")){   		$str=mb_convert_encoding($str,$charset_out,$charset);   	}   	return $str;   }  private function g($num){    if($num>0 && $num-10247){        return "";    }else{        for($i=count($this->d)-1;$i>=0;$i--){   			if($this->d[$i][1]d[$i][0];    }  }  private function c($str){    $ret="";    for($i=0;$i<strlen if>160){   			$q=ord(substr($str,++$i,1));   			$p=$p*256+$q-65536;        }        $ret.=$this->g($p);    }    return $ret;  }}function get_pinyin($str,$charset="utf-8"){  $pinyin=new pinyin();  return $pinyin->get_pinyin($str,$charset);}?></strlen>

PHP取得汉字(中文): 


function getChinese($str,$charset='utf8'){  if($charset=='gb2312'){    if(!preg_match_all("/^[".chr(0xa1)."-".chr(0xff)."]+/",$str,$match)){      return false;    }    return implode('',$match[0]);  }  //  if($charset=='utf8'){    if(!preg_match_all("/[\x{4e00}-\x{9fa5}]+/u",$str,$match)){      return false;    }    return implode('',$match[0]);  }  return false;}

PHP取得汉字拼音字母数组标点:

<?php //取得字符串中汉字字符数字下划线和短连接符、逗号、句号、分号、冒号、书名号、问号等非特殊字符  function getChinese($str,$charset='utf8'){  if($charset=='gb2312'){    if(!preg_match_all("/^[".chr(0xa1)."-".chr(0xff)."A-Za-z0-9_\-\,\。\,\.\;\;\:\:\《\》\?\?\%\%\!\!\~\~]+/",$str,$match)){      return false;    }    return implode('',$match[0]);  }  if($charset=='utf8'){    if(!preg_match_all("/[\x{4e00}-\x{9fa5}A-Za-z0-9_\-\,\。\,\.\;\;\:\:\《\》\?\?\%\%\!\!\~\~]+/u",$str,$match)){      return false;    }    return implode('',$match[0]);  }  return false;  }?>



Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn