Home > Article > Backend Development > PHP汉字转拼音的两种方法+PHP提取汉字(中文)方法
方法一:根据ASCII码转换 ,GB2312库对多音字也无能为力。
GB2312标准共收录6763个汉字,不在范围内的汉字是无法转换,如:中国前总理朱?基的“?”字。
GB2312中对所收汉字进行了“分区”处理,每区含有94个汉字/符号。这种表示方式也称为区位码。
01-09区为特殊符号。
16-55区为一级汉字,按拼音排序。(3755个)
56-87区为二级汉字,按部首/笔画排序。(3008个)
10-15区及88-94区则未有编码。
占用的码位是72*94=6768。其中有5个空位是D7FA-D7FE。所以实际共6763个汉字。其中一级汉字3755个,二级汉字3008个。而此类算法实际只能转换3755个汉字。
优点:没有使用大文字库,文件相对较小,没有使用正则表达式,性能相对较高。支持首字母转换。
缺点:没有在GB2312中收录的汉字无法转换,多音字无法识别。
(如果对拼音转换要求不高的朋友,建议使用这个。)
<?php // 此类是根据ASCII码转换,GB2312库对多音字也无能为力。// GB2312标准共收录6763个汉字,不在范围内的汉字是无法转换,如:中国前总理朱?基的“?”字。class pinyin{ public static function utf8_to($s, $isfirst = false) { return self::to(self::utf8_to_gb2312($s), $isfirst); } public static function utf8_to_gb2312($s) { return iconv('UTF-8', 'GB2312//IGNORE', $s); } // 字符串必须为GB2312编码 public static function to($s, $isfirst = false) { $res = ''; $len = strlen($s); $pinyin_arr = self::get_pinyin_array(); for($i=0; $i<$len; $i++) { $ascii = ord($s{$i}); if($ascii > 0x80) { $ascii2 = ord($s{++$i}); $ascii = $ascii * 256 + $ascii2 - 65536; } if($ascii 0) { if(($ascii >= 48 && $ascii = 97 && $ascii = 65 && $ascii -10247) { $res .= '_'; }else{ foreach($pinyin_arr as $py=>$asc) { if($asc 0xE0) { $s = self::utf8_to_gb2312($s{0}.$s{1}.$s{2}); }elseif($ascii = 65 && $ascii = 97 && $ascii =-20319 && $asc=-20283 && $asc=-19775 && $asc=-19218 && $asc=-18710 && $asc=-18526 && $asc=-18239 && $asc=-17922 && $asc=-17417 && $asc=-16474 && $asc=-16212 && $asc=-15640 && $asc=-15165 && $asc=-14922 && $asc=-14914 && $asc=-14630 && $asc=-14149 && $asc=-14090 && $asc=-13318 && $asc=-12838 && $asc=-12556 && $asc=-11847 && $asc=-11055 && $asc
方法二:根据拼音组合数组检索
<?php class pinyin{ private $d=array( array("a",-20319), array("ai",-20317), array("an",-20304), array("ang",-20295), array("ao",-20292), array("ba",-20283), array("bai",-20265), array("ban",-20257), array("bang",-20242), array("bao",-20230), array("bei",-20051), array("ben",-20036), array("beng",-20032), array("bi",-20026), array("bian",-20002), array("biao",-19990), array("bie",-19986), array("bin",-19982), array("bing",-19976), array("bo",-19805), array("bu",-19784), array("ca",-19775), array("cai",-19774), array("can",-19763), array("cang",-19756), array("cao",-19751), array("ce",-19746), array("ceng",-19741), array("cha",-19739), array("chai",-19728), array("chan",-19725), array("chang",-19715), array("chao",-19540), array("che",-19531), array("chen",-19525), array("cheng",-19515), array("chi",-19500), array("chong",-19484), array("chou",-19479), array("chu",-19467), array("chuai",-19289), array("chuan",-19288), array("chuang",-19281), array("chui",-19275), array("chun",-19270), array("chuo",-19263), array("ci",-19261), array("cong",-19249), array("cou",-19243), array("cu",-19242), array("cuan",-19238), array("cui",-19235), array("cun",-19227), array("cuo",-19224), array("da",-19218), array("dai",-19212), array("dan",-19038), array("dang",-19023), array("dao",-19018), array("de",-19006), array("deng",-19003), array("di",-18996), array("dian",-18977), array("diao",-18961), array("die",-18952), array("ding",-18783), array("diu",-18774), array("dong",-18773), array("dou",-18763), array("du",-18756), array("duan",-18741), array("dui",-18735), array("dun",-18731), array("duo",-18722), array("e",-18710), array("en",-18697), array("er",-18696), array("fa",-18526), array("fan",-18518), array("fang",-18501), array("fei",-18490), array("fen",-18478), array("feng",-18463), array("fo",-18448), array("fou",-18447), array("fu",-18446), array("ga",-18239), array("gai",-18237), array("gan",-18231), array("gang",-18220), array("gao",-18211), array("ge",-18201), array("gei",-18184), array("gen",-18183), array("geng",-18181), array("gong",-18012), array("gou",-17997), array("gu",-17988), array("gua",-17970), array("guai",-17964), array("guan",-17961), array("guang",-17950), array("gui",-17947), array("gun",-17931), array("guo",-17928), array("ha",-17922), array("hai",-17759), array("han",-17752), array("hang",-17733), array("hao",-17730), array("he",-17721), array("hei",-17703), array("hen",-17701), array("heng",-17697), array("hong",-17692), array("hou",-17683), array("hu",-17676), array("hua",-17496), array("huai",-17487), array("huan",-17482), array("huang",-17468), array("hui",-17454), array("hun",-17433), array("huo",-17427), array("ji",-17417), array("jia",-17202), array("jian",-17185), array("jiang",-16983), array("jiao",-16970), array("jie",-16942), array("jin",-16915), array("jing",-16733), array("jiong",-16708), array("jiu",-16706), array("ju",-16689), array("juan",-16664), array("jue",-16657), array("jun",-16647), array("ka",-16474), array("kai",-16470), array("kan",-16465), array("kang",-16459), array("kao",-16452), array("ke",-16448), array("ken",-16433), array("keng",-16429), array("kong",-16427), array("kou",-16423), array("ku",-16419), array("kua",-16412), array("kuai",-16407), array("kuan",-16403), array("kuang",-16401), array("kui",-16393), array("kun",-16220), array("kuo",-16216), array("la",-16212), array("lai",-16205), array("lan",-16202), array("lang",-16187), array("lao",-16180), array("le",-16171), array("lei",-16169), array("leng",-16158), array("li",-16155), array("lia",-15959), array("lian",-15958), array("liang",-15944), array("liao",-15933), array("lie",-15920), array("lin",-15915), array("ling",-15903), array("liu",-15889), array("long",-15878), array("lou",-15707), array("lu",-15701), array("lv",-15681), array("luan",-15667), array("lue",-15661), array("lun",-15659), array("luo",-15652), array("ma",-15640), array("mai",-15631), array("man",-15625), array("mang",-15454), array("mao",-15448), array("me",-15436), array("mei",-15435), array("men",-15419), array("meng",-15416), array("mi",-15408), array("mian",-15394), array("miao",-15385), array("mie",-15377), array("min",-15375), array("ming",-15369), array("miu",-15363), array("mo",-15362), array("mou",-15183), array("mu",-15180), array("na",-15165), array("nai",-15158), array("nan",-15153), array("nang",-15150), array("nao",-15149), array("ne",-15144), array("nei",-15143), array("nen",-15141), array("neng",-15140), array("ni",-15139), array("nian",-15128), array("niang",-15121), array("niao",-15119), array("nie",-15117), array("nin",-15110), array("ning",-15109), array("niu",-14941), array("nong",-14937), array("nu",-14933), array("nv",-14930), array("nuan",-14929), array("nue",-14928), array("nuo",-14926), array("o",-14922), array("ou",-14921), array("pa",-14914), array("pai",-14908), array("pan",-14902), array("pang",-14894), array("pao",-14889), array("pei",-14882), array("pen",-14873), array("peng",-14871), array("pi",-14857), array("pian",-14678), array("piao",-14674), array("pie",-14670), array("pin",-14668), array("ping",-14663), array("po",-14654), array("pu",-14645), array("qi",-14630), array("qia",-14594), array("qian",-14429), array("qiang",-14407), array("qiao",-14399), array("qie",-14384), array("qin",-14379), array("qing",-14368), array("qiong",-14355), array("qiu",-14353), array("qu",-14345), array("quan",-14170), array("que",-14159), array("qun",-14151), array("ran",-14149), array("rang",-14145), array("rao",-14140), array("re",-14137), array("ren",-14135), array("reng",-14125), array("ri",-14123), array("rong",-14122), array("rou",-14112), array("ru",-14109), array("ruan",-14099), array("rui",-14097), array("run",-14094), array("ruo",-14092), array("sa",-14090), array("sai",-14087), array("san",-14083), array("sang",-13917), array("sao",-13914), array("se",-13910), array("sen",-13907), array("seng",-13906), array("sha",-13905), array("shai",-13896), array("shan",-13894), array("shang",-13878), array("shao",-13870), array("she",-13859), array("shen",-13847), array("sheng",-13831), array("shi",-13658), array("shou",-13611), array("shu",-13601), array("shua",-13406), array("shuai",-13404), array("shuan",-13400), array("shuang",-13398), array("shui",-13395), array("shun",-13391), array("shuo",-13387), array("si",-13383), array("song",-13367), array("sou",-13359), array("su",-13356), array("suan",-13343), array("sui",-13340), array("sun",-13329), array("suo",-13326), array("ta",-13318), array("tai",-13147), array("tan",-13138), array("tang",-13120), array("tao",-13107), array("te",-13096), array("teng",-13095), array("ti",-13091), array("tian",-13076), array("tiao",-13068), array("tie",-13063), array("ting",-13060), array("tong",-12888), array("tou",-12875), array("tu",-12871), array("tuan",-12860), array("tui",-12858), array("tun",-12852), array("tuo",-12849), array("wa",-12838), array("wai",-12831), array("wan",-12829), array("wang",-12812), array("wei",-12802), array("wen",-12607), array("weng",-12597), array("wo",-12594), array("wu",-12585), array("xi",-12556), array("xia",-12359), array("xian",-12346), array("xiang",-12320), array("xiao",-12300), array("xie",-12120), array("xin",-12099), array("xing",-12089), array("xiong",-12074), array("xiu",-12067), array("xu",-12058), array("xuan",-12039), array("xue",-11867), array("xun",-11861), array("ya",-11847), array("yan",-11831), array("yang",-11798), array("yao",-11781), array("ye",-11604), array("yi",-11589), array("yin",-11536), array("ying",-11358), array("yo",-11340), array("yong",-11339), array("you",-11324), array("yu",-11303), array("yuan",-11097), array("yue",-11077), array("yun",-11067), array("za",-11055), array("zai",-11052), array("zan",-11045), array("zang",-11041), array("zao",-11038), array("ze",-11024), array("zei",-11020), array("zen",-11019), array("zeng",-11018), array("zha",-11014), array("zhai",-10838), array("zhan",-10832), array("zhang",-10815), array("zhao",-10800), array("zhe",-10790), array("zhen",-10780), array("zheng",-10764), array("zhi",-10587), array("zhong",-10544), array("zhou",-10533), array("zhu",-10519), array("zhua",-10331), array("zhuai",-10329), array("zhuan",-10328), array("zhuang",-10322), array("zhui",-10315), array("zhun",-10309), array("zhuo",-10307), array("zi",-10296), array("zong",-10281), array("zou",-10274), array("zu",-10270), array("zuan",-10262), array("zui",-10260), array("zun",-10256), array("zuo",-10254) ); public function get_pinyin($str,$charset="utf-8"){ if($charset!="gb2312"){ $str=$this->set_char($str,$charset,"gb2312"); $str=$this->c($str); $str=$this->set_char($str,"gb2312",$charset); }else{ $str=$this->c($str); } return $str; } private function set_char($str,$charset="utf-8",$charset_out="gb2312"){ if(function_exists('iconv')){ $str=iconv($charset,$charset_out,$str); }elseif(function_exists("mb_convert_encoding")){ $str=mb_convert_encoding($str,$charset_out,$charset); } return $str; } private function g($num){ if($num>0 && $num-10247){ return ""; }else{ for($i=count($this->d)-1;$i>=0;$i--){ if($this->d[$i][1]d[$i][0]; } } private function c($str){ $ret=""; for($i=0;$i<strlen if>160){ $q=ord(substr($str,++$i,1)); $p=$p*256+$q-65536; } $ret.=$this->g($p); } return $ret; }}function get_pinyin($str,$charset="utf-8"){ $pinyin=new pinyin(); return $pinyin->get_pinyin($str,$charset);}?></strlen>
function getChinese($str,$charset='utf8'){ if($charset=='gb2312'){ if(!preg_match_all("/^[".chr(0xa1)."-".chr(0xff)."]+/",$str,$match)){ return false; } return implode('',$match[0]); } // if($charset=='utf8'){ if(!preg_match_all("/[\x{4e00}-\x{9fa5}]+/u",$str,$match)){ return false; } return implode('',$match[0]); } return false;}
<?php //取得字符串中汉字字符数字下划线和短连接符、逗号、句号、分号、冒号、书名号、问号等非特殊字符 function getChinese($str,$charset='utf8'){ if($charset=='gb2312'){ if(!preg_match_all("/^[".chr(0xa1)."-".chr(0xff)."A-Za-z0-9_\-\,\。\,\.\;\;\:\:\《\》\?\?\%\%\!\!\~\~]+/",$str,$match)){ return false; } return implode('',$match[0]); } if($charset=='utf8'){ if(!preg_match_all("/[\x{4e00}-\x{9fa5}A-Za-z0-9_\-\,\。\,\.\;\;\:\:\《\》\?\?\%\%\!\!\~\~]+/u",$str,$match)){ return false; } return implode('',$match[0]); } return false; }?>