首页 >后端开发 >php教程 >PHP敏感词过滤

PHP敏感词过滤

WBOY
WBOY原创
2016-07-25 08:44:141045浏览
  1. /**
  2. * 禁词过滤
  3. * 执行效率:每篇用时0.05秒
  4. * @author liuxu
  5. *
  6. */
  7. class Logic_BlackWord
  8. {
  9. const APP_FORUM = 1;
  10. const APP_BLOG = 2;
  11. const APP_VOTE = 3;
  12. /**
  13. * 过滤得到禁词
  14. * @param unknown $txt
  15. * @return Ambigous
  16. */
  17. public function getHitList($txt)
  18. {
  19. $hitList = array();
  20. //对禁词分批过滤
  21. $max = $this->getMax();
  22. if($max)
  23. {
  24. $size = 1000;
  25. $last = ceil($max/$size);
  26. for($page=1;$page {
  27. $result = $this->getHitListByPage($txt,$page,$size);
  28. if($result) $hitList = array_merge($hitList,$result);
  29. }
  30. }
  31. $hitList2 = array();
  32. foreach($hitList as $hit=>$type)
  33. {
  34. $hitList2[$type][] = $hit;
  35. }
  36. return $hitList2;
  37. }
  38. private function getMax()
  39. {
  40. $redis = Rds::factory();
  41. $memKey = 'blackWord_max';
  42. $max = $redis->get($memKey);
  43. if($max===false)
  44. {
  45. $max = 0;
  46. $blackWord = new Model_BlackWord_BlackWord();
  47. $para['field'] = "MAX(id) AS max";
  48. $result = $blackWord->search($para);
  49. if(isset($result[0]['max'])) $max = $result[0]['max'];
  50. $redis->setex($memKey,300,$max);
  51. }
  52. return $max;
  53. }
  54. /**
  55. * 分批过滤得到禁词
  56. * @param unknown $txt
  57. * @param number $page
  58. * @param number $size
  59. * @return multitype:Ambigous
  60. */
  61. private function getHitListByPage($txt,$page=1,$size=1000)
  62. {
  63. $hitList = array();
  64. //分批得到禁词树
  65. $wordTree = $this->getWordTreeByPage($page,$size);
  66. $txt = strip_tags($txt);
  67. $txt = preg_replace('/[^a-zA-Z0-9\\x{4e00}-\\x{9fa5}]/iu','',$txt);
  68. $len = mb_strlen($txt,'UTF-8');
  69. for($i=0;$i {
  70. $char = mb_substr($txt,$i,1,'UTF-8');
  71. if(isset($wordTree[$char]))
  72. {
  73. $result = $this->getHitListByTree(mb_substr($txt,$i,50,'UTF-8'),$wordTree);
  74. if($result)
  75. {
  76. foreach($result as $hit=>$type)
  77. {
  78. $hitList[$hit] = $type;
  79. }
  80. }
  81. }
  82. }
  83. return $hitList;
  84. }
  85. /**
  86. * 是否禁词
  87. * @param str $txt
  88. * @param arr $wordTree
  89. * @return multitype:unknown
  90. */
  91. private function getHitListByTree($txt,&$wordTree)
  92. {
  93. $len = mb_strlen($txt,'UTF-8');
  94. $point = & $wordTree;
  95. $hit = '';
  96. $hitList = array();
  97. for($i=0;$i {
  98. $char = mb_substr($txt,$i,1,'UTF-8');
  99. if(isset($point[$char]))
  100. {
  101. $hit .= $char;
  102. $point = & $point[$char];
  103. if(isset($point['type']))//匹配成功
  104. {
  105. $hitList[$hit] = $point['type'];
  106. }
  107. }
  108. else
  109. {
  110. break;
  111. }
  112. }
  113. return $hitList;
  114. }
  115. /**
  116. * 分批得到禁词树
  117. * @param int $page
  118. * @param int $size
  119. * @return arr:
  120. */
  121. private function getWordTreeByPage($page=1,$size=1000)
  122. {
  123. $redis = Rds::factory();
  124. $memKey = 'blackWord_tree_'.$page.'_'.$size;
  125. $wordTree = $redis->get($memKey);
  126. if($wordTree===false)
  127. {
  128. $wordTree = array();
  129. $blackWord = new Model_BlackWord_BlackWord();
  130. $start = ($page-1)*$size;
  131. $end = $start + $size;
  132. $para['where'] = "status=1 AND id>".$start." AND id $result = $blackWord->search($para);
  133. if($result)
  134. {
  135. foreach($result as $value)
  136. {
  137. if($value['word'])
  138. {
  139. $value['word'] = preg_split('/(? $point = & $wordTree;
  140. foreach($value['word'] as $char)
  141. {
  142. $point = & $point[$char];
  143. }
  144. $point['type'] = $value['type'];
  145. }
  146. }
  147. }
  148. $redis->setex($memKey,300,$wordTree);
  149. }
  150. return $wordTree;
  151. }
  152. }
复制代码

PHP


声明:
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn