Heim >Backend-Entwicklung >PHP-Tutorial >权重计算,稍加修改亦可用于分词,词频统计,全文和spam检测等

权重计算,稍加修改亦可用于分词,词频统计,全文和spam检测等

WBOY
WBOYOriginal
2016-07-25 08:49:09934Durchsuche
效率非常客观,你要是改成其他用处那效率我就不保证了
  1. /* vim: set expandtab tabstop=4 shiftwidth=4: */
  2. // +------------------------------------------------------------------------
  3. // Name : 权重计算
  4. // Description: 稍加修改,亦可用于分词,词频统计,全文检索和垃圾检测
  5. // Date : 2013/12/16 08:51
  6. // Authors : latel
  7. // +------------------------------------------------------------------------
  8. //
  9. /*外部调用示例*/
  10. /*
  11. $aItems = array(
  12. 'chinaisbig',
  13. 'whichisnot',
  14. 'totalyrightforme',
  15. );
  16. $aTable = array(
  17. 'china,is|small',
  18. 'china,big|me',
  19. 'china,is|big,which|not,me',
  20. 'totaly|right,for,me',
  21. );
  22. $oWeight = new ttrie;
  23. $oWeight->newItems($aItems);
  24. $aResult = $oWeight->newTable($aTable);
  25. */
  26. class weight {
  27. protected $aDict = array(array());
  28. protected $aItems = array();
  29. protected $sLastRule;
  30. protected $aMatchs = array();
  31. protected $aShow = array();
  32. private function init() {
  33. //清空记录的匹配表和输出结果
  34. unset($this->aShow);
  35. }
  36. public function newItems($mItems) {
  37. //导入新的项目
  38. $this->aItems = (is_array($mItems))? $mItems: array($mItems);
  39. $this->init();
  40. }
  41. public function newTable(array $aTable) {
  42. //导入新的对照表,并生成字典
  43. foreach($aTable as $iTableKey=>$sTableLine) {
  44. $aTableLine = explode(',', str_replace('|', ',', $sTableLine));
  45. $setter = function($v, $k, $paraMeter) {
  46. $k1 = $paraMeter[0]; $oWeight = $paraMeter[1];
  47. $oWeight->genDict($v, $k1);
  48. };
  49. array_walk($aTableLine, $setter, array($iTableKey, $this));
  50. }
  51. $this->init();
  52. }
  53. public function getShow($sRule = 'max') {
  54. //获取最终的显示结果
  55. if(empty($this->aItems) || empty($this->aDict))
  56. return array();
  57. if (empty($this->aShow) || $sRule != $this->sLastRule)
  58. return $this->genShow($sRule);
  59. return $this->aShow;
  60. }
  61. public function genShow($sRule) {
  62. $aShow = array();
  63. $aMatchs = array();
  64. $getter = function($v, $k, $oWeight) use(&$aShow, &$aMatchs, $sRule) {
  65. $t = array_count_values($oWeight->matchWord($v));
  66. $aMatchs[] = $t;
  67. switch ($sRule) {
  68. case 'max':
  69. $aShow[$k] = array_keys($t, max($t));
  70. break;
  71. }
  72. };
  73. array_walk($this->aItems, $getter, $this);
  74. $this->aShow = $aShow;
  75. $this->aMatchs = $aMatchs;
  76. return $aShow;
  77. }
  78. private function genDict($mWord, $iKey = '') {
  79. $iInsertPonit = count($this->aDict);
  80. $iCur = 0; //当前节点号
  81. foreach (str_split($mWord) as $iChar) {
  82. if (isset($this->aDict[$iCur][$iChar])) {
  83. $iCur = $this->aDict[$iCur][$iChar];
  84. continue;
  85. }
  86. $this->aDict[$iInsertPonit] = array();
  87. $this->aDict[$iCur][$iChar] = $iInsertPonit;
  88. $iCur = $iInsertPonit;
  89. $iInsertPonit++;
  90. }
  91. $this->aDict[$iCur]['acc'][] = $iKey;
  92. }
  93. function matchWord($sLine) {
  94. $iCur = $iOffset = $iPosition = 0;
  95. $sLine .= "\0";
  96. $iLen = strlen($sLine);
  97. $aReturn = array();
  98. while($iOffset $sChar = $sLine{$iOffset};
  99. if(isset($this->aDict[$iCur][$sChar])) {
  100. $iCur = $this->aDict[$iCur][$sChar];
  101. if(isset($this->aDict[$iCur]['acc'])) {
  102. $aReturn = array_merge($aReturn, $this->aDict[$iCur]['acc']);
  103. $iPosition = $iOffset + 1;
  104. $iCur = 0;
  105. }
  106. } else {
  107. $iCur = 0;
  108. $iOffset = $iPosition;
  109. $iPosition = $iOffset + 1;
  110. }
  111. ++$iOffset;
  112. }
  113. return $aReturn;
  114. }
  115. }
  116. ?>
复制代码


Stellungnahme:
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn