search
HomeBackend DevelopmentPHP Tutorialsimilar_text算相似性时归一化时的疑问

我在算两个字符串的长度时,发现归一化时好像此函数采取的方式不一样。
第一次,我试了两个不一样长的字符串,算其编辑距离:
    echo "levenshtein计算:\n";echo levenshtein("seller_id","selr_id");echo "\n";
    得到的结果是:2

   再用同样的两个字符串,用PHP的similar_text函数来求其相似性
   echo "similar_text计算:\n";similar_text("seller_id","selr_id",$percent);
      echo $percent;
   出现在相似性是:87.5
把2这个距离归一化时,正好符合公式: 1-(编辑距离/(两个字符串的长度之和))

第二次,我试了两个一样长度的字符串,分别算其编辑距离和相似性
similar_text("abcd","1234",$percent);echo $percent;echo "\n";
echo levenshtein("abcd","1234");
得到的值分别为:4和0
正好符合公式: 1-(编辑距离/(任一个字符串的长度))

我的问题是:为什么对两个不一样长的字符串求相似性时,分母是两个字符串的长度之和呢?
我在网上找了些pdf文档看,对编辑距离归一化时,其分母是最长的那个字符串的长度呢。


回复讨论(解决方案)

第二次结果是0,4,你贴反了,误导群众。

第二次结果是0,4,你贴反了,误导群众。


呵呵,是的,写反了,谢谢指出来

笔误,对于归一化的疑问仍在~~

你的结论有普遍性吗?

不一样长

$str1	= "esca";$str2	= "bca";echo	levenshtein($str1,$str2), "\n";similar_text($str1,$str2,$percent);echo	$percent;/*257.142857142857*/

一样长度
$str1	= "esca";$str2	= "sbca";echo	levenshtein($str1,$str2), "\n";similar_text($str1,$str2,$percent);echo	$percent;/*275*/

嗯,similar_text 的算法与理论是不一样的,似乎是加权了
不知哪位能费神找一下源码贴出
手册上说是用递归实现的,代码应该不太长

#define LEVENSHTEIN_MAX_LENGTH 255/* {{{ reference_levdist * reference implementation, only optimized for memory usage, not speed */static int reference_levdist(const char *s1, int l1, const char *s2, int l2, int cost_ins, int cost_rep, int cost_del ){	int *p1, *p2, *tmp;	int i1, i2, c0, c1, c2;	if (l1 == 0) {		return l2 * cost_ins;	}	if (l2 == 0) {		return l1 * cost_del;	}	if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) {		return -1;	}	p1 = safe_emalloc((l2 + 1), sizeof(int), 0);	p2 = safe_emalloc((l2 + 1), sizeof(int), 0);	for (i2 = 0; i2 <= l2; i2++) {		p1[i2] = i2 * cost_ins;	}	for (i1 = 0; i1 < l1 ; i1++) {		p2[0] = p1[0] + cost_del;		for (i2 = 0; i2 < l2; i2++) {			c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep);			c1 = p1[i2 + 1] + cost_del;			if (c1 < c0) {				c0 = c1;			}			c2 = p2[i2] + cost_ins;			if (c2 < c0) {				c0 = c2;			}			p2[i2 + 1] = c0;		}		tmp = p1;		p1 = p2;		p2 = tmp;	}	c0 = p1[l2];	efree(p1);	efree(p2);	return c0;}/* }}} *//* {{{ custom_levdist */static int custom_levdist(char *str1, char *str2, char *callback_name TSRMLS_DC){	php_error_docref(NULL TSRMLS_CC, E_WARNING, "The general Levenshtein support is not there yet");	/* not there yet */	return -1;}/* }}} *//* {{{ proto int levenshtein(string str1, string str2[, int cost_ins, int cost_rep, int cost_del])   Calculate Levenshtein distance between two strings */PHP_FUNCTION(levenshtein){	int argc = ZEND_NUM_ARGS();	char *str1, *str2;	char *callback_name;	int str1_len, str2_len, callback_len;	long cost_ins, cost_rep, cost_del;	int distance = -1;	switch (argc) {		case 2: /* just two strings: use maximum performance version */			if (zend_parse_parameters(2 TSRMLS_CC, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) {				return;			}			distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1);			break;		case 5: /* more general version: calc cost by ins/rep/del weights */			if (zend_parse_parameters(5 TSRMLS_CC, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) {				return;			}			distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del);			break;		case 3: /* most general version: calc cost by user-supplied function */			if (zend_parse_parameters(3 TSRMLS_CC, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) {				return;			}			distance = custom_levdist(str1, str2, callback_name TSRMLS_CC);			break;		default:			WRONG_PARAM_COUNT;	}	if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) {		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Argument string(s) too long");	}	RETURN_LONG(distance);}/* }}} */




/* {{{ proto int similar_text(string str1, string str2 [, float percent])   Calculates the similarity between two strings */PHP_FUNCTION(similar_text){	char *t1, *t2;	zval **percent = NULL;	int ac = ZEND_NUM_ARGS();	int sim;	int t1_len, t2_len;	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|Z", &t1, &t1_len, &t2, &t2_len, &percent) == FAILURE) {		return;	}	if (ac > 2) {		convert_to_double_ex(percent);	}	if (t1_len + t2_len == 0) {		if (ac > 2) {			Z_DVAL_PP(percent) = 0;		}		RETURN_LONG(0);	}	sim = php_similar_char(t1, t1_len, t2, t2_len);	if (ac > 2) {		Z_DVAL_PP(percent) = sim * 200.0 / (t1_len + t2_len);	}	RETURN_LONG(sim);}static int php_similar_char(const char *txt1, int len1, const char *txt2, int len2){	int sum;	int pos1, pos2, max;	php_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);	if ((sum = max)) {		if (pos1 && pos2) {			sum += php_similar_char(txt1, pos1,									txt2, pos2);		}		if ((pos1 + max < len1) && (pos2 + max < len2)) {			sum += php_similar_char(txt1 + pos1 + max, len1 - pos1 - max,									txt2 + pos2 + max, len2 - pos2 - max);		}	}	return sum;}

还差一个

static void php_similar_str(const char *txt1, int len1, const char *txt2, int len2, int *pos1, int *pos2, int *max)
{
char *p, *q;
char *end1 = (char *) txt1 + len1;
char *end2 = (char *) txt2 + len2;
int l;

*max = 0;
for (p = (char *) txt1; p  for (q = (char *) txt2; q  for (l = 0; (p + l  if (l > *max) {
*max = l;
*pos1 = p - txt1;
*pos2 = q - txt2;
}
}
}
}

PHP_FUNCTION(similar_text)
中有
sim = php_similar_char(t1, t1_len, t2, t2_len);

Z_DVAL_PP(percent) = sim * 200.0 / (t1_len + t2_len);//计算相似度

return sum; //返回两个字符串的匹配字符的数目

的确有点不一样
sim * 200.0 / (t1_len + t2_len)
的含义是 匹配的字符数占源串平均长度的百分比
与 1-编辑距离/源串的最大长度 相差并不太多

还差一个

static void php_similar_str(const char *txt1, int len1, const char *txt2, int len2, int *pos1, int *pos2, int *max)
{
char *p, *q;
char *end1 = (char *) txt1 + len1;
char *end2 = (char *)……

手册上说复杂度是 O(N**3) 似乎这个函数就是了吧?
那 php_similar_char 的递归就不计算在复杂度中吗?

应该说 similar_text 函数的设计者,考虑的还是蛮周到的
当传入的两个串长度相同时,计算的相似度与理论上并无差异
当传入的两个串长度不同时,得到的相似度不像理论上的那么陡峭。也就是说被匹配的概率变大
当然如果你不希望这样的话可以自行计算,串都是你的,他也返回了已匹配的数量。计算一下并不困难

复杂度问题,是不是可以这样考虑?

php_similar_char的复杂度,理想的情况是logN,但最差是N


而对于,php_similar_str函数里的
for (l = 0; (p + l 
这语句虽然是个循环,但是和前面的php_similar_char是有关系的。因为此处会“剔除”最长相同字符串

每找出最长字符串+1,则外层递归,最差的复杂度会-1



另外,这是两种不同的算法,不知道lz为什么要去找这种规律,我认为你会徒劳的。理解算法意思,不同的

PHP_FUNCTION(similar_text)
中有
sim = php_similar_char(t1, t1_len, t2, t2_len);

Z_DVAL_PP(percent) = sim * 200.0 / (t1_len + t2_len);//计算相似度

return sum; //返回两个字符串的匹配字符的数目

的确有点不一样
sim * 200……


算sim的这里我有点看不懂,是如何得到匹配的字符数的?
以seller_id和selr_id为例,其sim值通过相似度倒推过来,是7,7是如何比较得到的呢?能不能麻烦您详细说下?

应该说 similar_text 函数的设计者,考虑的还是蛮周到的
当传入的两个串长度相同时,计算的相似度与理论上并无差异
当传入的两个串长度不同时,得到的相似度不像理论上的那么陡峭。也就是说被匹配的概率变大
当然如果你不希望这样的话可以自行计算,串都是你的,他也返回了已匹配的数量。计算一下并不困难


“当传入的两个串长度不同时,得到的相似度不像理论上的那么陡峭”这里的理论是指1-(编辑距离/(最长的字符串的长度))这个吗?
$str1    = "esca";
$str2    = "sbca";
如果按照1-(编辑距离/(最长的字符串的长度)来算,是50,similar_text算出来,是75
确实变大了。不过为什么要这么处理呢,依据是什么

不是说了吗:匹配的概率变大
在很多情况下,输入的匹配条件就是误差多多的
多分严格的过滤条件,只能是无效劳动

依据是什么?实践
理论是建立在实践基础上的,不能用理论去约束实践。

不是说了吗:匹配的概率变大
在很多情况下,输入的匹配条件就是误差多多的
多分严格的过滤条件,只能是无效劳动

依据是什么?实践
理论是建立在实践基础上的,不能用理论去约束实践。


呵呵,那你的意思就是说,在实践中,人直觉感觉到的相似性一般要大于 1-编辑距离/源串的最大长度 这样算出来的值了~~~
另外,那个sim值是在算什么呢,那一段递归代码,实在木有看懂。。以seller_id和selr_id为例,其sim值是7,能否演示下7是如何得到的呢?谢谢啦

similar_text计算与编辑距离无关
similar_text计算方法是
两个字符的最长相似长度 与 两个字符串长度和的一半 的比值

$max_similar_len = 0;
$percent = 0;
$max_similar_len = similar_text($string1, $string2, $percent);
$perc = $max_similar_len * 2 / (strlen($string1) + strlen($string2));
这时$perc与$percent是相等的

Statement
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Working with Flash Session Data in LaravelWorking with Flash Session Data in LaravelMar 12, 2025 pm 05:08 PM

Laravel simplifies handling temporary session data using its intuitive flash methods. This is perfect for displaying brief messages, alerts, or notifications within your application. Data persists only for the subsequent request by default: $request-

cURL in PHP: How to Use the PHP cURL Extension in REST APIscURL in PHP: How to Use the PHP cURL Extension in REST APIsMar 14, 2025 am 11:42 AM

The PHP Client URL (cURL) extension is a powerful tool for developers, enabling seamless interaction with remote servers and REST APIs. By leveraging libcurl, a well-respected multi-protocol file transfer library, PHP cURL facilitates efficient execution of various network protocols, including HTTP, HTTPS, and FTP. This extension offers granular control over HTTP requests, supports multiple concurrent operations, and provides built-in security features.

Build a React App With a Laravel Back End: Part 2, ReactBuild a React App With a Laravel Back End: Part 2, ReactMar 04, 2025 am 09:33 AM

This is the second and final part of the series on building a React application with a Laravel back-end. In the first part of the series, we created a RESTful API using Laravel for a basic product-listing application. In this tutorial, we will be dev

Simplified HTTP Response Mocking in Laravel TestsSimplified HTTP Response Mocking in Laravel TestsMar 12, 2025 pm 05:09 PM

Laravel provides concise HTTP response simulation syntax, simplifying HTTP interaction testing. This approach significantly reduces code redundancy while making your test simulation more intuitive. The basic implementation provides a variety of response type shortcuts: use Illuminate\Support\Facades\Http; Http::fake([ 'google.com' => 'Hello World', 'github.com' => ['foo' => 'bar'], 'forge.laravel.com' =>

12 Best PHP Chat Scripts on CodeCanyon12 Best PHP Chat Scripts on CodeCanyonMar 13, 2025 pm 12:08 PM

Do you want to provide real-time, instant solutions to your customers' most pressing problems? Live chat lets you have real-time conversations with customers and resolve their problems instantly. It allows you to provide faster service to your custom

Notifications in LaravelNotifications in LaravelMar 04, 2025 am 09:22 AM

In this article, we're going to explore the notification system in the Laravel web framework. The notification system in Laravel allows you to send notifications to users over different channels. Today, we'll discuss how you can send notifications ov

Explain the concept of late static binding in PHP.Explain the concept of late static binding in PHP.Mar 21, 2025 pm 01:33 PM

Article discusses late static binding (LSB) in PHP, introduced in PHP 5.3, allowing runtime resolution of static method calls for more flexible inheritance.Main issue: LSB vs. traditional polymorphism; LSB's practical applications and potential perfo

PHP Logging: Best Practices for PHP Log AnalysisPHP Logging: Best Practices for PHP Log AnalysisMar 10, 2025 pm 02:32 PM

PHP logging is essential for monitoring and debugging web applications, as well as capturing critical events, errors, and runtime behavior. It provides valuable insights into system performance, helps identify issues, and supports faster troubleshoot

See all articles

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

AI Hentai Generator

AI Hentai Generator

Generate AI Hentai for free.

Hot Article

Repo: How To Revive Teammates
1 months agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. Energy Crystals Explained and What They Do (Yellow Crystal)
2 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
Hello Kitty Island Adventure: How To Get Giant Seeds
1 months agoBy尊渡假赌尊渡假赌尊渡假赌

Hot Tools

SublimeText3 English version

SublimeText3 English version

Recommended: Win version, supports code prompts!

Safe Exam Browser

Safe Exam Browser

Safe Exam Browser is a secure browser environment for taking online exams securely. This software turns any computer into a secure workstation. It controls access to any utility and prevents students from using unauthorized resources.

Zend Studio 13.0.1

Zend Studio 13.0.1

Powerful PHP integrated development environment

DVWA

DVWA

Damn Vulnerable Web App (DVWA) is a PHP/MySQL web application that is very vulnerable. Its main goals are to be an aid for security professionals to test their skills and tools in a legal environment, to help web developers better understand the process of securing web applications, and to help teachers/students teach/learn in a classroom environment Web application security. The goal of DVWA is to practice some of the most common web vulnerabilities through a simple and straightforward interface, with varying degrees of difficulty. Please note that this software

mPDF

mPDF

mPDF is a PHP library that can generate PDF files from UTF-8 encoded HTML. The original author, Ian Back, wrote mPDF to output PDF files "on the fly" from his website and handle different languages. It is slower than original scripts like HTML2FPDF and produces larger files when using Unicode fonts, but supports CSS styles etc. and has a lot of enhancements. Supports almost all languages, including RTL (Arabic and Hebrew) and CJK (Chinese, Japanese and Korean). Supports nested block-level elements (such as P, DIV),