在学习boost的时候测试了一下boost的algorithm中的三个search函数,分别是Boyer-Moore Search,Boyer-Moore-Horspool Search,Knuth-Morris-Pratt Search,同时也测试了std的search函数,测试的结果有点意外,std的search花的时间比前面三个快了两个数量级,问题出在哪儿?测试的代码如下:
#define _CRT_SECURE_NO_WARNINGS
#include <boost/algorithm/searching/boyer_moore.hpp>
#include <boost/algorithm/searching/boyer_moore_horspool.hpp>
#include <boost\algorithm\searching\knuth_morris_pratt.hpp>
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>
#include <Windows.h>
#include <time.h>
using namespace std;
using namespace boost::algorithm;
void printMessage(char* msg, int startX, int startY)
{
HANDLE hd;
COORD pos;
pos.X = startX;
pos.Y = startY;
hd = GetStdHandle(STD_OUTPUT_HANDLE);
SetConsoleCursorPosition(hd, pos);
printf("%s", msg);
}
int main(int argc, char ** argv)
{
printf("Generating test set\n");
vector<int> vec;
srand(unsigned int(time(NULL)));
for (int i = 0; i < 10000; i++)
{
vec.push_back(rand() % 15000);
char msg[50];
sprintf(msg,"%.2f%%\n", i*1.0/99.99);
printMessage(msg, 0, 1);
}
LARGE_INTEGER t1, t2, tc;
QueryPerformanceFrequency(&tc);
QueryPerformanceCounter(&t1);
printf("Testing boyer moore search\n");
int count = 0;
for (int i = 0; i < 1000; i++)
{
vector<int> key;
key.push_back(rand() % 15000);
if (boyer_moore_search(vec.begin(), vec.end(), key.begin(), key.end()) != vec.end())
{
count++;
}
char msg[50];
sprintf(msg,"%.2f%%\n", i / 9.99);
printMessage(msg, 0, 3);
}
cout << "Hit " << count << "times\n";
QueryPerformanceCounter(&t2);
printf("Costing time: %.2f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
system("pause");
QueryPerformanceCounter(&t1);
printf("Testing boyer_moore_horspool search\n");
count = 0;
for (int i = 0; i < 1000; i++)
{
vector<int> key;
key.push_back(rand() % 15000);
if (boyer_moore_horspool_search(vec.begin(), vec.end(), key.begin(), key.end()) != vec.end())
{
count++;
}
char msg[50];
sprintf(msg, "%.2f%%\n", i / 9.99);
printMessage(msg, 0, 8);
}
cout << "Hit " << count << "times\n";
QueryPerformanceCounter(&t2);
printf("Costing time: %.2f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
system("pause");
QueryPerformanceCounter(&t1);
printf("Testing knuth_morris_pratt search\n");
count = 0;
for (int i = 0; i < 1000; i++)
{
vector<int> key;
key.push_back(rand() % 15000);
if (knuth_morris_pratt_search(vec.begin(), vec.end(), key.begin(), key.end()) != vec.end())
{
count++;
}
char msg[50];
sprintf(msg, "%.2f%%\n", i/ 9.99);
printMessage(msg, 0, 13);
}
cout << "Hit " << count << "times\n";
QueryPerformanceCounter(&t2);
printf("Costing time: %.2f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
system("pause");
QueryPerformanceCounter(&t1);
printf("Tesing std search\n");
count = 0;
for (int i = 0; i < 1000; i++)
{
vector<int> key;
key.push_back(rand() % 15000);
if (search(vec.begin(), vec.end(), key.begin(), key.end()) != vec.end())
{
count++;
}
char msg[50];
sprintf(msg, "%.2f%%\n", i/ 9.99);
printMessage(msg, 0, 18);
}
cout << "Hit " << count << "times\n";
QueryPerformanceCounter(&t2);
printf("Costing time: %.2f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
system("pause");
return 0;
}
测试环境是vs2013,测试结果如下图:
是什么导致了这么大的差距?
天蓬老师2017-04-17 11:59:50
問題在於幾次測試所使用的來源串相同,但目標串不同;應該要保證幾次的測試的目標串也相同,才能比較演算法本身的差異(「控制變數法」)。
例如暴力法串配對:
python
def search(source, target): for si in range(len(source)): for ti in range(len(target)): if si + ti >= len(source) or source[si+ti] != target[ti]: break if ti == len(target)-1: return si return -1
最壞情況下的時間複雜度是O(mn), 其中m, n分別是source, target字串長度,例如有兩個字串:
source: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab
target: aaaaab
這種情況就會是O(mn)的複雜度,但如果target是:baaaaa
則是O(m)。
可以把程式的key填充放到vec填滿的循環裡去,都是隨機數沒什麼關係,當然也可以單獨用一個循環。
個人認為,單純用隨機數做source和target串,幾個演算法的表現都會接近O(m);若想凸顯幾個演算法的複雜度差異,可以針對具體演算法實現構造特殊case(如上面的source, target特例就是根據暴力法的具體實現構造的)