用Python实现K-means算法时候,要计算随机两个数之间的欧氏距离,数据量为5000行,但计算的时间却有500多秒,不知道有什么能优化,求指教,代码如下循环
for i in range(len(data)): # 计算任意两点距离和
for j in range(i+1, len(data)):
random_sum += ed_relate(data[i][2:], data[j][2:])
ed_relate
def ed_relate(dataX, dataY):
'''
:param dataX:第一行
:param dataY: 第二行
:return: 之间的相似度
'''
sum = 0
if len(dataX) == len(dataY):
for a in range(0, len(dataX)):
sum += (float(dataX[a])-float(dataY[a])) ** 2
relate = math.sqrt(sum)
return relate
else:
print 'len is not equal'
return 0
数据data
[['3', '0010000000000', '1', '1', '4', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '4', '4', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000000010', '1', '0', '4', '2', '1', '3', '3', '2', '3', '5', '3', '2', '2', '3', '4', '2', '2', '4', '1', '1', '1', '1', '3', '2', '3', '2', '2', '3', '2', '2', '3']
['3', '0010000000000', '1', '3', '2', '3', '3', '3', '3', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '2', '2', '2', '3']
['2', '1000000000000', '2', '1', '3', '4', '2', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2', '3', '3', '2', '2', '2', '2', '3', '2', '2']
['2', '1000000000000', '1', '1', '5', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3']
['3', '0000000100000', '1', '0', '5', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '3', '3', '2']
['3', '0000000100000', '1', '0', '4', '2', '3', '3', '3', '2', '2', '2', '2', '2', '2', '1', '1', '2', '2', '2', '2', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0010000000000', '2', '1', '3', '4', '2', '2', '3', '2', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '3', '2', '2', '3']
['3', '0000010000000', '1', '1', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '3', '2', '2', '3', '2', '4', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2']
['3', '0010000000000', '3', '1', '4', '3', '3', '3', '4', '3', '3', '2', '3', '3', '2', '1', '1', '1', '4', '4', '4', '4', '4', '4', '3', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '0100000000000', '3', '4', '1', '2', '3', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '4', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['2', '0000000000100', '1', '2', '3', '4', '3', '2', '3', '1', '2', '2', '2', '2', '2', '2', '2', '4', '2', '2', '2', '3', '3', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000000010', '1', '3', '3', '2', '2', '3', '2', '3', '3', '3', '3', '3', '2', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['1', '0100000000000', '1', '1', '3', '2', '3', '3', '3', '2', '3', '3', '3', '3', '3', '3', '2', '1', '1', '3', '2', '2', '3', '1', '1', '1', '1', '1', '2', '3', '3', '1', '2']
['1', '0100000000000', '1', '2', '4', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000100000', '1', '1', '3', '3', '3', '2', '4', '4', '4', '4', '4', '2', '2', '1', '1', '3', '3', '4', '3', '4', '3', '1', '2', '1', '1', '1', '2', '2', '1', '1', '1']
['3', '0010000000000', '1', '2', '3', '3', '3', '2', '2', '2', '2', '3', '2', '2', '2', '2', '3', '2', '2', '3', '3', '3', '2', '2', '2', '2', '3', '3', '3', '2', '2', '2', '2']
['3', '0000010000000', '1', '1', '5', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000010000000', '1', '1', '5', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000100000', '1', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']]
只给出了前20行
怪我咯2017-04-18 09:32:33
The solution has been found. The problem lies in the two floats used to calculate the Euclidean distancesum += (float(dataX[a])-float(dataY[a])) ** 2
sum += (float(dataX[a])-float(dataY[a])) ** 2
先将data全转成 intdata = [[int(x) for x in row] for row in data]
First convert all data into intdata = [[int(x) for x in row] for row in data]
高洛峰2017-04-18 09:32:33
def ed_relate(dataX, dataY):
'''
:param dataX:第一行
:param dataY: 第二行
:return: 之间的相似度
'''
if len(dataX) == len(dataY):
relate = math.sqrt(sum(((float(x)-float(y)**2) for x, y in zip(dataX, dataY))))
return relate
else:
print 'len is not equal'
return 0
Not tested, it should improve some performance.
伊谢尔伦2017-04-18 09:32:33
It seems that there is no big problem with your existing code. I repeated your calculation 1,000 times, which is roughly equivalent to 20,000 pieces of data, and the time is 4.4 seconds.
You'd better profile it to see where the problem is.
迷茫2017-04-18 09:32:33
There seems to be no better way in terms of performance, but I think your double loop can be written more elegantly, such as:
def unique_pairs(n):
"""在range(n)范围内生成索引对, 其他地方需要类似迭代可以复用unique_pairs生成器"""
for i in range(n):
for j in range(i+1, n):
yield i, j
for i, j in unique_pairs(len(data)): # 计算任意两点距离和
random_sum += ed_relate(data[i][2:], data[j][2:])