如果您知道公共前缀所需的阈值频率:
#!/usr/bin/env python
from collections import Counter
from itertools import izip_longest
strings = ['foo_a','foo_b','foo_c','fnord']
threshold = .7 * len(strings)
prefix = []
for chars in izip_longest(*strings, fillvalue=''):
char, count = Counter(chars).most_common(1)[0]
if count < threshold:
break
prefix.append(char)
print(''.join(prefix))
# -> foo_
或者您可以收集所有常见前缀及其频率,然后再决定:
#!/usr/bin/env python
from collections import Counter
from itertools import izip_longest
strings = ['foo_a', 'foo_b','foo_c','fnord']
assert len(strings) > 1
threshold = len(strings)
prefix = []
prefixes = []
for chars in izip_longest(*strings, fillvalue=''):
char, count = Counter(chars).most_common(1)[0]
if count == 1:
break
elif count < threshold:
if prefix:
prefixes.append((''.join(prefix), threshold))
threshold = count
prefix.append(char)
if prefix:
prefixes.append((''.join(prefix), threshold))
print(prefixes)
# -> [('f', 4), ('foo_', 3)]
两个代码示例都假设存在主要前缀,即每个位置最常见的字符属于最常见的前缀。