问题是您将主题视为一个整体,如果您希望各个部分使用groupby https://stackoverflow.com/a/31506466/2141635原始答案中的代码首先获取一组所有名称,然后将这组名称与 defaultdict 键进行比较,以找出每个部分中的差异:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
# find every word in every TOPIC
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0) # rset pointer
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
# get difference in all_words vs words in current TOPIC
# giving 0 as default for missing values
for word in all_words - d.viewkeys():
d[word] = 0
for k,v in d.iteritems():
print("Prob for {} is {}".format(k,v))
d = defaultdict(float)
要存储所有输出,您可以将字典添加到列表中:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
for word in all_words - d.viewkeys():
d[word] = 0
out.append(d)
d = defaultdict(float)
然后迭代列表:
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
或者忘记 defualtdict 并使用 dict.fromkeys:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = [line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")]
f.seek(0)
out, d = [], dict.fromkeys(all_words ,0.0)
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
out.append(d)
d = dict.fromkeys(all_words ,0)
如果您总是希望末尾缺少单词,请使用 collections.OrderedDict 并使用第一种方法在字典末尾添加缺少的值:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d.setdefault(name, (float(val) * f))
for word in all_words.difference(d):
d[word] = 0
out.append(d)
d = OrderedDict()
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
最后按顺序和主题存储:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = OrderedDict()
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v).rstrip()
# create OrderedDict for each topic
out[topic] = OrderedDict()
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
out[topic].setdefault(name, (float(val) * f))
# find words missing from TOPIC and set to 0
for word in all_words.difference(out[topic]):
out[topic][word] = 0
for k,v in out.items():
print(k) # each TOPIC
for k,v in v.iteritems():
print("Prob for {} is {}".format(k,v)) # the OrderedDict items
print("\n")
doc1:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
doc2:
0.345 0.566667
Output:
TOPIC:topic_0 5892.0
Prob for site is 0.0128233197556
Prob for Internet is 0.00901731160895
Prob for online is 0.00790478615073
Prob for web is 0.00755346232181
Prob for say is 0.00550407331974
Prob for image is 0.00521130346231
Prob for BBC is 0
Prob for Mr is 0
Prob for s is 0
Prob for president is 0
Prob for tell is 0
TOPIC:topic_1 12366.0
Prob for Mr is 0.085187930859
Prob for s is 0.0293277438137
Prob for say is 0.0255701266375
Prob for president is 0.00870667394471
Prob for tell is 0.0076985327511
Prob for BBC is 0.0076985327511
Prob for web is 0
Prob for image is 0
Prob for online is 0
Prob for site is 0
Prob for Internet is 0
您可以使用常规 for 循环应用完全相同的逻辑,groupby 只是为您完成所有分组工作。
如果您实际上只想写入文件,那么代码就更简单:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2,open("prob.txt","w") as f3:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic, words = next(v), []
flt = next(values)
f3.write(topic)
for s in v:
name, val = s.split()
words.append(name)
f3.write("{} {}\n".format(name, (float(val) * flt)))
for word in all_words.difference(words):
f3.write("{} {}\n".format(word, 0))
f3.write("\n")
问题.txt:
TOPIC:topic_0 5892.0
site 0.0128233197556
Internet 0.00901731160895
online 0.00790478615073
web 0.00755346232181
say 0.00550407331974
image 0.00521130346231
BBC 0
Mr 0
s 0
president 0
tell 0
TOPIC:topic_1 12366.0
Mr 0.085187930859
s 0.0293277438137
say 0.0255701266375
president 0.00870667394471
tell 0.0076985327511
BBC 0.0076985327511
web 0
image 0
online 0
site 0
Internet 0