首先将txt文档提取到Excel表格中筛选出重复项,并且整理到txt中:
需要去重的目标txt也准备好:
接下来运行代码,注意代码中的路径需要自己按实际情况修改:
#coding:utf-8
#__author__ ='pig'
# 实现去重功能
import re
# 需要去重的标题列表
title_need_delete = []
f0 = open('D:\Desktop\数据\\need_delete.txt',encoding='utf-8')
for line in f0.readlines():
title_need_delete.append(line)
#print(title_need_delete)
# 一键多值字典0
dict={}
# 标题列表
title_list = []
f1=open('D:\Desktop\数据\\sum.txt',encoding='utf-8')
data=f1.readlines()
# print data1
f1.close()
num = 0
# 第一次遍历:构建字典,能看出每篇文章出现的次数
for index,line in enumerate(data):
if line[0:2] == 'T1':
num+=1
title = line[3:]
if title not in title_list:
# 向字典中添加新key
dict.setdefault(title,[]).append(1)
title_list.append(title)
else:
# 修改该文章出现的次数
dict[title][0] += 1
#print(dict)
print("原数据量:"+str(num))
f2 = open('after_delete.txt', 'w', encoding='utf-8')
delete_num = 0
after_num = 0
# 第二次遍历:将文本写入新文件(每篇文章数据整段写入),对于每篇文章,若出现在黑名单中,则需要跳过,直到最后一次出现才写入
for index,line in enumerate(data):
# 每篇文章先判定
if line[0:2] == 'RT':
# 定位至标题行
index_1 = index
while(data[index_1][0:2] != 'T1'):
index_1 += 1
# 提取标题
title = data[index_1][3:]
# 如果在黑名单里且不是最后一次出现,则跳过此文章,并将字典中次数-1
if title in title_need_delete and dict[title][0] > 1:
# print('跳过:'+ title)
dict[title][0] -= 1
delete_num += 1
# 跳过
continue
else :
# 不在黑名单中则正常写
f2.writelines(data[index])
after_num += 1
index += 1
while(data[index][0:2] != 'RT'):
f2.writelines(data[index])
index += 1
if index == len(data):
break
print("去重完成!")
print("删除重复数据:" + str(delete_num))
print("剩余数据:" + str(after_num))
f2.close()
运行结果: