TF-IDF简介
TF-IDF使用流程
见上图,步骤如下:
- 根据全部语料每个词对应的TF-IDF值
- 每个句子对应全部语料的one-hot编码,将TF-IDF编码作为特征
Python代码
流程很简单,就不复现了,这里有两个使用的方式,第一种是梳理整个流程,第二种是工程上能够简化代码的写法:
def get_text():
sentence_list = [ # 假设这是全部的训练语料
"nlp drives computer programs that translate text from one language to another",
"nlp combines computational linguistics rule based modeling of human language with statistical",
"nlp model respond to text or voice data and respond with text",
]
return sentence_list
def main():
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
print("逐个流程梳理:")
sentence_list = get_text()
# instantiate CountVectorizer()
count_vectorizer = CountVectorizer()
word_count_vector = count_vectorizer.fit_transform(sentence_list) # 1. 计算词频TF
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector) # 2. 计算 TF-IDF
# 计算TF-IDF
tf_idf_vector = tfidf_transformer.transform(word_count_vector) # 获得全部语料的tf-idf值
# 开始使用
print("全部语料:", count_vectorizer.get_feature_names())
my_sentence = "nlp combines computational linguistics"
print("转换任意的一个句子:", tfidf_transformer.transform(count_vectorizer.transform([my_sentence])).todense())
def main2():
print("简写操作:")
from sklearn.feature_extraction.text import TfidfVectorizer
sentence_list = get_text()
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_vectorizer.fit(sentence_list)
# 使用
print("全部语料:", tfidf_vectorizer.get_feature_names())
my_sentence = "nlp combines computational linguistics"
print("转换任意的一个句子:", tfidf_vectorizer.transform([my_sentence]).todense())
if __name__ == '__main__':
main()
main2()