Task
使用朴素贝叶斯模型对新闻进行分类
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
数据处理
20newsgroups数据集是用于文本分类、文本挖据和信息检索研究的国际标准数据集之一。数据集收集了大约20,000左右的新闻组文档,均匀分为20个不同主题的新闻组集合。
在sklearn中,该模型有两种装载方式:
第一种是sklearn.datasets.fetch_20newsgroups,返回一个可以被文本特征提取器(sklearn.feature_extraction.text.CountVectorizer)自定义参数提取特征的原始文本序列;
第二种是sklearn.datasets.fetch_20newsgroups_vectorized,返回一个已提取特征的文本序列,即不需要使用特征提取器。
### 加载数据
from sklearn.datasets import fetch_20newsgroups
twenty_news = fetch_20newsgroups()
y = twenty_news.target
X = twenty_news.data
n_samples = len(twenty_news.data)
print(n_samples, type(X), type(y))
11314 <class 'list'> <class 'numpy.ndarray'>
### 查看数据
from pprint import pprint
pprint(X[500])
y[500]
('From: bjorndahl@augustana.ab.ca\n'
'Subject: Re: document of .RTF\n'
'Organization: Augustana University College, Camrose, Alberta\n'
'Lines: 10\n'
'\n'
'In article <1993Mar30.113436.7339@worak.kaist.ac.kr>, tjyu@eve.kaist.ac.kr '
'(Yu TaiJung) writes:\n'
'> Does anybody have document of .RTF file or know where I can get it?\n'
'> \n'
'> Thanks in advance. :)\n'
'\n'
'I got one from Microsoft tech support.\n'
'\n'
'-- \n'
'Sterling G. Bjorndahl, bjorndahl@Augustana.AB.CA or bjorndahl@camrose.uucp\n'
'Augustana University College, Camrose, Alberta, Canada (403) 679-1100\n')
2
### 将文本转换为向量,使用tf-idf模型将每句话转化为130107维向量
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(X)
vectors.shape
(11314, 130107)
#tf-idf是稀疏向量,可以看到平均每个向量只有158个左右的非零值
vectors.nnz / float(vectors.shape[0])
157.9958458546933
news_test = fetch_20newsgroups(subset='test')
vectors_test = vectorizer.transform(news_test.data)
vectors_test.shape
(7532, 130107)
模型训练
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB(alpha=0.01) # alpha: laplace smooth
clf.fit(vectors, y)
pred = clf.predict(vectors_test)
print(metrics.classification_report(news_test.target, pred, target_names=twenty_news.target_names))
# macro avg: 宏平均,所有项求和直接平均
# weighted avg: 加权平均,每个类别的样本数量不同
precision recall f1-score support
alt.atheism 0.82 0.78 0.80 319
comp.graphics 0.69 0.75 0.72 389
comp.os.ms-windows.misc 0.74 0.63 0.68 394
comp.sys.ibm.pc.hardware 0.65 0.75 0.69 392
comp.sys.mac.hardware 0.83 0.84 0.83 385
comp.windows.x 0.84 0.78 0.81 395
misc.forsale 0.82 0.78 0.80 390
rec.autos 0.89 0.90 0.90 396
rec.motorcycles 0.93 0.96 0.95 398
rec.sport.baseball 0.95 0.94 0.95 397
rec.sport.hockey 0.95 0.97 0.96 399
sci.crypt 0.89 0.93 0.91 396
sci.electronics 0.79 0.77 0.78 393
sci.med 0.89 0.84 0.86 396
sci.space 0.87 0.91 0.89 394
soc.religion.christian 0.82 0.95 0.88 398
talk.politics.guns 0.76 0.91 0.83 364
talk.politics.mideast 0.97 0.94 0.96 376
talk.politics.misc 0.80 0.64 0.71 310
talk.religion.misc 0.76 0.59 0.67 251
accuracy 0.84 7532
macro avg 0.83 0.83 0.83 7532
weighted avg 0.84 0.84 0.83 7532
# 可以发现f1指标在0.83左右,我们查看每个类别里贡献最大的词项
def show_top10(classifer, vectorizer, categories):
feature_names = vectorizer.get_feature_names_out()
for i, category in enumerate(categories):
# feature_log_prob_: Empirical log probability of features given a class, P(x_i|y).
top10 = np.argsort(classifer.feature_log_prob_[i])[-10:]
print("%s: %s" % (category, " ".join(feature_names[top10])))
show_top10(clf, vectorizer, twenty_news.target_names)
alt.atheism: keith it and you in that is to of the
comp.graphics: edu in for it is and graphics of to the
comp.os.ms-windows.misc: file for of and edu is it to the windows
comp.sys.ibm.pc.hardware: card ide is of it drive and scsi to the
comp.sys.mac.hardware: in it is and of edu apple mac to the
comp.windows.x: it mit in motif and is of window to the
misc.forsale: shipping offer of 00 to and edu the for sale
rec.autos: that is you it in of and to car the
rec.motorcycles: dod you it com in of and bike to the
rec.sport.baseball: that is baseball and of in to he edu the
rec.sport.hockey: ca game he team and hockey of in to the
sci.crypt: chip that encryption is and clipper key of to the
sci.electronics: for edu you it in is and of to the
sci.med: edu pitt that it in and is to of the
sci.space: it that is nasa in and to of space the
soc.religion.christian: we it in and is god that to of the
talk.politics.guns: it is you that gun and in of to the
talk.politics.mideast: is you israeli that israel in and to of the
talk.politics.misc: edu it is you and in that of to the
talk.religion.misc: sandvik god you in is that and to of the
# 可以看到有很多it is the这类词,我觉得可以把这类停用词去掉再加上二元词试试
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
vectors = vectorizer.fit_transform(X)
vectors_test = vectorizer.transform(news_test.data)
vectors.shape, vectors_test.shape
((11314, 1186545), (7532, 1186545))
clf = MultinomialNB(alpha=0.01) # alpha: laplace smooth
clf.fit(vectors, y)
pred = clf.predict(vectors_test)
print(metrics.classification_report(news_test.target, pred, target_names=twenty_news.target_names))
precision recall f1-score support
alt.atheism 0.85 0.84 0.84 319
comp.graphics 0.71 0.71 0.71 389
comp.os.ms-windows.misc 0.71 0.66 0.68 394
comp.sys.ibm.pc.hardware 0.65 0.72 0.68 392
comp.sys.mac.hardware 0.82 0.81 0.81 385
comp.windows.x 0.81 0.79 0.80 395
misc.forsale 0.80 0.81 0.81 390
rec.autos 0.88 0.89 0.89 396
rec.motorcycles 0.93 0.95 0.94 398
rec.sport.baseball 0.93 0.92 0.92 397
rec.sport.hockey 0.93 0.96 0.95 399
sci.crypt 0.87 0.92 0.89 396
sci.electronics 0.78 0.73 0.75 393
sci.med 0.87 0.81 0.84 396
sci.space 0.84 0.90 0.87 394
soc.religion.christian 0.85 0.95 0.89 398
talk.politics.guns 0.79 0.91 0.84 364
talk.politics.mideast 0.96 0.94 0.95 376
talk.politics.misc 0.84 0.67 0.74 310
talk.religion.misc 0.80 0.61 0.69 251
accuracy 0.83 7532
macro avg 0.83 0.82 0.83 7532
weighted avg 0.83 0.83 0.83 7532
show_top10(clf, vectorizer, twenty_news.target_names)
alt.atheism: com atheism edu keith livesey atheists caltech caltech edu god edu keith
comp.graphics: subject polygon university com lines files 3d image edu graphics
comp.os.ms-windows.misc: ms drivers ax ax ax driver files dos file edu windows
comp.sys.ibm.pc.hardware: isa pc com controller edu bus card ide drive scsi
comp.sys.mac.hardware: scsi monitor simms centris se quadra drive edu apple mac
comp.windows.x: x11r5 xterm mit edu widget edu com server mit motif window
misc.forsale: lines distribution condition university new shipping offer 00 edu sale
rec.autos: organization subject oil writes article engine edu cars com car
rec.motorcycles: writes ca motorcycle bikes article ride edu dod com bike
rec.sport.baseball: article com runs games players game team year baseball edu
rec.sport.hockey: games season players play nhl edu ca game team hockey
sci.crypt: nsa government clipper chip escrow keys com chip encryption clipper key
sci.electronics: ca ground organization subject power circuit lines use com edu
sci.med: cs pitt com msg gordon banks geb pitt edu gordon banks edu pitt
sci.space: access digex access gov orbit alaska moon henry edu nasa space
soc.religion.christian: faith people christ christian bible edu church christians jesus god
talk.politics.guns: firearms weapons batf fbi stratus people com guns edu gun
talk.politics.mideast: armenia people arab edu armenians armenian jews turkish israeli israel
talk.politics.misc: kaldis government article writes clinton people optilink cramer com edu
talk.religion.misc: apple com people kent morality christian edu com jesus sandvik god
好吧,事实证明改一下分词器没什么用