静かなる名辞

pythonとプログラミングのこと


ロジスティック回帰で特徴語を抽出する

はじめに

 線形判別分析など、線形の判別モデルは係数を使って各クラスに重要な特徴を取り出すことができます。

 今回はロジスティック回帰を使って、20newsgroupsのデータセットから各クラスの特徴語を取り出してみます。

実験

 以下のようなコードを走らせます。

import textwrap
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def main():
    # 全データだと大きくて大変なのでテストだけ
    news20 = fetch_20newsgroups(subset="test")

    le = LabelEncoder()
    y = le.fit_transform(news20.target)

    cv = CountVectorizer(min_df=0.03, stop_words="english")
    X = cv.fit_transform(news20.data).toarray()

    # 同じスケールにしないと解釈しづらい
    ss = StandardScaler()
    lr = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
    pl = Pipeline([("ss", ss), ("lr", lr)])

    pl.fit(X, y)

    for coef, target in zip(pl.named_steps.lr.coef_, le.classes_):
        idx = np.argsort(coef)[::-1]
        feature_names = np.array(cv.get_feature_names(), dtype=object)
        print("#", news20.target_names[target])
        print(textwrap.fill(", ".join(feature_names[idx[:20]]), 40))
        print()

if __name__ == "__main__":
    main()
    

 重要なのは変数をスケーリングしていることで、これによって係数の大小を直接比較できるようになります。

結果

 こんな感じに。

# alt.atheism
god, christian, jesus, religion, said,
war, book, mac, free, earth, send,
years, law, 20, claim, example, game,
death, space, copy

# comp.graphics
graphics, file, files, available,
version, number, 31, bit, code,
computer, program, software, pc, data,
unix, months, wrong, view, light, far

# comp.os.ms-windows.misc
windows, file, ms, files, disk, win,
program, working, card, 11, user, dos,
using, house, version, number, mac,
software, video, better

# comp.sys.ibm.pc.hardware
card, pc, dos, drive, price, hardware,
computer, files, problems, chip, disk,
33, power, ms, 17, support, running,
hard, note, set

# comp.sys.mac.hardware
mac, apple, card, price, drive,
christian, 30, disk, problems, power,
user, software, video, speed, support,
running, ii, day, work, use

# comp.windows.x
windows, 00, use, graphics, code, file,
dos, running, ms, support, version, mit,
software, available, key, price, 24,
user, files, include

# misc.forsale
car, drive, sale, 00, price, disk,
original, wanted, copy, distribution,
guy, edu, unix, 000, pc, make, love,
house, games, sense

# rec.autos
car, nntp, speed, right, today, ll,
drive, buy, year, said, network, went,
needed, usa, ask, law, big, power,
really, came

# rec.motorcycles
dod, country, list, writes, drive, car,
need, standard, new, uk, type, past,
sale, gov, lot, sun, days, university,
course, understand

# rec.sport.baseball
game, team, situation, games, history,
list, year, national, argument, win,
local, time, start, box, play, past,
won, don, best, andrew

# rec.sport.hockey
game, team, games, paul, apple, 1993,
play, year, ca, didn, years, including,
group, man, win, 24, mike, situation,
night, best

# sci.crypt
chip, government, key, code, phone,
data, win, waco, war, message, bit,
computer, available, working, dod, 16,
running, issue, hardware, fbi

# sci.electronics
copy, host, pc, 40, chip, hardware,
disk, used, use, hp, sci, high, subject,
phone, available, program, discussion,
car, buy, video

# sci.med
information, 20, general, told, sci,
body, research, effect, vs, public,
taking, check, called, months, vms,
known, wants, common, know, particular

# sci.space
space, earth, edu, sci, au, human, life,
access, days, games, close, mr, private,
war, better, nasa, high, large, control,
head

# soc.religion.christian
1993, god, christian, jesus, people,
add, given, exactly, word, religion,
original, free, book, sun, body,
subject, andrew, human, think, important

# talk.politics.guns
fbi, waco, law, posting, com,
government, house, believe, second, 50,
people, evidence, change, control, sale,
buy, religion, national, hope, car

# talk.politics.mideast
mr, government, history, vs, people,
000, center, war, writes, mit,
university, did, message, start,
support, force, open, religion, want,
based

# talk.politics.misc
waco, government, men, people, earth,
mr, data, children, fbi, general, usa,
force, time, questions, apr, day, 1993,
house, national, video

# talk.religion.misc
jesus, god, christian, religion, fbi,
including, apr, children, writes, years,
1993apr21, book, message, saying, apple,
day, doesn, happened, mr, government

納得感のある結果が得られています。少しmin_dfを大きくしすぎた(クラス固有の語を落とした可能性がある)かもしれませんが、すでに1993などノイズっぽいものが紛れ込んでいるので、あまりいじらない方が良いでしょう。

まとめ

 標準化して係数を見るのは簡単で議論の余地が少ない方法なので(まあ有意性とか言われる可能性はあるけど)、使いやすいです。