機械学習　スパム

#単語のデータベースを作成する
import os, glob
import MeCab
import numpy as np
import pickle

#保存ファイル名
savefile = "./ok-spam.pikcle"
#MeCabの準備
tagger = MeCab.Tagger("-r /etc/mecabrc -d /var/lib/mecab/dic/ipadic-utf8")
#変数の準備
word_dic = {"__id": 0}
files = []

#指定したディレクトリ内のファイル一覧を読む
def read_files(dir, label):
    #テキストファイルの一覧を得る
    files = glob.glob(dir + '/*.txt')
    for f in files:
        read_file(f, label)

#ファイルを読む
def read_file(filename, label):
    words = []
    #フィあるの内容を読む
    with open(filename, "rt", encoding="utf-8") as f:
        text = f.read()
    files.append({
        "label": label,
        "words": text_to_ids(text)
    })
    
# テキストを単語IDのリストに変換
def text_to_ids(text):
    word_s = tagger.parse(text)
    words = []
    for line in word_s.split("\n"):
        if line == 'EOS' or line == '':
            continue
        parts = line.split("\t")
        if len(parts) < 5:
            continue  # 形態素解析の結果の要素が足りない場合はこの行をスキップ
        word = parts[0]
        params = parts[4].split("-")
        hinsi = params[0]
        hinsi2 = params[1] if len(params) > 1 else ''
        # 助詞・助動詞・記号・数字は捨てる
        if not (hinsi in ['名詞', '動詞', '形容詞']):
            continue
        if hinsi == '名詞' and hinsi2 == '数詞':
            continue
        # 単語をIDに変換
        id = word_to_id(parts[0])
        words.append(id)
    return words

#単語をIDに変換
def word_to_id(word):
    #単語が辞書に登録されているか？
    if not (word in word_dic):
        #登録されていないので新たにIDを割り当てる
        id = word_dic["__id"]
        word_dic["__id"] += 1
        word_dic[word] = id
    else:
        #既存の単語IDを返す
        id = word_dic[word]
    return id

#単語の出現頻度のデータを作る
def make_freq_data_allfiles():
    y = []
    x = []
    for f in files:
        y.append(f['label'])
        x.append(make_freq_data(f['words']))
    return y, x

def make_freq_data(words):
    #単語の出現回数を調べる
    cnt = 0
    dat = np.zeros(word_dic["__id"], 'float')
    for w in words:
        dat[w] += 1
        cnt += 1
    #回数を出現頻度に直す
    dat = dat / cnt
    return dat

#ファイルの一覧から学習用のデータベースを作る
if __name__ == "__main__":
    read_files("ok", 0)
    read_files("spam", 1)
    y, x = make_freq_data_allfiles()
    #ファイルにデータを保存する
    pickle.dump([y, x, word_dic], open(savefile, 'wb'))
    print("単語頻出データ作成完了")

単語頻出データ作成完了

# 全てのテキストを巡回して単語データベースを作成する
import os, glob
import MeCab
import numpy as np
import pickle

# 保存ファイル名
savefile = "./ok-spam.pickle"
# MeCabの準備 --- (*1)
tagger = MeCab.Tagger("-r /etc/mecabrc -d /var/lib/mecab/dic/ipadic-utf8")
# 変数の準備 --- (*2)
word_dic = {"__id": 0} # 単語辞書
files = [] # 読み込んだ単語データを追加する

# 指定したディレクトリ内のファイル一覧を読む --- (*3)
def read_files(dir, label):
    # テキストファイルの一覧を得る
    files = glob.glob(dir + '/*.txt')
    for f in files:
        read_file(f, label)

# ファイルを読む --- (*4)
def read_file(filename, label):
    words = []
    # ファイルの内容を読む
    with open(filename, "rt", encoding="utf-8") as f:
        text = f.read()
    files.append({
        "label": label,
        "words": text_to_ids(text)
    })

# テキストを単語IDのリストに変換
def text_to_ids(text):
    word_s = tagger.parse(text)
    words = []
    for line in word_s.split("\n"):
        if line == 'EOS' or line == '':
            continue
        parts = line.split("\t")
        if len(parts) < 5:
            continue  # 形態素解析の結果の要素が足りない場合はこの行をスキップ
        word = parts[0]
        params = parts[4].split("-")
        hinsi = params[0]
        hinsi2 = params[1] if len(params) > 1 else ''
        # 助詞・助動詞・記号・数字は捨てる
        if not (hinsi in ['名詞', '動詞', '形容詞']):
            continue
        if hinsi == '名詞' and hinsi2 == '数詞':
            continue
        # 単語をIDに変換
        id = word_to_id(parts[0])
        words.append(id)
    return words


# 単語をidに変換 --- (*9)
def word_to_id(word):
    # 単語が辞書に登録されているか？
    if not (word in word_dic):
        # 登録されていないので新たにIDを割り振る
        id = word_dic["__id"]
        word_dic["__id"] += 1
        word_dic[word] = id
    else:
        # 既存の単語IDを返す
        id = word_dic[word]
    return id

# 単語の頻出頻度のデータを作る --- (*10)
def make_freq_data_allfiles():
    y = []
    x = []
    for f in files:
        y.append(f['label'])
        x.append(make_freq_data(f['words']))
    return y, x

def make_freq_data(words):
    # 単語の出現回数を調べる
    cnt = 0
    dat = np.zeros(word_dic["__id"], 'float')
    for w in words:
        dat[w] += 1
        cnt += 1
    # 回数を出現頻度に直す --- (*11)
    dat = dat / cnt
    return dat

# ファイルの一覧から学習用のデータベースを作る
if __name__ == "__main__":
    read_files("ok", 0)
    read_files("spam", 1)
    y, x = make_freq_data_allfiles()
    # ファイルにデータを保存
    pickle.dump([y, x, word_dic], open(savefile, 'wb'))
    print("単語頻出データ作成完了")

単語頻出データ作成完了

#単語頻出データを機械学習するプログラム
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#データファイルの読み込み
data_file = "./ok-spam.pickle"
save_file = "./ok-spam-model.pickle"
data = pickle.load(open(data_file, "rb"))
y = data[0]
x = data[1]

#100回、学習とテストを繰り返す
count = 100
rate = 0
for i in range(count):
    #データを学習用とテストように分割
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    #学習
    model = GaussianNB()
    model.fit(x_train, y_train)
    #評価
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    #評価結果が良けれbあモデルを保存
    if acc > 0.94: pickle.dump(model, open(save_file, "wb"))
    print(acc)
    rate += acc
#平均値を表示
print("----")
print("average=", rate / count)

0.8378378378378378
0.9459459459459459
0.8918918918918919
0.9459459459459459
0.8918918918918919
0.8918918918918919
0.9459459459459459
0.9459459459459459
0.8378378378378378
0.9459459459459459
0.8108108108108109
0.8648648648648649
0.8108108108108109
0.8108108108108109
0.9459459459459459
0.8918918918918919
0.8648648648648649
0.8108108108108109
0.8918918918918919
0.918918918918919
1.0
0.918918918918919
0.8918918918918919
0.918918918918919
0.7837837837837838
0.8378378378378378
0.972972972972973
0.918918918918919
0.7837837837837838
0.8378378378378378
0.8918918918918919
0.9459459459459459
0.918918918918919
0.918918918918919
0.8918918918918919
0.8378378378378378
1.0
0.8648648648648649
0.918918918918919
0.918918918918919
0.9459459459459459
0.8648648648648649
0.9459459459459459
0.918918918918919
0.8648648648648649
0.8378378378378378
0.8648648648648649
0.8918918918918919
0.918918918918919
0.8918918918918919
0.972972972972973
0.8918918918918919
0.9459459459459459
0.972972972972973
0.8918918918918919
0.918918918918919
0.918918918918919
0.8378378378378378
0.8648648648648649
0.918918918918919
0.8918918918918919
0.8378378378378378
0.8918918918918919
0.8918918918918919
1.0
0.918918918918919
0.918918918918919
0.972972972972973
0.8648648648648649
0.972972972972973
0.8378378378378378
0.8918918918918919
0.8378378378378378
0.918918918918919
0.918918918918919
0.9459459459459459
0.9459459459459459
0.972972972972973
0.918918918918919
0.8648648648648649
0.918918918918919
0.8648648648648649
0.6756756756756757
0.9459459459459459
0.9459459459459459
0.918918918918919
0.9459459459459459
0.8378378378378378
0.8918918918918919
0.8378378378378378
0.8378378378378378
0.8648648648648649
0.8378378378378378
0.8648648648648649
0.8918918918918919
0.918918918918919
0.8378378378378378
0.972972972972973
0.8918918918918919
0.8918918918918919
----
average= 0.8956756756756761

#自分のテキストをスパム判定してみよう
import pickle
import MeCab
import numpy as np
from sklearn.naive_bayes import GaussianNB

#テストするテキスト
test_text1 = """
会社から支給されているiPhoneの調子が悪いです。
修理に出すので、しばらくはアプリのテストができません。
"""
test_text2 = """
億万長者になる方法を教えます。
すぐに以下のアドレスに返信してください。
"""
#ファイル名
data_file = "./ok-spam.pickle"
model_file = "./ok-spam-model.pickle"
label_names = ['OK', "SPAM"]
#単語辞書を読み出す
data = pickle.load(open(data_file, "rb"))
word_dic = data[2]
#Mecabの準備
tagger = MeCab.Tagger("-r /etc/mecabrc -d /var/lib/mecab/dic/ipadic-utf8")
#モデルを読み出す
model = pickle.load(open(model_file, "rb"))

#テキストがスパムか判定
def check_spam(text):
    # 単語IDのリストに変換した後の出現頻度を調べる
    zw = np.zeros(word_dic['__id'])
    count = 0
    s = tagger.parse(text)
    # 単語ごとの回数を加算
    for line in s.split("\n"):
        if line == "EOS" or line == '':
            break
        parts = line.split("\t")
        if len(parts) <= 3:
            continue  # 形態素解析の結果が想定のフォーマットでない場合、この行をスキップ
        org = parts[3]
        if org in word_dic:
            id = word_dic[org]
            zw[id] += 1
            count += 1
    if count > 0:  # 0で割ることのないようにする
        zw = zw / count
    else:
        zw = np.zeros(word_dic['__id'])  # 有効な単語がなかった場合、ゼロ配列を使用
    # 予測
    pre = model.predict([zw])[0]  # `ze` を `zw` に修正
    print("結果=", label_names[pre])


if __name__ == "__main__":
    check_spam(test_text1)
    check_spam(test_text2)

結果= SPAM
結果= SPAM