底下程式碼中,先將 train_ts.csv 進行斷詞,再開始向量化,最後偵測跟 “美國” 相近的結果
import codecs
import time
import jieba
import pandas as pd
import re
from gensim.models import Word2Vec
display=pd.options.display
display.max_columns=None
display.max_rows=None
display.width=None
display.max_colwidth=None
def preprocess(text):
text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
text=text.replace(" ","")
return text
print("斷詞處理中...............")
TEXT_CLEANING_RE = "[0-9a-zA-Z,:.?!\"\+\-*/_='()\[\]|<>$()[],|、《》!?”%【】“ .…❤:]"
jieba.set_dictionary('dict.txt')
#載入停用詞
with open('stops.txt', 'r', encoding='utf-8') as file:
stops = file.read().split('\n')
stops=set(stops)
with open('train_ts.csv','r', encoding='utf-8) as file:
lines=file.readlines()
vocabularies=[]
for line in lines:
line=preprocess(line)
terms = [t for t in jieba.cut(line, cut_all=True) if t not in stops]
vocabularies.append(terms)
w2v_model = Word2Vec(
window=7,
min_count=10,
workers=8
)
w2v_model.build_vocab(vocabularies)
words = list(w2v_model.wv.key_to_index.keys())
vocab_size = len(words)
print("Vocab size", vocab_size)
print("開始向量化....")
t1=time.time()
w2v_model.train(vocabularies, total_examples=len(vocabularies), epochs=32)
t2=time.time()
print(f'向量化時間 : {t2-t1}秒')
w2v_model.save("w2v_model_chinese")
結果:
斷詞處理中...............
Building prefix dict from E:\python\w2v\dict.txt ...
Loading model from cache C:\Users\mahal\AppData\Local\Temp\jieba.u3f38139618254a46357e04c6afbde5be.cache
Loading model cost 0.274 seconds.
Prefix dict has been built successfully.
向量化處理中...............
Vocab size 30530
向量化時間 : 48.616782903671265秒
載入向量模型
因為 train_ts.csv 有 320,767 筆資料,向量化模型要一段時間,還好上面程式碼中已儲存成 “w2v_model_chinese” 檔案,所以下次要使用,可以直接載入進行偵測。
無法訓練的人,可由本站下載 w2v_model_chinese
from gensim.models import Word2Vec
model=Word2Vec.load("w2v_model_chinese")
while True:
voc=input("請輸入要查詢的字詞 : ")
if voc=='quit':break
try:
rs=model.wv.most_similar(voc)
for r in rs:
print(r)
except:
print("不在字詞中")
結果
('日本軍', 0.6112362146377563)
('軍國', 0.579851508140564)
('軍國主義', 0.558709979057312)
('本天', 0.5249565839767456)
('日本海', 0.5150989294052124)
('底牌', 0.4993012547492981)
('來日', 0.49267053604125977)
('日本人', 0.48523956537246704)
('導彈', 0.4833442270755768)
('日本國', 0.4820500314235687)
