로이터_임베딩.py
Word2Vec 사전학습 임베딩을 활용한 로이터 뉴스 분류
핵심 개념
- 사전학습된 Word2Vec 임베딩 활용
- Gensim 라이브러리
- TextVectorization 레이어
- Bidirectional LSTM
- 8개 카테고리 다중 분류
#중요: konlpy하고 Korpora 는 pip 로 설치해야 한다
#https://fasttext.cc/docs/en/crawl-vectors.html
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from konlpy.tag import Okt
import re
import os, pathlib, shutil, random
import numpy as np
from keras import layers
import requests
from gensim.models import KeyedVectors
# 데이터셋 생성
def create_dataset(base_dir="reuter"):
# ... 로이터 뉴스 데이터 로드 및 분류
def loadFile(base_dir="reuter"):
labelnames = os.listdir(base_dir)
print(labelnames)
total_cnt = 0
for label in labelnames:
cnt = len(os.listdir(base_dir+"/"+label))
total_cnt += cnt
print(label, cnt)
print("전체 개수 : ", total_cnt)
"""
카테고리별 데이터:
acq 2292
crude 374
earn 3923
grain 51
interest 271
money-fx 293
ship 144
trade 326
전체 개수 : 7674
"""
batch_size=32
train_ds = keras.utils.text_dataset_from_directory(
"reuter/", batch_size=batch_size
)
text_only_train_ds = train_ds.map(lambda x, y: x)
# Word2Vec 모델 로드
path_to_word2vec_file = "GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(path_to_word2vec_file, binary=True)
print(f"Word2Vec model loaded. Vector size: {word2vec_model.vector_size}")
embedding_dim = word2vec_model.vector_size
max_tokens = 20000
text_vectorization = layers.TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=600,
)
text_vectorization.adapt(text_only_train_ds)
# 임베딩 매트릭스 생성
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens, embedding_dim))
print("Populating embedding matrix with Word2Vec vectors...")
hits = 0
misses = 0
for word, i in word_index.items():
if i < max_tokens:
try:
embedding_vector = word2vec_model[word]
embedding_matrix[i] = embedding_vector
hits += 1
except KeyError:
misses += 1
print(f"Converted {hits} words ({misses} misses)")
# 임베딩 레이어 (학습 안함)
embedding_layer = layers.Embedding(
max_tokens,
embedding_dim,
embeddings_initializer=keras.initializers.Constant(embedding_matrix),
trainable=False,
mask_zero=True,
)
# 모델 구축
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(8, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
model.summary()
callbacks = [
keras.callbacks.ModelCheckpoint("word2vec_embeddings_sequence_model2.keras",
save_best_only=True)
]
print("\nStarting model training with Word2Vec embeddings...")
model.fit(int_train_ds, epochs=10, callbacks=callbacks)