네이버 리뷰 데이터를 활용한 한국어 감정 분석

네이버 영화 리뷰데이터(Naver Sentiment Movie Corpus,NSMC)를 활용해서 감정분석을 수행했습니다.

1) 전처리 코드는 다음 github에서 가져와서 사용했습니다. 출처 : https://github.com/reniew/NSMC_Sentimental-Analysis/blob/master/notebook/NSMC_Preprocessing.ipynb

2) 모델링은 IMDB에서 수행한 LSTM 으로 진행했습니다.

!pip install konlpy

import os

import numpy as np
import pandas as pd

from datetime import datetime
import json
import re

from konlpy.tag import Okt # komoran, han, kkma

import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence

from tqdm.notebook import tqdm

데이터 불러오기

train = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', header=0, delimiter='\t' ,quoting=3)
test = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', header=0, delimiter='\t' ,quoting=3)

display(train.head())
display(test.head())

# output :

	id	document		label
0	9976970	아 더빙.. 진짜 짜증나네요 목소리	0
1	3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1
2	10265843	너무재밓었다그래서보는것을추천한다	0
3	9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0
4	6483659	사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...	1
id	document	label
0	6270596	굳 ㅋ	1
1	9274899	GDNTOPCLASSINTHECLUB	0
2	8544678	뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아	0
3	6825595	지루하지는 않은데 완전 막장임... 돈주고 보기에는....	0
4	6723715	3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??	0

train.shape, test.shape

# output :

((150000, 3), (50000, 3))

데이터 전처리

def preprocessing(review, okt, remove_stopwords = False, stop_words = [], test = False):
    # 함수의 인자는 다음과 같다.
    # review : 전처리할 텍스트
    # okt : okt 객체를 반복적으로 생성하지 않고 미리 생성후 인자로 받는다.
    # remove_stopword : 불용어를 제거할지 선택 기본값은 False
    # stop_word : 불용어 사전은 사용자가 직접 입력해야함 기본값은 비어있는 리스트
    
    # 1. 한글 및 공백을 제외한 문자 모두 제거.
    review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", review)
    
    # 2. okt 객체를 활용해서 형태소 단위로 나눈다.
    word_review = okt.morphs(review_text, stem=True)
    
    if test:
        print(review_text)
        print(word_review)

    if remove_stopwords:
        
        # 불용어 제거(선택적)
        word_review = [token for token in word_review if not token in stop_words]

    return word_review

sample_review = train['document'][0]
sample_review

# output :

아 더빙.. 진짜 짜증나네요 목소리

stop_words = [ '은', '는', '이', '가', '하', '아', '것', '들','의', '있', '되', '수', '보', '주', '등', '한']
preprocessing(sample_review, Okt(), remove_stopwords = True, stop_words=stop_words, test = True)

# output :

아 더빙 진짜 짜증나네요 목소리
['아', '더빙', '진짜', '짜증나다', '목소리']
['더빙', '진짜', '짜증나다', '목소리']

stop_words = [ '은', '는', '이', '가', '하', '아', '것', '들','의', '있', '되', '수', '보', '주', '등', '한']
okt = Okt()
clean_review = []
clean_review_test = []

for review in tqdm(train['document']):
    # 비어있는 데이터에서 멈추지 않도록 string인 경우만 진행
    if type(review) == str:
        clean_review.append(preprocessing(review, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_review.append([])

for review in tqdm(test['document']):
    # 비어있는 데이터에서 멈추지 않도록 string인 경우만 진행
    if type(review) == str:
        clean_review_test.append(preprocessing(review, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_review_test.append([])

print(len(clean_review))
print(len(clean_review_test))

# output :

150000
50000

tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_review) # 단어 인덱스 구축
text_sequences = tokenizer.texts_to_sequences(clean_review) # 문자열 -> 인덱스 리스트
                                                            # '나는 천재다 나는 멋있다' -> [1, 2, 1, 3]
word_vocab = tokenizer.word_index # 딕셔너리 형태
print("전체 단어 개수: ", len(word_vocab)) # 전체 단어 개수 확인

# output :

전체 단어 개수:  43756

MAX_SEQUENCE_LENGTH = 50 # 문장 최대 길이

X_train = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # 문장의 길이가 50 단어가 넘어가면 자르고, 모자르면 0으로 채워 넣는다.
y_train = np.array(train['label']) # 각 리뷰의 감정을 넘파이 배열로 만든다.

print('Shape of input data tensor:', X_train.shape) # 리뷰 데이터의 형태 확인
print('Shape of label tensor:', y_train.shape) # 감정 데이터 형태 확인

# output :

Shape of input data tensor: (150000, 50)
Shape of label tensor: (150000,)

tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(clean_review_test)
text_sequences_test = tokenizer_test.texts_to_sequences(clean_review_test)

word_vocab_test = tokenizer_test.word_index # 딕셔너리 형태
print("전체 단어 개수: ", len(word_vocab_test)) # 전체 단어 개수 확인

# output :

전체 단어 개수:  26778

MAX_SEQUENCE_LENGTH = 50 # 문장 최대 길이

X_test = pad_sequences(text_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # 문장의 길이가 50 단어가 넘어가면 자르고, 모자르면 0으로 채워 넣는다.
y_test = np.array(test['label']) # 각 리뷰의 감정을 넘파이 배열로 만든다.

print('Shape of input data tensor:', X_test.shape) # 리뷰 데이터의 형태 확인
print('Shape of label tensor:', y_test.shape) # 감정 데이터 형태 확인

# output :

Shape of input data tensor: (50000, 50)
Shape of label tensor: (50000,)

X_train

# output :

array([[  463,    20,   265, ...,     0,     0,     0],
       [  923,   465,    46, ...,     0,     0,     0],
       [  393,  2456, 25028, ...,     0,     0,     0],
       ...,
       [   43,    64,    60, ...,     0,     0,     0],
       [ 1046,     1,  2271, ...,     0,     0,     0],
       [  201,     1,  1806, ...,     0,     0,     0]], dtype=int32)

모델 구축

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(len(word_vocab)+1, 400)) # (단어집합의 크기, 임베딩 후 벡터 크기)
model.add(LSTM(128))
model.add(Dense(1, activation = 'sigmoid')) # 0 or 1로 이진분류이므로 시그모이드 함수를 사용
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics=['accuracy'])
# 이진 분류이므로 손실함수는 binary_crossentropy 사용, 에폭마다 정확도를 보기 위해 accuracy 적용
print(model.summary()) #모델 아키텍처 출력

# output :

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, None, 400)         17502800  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               270848    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
=================================================================
Total params: 17,773,777
Trainable params: 17,773,777
Non-trainable params: 0
_________________________________________________________________
None

모델 학습

model.fit(X_train, y_train, epochs =1, batch_size = 60, validation_split=0.2)

# output :

2000/2000 [==============================] - 565s 281ms/step - loss: 0.5983 - accuracy: 0.6748 - val_loss: 0.4411 - val_accuracy: 0.8092
<tensorflow.python.keras.callbacks.History at 0x7fee766c5780>

모델 검증

from sklearn.metrics import accuracy_score

y_train_predclass = model.predict_classes(X_train)
y_test_predclass = model.predict_classes(X_test)

print("Train Accuracy: {}".format(round(accuracy_score(y_train, y_train_predclass),3)))
print("Test Accuracy: {}".format(round(accuracy_score(y_test, y_test_predclass),3)))

# output :

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py:450: UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
  warnings.warn('`model.predict_classes()` is deprecated and '
Train Accuracy: 0.829
Test Accuracy: 0.571

'ㄴㅏㄴㅡㄴㅂㅐㄱㅗㅍㅡㄷㅏ' -> '나는배고프다' # hangul_utils . jamo_join