Keras 点评文本情感分析

2020-06-19 | 阅读：次

一、准备工作

1、数据导入

import pandas as pd
data = pd.read_csv('/Users/liming/Downloads/review.csv')
print(data.shape)
data.head()

(100000, 3)

	reviewid	reviewbody	star
0	661913194	宝燕乐园的滑滑梯很出名啊，波浪行的，陡峭型的，管道式的，小朋友一个滑滑梯滑过来，乐此不疲。园...	50
1	661913413	挺不错的潮汕味道，老家的味道	50
2	661913671	心心念念几个月，今天终于成行啦，四个人早早开车前往田林路，最近在修路所以车只能停在地下室，（...	50
3	661914092	干净卫生，老板服务热情，很温馨的汗蒸房??	50
4	661916265	慕名而去的，88元12个的大生蚝吃完感觉特别值，出菜员还告诉我们给我们多加了两个。感觉在广东...	45

2、情感划分

我们的目的是分析文本的情感：积极或消极。因此，这里设置阈值为30：star 小于30的为消极（0）、大于等于30的为积极（1）。

# 定义函数：根据用户评的星级来估计sentiment（情感）
def make_label(star):
    if star >=30:
        return 1
    else:
        return 0
# 运用 apply 方法得到新列
data["sentiment"] = data.star.apply(make_label)
data.head()

	reviewid	reviewbody	star	sentiment
0	661913194	宝燕乐园的滑滑梯很出名啊，波浪行的，陡峭型的，管道式的，小朋友一个滑滑梯滑过来，乐此不疲。园...	50	1
1	661913413	挺不错的潮汕味道，老家的味道	50	1
2	661913671	心心念念几个月，今天终于成行啦，四个人早早开车前往田林路，最近在修路所以车只能停在地下室，（...	50	1
3	661914092	干净卫生，老板服务热情，很温馨的汗蒸房??	50	1
4	661916265	慕名而去的，88元12个的大生蚝吃完感觉特别值，出菜员还告诉我们给我们多加了两个。感觉在广东...	45	1

sentences = data["reviewbody"].astype(str)
labels = data["sentiment"]

3、分词并转化为词向量

import jieba
from gensim.models.word2vec import Word2Vec
def train_word2vec(sentences,save_path):
    sentences_seg = []
    sen_str = "\n".join(sentences)
    res = jieba.lcut(sen_str)
    seg_str = " ".join(res)
    sen_list = seg_str.split("\n")
    for i in sen_list:
        sentences_seg.append(i.split())
    print("开始训练词向量") 
#     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(sentences_seg,
                size=100,  # 词向量维度
                min_count=5,  # 词频阈值
                window=5)  # 窗口大小    
    model.save(save_path)
    return model

model =  train_word2vec(sentences,'/Users/liming/Downloads/word2vec.model')    

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/zd/qhg48cw17_ncqf0rl48wz5rh0000gp/T/jieba.cache
Loading model cost 0.662 seconds.
Prefix dict has been built successfully.


开始训练词向量

4、数据预处理

from gensim.corpora.dictionary import Dictionary
from gensim import models
import numpy as np

def generate_id2wec(word2vec_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
    n_vocabs = len(w2id) + 1
    embedding_weights = np.zeros((n_vocabs, 100))
    for w, index in w2id.items():  # 从索引为1的词语开始，用词向量填充矩阵
        embedding_weights[index, :] = w2vec[w]
    return w2id,embedding_weights

def text_to_array(w2index, senlist):  # 文本转为索引数字模式
    sentences_array = []
    for sen in senlist:
        new_sen = [ w2index.get(word,0) for word in sen]   # 单词转索引数字
        sentences_array.append(new_sen)
    return np.array(sentences_array)

def prepare_data(w2id,sentences,labels,max_len=200):
    X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
    X_train = text_to_array(w2id, X_train)
    X_val = text_to_array(w2id, X_val)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_val = pad_sequences(X_val, maxlen=max_len)
    return np.array(X_train), np_utils.to_categorical(y_train) ,np.array(X_val), np_utils.to_categorical(y_val)

w2id, embedding_weights = generate_id2wec(model)

/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  if __name__ == '__main__':

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

x_train, y_trian, x_val , y_val = prepare_data(w2id, sentences, labels,200)

二、构建模型

class Sentiment:
    def __init__(self,w2id,embedding_weights,Embedding_dim,maxlen,labels_category):
        self.Embedding_dim = Embedding_dim
        self.embedding_weights = embedding_weights
        self.vocab = w2id
        self.labels_category = labels_category
        self.maxlen = maxlen
        self.model = self.build_model()
      
        
    def build_model(self):
        model = Sequential()
        #input dim(140,100)
        model.add(Embedding(output_dim = self.Embedding_dim,
                           input_dim=len(self.vocab)+1,
                           weights=[self.embedding_weights],
                           input_length=self.maxlen))
        model.add(Bidirectional(LSTM(50),merge_mode='concat'))
        model.add(Dropout(0.5))
        model.add(Dense(self.labels_category))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                     optimizer='adam', 
                     metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self,X_train, y_train,X_test, y_test,n_epoch=5 ):
        self.model.fit(X_train, y_train, batch_size=32, epochs=n_epoch,
                      validation_data=(X_test, y_test))
        self.model.save('sentiment.h5')   
        
    def predict(self,model_path,new_sen):
        model = self.model
        model.load_weights(model_path)
        new_sen_list = jieba.lcut(new_sen)
        sen2id =[ self.vocab.get(word,0) for word in new_sen_list]
        sen_input = pad_sequences([sen2id], maxlen=self.maxlen)
        res = model.predict(sen_input)[0]
        return np.argmax(res)

from keras import Sequential
from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax

senti = Sentiment(w2id,embedding_weights,100,200,2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 200, 100)          2885200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
=================================================================
Total params: 2,945,802
Trainable params: 2,945,802
Non-trainable params: 0
_________________________________________________________________

senti.train(x_train,y_trian, x_val ,y_val,1)

/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 20000 samples
Epoch 1/1
80000/80000 [==============================] - 1776s 22ms/step - loss: 0.1473 - accuracy: 0.9518 - val_loss: 0.1270 - val_accuracy: 0.9569

label_dic = {0:"消极的",1:"积极的"}
sen_new = "这家的银耳莲子羹很不错，上菜很快，菜的照片很真实"
pre = senti.predict("./sentiment.h5",sen_new)
print("'{}'的情感是:\n{}".format(sen_new,label_dic.get(pre)))

'这家的银耳莲子羹很不错，上菜很快，菜的照片很真实'的情感是:
积极的

Liming96

齐刘海的小柴

Keras 点评文本情感分析

一、准备工作

1、数据导入

2、情感划分

3、分词并转化为词向量

4、数据预处理

二、构建模型