#DeepLearning #SupervisedLearning #CNN

By Billy Gustave

Character based CNN - Sentiment Analysis

Goal

In [1]:
import pandas as pd, re, numpy as np, tensorflow as tf
In [ ]:
df = pd.read_csv(r'companies_reviews.csv')

Data/Text preprocessing

In [ ]:
# make comment column string
df.Comment = df.Comment.astype(str)
In [ ]:
# revoming links, users, hashtags and emojies from comments in lower()
df['cleaned_comments'] = df.Comment.apply(lambda x: re.sub(r'[\t\r]', ' ', re.sub(r'([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)|(\x7f)'
                                                           ,"", x.lower())).encode('ascii', 
                                                                                  'ignore').decode('ascii').strip())
nan_value = float("NaN")
df.replace('', nan_value, inplace=True)
df = df.dropna()
In [ ]:
df.to_csv(r'companies_reviews_cleaned.csv', index=False)
In [7]:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
n_class = 5
maxlength = 175
vocab_size = len(alphabet)+1
In [ ]:
def one_hot_encoding(text, max_text_length, alphabet):
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in text[:max_text_length]]
    # one hot encode
    onehot_encoded = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    onehot_encoded = np.array(onehot_encoded)
    if len(text) < max_text_length:
        padded_array = np.zeros((max_text_length, len(alphabet)))
        shape = np.shape(onehot_encoded)
        padded_array[:shape[0],:shape[1]] = onehot_encoded
        return padded_array
    return onehot_encoded

Train-Test-Split

In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
In [ ]:
def train_test_val():
    df = pd.read_csv(r'companies_reviews_cleaned.csv')
    df.dropna(inplace=True)
    df_1 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 1], random_state=7).iloc[:24000,:]
    df_2 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 2], random_state=7).iloc[:24000,:]
    df_3 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 3], random_state=7).iloc[:24000,:]
    df_4 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 4], random_state=7).iloc[:24000,:]
    df_5 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 5], random_state=7).iloc[:24000,:]
    df = shuffle(pd.concat([df_1,df_2,df_3,df_4,df_5], axis=0), random_state=7)
    df['Rating'] = df['Rating'].map({1:0, 2:1, 3:2, 4:3, 5:4})
    
    x_train, X, y_train, y = train_test_split(df['cleaned_comments'], df['Rating'], test_size=0.16666666666666666,
                                              stratify=df['Rating'], random_state=7)
    x_test, x_val, y_test, y_val = train_test_split(X, y, test_size=0.5, stratify=y, random_state=7)

    return x_train, x_val, x_test, y_train, y_val, y_test
In [ ]:
def train_test_val_full():
    df = pd.read_csv(r'companies_reviews_cleaned.csv')
    df = df[['cleaned_comments', 'Rating']].dropna()
    df['Rating'] = df['Rating'].map({1:0, 2:1, 3:2, 4:3, 5:4})
    
    x_train, X, y_train, y = train_test_split(df['cleaned_comments'], df['Rating'], test_size=0.16666666666666666,
                                              stratify=df['Rating'], random_state=7)
    x_test, x_val, y_test, y_val = train_test_split(X, y, test_size=0.5, stratify=y, random_state=7)

    return x_train, x_val, x_test, y_train, y_val, y_test
In [ ]:
x_train, x_val, x_test, y_train, y_val, y_test = train_test_val_full()
print(x_train.count())
print(y_train.count())
print(x_val.count())
print(y_val.count())
print(x_test.count())
print(y_test.count())

Encoding input data

In [ ]:
import io, json
In [ ]:
# save tokenizer
tokenizer_json = tokenizer.to_json()
with io.open('char_cnn_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
In [ ]:
# load tokenizer
with open('char_cnn_tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
In [ ]:
# saving
def save_data():
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    tokenizer.fit_on_texts(x_train)
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1
    tokenizer.word_index = char_dict.copy()
    # x_tran
    x_train_tokenized = tokenizer.texts_to_sequences(x_train)
    x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(x_train_tokenized, maxlen=maxlength)
    # x_val
    x_val_tokenized = tokenizer.texts_to_sequences(x_val)
    x_val_padded = tf.keras.preprocessing.sequence.pad_sequences(x_val_tokenized, maxlen=maxlength)
    # x_test
    x_test_tokenized = tokenizer.texts_to_sequences(x_test)
    x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(x_test_tokenized, maxlen=maxlength)
    
    y_t = tf.keras.utils.to_categorical(y_train, n_class)
    y_v = tf.keras.utils.to_categorical(y_val, n_class)
    y_tes = tf.keras.utils.to_categorical(y_test, n_class)
    
    np.savez_compressed('x_train.npz', x_train_padded)
    np.savez_compressed('x_val.npz', x_val_padded)
    np.savez_compressed('x_test.npz', x_test_padded)
    np.savez_compressed('y_train.npz', y_t)
    np.savez_compressed('y_val.npz', y_v)
    np.savez_compressed('y_test.npz', y_tes)
    return
save_data()
In [3]:
# loading
def load__data():
    x_train = np.load('x_train.npz')['arr_0']
    x_val = np.load('x_val.npz')['arr_0']
    x_test = np.load('x_test.npz')['arr_0']
    y_train = np.load('y_train.npz')['arr_0']
    y_val = np.load('y_val.npz')['arr_0']
    y_test = np.load('y_test.npz')['arr_0']
    return x_train, x_val, x_test, y_train, y_val, y_test
In [4]:
x_train, x_val, x_test, y_train, y_val, y_test = load__data()
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)
(2387540, 175)
(2387540, 5)
(238755, 175)
(238755, 5)
(238754, 175)
(238754, 5)

Model

In [5]:
def character_model():    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, vocab_size, input_length=maxlength),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=7, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=7, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(n_class, activation='softmax', kernel_initializer='he_uniform')
    ])
    print(model.summary())
    return model
In [6]:
char_model = character_model()
char_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 175, 70)           4900      
_________________________________________________________________
conv1d (Conv1D)              (None, 169, 256)          125696    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 56, 256)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 256)           459008    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 16, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 14, 256)           196864    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 12, 256)           196864    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 10, 256)           196864    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 8, 256)            196864    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 256)            0         
_________________________________________________________________
flatten (Flatten)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 1024)              525312    
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 5125      
=================================================================
Total params: 2,957,097
Trainable params: 2,957,097
Non-trainable params: 0
_________________________________________________________________
None
In [ ]:
 
In [24]:
char_model.fit(x_train, y_train, epochs=2, batch_size=256, validation_data=[x_val, y_val])
Train on 2387540 samples, validate on 238755 samples
Epoch 1/2
2387540/2387540 [==============================] - 24740s 10ms/sample - loss: 0.4809 - accuracy: 0.8438 - val_loss: 0.5503 - val_accuracy: 0.8221
Epoch 2/2
2387540/2387540 [==============================] - 24842s 10ms/sample - loss: 0.4871 - accuracy: 0.8438 - val_loss: 0.5532 - val_accuracy: 0.8219
Out[24]:
<tensorflow.python.keras.callbacks.History at 0x2380f639388>
In [25]:
char_model.save('char_base_cnn_model.h5')
Train on 2387540 samples, validate on 238755 samples Epoch 1/12 2387540/2387540 [==============================] - 24588s 10ms/sample - loss: 0.6012 - accuracy: 0.8096 - val_loss: 0.5661 - val_accuracy: 0.8203 Epoch 2/12 2387540/2387540 [==============================] - 23725s 10ms/sample - loss: 0.5538 - accuracy: 0.8226 - val_loss: 0.5475 - val_accuracy: 0.8220 Epoch 3/12 2387540/2387540 [==============================] - 24544s 10ms/sample - loss: 0.5408 - accuracy: 0.8261 - val_loss: 0.5463 - val_accuracy: 0.8233 Epoch 4/12 2387540/2387540 [==============================] - 23780s 10ms/sample - loss: 0.5341 - accuracy: 0.8280 - val_loss: 0.5382 - val_accuracy: 0.8260 Epoch 5/12 2387540/2387540 [==============================] - 23415s 10ms/sample - loss: 0.5276 - accuracy: 0.8299 - val_loss: 0.5395 - val_accuracy: 0.8257 Epoch 6/12 2387540/2387540 [==============================] - 24185s 10ms/sample - loss: 0.5220 - accuracy: 0.8316 - val_loss: 0.5349 - val_accuracy: 0.8279 Epoch 7/12 2387540/2387540 [==============================] - 24100s 10ms/sample - loss: 0.5169 - accuracy: 0.8332 - val_loss: 0.5429 - val_accuracy: 0.8242 Epoch 8/12 2387540/2387540 [==============================] - 23372s 10ms/sample - loss: 0.5122 - accuracy: 0.8344 - val_loss: 0.5367 - val_accuracy: 0.8251 Epoch 9/12 2387540/2387540 [==============================] - 24866s 10ms/sample - loss: 0.5070 - accuracy: 0.8360 - val_loss: 0.5405 - val_accuracy: 0.8230 Epoch 10/12 2387540/2387540 [==============================] - 24529s 10ms/sample - loss: 0.5023 - accuracy: 0.8373 - val_loss: 0.5433 - val_accuracy: 0.8244 -- c --------------- Epoch 11/12 2387540/2387540 [==============================] - 27376s 11ms/sample - loss: 0.5345 - accuracy: 0.8361 - val_loss: 0.5516 - val_accuracy: 0.8209 Epoch 12/12 2387540/2387540 [==============================] - 25246s 11ms/sample - loss: 0.4934 - accuracy: 0.8400 - val_loss: 0.5412 - val_accuracy: 0.8240 -- b ---------------- Epoch 1/2 2387540/2387540 [==============================] - 27169s 11ms/sample - loss: 0.4906 - accuracy: 0.8409 - val_loss: 0.5423 - val_accuracy: 0.8259 Epoch 2/2 2387540/2387540 [==============================] - 30012s 13ms/sample - loss: 0.4855 - accuracy: 0.8424 - val_loss: 0.5508 - val_accuracy: 0.8232 238754/238754 [==============================] - 618s 3ms/sample - loss: 0.5462 - accuracy: 0.8224 --current------------- Epoch 1/2 2387540/2387540 [==============================] - 24740s 10ms/sample - loss: 0.4809 - accuracy: 0.8438 - val_loss: 0.5503 - val_accuracy: 0.8221 Epoch 2/2 2387540/2387540 [==============================] - 24842s 10ms/sample - loss: 0.4871 - accuracy: 0.8438 - val_loss: 0.5532 - val_accuracy: 0.8219
In [11]:
score, acc = char_model.evaluate(x_test, y_test)
238754/238754 [==============================] - 914s 4ms/sample - loss: 0.5459 - accuracy: 0.8243
In [ ]:
 

Predict class

In [1]:
import tensorflow as tf, re, json
In [2]:
# load model
char_model = tf.keras.models.load_model(r'char_base_cnn_model_c.h5')
In [3]:
# load tokenizer
with open('char_cnn_tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
In [4]:
def clean_comment(text):
    return re.sub(r'[\t\r]', ' ', 
                  re.sub(r'([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)|(\x7f)' ,"", 
                         str(text).lower())).encode('ascii', 'ignore').decode('ascii').strip()
In [28]:
def predict_sentiment(text):
    text_cleaned = clean_comment(text)
    text_tokenized = tokenizer.texts_to_sequences(text_cleaned)
    flat_list = []
    for sublist in text_tokenized:
        for item in sublist:
            flat_list.append(item)
    flat_list = [flat_list]
    text_padded = tf.keras.preprocessing.sequence.pad_sequences(flat_list, maxlen=maxlength)
    pred_class = char_model.predict_classes(text_padded)[0]+1
    pred_proba = char_model.predict_proba(text_padded).max()
    return pred_class, pred_proba
In [29]:
predict_sentiment("I don't think this is working properly, this is not what I wanted")
Out[29]:
(1, 0.3434851)
In [ ]:
response = {'pred_class': '0', 'pred_proba': '0'}
In [ ]:
 
In [ ]:
 
In [ ]:
 

Contact Me

www.linkedin.com/in/billygustave

billygustave.com