#DeepLearning #SupervisedLearning #CNN
By Billy Gustave
Goal
import pandas as pd, re, numpy as np, tensorflow as tf
df = pd.read_csv(r'companies_reviews.csv')
# make comment column string
df.Comment = df.Comment.astype(str)
# revoming links, users, hashtags and emojies from comments in lower()
df['cleaned_comments'] = df.Comment.apply(lambda x: re.sub(r'[\t\r]', ' ', re.sub(r'([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)|(\x7f)'
,"", x.lower())).encode('ascii',
'ignore').decode('ascii').strip())
nan_value = float("NaN")
df.replace('', nan_value, inplace=True)
df = df.dropna()
df.to_csv(r'companies_reviews_cleaned.csv', index=False)
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
n_class = 5
maxlength = 175
vocab_size = len(alphabet)+1
def one_hot_encoding(text, max_text_length, alphabet):
# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
# integer encode input data
integer_encoded = [char_to_int[char] for char in text[:max_text_length]]
# one hot encode
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
onehot_encoded = np.array(onehot_encoded)
if len(text) < max_text_length:
padded_array = np.zeros((max_text_length, len(alphabet)))
shape = np.shape(onehot_encoded)
padded_array[:shape[0],:shape[1]] = onehot_encoded
return padded_array
return onehot_encoded
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
def train_test_val():
df = pd.read_csv(r'companies_reviews_cleaned.csv')
df.dropna(inplace=True)
df_1 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 1], random_state=7).iloc[:24000,:]
df_2 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 2], random_state=7).iloc[:24000,:]
df_3 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 3], random_state=7).iloc[:24000,:]
df_4 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 4], random_state=7).iloc[:24000,:]
df_5 = shuffle(df[['cleaned_comments', 'Rating']][df.Rating == 5], random_state=7).iloc[:24000,:]
df = shuffle(pd.concat([df_1,df_2,df_3,df_4,df_5], axis=0), random_state=7)
df['Rating'] = df['Rating'].map({1:0, 2:1, 3:2, 4:3, 5:4})
x_train, X, y_train, y = train_test_split(df['cleaned_comments'], df['Rating'], test_size=0.16666666666666666,
stratify=df['Rating'], random_state=7)
x_test, x_val, y_test, y_val = train_test_split(X, y, test_size=0.5, stratify=y, random_state=7)
return x_train, x_val, x_test, y_train, y_val, y_test
def train_test_val_full():
df = pd.read_csv(r'companies_reviews_cleaned.csv')
df = df[['cleaned_comments', 'Rating']].dropna()
df['Rating'] = df['Rating'].map({1:0, 2:1, 3:2, 4:3, 5:4})
x_train, X, y_train, y = train_test_split(df['cleaned_comments'], df['Rating'], test_size=0.16666666666666666,
stratify=df['Rating'], random_state=7)
x_test, x_val, y_test, y_val = train_test_split(X, y, test_size=0.5, stratify=y, random_state=7)
return x_train, x_val, x_test, y_train, y_val, y_test
x_train, x_val, x_test, y_train, y_val, y_test = train_test_val_full()
print(x_train.count())
print(y_train.count())
print(x_val.count())
print(y_val.count())
print(x_test.count())
print(y_test.count())
import io, json
# save tokenizer
tokenizer_json = tokenizer.to_json()
with io.open('char_cnn_tokenizer.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(tokenizer_json, ensure_ascii=False))
# load tokenizer
with open('char_cnn_tokenizer.json') as f:
data = json.load(f)
tokenizer = tokenizer_from_json(data)
# saving
def save_data():
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(x_train)
char_dict = {}
for i, char in enumerate(alphabet):
char_dict[char] = i + 1
tokenizer.word_index = char_dict.copy()
# x_tran
x_train_tokenized = tokenizer.texts_to_sequences(x_train)
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(x_train_tokenized, maxlen=maxlength)
# x_val
x_val_tokenized = tokenizer.texts_to_sequences(x_val)
x_val_padded = tf.keras.preprocessing.sequence.pad_sequences(x_val_tokenized, maxlen=maxlength)
# x_test
x_test_tokenized = tokenizer.texts_to_sequences(x_test)
x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(x_test_tokenized, maxlen=maxlength)
y_t = tf.keras.utils.to_categorical(y_train, n_class)
y_v = tf.keras.utils.to_categorical(y_val, n_class)
y_tes = tf.keras.utils.to_categorical(y_test, n_class)
np.savez_compressed('x_train.npz', x_train_padded)
np.savez_compressed('x_val.npz', x_val_padded)
np.savez_compressed('x_test.npz', x_test_padded)
np.savez_compressed('y_train.npz', y_t)
np.savez_compressed('y_val.npz', y_v)
np.savez_compressed('y_test.npz', y_tes)
return
save_data()
# loading
def load__data():
x_train = np.load('x_train.npz')['arr_0']
x_val = np.load('x_val.npz')['arr_0']
x_test = np.load('x_test.npz')['arr_0']
y_train = np.load('y_train.npz')['arr_0']
y_val = np.load('y_val.npz')['arr_0']
y_test = np.load('y_test.npz')['arr_0']
return x_train, x_val, x_test, y_train, y_val, y_test
x_train, x_val, x_test, y_train, y_val, y_test = load__data()
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)
def character_model():
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(vocab_size, vocab_size, input_length=maxlength),
tf.keras.layers.Convolution1D(filters=256, kernel_size=7, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.MaxPooling1D(3),
tf.keras.layers.Convolution1D(filters=256, kernel_size=7, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.MaxPooling1D(3),
tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.Convolution1D(filters=256, kernel_size=3, padding='valid', activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.MaxPooling1D(3),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_uniform'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(n_class, activation='softmax', kernel_initializer='he_uniform')
])
print(model.summary())
return model
char_model = character_model()
char_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
char_model.fit(x_train, y_train, epochs=2, batch_size=256, validation_data=[x_val, y_val])
char_model.save('char_base_cnn_model.h5')
score, acc = char_model.evaluate(x_test, y_test)
import tensorflow as tf, re, json
# load model
char_model = tf.keras.models.load_model(r'char_base_cnn_model_c.h5')
# load tokenizer
with open('char_cnn_tokenizer.json') as f:
data = json.load(f)
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
def clean_comment(text):
return re.sub(r'[\t\r]', ' ',
re.sub(r'([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)|(\x7f)' ,"",
str(text).lower())).encode('ascii', 'ignore').decode('ascii').strip()
def predict_sentiment(text):
text_cleaned = clean_comment(text)
text_tokenized = tokenizer.texts_to_sequences(text_cleaned)
flat_list = []
for sublist in text_tokenized:
for item in sublist:
flat_list.append(item)
flat_list = [flat_list]
text_padded = tf.keras.preprocessing.sequence.pad_sequences(flat_list, maxlen=maxlength)
pred_class = char_model.predict_classes(text_padded)[0]+1
pred_proba = char_model.predict_proba(text_padded).max()
return pred_class, pred_proba
predict_sentiment("I don't think this is working properly, this is not what I wanted")
response = {'pred_class': '0', 'pred_proba': '0'}