Case Study: Sentiment Analysis

Data Prep

import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('/Users/zero/Desktop/NLP/raw-data/Amazon_Unlocked_Mobile.csv')

# 对数据进行采样以加快计算速度
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

df.head()
Product Name Brand Name Price Rating Reviews Review Votes
394349 Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat... NaN 244.95 5 Very good one! Better than Samsung S and iphon... 0.0
34377 Apple iPhone 5c 8GB (Pink) - Verizon Wireless Apple 194.99 1 The phone needed a SIM card, would have been n... 1.0
248521 Motorola Droid RAZR MAXX XT912 M Verizon Smart... Motorola 174.99 5 I was 3 months away from my upgrade and my Str... 3.0
167661 CNPGD [U.S. Office Extended Warranty] Smartwat... CNPGD 49.99 1 an experience i want to forget 0.0
73287 Apple iPhone 7 Unlocked Phone 256 GB - US Vers... Apple 922.00 5 GREAT PHONE WORK ACCORDING MY EXPECTATIONS. 1.0
np.where(df['Rating'] > 3, 1, 0)
array([1, 0, 1, ..., 1, 0, 0])
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
# filter rows
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
# class labels
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)
Product Name Brand Name Price Rating Reviews Review Votes Positively Rated
34377 Apple iPhone 5c 8GB (Pink) - Verizon Wireless Apple 194.99 1 The phone needed a SIM card, would have been n... 1.0 0
248521 Motorola Droid RAZR MAXX XT912 M Verizon Smart... Motorola 174.99 5 I was 3 months away from my upgrade and my Str... 3.0 1
167661 CNPGD [U.S. Office Extended Warranty] Smartwat... CNPGD 49.99 1 an experience i want to forget 0.0 0
73287 Apple iPhone 7 Unlocked Phone 256 GB - US Vers... Apple 922.00 5 GREAT PHONE WORK ACCORDING MY EXPECTATIONS. 1.0 1
277158 Nokia N8 Unlocked GSM Touch Screen Phone Featu... Nokia 95.00 5 I fell in love with this phone because it did ... 0.0 1
100311 Blackberry Torch 2 9810 Unlocked Phone with 1.... BlackBerry 77.49 5 I am pleased with this Blackberry phone! The p... 0.0 1
251669 Motorola Moto E (1st Generation) - Black - 4 G... Motorola 89.99 5 Great product, best value for money smartphone... 0.0 1
279878 OtterBox 77-29864 Defender Series Hybrid Case ... OtterBox 9.99 5 I've bought 3 no problems. Fast delivery. 0.0 1
406017 Verizon HTC Rezound 4G Android Smarphone - 8MP... HTC 74.99 4 Great phone for the price... 0.0 1
302567 RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came... RCA 159.99 5 My mom is not good with new technoloy but this... 4.0 1
# Most ratings are positive
# More than 50% is positive
df['Positively Rated'].mean()
0.7471776686078667
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)
X_train first entry:

 Everything about it is awesome!


X_train shape:  (23052,)

CountVectorizer

Construct a sparse matrix

from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
# init countvertoizer
init_countVectorizer = CountVectorizer()
# and then fit it
vect = init_countVectorizer.fit(X_train)
# step is 2000
#sparse matrix
vect.get_feature_names()[::2000]
['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']
len(vect.get_feature_names())
19601
19601/2000
9.8005
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized
<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
    with 613289 stored elements in Compressed Sparse Row format>
print(type(X_train_vectorized)) 
<class 'scipy.sparse.csr.csr_matrix'>
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))
AUC:  0.897433277667
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
Smallest Coefs:
['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
 'waste' 'disappointed']

Largest Coefs: 
['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']

Tfidf

🔥 feature: limit frequency threshold

from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())
5442
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))
AUC:  0.889951006492
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))
Smallest tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Largest tfidf: 
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome' 'far'
 'perfectly']
vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])
<2x5442 sparse matrix of type '<class 'numpy.float64'>'
    with 12 stored elements in Compressed Sparse Row format>
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))
[0 0]

n-grams

🔥 “back as” or “is not” is grams

# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
# min_df
# ngram_range

vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())
29072
vect.get_feature_names()[3000:3010]
['back as',
 'back asap',
 'back because',
 'back but',
 'back button',
 'back camera',
 'back case',
 'back cover',
 'back for',
 'back from']
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))
AUC:  0.91106617946
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
Smallest Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Largest Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']
# These reviews are now correctly identified
# why 🤔
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))
[1 0]