Case Study: Sentiment Analysis

Data Prep

import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('/Users/zero/Desktop/NLP/raw-data/Amazon_Unlocked_Mobile.csv')

# 对数据进行采样以加快计算速度
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

df.head()

	Product Name	Brand Name	Price	Rating	Reviews	Review Votes
394349	Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...	NaN	244.95	5	Very good one! Better than Samsung S and iphon...	0.0
34377	Apple iPhone 5c 8GB (Pink) - Verizon Wireless	Apple	194.99	1	The phone needed a SIM card, would have been n...	1.0
248521	Motorola Droid RAZR MAXX XT912 M Verizon Smart...	Motorola	174.99	5	I was 3 months away from my upgrade and my Str...	3.0
167661	CNPGD [U.S. Office Extended Warranty] Smartwat...	CNPGD	49.99	1	an experience i want to forget	0.0
73287	Apple iPhone 7 Unlocked Phone 256 GB - US Vers...	Apple	922.00	5	GREAT PHONE WORK ACCORDING MY EXPECTATIONS.	1.0

np.where(df['Rating'] > 3, 1, 0)

array([1, 0, 1, ..., 1, 0, 0])

# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
# filter rows
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
# class labels
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

	Product Name	Brand Name	Price	Rating	Reviews	Review Votes	Positively Rated
34377	Apple iPhone 5c 8GB (Pink) - Verizon Wireless	Apple	194.99	1	The phone needed a SIM card, would have been n...	1.0	0
248521	Motorola Droid RAZR MAXX XT912 M Verizon Smart...	Motorola	174.99	5	I was 3 months away from my upgrade and my Str...	3.0	1
167661	CNPGD [U.S. Office Extended Warranty] Smartwat...	CNPGD	49.99	1	an experience i want to forget	0.0	0
73287	Apple iPhone 7 Unlocked Phone 256 GB - US Vers...	Apple	922.00	5	GREAT PHONE WORK ACCORDING MY EXPECTATIONS.	1.0	1
277158	Nokia N8 Unlocked GSM Touch Screen Phone Featu...	Nokia	95.00	5	I fell in love with this phone because it did ...	0.0	1
100311	Blackberry Torch 2 9810 Unlocked Phone with 1....	BlackBerry	77.49	5	I am pleased with this Blackberry phone! The p...	0.0	1
251669	Motorola Moto E (1st Generation) - Black - 4 G...	Motorola	89.99	5	Great product, best value for money smartphone...	0.0	1
279878	OtterBox 77-29864 Defender Series Hybrid Case ...	OtterBox	9.99	5	I've bought 3 no problems. Fast delivery.	0.0	1
406017	Verizon HTC Rezound 4G Android Smarphone - 8MP...	HTC	74.99	4	Great phone for the price...	0.0	1
302567	RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...	RCA	159.99	5	My mom is not good with new technoloy but this...	4.0	1

# Most ratings are positive
# More than 50% is positive
df['Positively Rated'].mean()

0.7471776686078667

from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Everything about it is awesome!


X_train shape:  (23052,)

CountVectorizer

Construct a sparse matrix

from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
# init countvertoizer
init_countVectorizer = CountVectorizer()
# and then fit it
vect = init_countVectorizer.fit(X_train)

# step is 2000
#sparse matrix
vect.get_feature_names()[::2000]

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

len(vect.get_feature_names())

19601/2000

9.8005

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
    with 613289 stored elements in Compressed Sparse Row format>

print(type(X_train_vectorized))

<class 'scipy.sparse.csr.csr_matrix'>

from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.897433277667

# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
 'waste' 'disappointed']

Largest Coefs: 
['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']

Tfidf

🔥 feature: limit frequency threshold

from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.889951006492

feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Largest tfidf: 
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome' 'far'
 'perfectly']

vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])

<2x5442 sparse matrix of type '<class 'numpy.float64'>'
    with 12 stored elements in Compressed Sparse Row format>

# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]

n-grams

🔥 “back as” or “is not” is grams

# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
# min_df
# ngram_range

vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

vect.get_feature_names()[3000:3010]

['back as',
 'back asap',
 'back because',
 'back but',
 'back button',
 'back camera',
 'back case',
 'back cover',
 'back for',
 'back from']

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.91106617946

feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Largest Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']

# These reviews are now correctly identified
# why 🤔
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]

NLP part 03 sentiment analysis

Case Study: Sentiment Analysis

Data Prep

CountVectorizer

Tfidf

n-grams

Jixing Liu

NLP part 03 sentiment analysis

Case Study: Sentiment Analysis

Data Prep

CountVectorizer

Tfidf

n-grams

Jixing Liu

使用 R 输出格式化的 Excel

如何拟合一条曲线

努力后的失败，才是诚实的失败

蝇王

如何阅读大量的学术论文, 而不发疯？

多标签分类问题

新药研发

Deep Work

The Hello World Of Neural Network

使用 R 分析可视化你的 iPhone 健康 APP 数据