Sentiment Analysis¶

Sentiment analysis is a natural language processing technique used to determine whether data is positive, negative or neutral. Sentiment analysis is often performed on textual data to help businesses monitor brand and product sentiment in customer feedback, and understand customer needs.

Sample Data¶

data = [
    {"text": "I loved working here, but I need to move to a new city.", "sentiment": "positive"},
    {"text": "The work environment was toxic and stressful.", "sentiment": "negative"},
    {"text": "It was an okay experience, nothing special.", "sentiment": "neutral"},
    {"text": "I am unsure about my feelings towards this job.", "sentiment": "ambiguous"}
]

BERT Model¶

Split Data: Split your dataset into training and validation sets.
Training: Train the model on the training set.
Evaluation: Use the validation set to evaluate the model. Here’s how you can do it with the BERT model:

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    [d['text'] for d in data], [0, 1, 2, 3], test_size=0.2, random_state=42
)

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to torch tensors
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

# Define training arguments
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=2)

# Create Trainer instance
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

# Print classification report
print(classification_report(val_labels, pred_labels, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

LSTM Model¶

Split Data: Split your dataset into training and validation sets.
Training: Train the model on the training set.
Evaluation: Use the validation set to evaluate the model. Here’s how you can do it with the LSTM model:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Tokenize the data
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts([d['text'] for d in data])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    [d['text'] for d in data], [0, 1, 2, 3], test_size=0.2, random_state=42
)

# Tokenize and pad the data
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
train_padded = pad_sequences(train_sequences, maxlen=50)
val_padded = pad_sequences(val_sequences, maxlen=50)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=50))
model.add(LSTM(64))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_padded, train_labels, epochs=5, batch_size=2, validation_data=(val_padded, val_labels))

# Evaluate the model
pred_labels = model.predict(val_padded).argmax(axis=1)

# Print classification report
print(classification_report(val_labels, pred_labels, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

RNN Model¶

Split Data: Split your dataset into training and validation sets.
Training: Train the model on the training set.
Evaluation: Use the validation set to evaluate the model. Here’s how you can do it with the RNN model:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Tokenize the data
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts([d['text'] for d in data])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    [d['text'] for d in data], [0, 1, 2, 3], test_size=0.2, random_state=42
)

# Tokenize and pad the data
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
train_padded = pad_sequences(train_sequences, maxlen=50)
val_padded = pad_sequences(val_sequences, maxlen=50)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=50))
model.add(SimpleRNN(64))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_padded, train_labels, epochs=5, batch_size=2, validation_data=(val_padded, val_labels))

# Evaluate the model
pred_labels = model.predict(val_padded).argmax(axis=1)

# Print classification report
print(classification_report(val_labels, pred_labels, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

Traditional Machine Learning Models: Logistic Regression, Naive Bayes, Support Vector Machines (SVM)¶

data = [
    {"text": "I loved working here, but I need to move to a new city.", "sentiment": "positive"},
    {"text": "The work environment was toxic and stressful.", "sentiment": "negative"},
    {"text": "It was an okay experience, nothing special.", "sentiment": "neutral"},
    {"text": "I am unsure about my feelings towards this job.", "sentiment": "ambiguous"}
]

Preprocessing¶

First, we need to preprocess the text data and convert it into a format suitable for machine learning models.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert data to DataFrame
df = pd.DataFrame(data)

# Encode labels
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'ambiguous': 3}
df['label'] = df['sentiment'].map(label_mapping)

# Split data
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

Logistic Regression¶

Logistic regression is a linear model used for binary classification. It can be extended to multi-class classification using the one-vs-rest approach.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train the model
log_reg = LogisticRegression()
log_reg.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = log_reg.predict(X_val_vec)
print("Logistic Regression Classification Report:")
print(classification_report(y_val, y_pred, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

Naive Bayes¶

Naive Bayes is a probabilistic classifier based on Bayes' theorem with strong independence assumptions between features.

from sklearn.naive_bayes import MultinomialNB

# Train the model
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = nb.predict(X_val_vec)
print("Naive Bayes Classification Report:")
print(classification_report(y_val, y_pred, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

Support Vector Machine (SVM)¶

Support Vector Machines (SVM) are supervised learning models used for classification tasks. They find the hyperplane that best separates the classes in the feature space.

from sklearn.svm import SVC

# Train the model
svm = SVC(kernel='linear')
svm.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = svm.predict(X_val_vec)
print("SVM Classification Report:")
print(classification_report(y_val, y_pred, target_names=['positive', 'negative', 'neutral', 'ambiguous']))

XGBoost Model with Word2Vec Embeddings¶

import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import xgboost as xgb
from sklearn.metrics import classification_report

# Convert data to DataFrame
df = pd.DataFrame(data)

# Encode labels
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'ambiguous': 3}
df['label'] = df['sentiment'].map(label_mapping)

# Split data
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Tokenize text data
X_train_tokens = [text.split() for text in X_train]
X_val_tokens = [text.split() for text in X_val]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Function to average word vectors for a document
def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    return np.mean(word2vec_model.wv[doc], axis=0)

# Create feature vectors
X_train_vec = np.array([document_vector(doc) for doc in X_train_tokens])
X_val_vec = np.array([document_vector(doc) for doc in X_val_tokens])

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train_vec, label=y_train)
dval = xgb.DMatrix(X_val_vec, label=y_val)

# Set parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 4,
    'eval_metric': 'mlogloss'
}

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predict and evaluate
y_pred = bst.predict(dval)
print("XGBoost Classification Report:")
print(classification_report(y_val, y_pred, target_names=['positive', 'negative', 'neutral', 'ambiguous']))