# Improt requierd libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

# Set plot style
sns.set(style="whitegrid")

# Load the dataset

df = pd.read_csv("train.tsv", sep="\t")
df.head()

# Check dataset structure

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nUnique sentiment classes:", sorted(df['Sentiment'].unique()))
df['Sentiment'].value_counts(normalize=True).sort_index()

Shape: (156060, 4)

Columns: ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']

Unique sentiment classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]

# Viisualize Sentiment Class Distribution

plt.figure(figsize=(7,5))
ax = sns.countplot(x='Sentiment', data=df, palette="pastel")

# Add value labels on each bar
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 200,
            '{:,.0f}'.format(height),
            ha="center", fontsize=10)

plt.title("Sentiment Class Distribution")
plt.xlabel("Sentiment Class (0 = Negative, 4 = Positive)")
plt.ylabel("Frequency")
plt.xticks([0, 1, 2, 3, 4])
plt.show()

# Split the data

X = df["Phrase"]
y = df["Sentiment"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# TF-IDF Vectorization

# Convert text phrases into TF-IDF feature vectors
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Logistic Regression Model Training

# Train logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

LogisticRegression(max_iter=200)

LogisticRegression(max_iter=200)

# Predict sentiment labels on the validation set
y_pred = model.predict(X_val_tfidf)

# Print classification report
from sklearn.metrics import classification_report

print("Classification Report:")
print(classification_report(y_val, y_pred, digits=4))

Classification Report:
              precision    recall  f1-score   support

           0     0.5594    0.1931    0.2871      1414
           1     0.5266    0.3465    0.4180      5455
           2     0.6704    0.8748    0.7591     15917
           3     0.5555    0.4759    0.5126      6585
           4     0.6091    0.2395    0.3439      1841

    accuracy                         0.6300     31212
   macro avg     0.5842    0.4260    0.4641     31212
weighted avg     0.6124    0.6300    0.6016     31212

# Confusion Matrix Visualization

# Plot confusion matrix
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_val, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Purples',
            xticklabels=[0, 1, 2, 3, 4],
            yticklabels=[0, 1, 2, 3, 4])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Load MPQA Subjectivity Lexicon
def load_subjectivity_lexicon(filepath):
    lexicon = {}
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            word = parts[2].split('=')[1]
            polarity = parts[-1].split('=')[1]
            strength = parts[0].split('=')[1]
            lexicon[word] = {
                'polarity': polarity,
                'strength': strength
            }
    return lexicon

subjectivity_lexicon = load_subjectivity_lexicon("subjclueslen1-HLTEMNLP05.tff")

# Extract Lexicon-Based Features
import re

def extract_lexicon_features(phrase, lexicon):
    words = re.findall(r'\b\w+\b', phrase.lower())
    strong, weak, pos, neg = 0, 0, 0, 0

    for word in words:
        if word in lexicon:
            if lexicon[word]['strength'] == 'strongsubj':
                strong += 1
            elif lexicon[word]['strength'] == 'weaksubj':
                weak += 1

            if lexicon[word]['polarity'] == 'positive':
                pos += 1
            elif lexicon[word]['polarity'] == 'negative':
                neg += 1

    return [strong, weak, pos, neg]

# Apply to train and validation sets
X_train_lex = np.array([extract_lexicon_features(p, subjectivity_lexicon) for p in X_train])
X_val_lex = np.array([extract_lexicon_features(p, subjectivity_lexicon) for p in X_val])

from scipy.sparse import hstack

# Combine sparse TF-IDF matrix with dense lexicon features
X_train_combined = hstack([X_train_tfidf, X_train_lex])
X_val_combined = hstack([X_val_tfidf, X_val_lex])


# Retrain logistic regression with combined features
model_combined = LogisticRegression(max_iter=200)
model_combined.fit(X_train_combined, y_train)

# Evaluate
y_pred_combined = model_combined.predict(X_val_combined)
print("Classification Report (TF-IDF + Lexicon):")
print(classification_report(y_val, y_pred_combined, digits=4))

Classification Report (TF-IDF + Lexicon):
              precision    recall  f1-score   support

           0     0.5451    0.2008    0.2935      1414
           1     0.5266    0.3533    0.4229      5455
           2     0.6770    0.8740    0.7630     15917
           3     0.5496    0.4729    0.5084      6585
           4     0.5767    0.2553    0.3539      1841

    accuracy                         0.6314     31212
   macro avg     0.5750    0.4313    0.4683     31212
weighted avg     0.6119    0.6314    0.6044     31212

	proportion
Sentiment
0	0.045316
1	0.174760
2	0.509945
3	0.210989
4	0.058990

Sentiment Classification of Movie Review Phrases Using NLP Techniques¶

Project Description¶

Project Files Overview¶

Dataset Files¶

Lexicons and Dictionaries¶

Python Scripts¶

Exploratory Data Analysis and Preprocessing¶

Feature Engineering & Modeling¶

TF-IDF Vectorization¶

Logistic Regression Model¶

Evaluation and Results¶

Metric Definitions:¶

Key Observations:¶

Model Scores¶

Load Subjectivity Lexicon into a Dictionary¶

Combining TF-IDF and Lexicon-Based Features¶

Conclusion and Future Work¶

Key Takeaways:¶

Future Work:¶

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	2	1	A series of escapades demonstrating the adage ...	2
2	3	1	A series	2
3	4	1	A	2
4	5	1	series	2