7.2 Logistic Regression
Logistic Regression is one of the most well-known types of supervised models and is well-suited for classification because textmodel_lr is already built-in, it is practical to use in R. In addition, this implementation includes L2 regularisation by default to prevent overfitting in high-dimensional text feature spaces. For consistency, we will reuse the data loading, sampling, splitting, and preprocessing steps from the SVM example.
First, ensure the necessary libraries are loaded:
set.seed(42) # Set seed for reproducibility
library(quanteda)
library(quanteda.textmodels) # For textmodel_lr
library(caret) # For evaluation metrics
library(pROC) # For ROC analysis (optional, but good for binary classification)
library(ggplot2) # For plotting (optional)
# Load the movie review corpus and sample a subset (matching the SVM example)
corpus_reviews_lr <- corpus_sample(data_corpus_LMRD, 2000) # Sample 2000 reviewsWe will use the same data splitting logic as in the SVM example, ensuring a stratified split based on the polarity variable to maintain the proportion of positive and negative reviews in both the training and test sets.
# Extract the polarity label as the target variable and convert it to a factor
# Assuming the polarity variable is binary ('neg', 'pos') in your corpus object
polarity_labels_lr <- factor(corpus_reviews_lr$polarity)
# Identify documents with valid polarity labels (not NA)
valid_docs_index_lr <- which(!is.na(polarity_labels_lr))
# Subset the corpus and polarity labels to only include documents with valid
# polarity
corpus_reviews_valid_lr <- corpus_reviews_lr[valid_docs_index_lr]
polarity_valid_lr <- polarity_labels_lr[valid_docs_index_lr]
# Ensure polarity_valid_lr is a factor with levels 'neg', 'pos' in that
# specific order
polarity_valid_lr <- factor(polarity_valid_lr, levels = c("neg", "pos"))
# Check if both levels ('neg', 'pos') are present
if (!all(c("neg", "pos") %in% levels(polarity_valid_lr)) || any(table(polarity_valid_lr) ==
0)) {
stop("The sampled corpus subset does not contain both 'neg' and 'pos' classes after filtering NA polarity. Please increase the sample size or check data.")
}
# Manually create stratified split indices (reusing the logic from the SVM
# example) Get indices for each class
neg_indices_lr <- which(polarity_valid_lr == "neg")
pos_indices_lr <- which(polarity_valid_lr == "pos")
# Determine the number of instances for train/test per class (70/30 split)
set.seed(42) # for reproducibility
train_size_neg_lr <- floor(0.7 * length(neg_indices_lr))
train_size_pos_lr <- floor(0.7 * length(pos_indices_lr))
# Sample indices for training set from each class
train_indices_neg_lr <- sample(neg_indices_lr, size = train_size_neg_lr, replace = FALSE)
train_indices_pos_lr <- sample(pos_indices_lr, size = train_size_pos_lr, replace = FALSE)
# Combine training indices
train_index_lr <- c(train_indices_neg_lr, train_indices_pos_lr)
# The remaining indices are for the test set
all_valid_indices_lr <- seq_along(polarity_valid_lr)
test_index_lr <- all_valid_indices_lr[!all_valid_indices_lr %in% train_index_lr]
# Split the corpus subset and polarity labels into training and testing sets
corpus_reviews_train_lr <- corpus_reviews_valid_lr[train_index_lr]
corpus_reviews_test_lr <- corpus_reviews_valid_lr[test_index_lr]
polarity_train_lr <- polarity_valid_lr[train_index_lr]
polarity_test_lr <- polarity_valid_lr[test_index_lr]
# Check the distribution of the split
print("Training set class distribution (LR example):")## [1] "Training set class distribution (LR example):"
## polarity_train_lr
## neg pos
## 716 683
## [1] "Testing set class distribution (LR example):"
## polarity_test_lr
## neg pos
## 307 294
Next, we preprocess the training and test corpus subsets to create DFMs, applying similar cleaning steps as before and matching the test DFM features to the training DFM.
# Tokenise and preprocess the training corpus
tokens_train_lr <- tokens(corpus_reviews_train_lr, what = "word", remove_punct = TRUE,
remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_select(stopwords("english"), selection = "remove")
# Tokenise and preprocess the test corpus
tokens_test_lr <- tokens(corpus_reviews_test_lr, what = "word", remove_punct = TRUE,
remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
tokens_tolower() %>%
tokens_select(stopwords("english"), selection = "remove")
# Create dfms
dfm_train_lr <- dfm(tokens_train_lr)
dfm_test_lr <- dfm(tokens_test_lr)
# Ensure the test dfm has the same features as the training dfm
dfm_test_matched_lr <- dfm_match(dfm_test_lr, features = featnames(dfm_train_lr))
# Display DFM dimensions
cat("Dimensions of Training DFM (LR example):", dim(dfm_train_lr), "\n")## Dimensions of Training DFM (LR example): 1399 23543
## Dimensions of Matched Test DFM (LR example): 601 23543
Now, we train the Regularized Logistic Regression model using textmodel_lr.
# Train the textmodel_lr model for binary classification The training labels
# are the polarity labels from the training corpus
library(quanteda.textmodels)
model_lr_lmrd <- textmodel_lr(dfm_train_lr, polarity_train_lr)
# Print the model summary
summary(model_lr_lmrd)##
## Call:
## textmodel_lr.dfm(x = dfm_train_lr, y = polarity_train_lr)
##
## Lambda Min:
## [1] 0.009405
##
## Lambda 1se:
## [1] 0.01498
##
## Estimated Feature Scores:
## (Intercept) just thought finish whole year without giving single movie bomb
## pos 0.1042 0 0 0 0 0 0 0 0 0 0
## rating friend brought notorious turd house last night feared worst knowing
## pos 0 0 0 0 0 0 0 0 0 -1.281 0
## reputation god-awful anticipated mexican-made mess dubbed english produced
## pos 0 0 0 0 0 0 0 0
We then use the trained model to predict the polarity labels for the documents in the test set.
# Predict the classes for the matched test set
predictions_lr_lmrd <- predict(model_lr_lmrd, newdata = dfm_test_matched_lr)
# Display the first few predictions
head(predictions_lr_lmrd)## test/neg/6932_2.txt train/neg/6580_4.txt test/pos/1099_7.txt
## neg pos pos
## train/pos/9167_7.txt test/neg/8064_1.txt test/neg/4057_4.txt
## neg neg pos
## Levels: neg pos
Finally, we evaluate the model’s performance on the test set using a confusion matrix. Since this is a binary classification task, we can also compute metrics like ROC AUC, which is similar to the SVM evaluation.
# Get the actual classes from the test corpus
actual_classes_lmrd <- polarity_test_lr
# Ensure actual and predicted classes are factors with the same levels for
# comparison The levels should already be consistent ('neg', 'pos') from the
# splitting step
confusion_matrix_lr_lmrd <- confusionMatrix(predictions_lr_lmrd, actual_classes_lmrd)
# Print the confusion matrix and performance statistics
print(confusion_matrix_lr_lmrd)## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 232 46
## pos 75 248
##
## Accuracy : 0.7987
## 95% CI : (0.7643, 0.83)
## No Information Rate : 0.5108
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.598
##
## Mcnemar's Test P-Value : 0.01091
##
## Sensitivity : 0.7557
## Specificity : 0.8435
## Pos Pred Value : 0.8345
## Neg Pred Value : 0.7678
## Prevalence : 0.5108
## Detection Rate : 0.3860
## Detection Prevalence : 0.4626
## Balanced Accuracy : 0.7996
##
## 'Positive' Class : neg
##