7.2 Logistic Regression

Logistic Regression is one of the most well-known types of supervised models and is well-suited for classification because textmodel_lr is already built-in, it is practical to use in R. In addition, this implementation includes L2 regularisation by default to prevent overfitting in high-dimensional text feature spaces. For consistency, we will reuse the data loading, sampling, splitting, and preprocessing steps from the SVM example.

First, ensure the necessary libraries are loaded:

set.seed(42)  # Set seed for reproducibility

library(quanteda)
library(quanteda.textmodels)  # For textmodel_lr
library(caret)  # For evaluation metrics
library(pROC)  # For ROC analysis (optional, but good for binary classification)
library(ggplot2)  # For plotting (optional)

# Load the movie review corpus and sample a subset (matching the SVM example)
corpus_reviews_lr <- corpus_sample(data_corpus_LMRD, 2000)  # Sample 2000 reviews

We will use the same data splitting logic as in the SVM example, ensuring a stratified split based on the polarity variable to maintain the proportion of positive and negative reviews in both the training and test sets.

# Extract the polarity label as the target variable and convert it to a factor
# Assuming the polarity variable is binary ('neg', 'pos') in your corpus object
polarity_labels_lr <- factor(corpus_reviews_lr$polarity)

# Identify documents with valid polarity labels (not NA)
valid_docs_index_lr <- which(!is.na(polarity_labels_lr))

# Subset the corpus and polarity labels to only include documents with valid
# polarity
corpus_reviews_valid_lr <- corpus_reviews_lr[valid_docs_index_lr]
polarity_valid_lr <- polarity_labels_lr[valid_docs_index_lr]

# Ensure polarity_valid_lr is a factor with levels 'neg', 'pos' in that
# specific order
polarity_valid_lr <- factor(polarity_valid_lr, levels = c("neg", "pos"))

# Check if both levels ('neg', 'pos') are present
if (!all(c("neg", "pos") %in% levels(polarity_valid_lr)) || any(table(polarity_valid_lr) ==
    0)) {
    stop("The sampled corpus subset does not contain both 'neg' and 'pos' classes after filtering NA polarity. Please increase the sample size or check data.")
}

# Manually create stratified split indices (reusing the logic from the SVM
# example) Get indices for each class
neg_indices_lr <- which(polarity_valid_lr == "neg")
pos_indices_lr <- which(polarity_valid_lr == "pos")

# Determine the number of instances for train/test per class (70/30 split)
set.seed(42)  # for reproducibility
train_size_neg_lr <- floor(0.7 * length(neg_indices_lr))
train_size_pos_lr <- floor(0.7 * length(pos_indices_lr))

# Sample indices for training set from each class
train_indices_neg_lr <- sample(neg_indices_lr, size = train_size_neg_lr, replace = FALSE)
train_indices_pos_lr <- sample(pos_indices_lr, size = train_size_pos_lr, replace = FALSE)

# Combine training indices
train_index_lr <- c(train_indices_neg_lr, train_indices_pos_lr)

# The remaining indices are for the test set
all_valid_indices_lr <- seq_along(polarity_valid_lr)
test_index_lr <- all_valid_indices_lr[!all_valid_indices_lr %in% train_index_lr]

# Split the corpus subset and polarity labels into training and testing sets
corpus_reviews_train_lr <- corpus_reviews_valid_lr[train_index_lr]
corpus_reviews_test_lr <- corpus_reviews_valid_lr[test_index_lr]

polarity_train_lr <- polarity_valid_lr[train_index_lr]
polarity_test_lr <- polarity_valid_lr[test_index_lr]

# Check the distribution of the split
print("Training set class distribution (LR example):")

## [1] "Training set class distribution (LR example):"

print(table(polarity_train_lr))

## polarity_train_lr
## neg pos 
## 716 683

print("Testing set class distribution (LR example):")

## [1] "Testing set class distribution (LR example):"

print(table(polarity_test_lr))

## polarity_test_lr
## neg pos 
## 307 294

Next, we preprocess the training and test corpus subsets to create DFMs, applying similar cleaning steps as before and matching the test DFM features to the training DFM.

# Tokenise and preprocess the training corpus
tokens_train_lr <- tokens(corpus_reviews_train_lr, what = "word", remove_punct = TRUE,
    remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
    tokens_tolower() %>%
    tokens_select(stopwords("english"), selection = "remove")

# Tokenise and preprocess the test corpus
tokens_test_lr <- tokens(corpus_reviews_test_lr, what = "word", remove_punct = TRUE,
    remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) %>%
    tokens_tolower() %>%
    tokens_select(stopwords("english"), selection = "remove")

# Create dfms
dfm_train_lr <- dfm(tokens_train_lr)
dfm_test_lr <- dfm(tokens_test_lr)

# Ensure the test dfm has the same features as the training dfm
dfm_test_matched_lr <- dfm_match(dfm_test_lr, features = featnames(dfm_train_lr))

# Display DFM dimensions
cat("Dimensions of Training DFM (LR example):", dim(dfm_train_lr), "\n")

## Dimensions of Training DFM (LR example): 1399 23543

cat("Dimensions of Matched Test DFM (LR example):", dim(dfm_test_matched_lr), "\n")

## Dimensions of Matched Test DFM (LR example): 601 23543

Now, we train the Regularized Logistic Regression model using textmodel_lr.

# Train the textmodel_lr model for binary classification The training labels
# are the polarity labels from the training corpus
library(quanteda.textmodels)

model_lr_lmrd <- textmodel_lr(dfm_train_lr, polarity_train_lr)

# Print the model summary
summary(model_lr_lmrd)

## 
## Call:
## textmodel_lr.dfm(x = dfm_train_lr, y = polarity_train_lr)
## 
## Lambda Min:
## [1] 0.009405
## 
## Lambda 1se:
## [1] 0.01498
## 
## Estimated Feature Scores:
##     (Intercept) just thought finish whole year without giving single movie bomb
## pos      0.1042    0       0      0     0    0       0      0      0     0    0
##     rating friend brought notorious turd house last night feared  worst knowing
## pos      0      0       0         0    0     0    0     0      0 -1.281       0
##     reputation god-awful anticipated mexican-made mess dubbed english produced
## pos          0         0           0            0    0      0       0        0

We then use the trained model to predict the polarity labels for the documents in the test set.

# Predict the classes for the matched test set
predictions_lr_lmrd <- predict(model_lr_lmrd, newdata = dfm_test_matched_lr)

# Display the first few predictions
head(predictions_lr_lmrd)

##  test/neg/6932_2.txt train/neg/6580_4.txt  test/pos/1099_7.txt 
##                  neg                  pos                  pos 
## train/pos/9167_7.txt  test/neg/8064_1.txt  test/neg/4057_4.txt 
##                  neg                  neg                  pos 
## Levels: neg pos

Finally, we evaluate the model’s performance on the test set using a confusion matrix. Since this is a binary classification task, we can also compute metrics like ROC AUC, which is similar to the SVM evaluation.

# Get the actual classes from the test corpus
actual_classes_lmrd <- polarity_test_lr

# Ensure actual and predicted classes are factors with the same levels for
# comparison The levels should already be consistent ('neg', 'pos') from the
# splitting step
confusion_matrix_lr_lmrd <- confusionMatrix(predictions_lr_lmrd, actual_classes_lmrd)

# Print the confusion matrix and performance statistics
print(confusion_matrix_lr_lmrd)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 232  46
##        pos  75 248
##                                         
##                Accuracy : 0.7987        
##                  95% CI : (0.7643, 0.83)
##     No Information Rate : 0.5108        
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.598         
##                                         
##  Mcnemar's Test P-Value : 0.01091       
##                                         
##             Sensitivity : 0.7557        
##             Specificity : 0.8435        
##          Pos Pred Value : 0.8345        
##          Neg Pred Value : 0.7678        
##              Prevalence : 0.5108        
##          Detection Rate : 0.3860        
##    Detection Prevalence : 0.4626        
##       Balanced Accuracy : 0.7996        
##                                         
##        'Positive' Class : neg           
##