In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
In [60]:
# we read in the our csv with new extracted features
df=pd.read_csv("featured_reviews.csv",parse_dates=["timestamp_created"])
In [64]:
df.head()
Out[64]:
game_name review voted_up timestamp_created author_num_games_owned author_num_reviews author_playtime_at_review author_playtime_last_two_weeks author_playtime_forever review_length ... 2d crpg sandbox deckbuilding survival strategy shooter experience_level_experienced experience_level_intermediate sentiment_score
0 elden_ring love game much someone wants collect everythin... 1 2024-10-18 18:12:33 326 16 8977 3957 8988 100 ... 0 0 0 0 0 0 0 1 0 0.6369
1 elden_ring roll 1 2024-10-18 18:10:09 11 2 2422 1085 2513 4 ... 0 0 0 0 0 0 0 0 0 0.0000
2 elden_ring laterally perfect every way bad thing optimiza... 1 2024-10-18 16:53:57 0 1 350 492 492 55 ... 0 0 0 0 0 0 0 0 0 0.4215
3 elden_ring try finger hole 1 2024-10-18 16:44:56 0 1 6456 80 6536 15 ... 0 0 0 0 0 0 0 0 1 0.0000
4 elden_ring damn damn explain piece art pure phenomenal 1 2024-10-18 16:29:53 51 3 5563 1424 5563 43 ... 0 0 0 0 0 0 0 1 0 -0.6597

5 rows × 41 columns

We will try to model if we can predict if the review would be positive or negative based on the content of the review.

In [8]:
# we define our features and target
y = df['voted_up']
X = df['review']
In [9]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['voted_up'], test_size=0.2
)

The BERT model (Bidirectional Encoder Representations from Transformers) is a pre-trained transformer-based language model designed to understand text context bidirectionally. We will leverage it for text classification by fine-tuning it on our dataset to predict whether a review was positive or negative (voted_up).

Each review was:

  • Split into tokens
  • Converted into unique token IDs (input_ids)
  • Padded or truncated to a maximum length of 128 tokens
  • Generated an attention_mask to indicate which tokens should be attended to

We created a ReviewDataset class to manage the tokenized inputs and corresponding labels.
The dataset was then split into training and testing sets (X_train, y_train, X_test, y_test).

In [766]:
from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)
In [780]:
# raw data before transformation ->
print("Original Train Data (First 5 Rows):")
raw_sample = pd.DataFrame({
    'review': X_train.iloc[:5],
    'voted_up': y_train.iloc[:5]
})
print(raw_sample)
Original Train Data (First 5 Rows):
                                                  review  voted_up
1288                                    yeah pretty good         1
31280  well thought made game complexity slowly build...         1
39225                                          hyvä peli         1
17598  god love game played bg3 need good play game l...         1
31369                          1010 best game ive played         1
In [776]:
# Display transformed data (tokenized)
def display_transformed_data(dataset, n=5):
    """
    Function to display the transform review data (tokenized)
    """
    print("\nTransformed Train Data (Tokenized):")
    transformed_samples = []
    for i in range(n):
        sample = dataset[i]
        transformed_samples.append({
            'input_ids': sample['input_ids'].tolist(),
            'attention_mask': sample['attention_mask'].tolist(),
            'label': sample['label'].item()
        })
    transformed_df = pd.DataFrame(transformed_samples)
    print(transformed_df)

# first 5 samples from the transformed dataset ->
display_transformed_data(train_dataset)
Transformed Train Data (Tokenized):
                                           input_ids  \
0  [101, 3398, 3492, 2204, 102, 0, 0, 0, 0, 0, 0,...   
1  [101, 2092, 2245, 2081, 2208, 11619, 3254, 164...   
2  [101, 1044, 2100, 3567, 21877, 3669, 102, 0, 0...   
3  [101, 2643, 2293, 2208, 2209, 1038, 2290, 2509...   
4  [101, 7886, 2692, 2190, 2208, 4921, 2063, 2209...   

                                      attention_mask  label  
0  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
2  [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...      1  
In [17]:
from torch.utils.data import DataLoader

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
In [19]:
# Load pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [21]:
optimizer = AdamW(model.parameters(), lr=2e-5)
In [23]:
# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
Epoch 1, Loss: 0.2058819322121209
Epoch 2, Loss: 0.13247972990714788
Epoch 3, Loss: 0.07910977544507167
In [25]:
# Evaluation loop
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Calculate metrics
print("Accuracy:", accuracy_score(y_true, y_preds))
print("F1-Score:", f1_score(y_true, y_preds))
print("Classification Report:")
print(classification_report(y_true, y_preds))
Accuracy: 0.934370613008891
F1-Score: 0.9635785236642213
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       826
           1       0.97      0.96      0.96      7722

    accuracy                           0.93      8548
   macro avg       0.81      0.82      0.82      8548
weighted avg       0.94      0.93      0.94      8548

In [ ]:
 

We will now use more features about the review data like review_length and the sentiment_score

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],  # Include all features
    df['voted_up'], 
    test_size=0.2, 
)
In [16]:
# Display raw data before transformation
X_test.head()
Out[16]:
review review_length sentiment_score
23404 h1this game devolved rubbish playedh1 fan talo... 1620 0.7303
14647 immersive story line excellent fight mechinics... 62 0.7351
24892 greatest 2d platformer ever made 32 0.6369
32911 dont enough one thing lose simulator game garb... 57 -0.3182
27897 play game interstellar music background 55 aud... 60 0.5994
In [18]:
# Columns to normalize
numerical_cols = ['review_length', 'sentiment_score']

scaler = MinMaxScaler()

# Fit on training data and transform both train and test sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
In [20]:
X_train.head()
Out[20]:
review review_length sentiment_score
26411 objectively beautiful tremendously fun 1010 0.005251 0.908691
12658 really love game demon souls bloodborne next 0.005376 0.834883
35651 really fun game great story line goals players... 0.011503 0.943744
12607 lots talk people needing starship enterprise r... 0.088272 0.962346
24936 great story doesnt intrude gameplay amazing ga... 0.031133 0.967047

Similar to the last model we created a custom Dataset class (ReviewDataset) to include:

  • Text Features: Tokenized input IDs and attention masks
  • Numerical Features: Normalized review_length and sentiment_score
  • Labels: voted_up
The key difference is that the __getitem__ method was updated to return both numerical features and tokenized text features, making the dataset compatible with the combined BERT model
In [20]:
class ReviewDataset(Dataset):
    def __init__(self, texts, numerical_features, labels):
        self.texts = texts
        self.numerical_features = numerical_features
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
        label = self.labels.iloc[idx]
        
        # Tokenize the text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'numerical_features': num_features,
            'label': torch.tensor(label, dtype=torch.long)
        }
In [22]:
# Create datasets
train_dataset = ReviewDataset(
    texts=X_train['review'],
    numerical_features=X_train[numerical_cols],
    labels=y_train
)

test_dataset = ReviewDataset(
    texts=X_test['review'],
    numerical_features=X_test[numerical_cols],
    labels=y_test
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
In [63]:
class CustomBERTModel(nn.Module):
    def __init__(self, bert_model):
        super(CustomBERTModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768 + len(numerical_cols), 2)  # Adjusted for the number of numerical features
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  
        combined = torch.cat((cls_output, numerical_features), dim=1)  # Concatenated with numerical features
        logits = self.fc(self.dropout(combined))
        return logits
In [69]:
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

model = CustomBERTModel(bert_model).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
Epoch 1, Loss: 0.20206833520540718
Epoch 2, Loss: 0.1320957051658507
Epoch 3, Loss: 0.0781316688128422
In [71]:
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        preds = torch.argmax(logits, axis=1)
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Evaluate
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_true, y_preds))
C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       866
           1       0.96      0.97      0.96      7682

    accuracy                           0.93      8548
   macro avg       0.82      0.80      0.81      8548
weighted avg       0.93      0.93      0.93      8548

In [ ]:
 

We will now use a LightGBM classifier model to try to model the same relationship.

The review data is transformed for it to work with our classifier model. The transformation involves converting raw review text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency). This step allows text data to be represented numerically for machine learning models.

In [22]:
numerical_cols = ['review_length', 'sentiment_score']
In [24]:
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
In [26]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],
    df['voted_up'],
    test_size=0.2
)
In [46]:
scaler = MinMaxScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])

# Fit TF-IDF on training data and transform both train and test reviews
X_train_tfidf = vectorizer.fit_transform(X_train['review'])
X_test_tfidf = vectorizer.transform(X_test['review'])

# Combine TF-IDF and scaled numerical features
import scipy
X_train_combined = scipy.sparse.hstack((X_train_tfidf, X_train_numerical))
X_test_combined = scipy.sparse.hstack((X_test_tfidf, X_test_numerical))
In [47]:
# we display the transformed data ->
tfidf_dense = pd.DataFrame(
    X_train_tfidf[:5].toarray(), 
    columns=vectorizer.get_feature_names_out() 
)
print("TF-IDF Transformed Data (First 5 Rows):")
print(tfidf_dense.head())
TF-IDF Transformed Data (First 5 Rows):
   010   10  100  1000  10010  100ing  1010   11  110  1110  ...   на   не  \
0  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
1  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
2  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
3  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
4  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   

    по  послушай   то   ты  что  ミxノ  ヽ_ヽ___   二つ  
0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
1  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
2  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
3  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
4  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  

[5 rows x 5000 columns]
In [50]:
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)

print("Classification Report:")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.74      0.52       855
           1       0.97      0.88      0.92      7693

    accuracy                           0.86      8548
   macro avg       0.68      0.81      0.72      8548
weighted avg       0.91      0.86      0.88      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")
In [81]:
# import numpy as np

# y_pred_proba = model.predict_proba(X_test_combined)[:, 1]

# for threshold in np.arange(0.1, 0.6, 0.1):
#     y_pred = (y_pred_proba >= threshold).astype(int)
#     print(f"Threshold: {threshold}")
#     print(classification_report(y_test, y_pred))

To improve the model’s performance, we conducted a Grid Search over a small parameter space to find the best combination of hyperparameters. After choosing the best parameters for the model, we adjusted the decision threshold (default is 0.5) to optimize performance for different class priorities. We tested thresholds ranging from 0.1 to 0.9.

In [52]:
param_grid = {
    'learning_rate': [0.01, 0.1],         
    'n_estimators': [100, 200],           
    'max_depth': [3, 5],                  
    'subsample': [0.8, 1.0],              
    'colsample_bytree': [0.8, 1.0]        
}
In [54]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

model = LGBMClassifier(class_weight='balanced')

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=3,          # 3-fold cross-validation
    verbose=2,
    n_jobs=-1      # Use all available cores
)

# Fit grid search
grid_search.fit(X_train_combined, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.9126180657708303
In [55]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = LGBMClassifier(class_weight='balanced', **best_params)
best_model.fit(X_train_combined, y_train)
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Out[55]:
LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200,
               subsample=0.8)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200,
               subsample=0.8)
In [56]:
#then we do threshold adjustment on the tuned model
y_val_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_val_pred = (y_val_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_val_pred)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_val_pred))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.82      0.20      0.32       855
           1       0.92      1.00      0.96      7693

    accuracy                           0.92      8548
   macro avg       0.87      0.60      0.64      8548
weighted avg       0.91      0.92      0.89      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.66      0.39      0.49       855
           1       0.94      0.98      0.96      7693

    accuracy                           0.92      8548
   macro avg       0.80      0.69      0.72      8548
weighted avg       0.91      0.92      0.91      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.53      0.54      0.54       855
           1       0.95      0.95      0.95      7693

    accuracy                           0.91      8548
   macro avg       0.74      0.74      0.74      8548
weighted avg       0.91      0.91      0.91      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.43      0.66      0.52       855
           1       0.96      0.90      0.93      7693

    accuracy                           0.88      8548
   macro avg       0.69      0.78      0.72      8548
weighted avg       0.91      0.88      0.89      8548

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.37      0.72      0.49       855
           1       0.97      0.86      0.91      7693

    accuracy                           0.85      8548
   macro avg       0.67      0.79      0.70      8548
weighted avg       0.91      0.85      0.87      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.27      0.84      0.41       855
           1       0.98      0.75      0.85      7693

    accuracy                           0.76      8548
   macro avg       0.62      0.80      0.63      8548
weighted avg       0.91      0.76      0.80      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.21      0.92      0.34       855
           1       0.99      0.61      0.76      7693

    accuracy                           0.64      8548
   macro avg       0.60      0.77      0.55      8548
weighted avg       0.91      0.64      0.71      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.17      0.98      0.30       855
           1       0.99      0.48      0.65      7693

    accuracy                           0.53      8548
   macro avg       0.58      0.73      0.47      8548
weighted avg       0.91      0.53      0.62      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.13      0.99      0.23       855
           1       1.00      0.28      0.44      7693

    accuracy                           0.35      8548
   macro avg       0.56      0.64      0.34      8548
weighted avg       0.91      0.35      0.42      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")
In [90]:
#select the best threshold
best_threshold=0.3
In [92]:
final_test_pred = (y_val_proba >= best_threshold).astype(int)

# Evaluate using classification metrics
print("Classification Report with Optimal Threshold:")
print(classification_report(y_test, final_test_pred))
Classification Report with Optimal Threshold:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56       845
           1       0.95      0.95      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.76      0.76      0.76      8548
weighted avg       0.91      0.91      0.91      8548

In [ ]:
 

We try to use an ensemble model with LightGBM, Logistic Regression and Naive Bayes for modelling the same relationship.

In [94]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
In [96]:
# we define the base models of LightGBM, logistic regression and naive bayes
base_models = [
    ('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)),
    ('logreg', LogisticRegression(max_iter=1000)),
    ('nb', MultinomialNB())
]
In [98]:
# Logistic Regression would be the meta model
meta_model = LogisticRegression()
In [100]:
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3 
)
In [102]:
# fit the ensemble model
stacked_model.fit(X_train_combined, y_train)
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20513, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71475
[LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20514, number of negative: 2279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71744
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20513, number of negative: 2280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064723 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70027
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Out[102]:
StackingClassifier(cv=3,
                   estimators=[('lightgbm',
                                LGBMClassifier(class_weight='balanced',
                                               random_state=42)),
                               ('logreg', LogisticRegression(max_iter=1000)),
                               ('nb', MultinomialNB())],
                   final_estimator=LogisticRegression())
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(cv=3,
                   estimators=[('lightgbm',
                                LGBMClassifier(class_weight='balanced',
                                               random_state=42)),
                               ('logreg', LogisticRegression(max_iter=1000)),
                               ('nb', MultinomialNB())],
                   final_estimator=LogisticRegression())
LGBMClassifier(class_weight='balanced', random_state=42)
LogisticRegression(max_iter=1000)
MultinomialNB()
LogisticRegression()
In [104]:
y_pred = stacked_model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.51      0.61       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.85      0.75      0.79      8548
weighted avg       0.93      0.93      0.93      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")
In [106]:
param_grid = {
    'lightgbm__n_estimators': [100, 150],       # Number of boosting iterations
    'lightgbm__max_depth': [4, 6],             # Maximum depth of trees
    'lightgbm__learning_rate': [0.05, 0.1],    # Learning rate
    'final_estimator__C': [0.1, 1, 10]         # Regularization strength for Logistic Regression (meta-model)
}
In [108]:
grid_search = GridSearchCV(
    estimator=stacked_model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=3,
    verbose=2,
    n_jobs=-1
)
In [110]:
grid_search.fit(X_train_combined, y_train)

# Best parameters and performance
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

# Use the best model
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20513, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71475
[LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20514, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71744
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20513, number of negative: 2280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70027
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'final_estimator__C': 10, 'lightgbm__learning_rate': 0.1, 'lightgbm__max_depth': 6, 'lightgbm__n_estimators': 150}
Best F1-Score: 0.9613205813271328
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.74      0.49      0.59       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.84      0.74      0.78      8548
weighted avg       0.93      0.93      0.93      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")
In [111]:
from sklearn.metrics import classification_report

# Predict probabilities for threshold adjustment
y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.85      0.21      0.34       845
           1       0.92      1.00      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.89      0.60      0.65      8548
weighted avg       0.91      0.92      0.90      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.80      0.29      0.43       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.87      0.64      0.69      8548
weighted avg       0.92      0.92      0.91      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.79      0.37      0.50       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.86      0.68      0.73      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.77      0.44      0.56       845
           1       0.94      0.99      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.86      0.71      0.76      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.5
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")
              precision    recall  f1-score   support

           0       0.74      0.49      0.59       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.84      0.74      0.78      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.71      0.54      0.62       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.83      0.76      0.79      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.65      0.62      0.63       845
           1       0.96      0.96      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.79      0.80      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.57      0.70      0.63       845
           1       0.97      0.94      0.95      7703

    accuracy                           0.92      8548
   macro avg       0.77      0.82      0.79      8548
weighted avg       0.93      0.92      0.92      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.41      0.79      0.54       845
           1       0.97      0.88      0.92      7703

    accuracy                           0.87      8548
   macro avg       0.69      0.83      0.73      8548
weighted avg       0.92      0.87      0.89      8548

In [ ]:
 

Next we make an ensemble model by bagging the LightGBM Classifier and try to model the same relationship

In [116]:
from sklearn.ensemble import BaggingClassifier

base_estimator = LGBMClassifier(class_weight='balanced')

# Define the BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=base_estimator,
    n_estimators=10,  # Default number of estimators
    max_samples=1.0,  # Use all samples
    max_features=1.0,  # Use all features
    random_state=42
)

# Fit the model
bagging_model.fit(X_train_combined, y_train)

# Make predictions
y_pred = bagging_model.predict(X_test_combined)

# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501547 -> initscore=0.006190
[LightGBM] [Info] Start training from score 0.006190
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502283 -> initscore=0.009133
[LightGBM] [Info] Start training from score 0.009133
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.148996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502201 -> initscore=0.008806
[LightGBM] [Info] Start training from score 0.008806
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.142197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497975 -> initscore=-0.008098
[LightGBM] [Info] Start training from score -0.008098
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499513 -> initscore=-0.001948
[LightGBM] [Info] Start training from score -0.001948
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494604 -> initscore=-0.021587
[LightGBM] [Info] Start training from score -0.021587
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497169 -> initscore=-0.011323
[LightGBM] [Info] Start training from score -0.011323
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494125 -> initscore=-0.023502
[LightGBM] [Info] Start training from score -0.023502
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501711 -> initscore=0.006843
[LightGBM] [Info] Start training from score 0.006843
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500488 -> initscore=0.001951
[LightGBM] [Info] Start training from score 0.001951
Classification Report (Without Hyperparameter Tuning):
              precision    recall  f1-score   support

           0       0.43      0.74      0.55       845
           1       0.97      0.89      0.93      7703

    accuracy                           0.88      8548
   macro avg       0.70      0.82      0.74      8548
weighted avg       0.92      0.88      0.89      8548

In [118]:
# tune parameters

base_estimator = LGBMClassifier(class_weight='balanced')

# Define the BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=base_estimator,
    random_state=42
)

param_grid = {
    'n_estimators': [5, 10],          
    'max_samples': [0.6, 0.8],       
    'max_features': [0.6, 0.8]
}

grid_search = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=3,                  
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_combined, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63766
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1815
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497755 -> initscore=-0.008981
[LightGBM] [Info] Start training from score -0.008981
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62858
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1807
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080
[LightGBM] [Info] Start training from score 0.005080
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65062
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1790
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080
[LightGBM] [Info] Start training from score 0.005080
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62425
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1832
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.512621 -> initscore=0.050495
[LightGBM] [Info] Start training from score 0.050495
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64527
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1825
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504825 -> initscore=0.019301
[LightGBM] [Info] Start training from score 0.019301
Best Parameters: {'max_features': 0.6, 'max_samples': 0.6, 'n_estimators': 5}
Best F1-Score: 0.8968175296168622
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.45      0.69      0.54       845
           1       0.96      0.91      0.93      7703

    accuracy                           0.89      8548
   macro avg       0.71      0.80      0.74      8548
weighted avg       0.91      0.89      0.90      8548

In [119]:
y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")

# Evaluate with the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.90      0.10      0.18       845
           1       0.91      1.00      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.91      0.55      0.56      8548
weighted avg       0.91      0.91      0.88      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.79      0.28      0.41       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.86      0.64      0.69      8548
weighted avg       0.91      0.92      0.90      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.68      0.46      0.55       845
           1       0.94      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.72      0.75      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.55      0.58      0.57       845
           1       0.95      0.95      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.75      0.77      0.76      8548
weighted avg       0.91      0.91      0.91      8548

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.45      0.69      0.54       845
           1       0.96      0.91      0.93      7703

    accuracy                           0.89      8548
   macro avg       0.71      0.80      0.74      8548
weighted avg       0.91      0.89      0.90      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.34      0.81      0.48       845
           1       0.98      0.83      0.90      7703

    accuracy                           0.83      8548
   macro avg       0.66      0.82      0.69      8548
weighted avg       0.91      0.83      0.85      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.24      0.91      0.38       845
           1       0.99      0.68      0.80      7703

    accuracy                           0.70      8548
   macro avg       0.61      0.79      0.59      8548
weighted avg       0.91      0.70      0.76      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.19      0.97      0.31       845
           1       0.99      0.54      0.70      7703

    accuracy                           0.58      8548
   macro avg       0.59      0.75      0.50      8548
weighted avg       0.91      0.58      0.66      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.13      0.99      0.23       845
           1       1.00      0.27      0.42      7703

    accuracy                           0.34      8548
   macro avg       0.56      0.63      0.33      8548
weighted avg       0.91      0.34      0.40      8548

Best Threshold: 0.30000000000000004, Best F1-Score: 0.9593827711534783
Classification Report with Best Threshold:
              precision    recall  f1-score   support

           0       0.68      0.46      0.55       845
           1       0.94      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.72      0.75      8548
weighted avg       0.92      0.93      0.92      8548

In [ ]:
 

We will try to predict if a review mentions difficulty using review meta data like length and sentiment score. We will try to model review_length and sentiment_score to mentions_difficulty.

In [122]:
df.columns
Out[122]:
Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate', 'sentiment_score'],
      dtype='object')
In [124]:
# we take review_length and sentiment_score as our inputs and mentions_difficulty as our output
features = [
    'review_length', 
    'sentiment_score',   
]
In [126]:
X = df[features]
y = df['mentions_difficulty']
In [128]:
X.head()
Out[128]:
review_length sentiment_score
0 100 0.6369
1 4 0.0000
2 55 0.4215
3 15 0.0000
4 43 -0.6597
In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [132]:
# both are numerical columns so we scale them
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [134]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

import pandas as pd
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
In [136]:
X_train_scaled
Out[136]:
review_length sentiment_score
0 0.007698 0.746975
1 0.000929 0.228823
2 0.001460 0.474197
3 0.006902 0.500000
4 0.001195 0.500000
... ... ...
29910 0.001460 0.792979
29911 0.014335 0.903740
29912 0.005840 0.625013
29913 0.001062 0.901040
29914 0.013008 0.062456

29915 rows × 2 columns

In [138]:
y.value_counts()
Out[138]:
mentions_difficulty
0    36788
1     5949
Name: count, dtype: int64

To combat unbalanced classes, we do sampling. We undersample the majority class and oversample the minority class.

In [140]:
undersampler = RandomUnderSampler(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)
y_train_resampled.value_counts()
Out[140]:
mentions_difficulty
0    20880
1     4176
Name: count, dtype: int64
In [142]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_resampled, y_train_resampled)
y_train_resampled.value_counts()
Out[142]:
mentions_difficulty
0    20880
1    20880
Name: count, dtype: int64
In [146]:
# train a LightGBM model 
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Out[146]:
LGBMClassifier(class_weight='balanced')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(class_weight='balanced')
In [148]:
y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.80     12822

In [150]:
# run the same model but with some hyperparameter tuning
model = LGBMClassifier(class_weight='balanced', random_state=42)

param_grid = {
    'n_estimators': [50, 100],         
    'max_depth': [3, 5],             
    'learning_rate': [0.05, 0.1],     
    'colsample_bytree': [0.8, 1.0],   
}


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best F1-Score: 0.7800546322105028
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.79     12822

In [151]:
# Predict probabilities for the positive class
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    # Adjust predictions based on the current threshold
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    
    # Evaluate using F1-score
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
    
    # Track the best threshold
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# Display the best threshold and corresponding F1-score
print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")

# Evaluate the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.24      0.39     11049
           1       0.17      0.99      0.30      1773

    accuracy                           0.35     12822
   macro avg       0.58      0.62      0.34     12822
weighted avg       0.88      0.35      0.38     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.99      0.46      0.62     11049
           1       0.22      0.96      0.36      1773

    accuracy                           0.53     12822
   macro avg       0.60      0.71      0.49     12822
weighted avg       0.88      0.53      0.59     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.98      0.58      0.72     11049
           1       0.26      0.91      0.40      1773

    accuracy                           0.62     12822
   macro avg       0.62      0.74      0.56     12822
weighted avg       0.88      0.62      0.68     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.67      0.79     11049
           1       0.29      0.85      0.43      1773

    accuracy                           0.69     12822
   macro avg       0.63      0.76      0.61     12822
weighted avg       0.87      0.69      0.74     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.84      0.89     11049
           1       0.40      0.65      0.49      1773

    accuracy                           0.82     12822
   macro avg       0.67      0.75      0.69     12822
weighted avg       0.86      0.82      0.83     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     11049
           1       0.52      0.48      0.50      1773

    accuracy                           0.87     12822
   macro avg       0.72      0.71      0.71     12822
weighted avg       0.86      0.87      0.87     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     11049
           1       0.61      0.38      0.47      1773

    accuracy                           0.88     12822
   macro avg       0.76      0.67      0.70     12822
weighted avg       0.87      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.99      0.93     11049
           1       0.75      0.20      0.32      1773

    accuracy                           0.88     12822
   macro avg       0.82      0.60      0.63     12822
weighted avg       0.87      0.88      0.85     12822

Best Threshold: 0.7000000000000001, Best F1-Score: 0.5038123167155425
Classification Report with Best Threshold:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     11049
           1       0.52      0.48      0.50      1773

    accuracy                           0.87     12822
   macro avg       0.72      0.71      0.71     12822
weighted avg       0.86      0.87      0.87     12822

In [ ]:
 

We will now use an XGBoost Classifier model to try to model the same relationship as the last one.

In [156]:
from xgboost import XGBClassifier

model = XGBClassifier(
    scale_pos_weight=2,        # Adjust for class imbalance
    use_label_encoder=False,  
    eval_metric='logloss'
)

# Fit the model
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))
Classification Report (Without Hyperparameter Tuning):
              precision    recall  f1-score   support

           0       0.96      0.64      0.77     11049
           1       0.27      0.84      0.41      1773

    accuracy                           0.67     12822
   macro avg       0.62      0.74      0.59     12822
weighted avg       0.87      0.67      0.72     12822

C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:20:42] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
In [158]:
param_grid = {
    'n_estimators': [100, 200],      
    'learning_rate': [0.05, 0.1],   
    'max_depth': [4, 6],            
    'subsample': [0.8, 1.0],        
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',    
    cv=3,                     
    verbose=2,               
    n_jobs=-1                 
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 32 candidates, totalling 96 fits
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:23:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.7642612384937363
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.97      0.62      0.76     11049
           1       0.27      0.87      0.41      1773

    accuracy                           0.66     12822
   macro avg       0.62      0.75      0.58     12822
weighted avg       0.87      0.66      0.71     12822

In [164]:
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.21      0.35     11049
           1       0.17      0.99      0.29      1773

    accuracy                           0.32     12822
   macro avg       0.58      0.60      0.32     12822
weighted avg       0.88      0.32      0.34     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.99      0.37      0.54     11049
           1       0.20      0.97      0.33      1773

    accuracy                           0.45     12822
   macro avg       0.59      0.67      0.43     12822
weighted avg       0.88      0.45      0.51     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.98      0.46      0.63     11049
           1       0.22      0.95      0.36      1773

    accuracy                           0.53     12822
   macro avg       0.60      0.71      0.49     12822
weighted avg       0.88      0.53      0.59     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.98      0.55      0.70     11049
           1       0.25      0.92      0.39      1773

    accuracy                           0.60     12822
   macro avg       0.61      0.73      0.55     12822
weighted avg       0.88      0.60      0.66     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.97      0.62      0.76     11049
           1       0.27      0.87      0.41      1773

    accuracy                           0.66     12822
   macro avg       0.62      0.75      0.58     12822
weighted avg       0.87      0.66      0.71     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.96      0.71      0.81     11049
           1       0.31      0.80      0.44      1773

    accuracy                           0.72     12822
   macro avg       0.63      0.75      0.63     12822
weighted avg       0.87      0.72      0.76     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.94      0.81      0.87     11049
           1       0.37      0.68      0.48      1773

    accuracy                           0.79     12822
   macro avg       0.65      0.75      0.67     12822
weighted avg       0.86      0.79      0.82     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     11049
           1       0.48      0.51      0.49      1773

    accuracy                           0.86     12822
   macro avg       0.70      0.71      0.70     12822
weighted avg       0.86      0.86      0.86     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     11049
           1       0.63      0.34      0.44      1773

    accuracy                           0.88     12822
   macro avg       0.76      0.66      0.69     12822
weighted avg       0.86      0.88      0.87     12822

In [ ]:
 

We will now use an ensemble model of XGBoost, Logistic Regression and Naïve Bayes to model the same relationship

In [166]:
xgb = XGBClassifier(
    scale_pos_weight=2,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
logreg = LogisticRegression(max_iter=1000)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('xgb', xgb), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.75      0.46      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

In [168]:
param_grid = {
    'xgb__n_estimators': [100, 150],     
    'xgb__max_depth': [4, 6],            
    'xgb__learning_rate': [0.05, 0.1],   
    'final_estimator__C': [0.1, 1, 10]
}
In [170]:
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1',  
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
Fitting 3 folds for each of 24 candidates, totalling 72 fits
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Best Parameters: {'final_estimator__C': 1, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 150}
Best Score: 0.7598426725216721
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11049
           1       0.32      0.74      0.45      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

Thresholds between 0.1 and 0.9 were tested to improve performance metrics for class 1.

In [176]:
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Adjust thresholds and evaluate
for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.21      0.35     11049
           1       0.17      0.99      0.29      1773

    accuracy                           0.32     12822
   macro avg       0.58      0.60      0.32     12822
weighted avg       0.88      0.32      0.34     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.98      0.45      0.61     11049
           1       0.21      0.94      0.35      1773

    accuracy                           0.52     12822
   macro avg       0.60      0.69      0.48     12822
weighted avg       0.87      0.52      0.58     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.97      0.57      0.72     11049
           1       0.25      0.88      0.39      1773

    accuracy                           0.61     12822
   macro avg       0.61      0.73      0.55     12822
weighted avg       0.87      0.61      0.67     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.68      0.80     11049
           1       0.29      0.80      0.43      1773

    accuracy                           0.70     12822
   macro avg       0.62      0.74      0.61     12822
weighted avg       0.86      0.70      0.75     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11049
           1       0.32      0.74      0.45      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.83      0.88     11049
           1       0.38      0.65      0.48      1773

    accuracy                           0.80     12822
   macro avg       0.66      0.74      0.68     12822
weighted avg       0.86      0.80      0.82     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     11049
           1       0.46      0.53      0.49      1773

    accuracy                           0.85     12822
   macro avg       0.69      0.72      0.70     12822
weighted avg       0.86      0.85      0.85     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     11049
           1       0.58      0.39      0.47      1773

    accuracy                           0.88     12822
   macro avg       0.74      0.67      0.70     12822
weighted avg       0.86      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     11049
           1       0.71      0.24      0.36      1773

    accuracy                           0.88     12822
   macro avg       0.80      0.61      0.65     12822
weighted avg       0.87      0.88      0.86     12822

In [ ]:
 

We will now use an ensemble model of LightGBM, Logistic Regression and Naïve Bayes to model the same relationship

In [179]:
lgbm = LGBMClassifier(
    class_weight='balanced',  # Automatically handles class imbalance
    n_estimators=100,         
    max_depth=6,              
    learning_rate=0.1,        
    subsample=0.8,            
    colsample_bytree=0.8,     
    random_state=42
)

logreg = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('lgbm', lgbm), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

In [181]:
param_grid = {
    'lgbm__n_estimators': [100, 150],       
    'lgbm__max_depth': [4, 6],             
    'lgbm__learning_rate': [0.05, 0.1],    
    'final_estimator__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1', 
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'final_estimator__C': 10, 'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 6, 'lgbm__n_estimators': 150}
Best Score: 0.7967459488410441
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

In [182]:
# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.39      0.56     11049
           1       0.20      0.97      0.34      1773

    accuracy                           0.47     12822
   macro avg       0.60      0.68      0.45     12822
weighted avg       0.88      0.47      0.53     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.98      0.58      0.73     11049
           1       0.26      0.91      0.40      1773

    accuracy                           0.63     12822
   macro avg       0.62      0.75      0.56     12822
weighted avg       0.88      0.63      0.68     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.97      0.65      0.78     11049
           1       0.28      0.86      0.42      1773

    accuracy                           0.68     12822
   macro avg       0.62      0.76      0.60     12822
weighted avg       0.87      0.68      0.73     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     11049
           1       0.31      0.81      0.44      1773

    accuracy                           0.72     12822
   macro avg       0.63      0.76      0.63     12822
weighted avg       0.87      0.72      0.76     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.83      0.88     11049
           1       0.39      0.66      0.49      1773

    accuracy                           0.81     12822
   macro avg       0.66      0.75      0.69     12822
weighted avg       0.86      0.81      0.83     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     11049
           1       0.45      0.56      0.50      1773

    accuracy                           0.84     12822
   macro avg       0.69      0.72      0.70     12822
weighted avg       0.86      0.84      0.85     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     11049
           1       0.58      0.43      0.49      1773

    accuracy                           0.88     12822
   macro avg       0.74      0.69      0.71     12822
weighted avg       0.87      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     11049
           1       0.70      0.25      0.37      1773

    accuracy                           0.88     12822
   macro avg       0.79      0.62      0.65     12822
weighted avg       0.86      0.88      0.86     12822

In [66]:
# from scipy.stats import pointbiserialr

# corr_length, _ = pointbiserialr(df['review_length'], df['mentions_difficulty'])
# corr_sentiment, _ = pointbiserialr(df['sentiment_score'], df['mentions_difficulty'])

# print(f"Correlation (review_length, mentions_difficulty): {corr_length:.4f}")
# print(f"Correlation (sentiment_score, mentions_difficulty): {corr_sentiment:.4f}")
In [ ]:
 

Next, we would try to model the relationship between the genres of the game in the review and if it mentions difficulty or not.

In [185]:
df.columns
Out[185]:
Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate', 'sentiment_score'],
      dtype='object')
In [68]:
# we define the list of genre columns
genre_columns=['roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d']
In [72]:
X=df[genre_columns]
X
Out[72]:
roguelike co_op base_building soulslike deckbuilding puzzle metroidvania rpg competitive first_person ... platformer shooter open_world strategy survival adventure crafting third_person turn_based 2d
0 0 0 0 1 0 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
1 0 0 0 1 0 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
2 0 0 0 1 0 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
3 0 0 0 1 0 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
4 0 0 0 1 0 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42732 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 0
42733 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 0
42734 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 0
42735 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 0
42736 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 0

42737 rows × 26 columns

In [201]:
# take all the genre columns as input and mentions_difficulty as output
X=df[genre_columns]
y=df["mentions_difficulty"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
In [203]:
y_train.value_counts()
Out[203]:
mentions_difficulty
0    25715
1     4200
Name: count, dtype: int64
In [205]:
# balance the classes by oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
In [207]:
y_train_resampled.value_counts()
Out[207]:
mentions_difficulty
0    25715
1    25715
Name: count, dtype: int64
In [209]:
# build a decision tree classifier model
model = DecisionTreeClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)
Out[209]:
DecisionTreeClassifier(class_weight='balanced')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(class_weight='balanced')
In [90]:
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822

In [211]:
param_grid = {
    'criterion': ['gini', 'entropy'],    
    'max_depth': [5, 10, 15, None],         
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 5]        
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best F1-Score: 0.6037269912871214
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

In [217]:
# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.94      0.14      0.25     11073
           1       0.15      0.94      0.25      1749

    accuracy                           0.25     12822
   macro avg       0.54      0.54      0.25     12822
weighted avg       0.83      0.25      0.25     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.84      0.28      1749

    accuracy                           0.42     12822
   macro avg       0.55      0.60      0.40     12822
weighted avg       0.83      0.42      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [219]:
# then we move to a random forest classifier to try to improve this
model = RandomForestClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66     11073
           1       0.19      0.69      0.29      1749

    accuracy                           0.54     12822
   macro avg       0.55      0.61      0.48     12822
weighted avg       0.82      0.54      0.61     12822

In [221]:
# we try to tune the hyperparameters again
param_grid = {
    'n_estimators': [100, 200],          
    'max_depth': [5, 10, None],         
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 5],         
    'max_features': ['sqrt', 'log2']     
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best F1-Score: 0.6052300517078143
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

In [222]:
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.94      0.14      0.25     11073
           1       0.15      0.94      0.25      1749

    accuracy                           0.25     12822
   macro avg       0.54      0.54      0.25     12822
weighted avg       0.83      0.25      0.25     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.84      0.28      1749

    accuracy                           0.42     12822
   macro avg       0.55      0.60      0.40     12822
weighted avg       0.83      0.42      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [223]:
# we now use a logistic regression model for the same relationship
model = LogisticRegression(
    max_iter=1000
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66     11073
           1       0.19      0.69      0.29      1749

    accuracy                           0.54     12822
   macro avg       0.55      0.61      0.48     12822
weighted avg       0.82      0.54      0.61     12822

In [224]:
# try to tune this model
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear', 'saga'],  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best F1-Score: 0.6054754679057912
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

In [225]:
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.93      0.10      0.18     11073
           1       0.14      0.95      0.25      1749

    accuracy                           0.22     12822
   macro avg       0.54      0.53      0.22     12822
weighted avg       0.82      0.22      0.19     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.83      0.28      1749

    accuracy                           0.41     12822
   macro avg       0.55      0.59      0.39     12822
weighted avg       0.82      0.41      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Since it produces nearly identical results with all three models, even after tuning, we can conclude that there isn't strong enough evidence in the data to model the mentioned relationship

In [ ]: