import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
# we read in the our csv with new extracted features
df=pd.read_csv("featured_reviews.csv",parse_dates=["timestamp_created"])
df.head()
game_name | review | voted_up | timestamp_created | author_num_games_owned | author_num_reviews | author_playtime_at_review | author_playtime_last_two_weeks | author_playtime_forever | review_length | ... | 2d | crpg | sandbox | deckbuilding | survival | strategy | shooter | experience_level_experienced | experience_level_intermediate | sentiment_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | elden_ring | love game much someone wants collect everythin... | 1 | 2024-10-18 18:12:33 | 326 | 16 | 8977 | 3957 | 8988 | 100 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0.6369 |
1 | elden_ring | roll | 1 | 2024-10-18 18:10:09 | 11 | 2 | 2422 | 1085 | 2513 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0000 |
2 | elden_ring | laterally perfect every way bad thing optimiza... | 1 | 2024-10-18 16:53:57 | 0 | 1 | 350 | 492 | 492 | 55 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.4215 |
3 | elden_ring | try finger hole | 1 | 2024-10-18 16:44:56 | 0 | 1 | 6456 | 80 | 6536 | 15 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0000 |
4 | elden_ring | damn damn explain piece art pure phenomenal | 1 | 2024-10-18 16:29:53 | 51 | 3 | 5563 | 1424 | 5563 | 43 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | -0.6597 |
5 rows × 41 columns
We will try to model if we can predict if the review would be positive or negative based on the content of the review.
# we define our features and target
y = df['voted_up']
X = df['review']
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
df['review'], df['voted_up'], test_size=0.2
)
The BERT model (Bidirectional Encoder Representations from Transformers) is a pre-trained transformer-based language model designed to understand text context bidirectionally. We will leverage it for text classification by fine-tuning it on our dataset to predict whether a review was positive or negative (voted_up).
Each review was:
- Split into tokens
- Converted into unique token IDs (input_ids)
- Padded or truncated to a maximum length of 128 tokens
- Generated an attention_mask to indicate which tokens should be attended to
We created a ReviewDataset class to manage the tokenized inputs and corresponding labels.
The dataset was then split into training and testing sets (X_train, y_train, X_test, y_test).
from transformers import BertTokenizer
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class ReviewDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts.iloc[idx]
label = self.labels.iloc[idx]
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'label': torch.tensor(label, dtype=torch.long)
}
# Create datasets
train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)
# raw data before transformation ->
print("Original Train Data (First 5 Rows):")
raw_sample = pd.DataFrame({
'review': X_train.iloc[:5],
'voted_up': y_train.iloc[:5]
})
print(raw_sample)
Original Train Data (First 5 Rows): review voted_up 1288 yeah pretty good 1 31280 well thought made game complexity slowly build... 1 39225 hyvä peli 1 17598 god love game played bg3 need good play game l... 1 31369 1010 best game ive played 1
# Display transformed data (tokenized)
def display_transformed_data(dataset, n=5):
"""
Function to display the transform review data (tokenized)
"""
print("\nTransformed Train Data (Tokenized):")
transformed_samples = []
for i in range(n):
sample = dataset[i]
transformed_samples.append({
'input_ids': sample['input_ids'].tolist(),
'attention_mask': sample['attention_mask'].tolist(),
'label': sample['label'].item()
})
transformed_df = pd.DataFrame(transformed_samples)
print(transformed_df)
# first 5 samples from the transformed dataset ->
display_transformed_data(train_dataset)
Transformed Train Data (Tokenized): input_ids \ 0 [101, 3398, 3492, 2204, 102, 0, 0, 0, 0, 0, 0,... 1 [101, 2092, 2245, 2081, 2208, 11619, 3254, 164... 2 [101, 1044, 2100, 3567, 21877, 3669, 102, 0, 0... 3 [101, 2643, 2293, 2208, 2209, 1038, 2290, 2509... 4 [101, 7886, 2692, 2190, 2208, 4921, 2063, 2209... attention_mask label 0 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 1 1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 1 2 [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... 1 3 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 1 4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ... 1
from torch.utils.data import DataLoader
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
# Load pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
optimizer = AdamW(model.parameters(), lr=2e-5)
# Training loop
epochs = 3
model.train()
for epoch in range(epochs):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
# Move data to GPU
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
# Forward pass
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
# Backward pass and optimization
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
Epoch 1, Loss: 0.2058819322121209 Epoch 2, Loss: 0.13247972990714788 Epoch 3, Loss: 0.07910977544507167
# Evaluation loop
model.eval()
y_preds = []
y_true = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, axis=1)
y_preds.extend(preds.cpu().numpy())
y_true.extend(labels.cpu().numpy())
# Calculate metrics
print("Accuracy:", accuracy_score(y_true, y_preds))
print("F1-Score:", f1_score(y_true, y_preds))
print("Classification Report:")
print(classification_report(y_true, y_preds))
Accuracy: 0.934370613008891 F1-Score: 0.9635785236642213 Classification Report: precision recall f1-score support 0 0.65 0.69 0.67 826 1 0.97 0.96 0.96 7722 accuracy 0.93 8548 macro avg 0.81 0.82 0.82 8548 weighted avg 0.94 0.93 0.94 8548
We will now use more features about the review data like review_length and the sentiment_score
X_train, X_test, y_train, y_test = train_test_split(
df[['review', 'review_length', 'sentiment_score']], # Include all features
df['voted_up'],
test_size=0.2,
)
# Display raw data before transformation
X_test.head()
review | review_length | sentiment_score | |
---|---|---|---|
23404 | h1this game devolved rubbish playedh1 fan talo... | 1620 | 0.7303 |
14647 | immersive story line excellent fight mechinics... | 62 | 0.7351 |
24892 | greatest 2d platformer ever made | 32 | 0.6369 |
32911 | dont enough one thing lose simulator game garb... | 57 | -0.3182 |
27897 | play game interstellar music background 55 aud... | 60 | 0.5994 |
# Columns to normalize
numerical_cols = ['review_length', 'sentiment_score']
scaler = MinMaxScaler()
# Fit on training data and transform both train and test sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
X_train.head()
review | review_length | sentiment_score | |
---|---|---|---|
26411 | objectively beautiful tremendously fun 1010 | 0.005251 | 0.908691 |
12658 | really love game demon souls bloodborne next | 0.005376 | 0.834883 |
35651 | really fun game great story line goals players... | 0.011503 | 0.943744 |
12607 | lots talk people needing starship enterprise r... | 0.088272 | 0.962346 |
24936 | great story doesnt intrude gameplay amazing ga... | 0.031133 | 0.967047 |
Similar to the last model we created a custom Dataset class (ReviewDataset) to include:
- Text Features: Tokenized input IDs and attention masks
- Numerical Features: Normalized review_length and sentiment_score
- Labels: voted_up
class ReviewDataset(Dataset):
def __init__(self, texts, numerical_features, labels):
self.texts = texts
self.numerical_features = numerical_features
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts.iloc[idx]
num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
label = self.labels.iloc[idx]
# Tokenize the text
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'numerical_features': num_features,
'label': torch.tensor(label, dtype=torch.long)
}
# Create datasets
train_dataset = ReviewDataset(
texts=X_train['review'],
numerical_features=X_train[numerical_cols],
labels=y_train
)
test_dataset = ReviewDataset(
texts=X_test['review'],
numerical_features=X_test[numerical_cols],
labels=y_test
)
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
class CustomBERTModel(nn.Module):
def __init__(self, bert_model):
super(CustomBERTModel, self).__init__()
self.bert = bert_model
self.fc = nn.Linear(768 + len(numerical_cols), 2) # Adjusted for the number of numerical features
self.dropout = nn.Dropout(0.3)
def forward(self, input_ids, attention_mask, numerical_features):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
cls_output = outputs.last_hidden_state[:, 0, :]
combined = torch.cat((cls_output, numerical_features), dim=1) # Concatenated with numerical features
logits = self.fc(self.dropout(combined))
return logits
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-uncased')
model = CustomBERTModel(bert_model).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
# Training loop
epochs = 3
model.train()
for epoch in range(epochs):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
numerical_features = batch['numerical_features'].to(device)
labels = batch['label'].to(device)
logits = model(input_ids, attention_mask, numerical_features)
loss = nn.CrossEntropyLoss()(logits, labels)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
Epoch 1, Loss: 0.20206833520540718 Epoch 2, Loss: 0.1320957051658507 Epoch 3, Loss: 0.0781316688128422
model.eval()
y_preds = []
y_true = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
numerical_features = batch['numerical_features'].to(device)
labels = batch['label'].to(device)
logits = model(input_ids, attention_mask, numerical_features)
preds = torch.argmax(logits, axis=1)
y_preds.extend(preds.cpu().numpy())
y_true.extend(labels.cpu().numpy())
# Evaluate
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_true, y_preds))
C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
Classification Report: precision recall f1-score support 0 0.69 0.63 0.66 866 1 0.96 0.97 0.96 7682 accuracy 0.93 8548 macro avg 0.82 0.80 0.81 8548 weighted avg 0.93 0.93 0.93 8548
We will now use a LightGBM classifier model to try to model the same relationship.
The review data is transformed for it to work with our classifier model. The transformation involves converting raw review text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency). This step allows text data to be represented numerically for machine learning models.
numerical_cols = ['review_length', 'sentiment_score']
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train, X_test, y_train, y_test = train_test_split(
df[['review', 'review_length', 'sentiment_score']],
df['voted_up'],
test_size=0.2
)
scaler = MinMaxScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])
# Fit TF-IDF on training data and transform both train and test reviews
X_train_tfidf = vectorizer.fit_transform(X_train['review'])
X_test_tfidf = vectorizer.transform(X_test['review'])
# Combine TF-IDF and scaled numerical features
import scipy
X_train_combined = scipy.sparse.hstack((X_train_tfidf, X_train_numerical))
X_test_combined = scipy.sparse.hstack((X_test_tfidf, X_test_numerical))
# we display the transformed data ->
tfidf_dense = pd.DataFrame(
X_train_tfidf[:5].toarray(),
columns=vectorizer.get_feature_names_out()
)
print("TF-IDF Transformed Data (First 5 Rows):")
print(tfidf_dense.head())
TF-IDF Transformed Data (First 5 Rows): 010 10 100 1000 10010 100ing 1010 11 110 1110 ... на не \ 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 по послушай то ты что ミxノ ヽ_ヽ___ 二つ 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 [5 rows x 5000 columns]
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_combined, y_train)
y_pred = model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141531 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106992 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 Classification Report: precision recall f1-score support 0 0.40 0.74 0.52 855 1 0.97 0.88 0.92 7693 accuracy 0.86 8548 macro avg 0.68 0.81 0.72 8548 weighted avg 0.91 0.86 0.88 8548
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix. _log_warning("Converting data to scipy sparse matrix.")
# import numpy as np
# y_pred_proba = model.predict_proba(X_test_combined)[:, 1]
# for threshold in np.arange(0.1, 0.6, 0.1):
# y_pred = (y_pred_proba >= threshold).astype(int)
# print(f"Threshold: {threshold}")
# print(classification_report(y_test, y_pred))
To improve the model’s performance, we conducted a Grid Search over a small parameter space to find the best combination of hyperparameters. After choosing the best parameters for the model, we adjusted the decision threshold (default is 0.5) to optimize performance for different class priorities. We tested thresholds ranging from 0.1 to 0.9.
param_grid = {
'learning_rate': [0.01, 0.1],
'n_estimators': [100, 200],
'max_depth': [3, 5],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
model = LGBMClassifier(class_weight='balanced')
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1', # Optimize for F1-score
cv=3, # 3-fold cross-validation
verbose=2,
n_jobs=-1 # Use all available cores
)
# Fit grid search
grid_search.fit(X_train_combined, y_train)
# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)
Fitting 3 folds for each of 32 candidates, totalling 96 fits [LightGBM] [Info] Number of positive: 30780, number of negative: 3409 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140871 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106992 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8} Best F1-Score: 0.9126180657708303
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
best_model = LGBMClassifier(class_weight='balanced', **best_params)
best_model.fit(X_train_combined, y_train)
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8} [LightGBM] [Info] Number of positive: 30780, number of negative: 3409 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141309 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106992 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200, subsample=0.8)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200, subsample=0.8)
#then we do threshold adjustment on the tuned model
y_val_proba = best_model.predict_proba(X_test_combined)[:, 1]
# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5 # Default threshold
best_f1 = 0
for threshold in thresholds:
y_val_pred = (y_val_proba >= threshold).astype(int)
f1 = f1_score(y_test, y_val_pred)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_val_pred))
Threshold: 0.1 precision recall f1-score support 0 0.82 0.20 0.32 855 1 0.92 1.00 0.96 7693 accuracy 0.92 8548 macro avg 0.87 0.60 0.64 8548 weighted avg 0.91 0.92 0.89 8548 Threshold: 0.2 precision recall f1-score support 0 0.66 0.39 0.49 855 1 0.94 0.98 0.96 7693 accuracy 0.92 8548 macro avg 0.80 0.69 0.72 8548 weighted avg 0.91 0.92 0.91 8548 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.53 0.54 0.54 855 1 0.95 0.95 0.95 7693 accuracy 0.91 8548 macro avg 0.74 0.74 0.74 8548 weighted avg 0.91 0.91 0.91 8548 Threshold: 0.4 precision recall f1-score support 0 0.43 0.66 0.52 855 1 0.96 0.90 0.93 7693 accuracy 0.88 8548 macro avg 0.69 0.78 0.72 8548 weighted avg 0.91 0.88 0.89 8548 Threshold: 0.5 precision recall f1-score support 0 0.37 0.72 0.49 855 1 0.97 0.86 0.91 7693 accuracy 0.85 8548 macro avg 0.67 0.79 0.70 8548 weighted avg 0.91 0.85 0.87 8548 Threshold: 0.6 precision recall f1-score support 0 0.27 0.84 0.41 855 1 0.98 0.75 0.85 7693 accuracy 0.76 8548 macro avg 0.62 0.80 0.63 8548 weighted avg 0.91 0.76 0.80 8548 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.21 0.92 0.34 855 1 0.99 0.61 0.76 7693 accuracy 0.64 8548 macro avg 0.60 0.77 0.55 8548 weighted avg 0.91 0.64 0.71 8548 Threshold: 0.8 precision recall f1-score support 0 0.17 0.98 0.30 855 1 0.99 0.48 0.65 7693 accuracy 0.53 8548 macro avg 0.58 0.73 0.47 8548 weighted avg 0.91 0.53 0.62 8548 Threshold: 0.9 precision recall f1-score support 0 0.13 0.99 0.23 855 1 1.00 0.28 0.44 7693 accuracy 0.35 8548 macro avg 0.56 0.64 0.34 8548 weighted avg 0.91 0.35 0.42 8548
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix. _log_warning("Converting data to scipy sparse matrix.")
#select the best threshold
best_threshold=0.3
final_test_pred = (y_val_proba >= best_threshold).astype(int)
# Evaluate using classification metrics
print("Classification Report with Optimal Threshold:")
print(classification_report(y_test, final_test_pred))
Classification Report with Optimal Threshold: precision recall f1-score support 0 0.56 0.56 0.56 845 1 0.95 0.95 0.95 7703 accuracy 0.91 8548 macro avg 0.76 0.76 0.76 8548 weighted avg 0.91 0.91 0.91 8548
We try to use an ensemble model with LightGBM, Logistic Regression and Naive Bayes for modelling the same relationship.
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
# we define the base models of LightGBM, logistic regression and naive bayes
base_models = [
('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)),
('logreg', LogisticRegression(max_iter=1000)),
('nb', MultinomialNB())
]
# Logistic Regression would be the meta model
meta_model = LogisticRegression()
stacked_model = StackingClassifier(
estimators=base_models,
final_estimator=meta_model,
cv=3
)
# fit the ensemble model
stacked_model.fit(X_train_combined, y_train)
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141258 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Info] Number of positive: 20513, number of negative: 2279 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072337 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 71475 [LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Info] Number of positive: 20514, number of negative: 2279 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072013 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 71744 [LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Info] Number of positive: 20513, number of negative: 2280 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064723 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 70027 [LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000
StackingClassifier(cv=3, estimators=[('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)), ('logreg', LogisticRegression(max_iter=1000)), ('nb', MultinomialNB())], final_estimator=LogisticRegression())In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(cv=3, estimators=[('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)), ('logreg', LogisticRegression(max_iter=1000)), ('nb', MultinomialNB())], final_estimator=LogisticRegression())
LGBMClassifier(class_weight='balanced', random_state=42)
LogisticRegression(max_iter=1000)
MultinomialNB()
LogisticRegression()
y_pred = stacked_model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report: precision recall f1-score support 0 0.75 0.51 0.61 845 1 0.95 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.85 0.75 0.79 8548 weighted avg 0.93 0.93 0.93 8548
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix. _log_warning("Converting data to scipy sparse matrix.")
param_grid = {
'lightgbm__n_estimators': [100, 150], # Number of boosting iterations
'lightgbm__max_depth': [4, 6], # Maximum depth of trees
'lightgbm__learning_rate': [0.05, 0.1], # Learning rate
'final_estimator__C': [0.1, 1, 10] # Regularization strength for Logistic Regression (meta-model)
}
grid_search = GridSearchCV(
estimator=stacked_model,
param_grid=param_grid,
scoring='f1', # Optimize for F1-score
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_combined, y_train)
# Best parameters and performance
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)
# Use the best model
best_model = grid_search.best_estimator_
# Predict on test set
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 24 candidates, totalling 72 fits [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141049 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 20513, number of negative: 2279 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079789 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 71475 [LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 20514, number of negative: 2279 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075483 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 71744 [LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 20513, number of negative: 2280 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083924 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 70027 [LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Info] Start training from score 0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Best Parameters: {'final_estimator__C': 10, 'lightgbm__learning_rate': 0.1, 'lightgbm__max_depth': 6, 'lightgbm__n_estimators': 150} Best F1-Score: 0.9613205813271328 Classification Report (Best Model): precision recall f1-score support 0 0.74 0.49 0.59 845 1 0.95 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.84 0.74 0.78 8548 weighted avg 0.93 0.93 0.93 8548
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix. _log_warning("Converting data to scipy sparse matrix.")
from sklearn.metrics import classification_report
# Predict probabilities for threshold adjustment
y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]
# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
for threshold in thresholds:
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.85 0.21 0.34 845 1 0.92 1.00 0.96 7703 accuracy 0.92 8548 macro avg 0.89 0.60 0.65 8548 weighted avg 0.91 0.92 0.90 8548 Threshold: 0.2 precision recall f1-score support 0 0.80 0.29 0.43 845 1 0.93 0.99 0.96 7703 accuracy 0.92 8548 macro avg 0.87 0.64 0.69 8548 weighted avg 0.92 0.92 0.91 8548 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.79 0.37 0.50 845 1 0.93 0.99 0.96 7703 accuracy 0.93 8548 macro avg 0.86 0.68 0.73 8548 weighted avg 0.92 0.93 0.92 8548 Threshold: 0.4 precision recall f1-score support 0 0.77 0.44 0.56 845 1 0.94 0.99 0.96 7703 accuracy 0.93 8548 macro avg 0.86 0.71 0.76 8548 weighted avg 0.92 0.93 0.92 8548 Threshold: 0.5
C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix. _log_warning("Converting data to scipy sparse matrix.")
precision recall f1-score support 0 0.74 0.49 0.59 845 1 0.95 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.84 0.74 0.78 8548 weighted avg 0.93 0.93 0.93 8548 Threshold: 0.6 precision recall f1-score support 0 0.71 0.54 0.62 845 1 0.95 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.83 0.76 0.79 8548 weighted avg 0.93 0.93 0.93 8548 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.65 0.62 0.63 845 1 0.96 0.96 0.96 7703 accuracy 0.93 8548 macro avg 0.81 0.79 0.80 8548 weighted avg 0.93 0.93 0.93 8548 Threshold: 0.8 precision recall f1-score support 0 0.57 0.70 0.63 845 1 0.97 0.94 0.95 7703 accuracy 0.92 8548 macro avg 0.77 0.82 0.79 8548 weighted avg 0.93 0.92 0.92 8548 Threshold: 0.9 precision recall f1-score support 0 0.41 0.79 0.54 845 1 0.97 0.88 0.92 7703 accuracy 0.87 8548 macro avg 0.69 0.83 0.73 8548 weighted avg 0.92 0.87 0.89 8548
Next we make an ensemble model by bagging the LightGBM Classifier and try to model the same relationship
from sklearn.ensemble import BaggingClassifier
base_estimator = LGBMClassifier(class_weight='balanced')
# Define the BaggingClassifier
bagging_model = BaggingClassifier(
estimator=base_estimator,
n_estimators=10, # Default number of estimators
max_samples=1.0, # Use all samples
max_features=1.0, # Use all features
random_state=42
)
# Fit the model
bagging_model.fit(X_train_combined, y_train)
# Make predictions
y_pred = bagging_model.predict(X_test_combined)
# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138936 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501547 -> initscore=0.006190 [LightGBM] [Info] Start training from score 0.006190 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144465 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502283 -> initscore=0.009133 [LightGBM] [Info] Start training from score 0.009133 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.148996 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502201 -> initscore=0.008806 [LightGBM] [Info] Start training from score 0.008806 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.142197 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497975 -> initscore=-0.008098 [LightGBM] [Info] Start training from score -0.008098 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134579 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499513 -> initscore=-0.001948 [LightGBM] [Info] Start training from score -0.001948 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138453 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494604 -> initscore=-0.021587 [LightGBM] [Info] Start training from score -0.021587 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134002 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497169 -> initscore=-0.011323 [LightGBM] [Info] Start training from score -0.011323 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143235 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494125 -> initscore=-0.023502 [LightGBM] [Info] Start training from score -0.023502 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140997 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501711 -> initscore=0.006843 [LightGBM] [Info] Start training from score 0.006843 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153181 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 106736 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500488 -> initscore=0.001951 [LightGBM] [Info] Start training from score 0.001951 Classification Report (Without Hyperparameter Tuning): precision recall f1-score support 0 0.43 0.74 0.55 845 1 0.97 0.89 0.93 7703 accuracy 0.88 8548 macro avg 0.70 0.82 0.74 8548 weighted avg 0.92 0.88 0.89 8548
# tune parameters
base_estimator = LGBMClassifier(class_weight='balanced')
# Define the BaggingClassifier
bagging_model = BaggingClassifier(
estimator=base_estimator,
random_state=42
)
param_grid = {
'n_estimators': [5, 10],
'max_samples': [0.6, 0.8],
'max_features': [0.6, 0.8]
}
grid_search = GridSearchCV(
estimator=bagging_model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_combined, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 8 candidates, totalling 24 fits [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099603 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 63766 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1815 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497755 -> initscore=-0.008981 [LightGBM] [Info] Start training from score -0.008981 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076373 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 62858 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1807 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080 [LightGBM] [Info] Start training from score 0.005080 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075791 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 65062 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1790 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080 [LightGBM] [Info] Start training from score 0.005080 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077835 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 62425 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1832 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.512621 -> initscore=0.050495 [LightGBM] [Info] Start training from score 0.050495 [LightGBM] [Info] Number of positive: 30770, number of negative: 3419 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078590 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 64527 [LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1825 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504825 -> initscore=0.019301 [LightGBM] [Info] Start training from score 0.019301 Best Parameters: {'max_features': 0.6, 'max_samples': 0.6, 'n_estimators': 5} Best F1-Score: 0.8968175296168622 Classification Report (Best Model): precision recall f1-score support 0 0.45 0.69 0.54 845 1 0.96 0.91 0.93 7703 accuracy 0.89 8548 macro avg 0.71 0.80 0.74 8548 weighted avg 0.91 0.89 0.90 8548
y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]
# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5 # Default threshold
best_f1 = 0
for threshold in thresholds:
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
f1 = f1_score(y_test, y_pred_adjusted)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
if f1 > best_f1:
best_f1 = f1
best_threshold = threshold
print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")
# Evaluate with the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))
Threshold: 0.1 precision recall f1-score support 0 0.90 0.10 0.18 845 1 0.91 1.00 0.95 7703 accuracy 0.91 8548 macro avg 0.91 0.55 0.56 8548 weighted avg 0.91 0.91 0.88 8548 Threshold: 0.2 precision recall f1-score support 0 0.79 0.28 0.41 845 1 0.93 0.99 0.96 7703 accuracy 0.92 8548 macro avg 0.86 0.64 0.69 8548 weighted avg 0.91 0.92 0.90 8548 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.68 0.46 0.55 845 1 0.94 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.81 0.72 0.75 8548 weighted avg 0.92 0.93 0.92 8548 Threshold: 0.4 precision recall f1-score support 0 0.55 0.58 0.57 845 1 0.95 0.95 0.95 7703 accuracy 0.91 8548 macro avg 0.75 0.77 0.76 8548 weighted avg 0.91 0.91 0.91 8548 Threshold: 0.5 precision recall f1-score support 0 0.45 0.69 0.54 845 1 0.96 0.91 0.93 7703 accuracy 0.89 8548 macro avg 0.71 0.80 0.74 8548 weighted avg 0.91 0.89 0.90 8548 Threshold: 0.6 precision recall f1-score support 0 0.34 0.81 0.48 845 1 0.98 0.83 0.90 7703 accuracy 0.83 8548 macro avg 0.66 0.82 0.69 8548 weighted avg 0.91 0.83 0.85 8548 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.24 0.91 0.38 845 1 0.99 0.68 0.80 7703 accuracy 0.70 8548 macro avg 0.61 0.79 0.59 8548 weighted avg 0.91 0.70 0.76 8548 Threshold: 0.8 precision recall f1-score support 0 0.19 0.97 0.31 845 1 0.99 0.54 0.70 7703 accuracy 0.58 8548 macro avg 0.59 0.75 0.50 8548 weighted avg 0.91 0.58 0.66 8548 Threshold: 0.9 precision recall f1-score support 0 0.13 0.99 0.23 845 1 1.00 0.27 0.42 7703 accuracy 0.34 8548 macro avg 0.56 0.63 0.33 8548 weighted avg 0.91 0.34 0.40 8548 Best Threshold: 0.30000000000000004, Best F1-Score: 0.9593827711534783 Classification Report with Best Threshold: precision recall f1-score support 0 0.68 0.46 0.55 845 1 0.94 0.98 0.96 7703 accuracy 0.93 8548 macro avg 0.81 0.72 0.75 8548 weighted avg 0.92 0.93 0.92 8548
We will try to predict if a review mentions difficulty using review meta data like length and sentiment score. We will try to model review_length and sentiment_score to mentions_difficulty.
df.columns
Index(['game_name', 'review', 'voted_up', 'timestamp_created', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_at_review', 'author_playtime_last_two_weeks', 'author_playtime_forever', 'review_length', 'difficulty_word_count', 'mentions_difficulty', 'roguelike', 'co_op', 'base_building', 'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg', 'competitive', 'first_person', 'crpg', 'multiplayer', 'action', 'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter', 'open_world', 'strategy', 'survival', 'adventure', 'crafting', 'third_person', 'turn_based', '2d', 'experience_level_experienced', 'experience_level_intermediate', 'sentiment_score'], dtype='object')
# we take review_length and sentiment_score as our inputs and mentions_difficulty as our output
features = [
'review_length',
'sentiment_score',
]
X = df[features]
y = df['mentions_difficulty']
X.head()
review_length | sentiment_score | |
---|---|---|
0 | 100 | 0.6369 |
1 | 4 | 0.0000 |
2 | 55 | 0.4215 |
3 | 15 | 0.0000 |
4 | 43 | -0.6597 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# both are numerical columns so we scale them
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
import pandas as pd
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
X_train_scaled
review_length | sentiment_score | |
---|---|---|
0 | 0.007698 | 0.746975 |
1 | 0.000929 | 0.228823 |
2 | 0.001460 | 0.474197 |
3 | 0.006902 | 0.500000 |
4 | 0.001195 | 0.500000 |
... | ... | ... |
29910 | 0.001460 | 0.792979 |
29911 | 0.014335 | 0.903740 |
29912 | 0.005840 | 0.625013 |
29913 | 0.001062 | 0.901040 |
29914 | 0.013008 | 0.062456 |
29915 rows × 2 columns
y.value_counts()
mentions_difficulty 0 36788 1 5949 Name: count, dtype: int64
To combat unbalanced classes, we do sampling. We undersample the majority class and oversample the minority class.
undersampler = RandomUnderSampler(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)
y_train_resampled.value_counts()
mentions_difficulty 0 20880 1 4176 Name: count, dtype: int64
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_resampled, y_train_resampled)
y_train_resampled.value_counts()
mentions_difficulty 0 20880 1 20880 Name: count, dtype: int64
# train a LightGBM model
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LGBMClassifier(class_weight='balanced')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(class_weight='balanced')
y_pred = model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report: precision recall f1-score support 0 0.95 0.76 0.85 11049 1 0.34 0.76 0.47 1773 accuracy 0.76 12822 macro avg 0.65 0.76 0.66 12822 weighted avg 0.87 0.76 0.80 12822
# run the same model but with some hyperparameter tuning
model = LGBMClassifier(class_weight='balanced', random_state=42)
param_grid = {
'n_estimators': [50, 100],
'max_depth': [3, 5],
'learning_rate': [0.05, 0.1],
'colsample_bytree': [0.8, 1.0],
}
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)
y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 16 candidates, totalling 48 fits [LightGBM] [Info] Number of positive: 20880, number of negative: 20880 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000544 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100} Best F1-Score: 0.7800546322105028 [LightGBM] [Info] Number of positive: 20880, number of negative: 20880 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000179 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Classification Report (Best Model): precision recall f1-score support 0 0.95 0.76 0.85 11049 1 0.34 0.76 0.47 1773 accuracy 0.76 12822 macro avg 0.65 0.76 0.66 12822 weighted avg 0.87 0.76 0.79 12822
# Predict probabilities for the positive class
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5 # Default threshold
best_f1 = 0
for threshold in thresholds:
# Adjust predictions based on the current threshold
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
# Evaluate using F1-score
f1 = f1_score(y_test, y_pred_adjusted)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
# Track the best threshold
if f1 > best_f1:
best_f1 = f1
best_threshold = threshold
# Display the best threshold and corresponding F1-score
print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")
# Evaluate the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))
Threshold: 0.1 precision recall f1-score support 0 0.99 0.24 0.39 11049 1 0.17 0.99 0.30 1773 accuracy 0.35 12822 macro avg 0.58 0.62 0.34 12822 weighted avg 0.88 0.35 0.38 12822 Threshold: 0.2 precision recall f1-score support 0 0.99 0.46 0.62 11049 1 0.22 0.96 0.36 1773 accuracy 0.53 12822 macro avg 0.60 0.71 0.49 12822 weighted avg 0.88 0.53 0.59 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.98 0.58 0.72 11049 1 0.26 0.91 0.40 1773 accuracy 0.62 12822 macro avg 0.62 0.74 0.56 12822 weighted avg 0.88 0.62 0.68 12822 Threshold: 0.4 precision recall f1-score support 0 0.96 0.67 0.79 11049 1 0.29 0.85 0.43 1773 accuracy 0.69 12822 macro avg 0.63 0.76 0.61 12822 weighted avg 0.87 0.69 0.74 12822 Threshold: 0.5 precision recall f1-score support 0 0.95 0.76 0.85 11049 1 0.34 0.76 0.47 1773 accuracy 0.76 12822 macro avg 0.65 0.76 0.66 12822 weighted avg 0.87 0.76 0.79 12822 Threshold: 0.6 precision recall f1-score support 0 0.94 0.84 0.89 11049 1 0.40 0.65 0.49 1773 accuracy 0.82 12822 macro avg 0.67 0.75 0.69 12822 weighted avg 0.86 0.82 0.83 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.92 0.93 0.92 11049 1 0.52 0.48 0.50 1773 accuracy 0.87 12822 macro avg 0.72 0.71 0.71 12822 weighted avg 0.86 0.87 0.87 12822 Threshold: 0.8 precision recall f1-score support 0 0.91 0.96 0.93 11049 1 0.61 0.38 0.47 1773 accuracy 0.88 12822 macro avg 0.76 0.67 0.70 12822 weighted avg 0.87 0.88 0.87 12822 Threshold: 0.9 precision recall f1-score support 0 0.89 0.99 0.93 11049 1 0.75 0.20 0.32 1773 accuracy 0.88 12822 macro avg 0.82 0.60 0.63 12822 weighted avg 0.87 0.88 0.85 12822 Best Threshold: 0.7000000000000001, Best F1-Score: 0.5038123167155425 Classification Report with Best Threshold: precision recall f1-score support 0 0.92 0.93 0.92 11049 1 0.52 0.48 0.50 1773 accuracy 0.87 12822 macro avg 0.72 0.71 0.71 12822 weighted avg 0.86 0.87 0.87 12822
We will now use an XGBoost Classifier model to try to model the same relationship as the last one.
from xgboost import XGBClassifier
model = XGBClassifier(
scale_pos_weight=2, # Adjust for class imbalance
use_label_encoder=False,
eval_metric='logloss'
)
# Fit the model
model.fit(X_train_resampled, y_train_resampled)
# Make predictions
y_pred = model.predict(X_test_scaled)
# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))
Classification Report (Without Hyperparameter Tuning): precision recall f1-score support 0 0.96 0.64 0.77 11049 1 0.27 0.84 0.41 1773 accuracy 0.67 12822 macro avg 0.62 0.74 0.59 12822 weighted avg 0.87 0.67 0.72 12822
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:20:42] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
param_grid = {
'n_estimators': [100, 200],
'learning_rate': [0.05, 0.1],
'max_depth': [4, 6],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 32 candidates, totalling 96 fits
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:23:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8} Best F1-Score: 0.7642612384937363 Classification Report (Best Model): precision recall f1-score support 0 0.97 0.62 0.76 11049 1 0.27 0.87 0.41 1773 accuracy 0.66 12822 macro avg 0.62 0.75 0.58 12822 weighted avg 0.87 0.66 0.71 12822
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5 # Default threshold
best_f1 = 0
for threshold in thresholds:
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
f1 = f1_score(y_test, y_pred_adjusted)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.99 0.21 0.35 11049 1 0.17 0.99 0.29 1773 accuracy 0.32 12822 macro avg 0.58 0.60 0.32 12822 weighted avg 0.88 0.32 0.34 12822 Threshold: 0.2 precision recall f1-score support 0 0.99 0.37 0.54 11049 1 0.20 0.97 0.33 1773 accuracy 0.45 12822 macro avg 0.59 0.67 0.43 12822 weighted avg 0.88 0.45 0.51 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.98 0.46 0.63 11049 1 0.22 0.95 0.36 1773 accuracy 0.53 12822 macro avg 0.60 0.71 0.49 12822 weighted avg 0.88 0.53 0.59 12822 Threshold: 0.4 precision recall f1-score support 0 0.98 0.55 0.70 11049 1 0.25 0.92 0.39 1773 accuracy 0.60 12822 macro avg 0.61 0.73 0.55 12822 weighted avg 0.88 0.60 0.66 12822 Threshold: 0.5 precision recall f1-score support 0 0.97 0.62 0.76 11049 1 0.27 0.87 0.41 1773 accuracy 0.66 12822 macro avg 0.62 0.75 0.58 12822 weighted avg 0.87 0.66 0.71 12822 Threshold: 0.6 precision recall f1-score support 0 0.96 0.71 0.81 11049 1 0.31 0.80 0.44 1773 accuracy 0.72 12822 macro avg 0.63 0.75 0.63 12822 weighted avg 0.87 0.72 0.76 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.94 0.81 0.87 11049 1 0.37 0.68 0.48 1773 accuracy 0.79 12822 macro avg 0.65 0.75 0.67 12822 weighted avg 0.86 0.79 0.82 12822 Threshold: 0.8 precision recall f1-score support 0 0.92 0.91 0.92 11049 1 0.48 0.51 0.49 1773 accuracy 0.86 12822 macro avg 0.70 0.71 0.70 12822 weighted avg 0.86 0.86 0.86 12822 Threshold: 0.9 precision recall f1-score support 0 0.90 0.97 0.93 11049 1 0.63 0.34 0.44 1773 accuracy 0.88 12822 macro avg 0.76 0.66 0.69 12822 weighted avg 0.86 0.88 0.87 12822
We will now use an ensemble model of XGBoost, Logistic Regression and Naïve Bayes to model the same relationship
xgb = XGBClassifier(
scale_pos_weight=2,
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
logreg = LogisticRegression(max_iter=1000)
nb = GaussianNB()
stacking_model = StackingClassifier(
estimators=[('xgb', xgb), ('logreg', logreg), ('nb', nb)],
final_estimator=LogisticRegression(max_iter=1000),
cv=3
)
stacking_model.fit(X_train_resampled, y_train_resampled)
y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
Classification Report: precision recall f1-score support 0 0.95 0.76 0.84 11049 1 0.33 0.75 0.46 1773 accuracy 0.75 12822 macro avg 0.64 0.75 0.65 12822 weighted avg 0.86 0.75 0.79 12822
param_grid = {
'xgb__n_estimators': [100, 150],
'xgb__max_depth': [4, 6],
'xgb__learning_rate': [0.05, 0.1],
'final_estimator__C': [0.1, 1, 10]
}
grid_search = GridSearchCV(
estimator=stacking_model,
param_grid=param_grid,
scoring='f1',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
Fitting 3 folds for each of 24 candidates, totalling 72 fits
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning) C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
Best Parameters: {'final_estimator__C': 1, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 150} Best Score: 0.7598426725216721 Classification Report (Best Model): precision recall f1-score support 0 0.95 0.75 0.84 11049 1 0.32 0.74 0.45 1773 accuracy 0.75 12822 macro avg 0.64 0.75 0.65 12822 weighted avg 0.86 0.75 0.79 12822
Thresholds between 0.1 and 0.9 were tested to improve performance metrics for class 1.
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
# Adjust thresholds and evaluate
for threshold in np.arange(0.1, 1.0, 0.1):
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.99 0.21 0.35 11049 1 0.17 0.99 0.29 1773 accuracy 0.32 12822 macro avg 0.58 0.60 0.32 12822 weighted avg 0.88 0.32 0.34 12822 Threshold: 0.2 precision recall f1-score support 0 0.98 0.45 0.61 11049 1 0.21 0.94 0.35 1773 accuracy 0.52 12822 macro avg 0.60 0.69 0.48 12822 weighted avg 0.87 0.52 0.58 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.97 0.57 0.72 11049 1 0.25 0.88 0.39 1773 accuracy 0.61 12822 macro avg 0.61 0.73 0.55 12822 weighted avg 0.87 0.61 0.67 12822 Threshold: 0.4 precision recall f1-score support 0 0.96 0.68 0.80 11049 1 0.29 0.80 0.43 1773 accuracy 0.70 12822 macro avg 0.62 0.74 0.61 12822 weighted avg 0.86 0.70 0.75 12822 Threshold: 0.5 precision recall f1-score support 0 0.95 0.75 0.84 11049 1 0.32 0.74 0.45 1773 accuracy 0.75 12822 macro avg 0.64 0.75 0.65 12822 weighted avg 0.86 0.75 0.79 12822 Threshold: 0.6 precision recall f1-score support 0 0.94 0.83 0.88 11049 1 0.38 0.65 0.48 1773 accuracy 0.80 12822 macro avg 0.66 0.74 0.68 12822 weighted avg 0.86 0.80 0.82 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.92 0.90 0.91 11049 1 0.46 0.53 0.49 1773 accuracy 0.85 12822 macro avg 0.69 0.72 0.70 12822 weighted avg 0.86 0.85 0.85 12822 Threshold: 0.8 precision recall f1-score support 0 0.91 0.95 0.93 11049 1 0.58 0.39 0.47 1773 accuracy 0.88 12822 macro avg 0.74 0.67 0.70 12822 weighted avg 0.86 0.88 0.87 12822 Threshold: 0.9 precision recall f1-score support 0 0.89 0.98 0.93 11049 1 0.71 0.24 0.36 1773 accuracy 0.88 12822 macro avg 0.80 0.61 0.65 12822 weighted avg 0.87 0.88 0.86 12822
We will now use an ensemble model of LightGBM, Logistic Regression and Naïve Bayes to model the same relationship
lgbm = LGBMClassifier(
class_weight='balanced', # Automatically handles class imbalance
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
logreg = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()
stacking_model = StackingClassifier(
estimators=[('lgbm', lgbm), ('logreg', logreg), ('nb', nb)],
final_estimator=LogisticRegression(max_iter=1000),
cv=3
)
stacking_model.fit(X_train_resampled, y_train_resampled)
y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000104 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Classification Report: precision recall f1-score support 0 0.95 0.76 0.84 11049 1 0.33 0.76 0.46 1773 accuracy 0.76 12822 macro avg 0.64 0.76 0.65 12822 weighted avg 0.87 0.76 0.79 12822
param_grid = {
'lgbm__n_estimators': [100, 150],
'lgbm__max_depth': [4, 6],
'lgbm__learning_rate': [0.05, 0.1],
'final_estimator__C': [0.1, 1, 10]
}
grid_search = GridSearchCV(
estimator=stacking_model,
param_grid=param_grid,
scoring='f1',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
Fitting 3 folds for each of 24 candidates, totalling 72 fits [LightGBM] [Info] Number of positive: 20880, number of negative: 20880 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Info] Number of positive: 13920, number of negative: 13920 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000092 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 510 [LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf Best Parameters: {'final_estimator__C': 10, 'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 6, 'lgbm__n_estimators': 150} Best Score: 0.7967459488410441 Classification Report (Best Model): precision recall f1-score support 0 0.95 0.76 0.84 11049 1 0.33 0.76 0.46 1773 accuracy 0.76 12822 macro avg 0.64 0.76 0.65 12822 weighted avg 0.87 0.76 0.79 12822
# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
for threshold in np.arange(0.1, 1.0, 0.1):
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.99 0.39 0.56 11049 1 0.20 0.97 0.34 1773 accuracy 0.47 12822 macro avg 0.60 0.68 0.45 12822 weighted avg 0.88 0.47 0.53 12822 Threshold: 0.2 precision recall f1-score support 0 0.98 0.58 0.73 11049 1 0.26 0.91 0.40 1773 accuracy 0.63 12822 macro avg 0.62 0.75 0.56 12822 weighted avg 0.88 0.63 0.68 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.97 0.65 0.78 11049 1 0.28 0.86 0.42 1773 accuracy 0.68 12822 macro avg 0.62 0.76 0.60 12822 weighted avg 0.87 0.68 0.73 12822 Threshold: 0.4 precision recall f1-score support 0 0.96 0.70 0.81 11049 1 0.31 0.81 0.44 1773 accuracy 0.72 12822 macro avg 0.63 0.76 0.63 12822 weighted avg 0.87 0.72 0.76 12822 Threshold: 0.5 precision recall f1-score support 0 0.95 0.76 0.84 11049 1 0.33 0.76 0.46 1773 accuracy 0.76 12822 macro avg 0.64 0.76 0.65 12822 weighted avg 0.87 0.76 0.79 12822 Threshold: 0.6 precision recall f1-score support 0 0.94 0.83 0.88 11049 1 0.39 0.66 0.49 1773 accuracy 0.81 12822 macro avg 0.66 0.75 0.69 12822 weighted avg 0.86 0.81 0.83 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.93 0.89 0.91 11049 1 0.45 0.56 0.50 1773 accuracy 0.84 12822 macro avg 0.69 0.72 0.70 12822 weighted avg 0.86 0.84 0.85 12822 Threshold: 0.8 precision recall f1-score support 0 0.91 0.95 0.93 11049 1 0.58 0.43 0.49 1773 accuracy 0.88 12822 macro avg 0.74 0.69 0.71 12822 weighted avg 0.87 0.88 0.87 12822 Threshold: 0.9 precision recall f1-score support 0 0.89 0.98 0.93 11049 1 0.70 0.25 0.37 1773 accuracy 0.88 12822 macro avg 0.79 0.62 0.65 12822 weighted avg 0.86 0.88 0.86 12822
# from scipy.stats import pointbiserialr
# corr_length, _ = pointbiserialr(df['review_length'], df['mentions_difficulty'])
# corr_sentiment, _ = pointbiserialr(df['sentiment_score'], df['mentions_difficulty'])
# print(f"Correlation (review_length, mentions_difficulty): {corr_length:.4f}")
# print(f"Correlation (sentiment_score, mentions_difficulty): {corr_sentiment:.4f}")
Next, we would try to model the relationship between the genres of the game in the review and if it mentions difficulty or not.
df.columns
Index(['game_name', 'review', 'voted_up', 'timestamp_created', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_at_review', 'author_playtime_last_two_weeks', 'author_playtime_forever', 'review_length', 'difficulty_word_count', 'mentions_difficulty', 'roguelike', 'co_op', 'base_building', 'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg', 'competitive', 'first_person', 'crpg', 'multiplayer', 'action', 'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter', 'open_world', 'strategy', 'survival', 'adventure', 'crafting', 'third_person', 'turn_based', '2d', 'experience_level_experienced', 'experience_level_intermediate', 'sentiment_score'], dtype='object')
# we define the list of genre columns
genre_columns=['roguelike', 'co_op', 'base_building',
'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
'open_world', 'strategy', 'survival', 'adventure', 'crafting',
'third_person', 'turn_based', '2d']
X=df[genre_columns]
X
roguelike | co_op | base_building | soulslike | deckbuilding | puzzle | metroidvania | rpg | competitive | first_person | ... | platformer | shooter | open_world | strategy | survival | adventure | crafting | third_person | turn_based | 2d | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
42732 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
42733 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
42734 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
42735 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
42736 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
42737 rows × 26 columns
# take all the genre columns as input and mentions_difficulty as output
X=df[genre_columns]
y=df["mentions_difficulty"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
y_train.value_counts()
mentions_difficulty 0 25715 1 4200 Name: count, dtype: int64
# balance the classes by oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
y_train_resampled.value_counts()
mentions_difficulty 0 25715 1 25715 Name: count, dtype: int64
# build a decision tree classifier model
model = DecisionTreeClassifier(
class_weight='balanced'
)
model.fit(X_train_resampled, y_train_resampled)
DecisionTreeClassifier(class_weight='balanced')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(class_weight='balanced')
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report: precision recall f1-score support 0 0.91 0.56 0.69 11030 1 0.19 0.65 0.30 1792 accuracy 0.57 12822 macro avg 0.55 0.60 0.49 12822 weighted avg 0.81 0.57 0.64 12822
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5]
}
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 48 candidates, totalling 144 fits Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2} Best F1-Score: 0.6037269912871214 Classification Report (Best Model): precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822
# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
for threshold in np.arange(0.1, 1.0, 0.1):
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.2 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.94 0.14 0.25 11073 1 0.15 0.94 0.25 1749 accuracy 0.25 12822 macro avg 0.54 0.54 0.25 12822 weighted avg 0.83 0.25 0.25 12822 Threshold: 0.4 precision recall f1-score support 0 0.93 0.35 0.51 11073 1 0.17 0.84 0.28 1749 accuracy 0.42 12822 macro avg 0.55 0.60 0.40 12822 weighted avg 0.83 0.42 0.48 12822 Threshold: 0.5 precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822 Threshold: 0.6 precision recall f1-score support 0 0.88 0.88 0.88 11073 1 0.24 0.24 0.24 1749 accuracy 0.79 12822 macro avg 0.56 0.56 0.56 12822 weighted avg 0.79 0.79 0.79 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.87 0.97 0.92 11073 1 0.27 0.07 0.11 1749 accuracy 0.85 12822 macro avg 0.57 0.52 0.52 12822 weighted avg 0.79 0.85 0.81 12822 Threshold: 0.8
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822 Threshold: 0.9 precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# then we move to a random forest classifier to try to improve this
model = RandomForestClassifier(
class_weight='balanced'
)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report: precision recall f1-score support 0 0.91 0.52 0.66 11073 1 0.19 0.69 0.29 1749 accuracy 0.54 12822 macro avg 0.55 0.61 0.48 12822 weighted avg 0.82 0.54 0.61 12822
# we try to tune the hyperparameters again
param_grid = {
'n_estimators': [100, 200],
'max_depth': [5, 10, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5],
'max_features': ['sqrt', 'log2']
}
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 72 candidates, totalling 216 fits Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100} Best F1-Score: 0.6052300517078143 Classification Report (Best Model): precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
for threshold in np.arange(0.1, 1.0, 0.1):
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.2 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.94 0.14 0.25 11073 1 0.15 0.94 0.25 1749 accuracy 0.25 12822 macro avg 0.54 0.54 0.25 12822 weighted avg 0.83 0.25 0.25 12822 Threshold: 0.4 precision recall f1-score support 0 0.93 0.35 0.51 11073 1 0.17 0.84 0.28 1749 accuracy 0.42 12822 macro avg 0.55 0.60 0.40 12822 weighted avg 0.83 0.42 0.48 12822 Threshold: 0.5 precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822 Threshold: 0.6 precision recall f1-score support 0 0.88 0.88 0.88 11073 1 0.24 0.24 0.24 1749 accuracy 0.79 12822 macro avg 0.56 0.56 0.56 12822 weighted avg 0.79 0.79 0.79 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.87 0.97 0.92 11073 1 0.27 0.07 0.11 1749 accuracy 0.85 12822 macro avg 0.57 0.52 0.52 12822 weighted avg 0.79 0.85 0.81 12822 Threshold: 0.8 precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822 Threshold: 0.9 precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# we now use a logistic regression model for the same relationship
model = LogisticRegression(
max_iter=1000
)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report: precision recall f1-score support 0 0.91 0.52 0.66 11073 1 0.19 0.69 0.29 1749 accuracy 0.54 12822 macro avg 0.55 0.61 0.48 12822 weighted avg 0.82 0.54 0.61 12822
# try to tune this model
param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga'],
}
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_weighted',
cv=3,
verbose=2,
n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))
Fitting 3 folds for each of 20 candidates, totalling 60 fits Best Parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'} Best F1-Score: 0.6054754679057912 Classification Report (Best Model): precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
for threshold in np.arange(0.1, 1.0, 0.1):
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
print(f"Threshold: {threshold}")
print(classification_report(y_test, y_pred_adjusted))
Threshold: 0.1 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.2 precision recall f1-score support 0 0.00 0.00 0.00 11073 1 0.14 1.00 0.24 1749 accuracy 0.14 12822 macro avg 0.07 0.50 0.12 12822 weighted avg 0.02 0.14 0.03 12822 Threshold: 0.30000000000000004 precision recall f1-score support 0 0.93 0.10 0.18 11073 1 0.14 0.95 0.25 1749 accuracy 0.22 12822 macro avg 0.54 0.53 0.22 12822 weighted avg 0.82 0.22 0.19 12822 Threshold: 0.4 precision recall f1-score support 0 0.93 0.35 0.51 11073 1 0.17 0.83 0.28 1749 accuracy 0.41 12822 macro avg 0.55 0.59 0.39 12822 weighted avg 0.82 0.41 0.48 12822 Threshold: 0.5 precision recall f1-score support 0 0.91 0.55 0.69 11073 1 0.19 0.67 0.30 1749 accuracy 0.57 12822 macro avg 0.55 0.61 0.49 12822 weighted avg 0.82 0.57 0.64 12822 Threshold: 0.6 precision recall f1-score support 0 0.88 0.88 0.88 11073 1 0.24 0.24 0.24 1749 accuracy 0.79 12822 macro avg 0.56 0.56 0.56 12822 weighted avg 0.79 0.79 0.79 12822 Threshold: 0.7000000000000001 precision recall f1-score support 0 0.87 0.97 0.92 11073 1 0.27 0.07 0.11 1749 accuracy 0.85 12822 macro avg 0.57 0.52 0.52 12822 weighted avg 0.79 0.85 0.81 12822 Threshold: 0.8 precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822 Threshold: 0.9 precision recall f1-score support 0 0.86 1.00 0.93 11073 1 0.00 0.00 0.00 1749 accuracy 0.86 12822 macro avg 0.43 0.50 0.46 12822 weighted avg 0.75 0.86 0.80 12822
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Since it produces nearly identical results with all three models, even after tuning, we can conclude that there isn't strong enough evidence in the data to model the mentioned relationship