import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# we read in the our csv with new extracted features
df=pd.read_csv("featured_reviews.csv",parse_dates=["timestamp_created"])

df.head()

# we define our features and target
y = df['voted_up']
X = df['review']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['voted_up'], test_size=0.2
)

from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)

# raw data before transformation ->
print("Original Train Data (First 5 Rows):")
raw_sample = pd.DataFrame({
    'review': X_train.iloc[:5],
    'voted_up': y_train.iloc[:5]
})
print(raw_sample)

Original Train Data (First 5 Rows):
                                                  review  voted_up
1288                                    yeah pretty good         1
31280  well thought made game complexity slowly build...         1
39225                                          hyvä peli         1
17598  god love game played bg3 need good play game l...         1
31369                          1010 best game ive played         1

# Display transformed data (tokenized)
def display_transformed_data(dataset, n=5):
    """
    Function to display the transform review data (tokenized)
    """
    print("\nTransformed Train Data (Tokenized):")
    transformed_samples = []
    for i in range(n):
        sample = dataset[i]
        transformed_samples.append({
            'input_ids': sample['input_ids'].tolist(),
            'attention_mask': sample['attention_mask'].tolist(),
            'label': sample['label'].item()
        })
    transformed_df = pd.DataFrame(transformed_samples)
    print(transformed_df)

# first 5 samples from the transformed dataset ->
display_transformed_data(train_dataset)

Transformed Train Data (Tokenized):
                                           input_ids  \
0  [101, 3398, 3492, 2204, 102, 0, 0, 0, 0, 0, 0,...   
1  [101, 2092, 2245, 2081, 2208, 11619, 3254, 164...   
2  [101, 1044, 2100, 3567, 21877, 3669, 102, 0, 0...   
3  [101, 2643, 2293, 2208, 2209, 1038, 2290, 2509...   
4  [101, 7886, 2692, 2190, 2208, 4921, 2063, 2209...   

                                      attention_mask  label  
0  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
2  [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...      1

from torch.utils.data import DataLoader

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 0.2058819322121209
Epoch 2, Loss: 0.13247972990714788
Epoch 3, Loss: 0.07910977544507167

# Evaluation loop
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Calculate metrics
print("Accuracy:", accuracy_score(y_true, y_preds))
print("F1-Score:", f1_score(y_true, y_preds))
print("Classification Report:")
print(classification_report(y_true, y_preds))

Accuracy: 0.934370613008891
F1-Score: 0.9635785236642213
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       826
           1       0.97      0.96      0.96      7722

    accuracy                           0.93      8548
   macro avg       0.81      0.82      0.82      8548
weighted avg       0.94      0.93      0.94      8548

X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],  # Include all features
    df['voted_up'], 
    test_size=0.2, 
)

# Display raw data before transformation
X_test.head()

# Columns to normalize
numerical_cols = ['review_length', 'sentiment_score']

scaler = MinMaxScaler()

# Fit on training data and transform both train and test sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

X_train.head()

class ReviewDataset(Dataset):
    def __init__(self, texts, numerical_features, labels):
        self.texts = texts
        self.numerical_features = numerical_features
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
        label = self.labels.iloc[idx]
        
        # Tokenize the text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'numerical_features': num_features,
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = ReviewDataset(
    texts=X_train['review'],
    numerical_features=X_train[numerical_cols],
    labels=y_train
)

test_dataset = ReviewDataset(
    texts=X_test['review'],
    numerical_features=X_test[numerical_cols],
    labels=y_test
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

class CustomBERTModel(nn.Module):
    def __init__(self, bert_model):
        super(CustomBERTModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768 + len(numerical_cols), 2)  # Adjusted for the number of numerical features
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  
        combined = torch.cat((cls_output, numerical_features), dim=1)  # Concatenated with numerical features
        logits = self.fc(self.dropout(combined))
        return logits

from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

model = CustomBERTModel(bert_model).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)

Epoch 1, Loss: 0.20206833520540718
Epoch 2, Loss: 0.1320957051658507
Epoch 3, Loss: 0.0781316688128422

model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        preds = torch.argmax(logits, axis=1)
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Evaluate
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_true, y_preds))

C:\Users\aniru\AppData\Local\Temp\ipykernel_6700\335967104.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       866
           1       0.96      0.97      0.96      7682

    accuracy                           0.93      8548
   macro avg       0.82      0.80      0.81      8548
weighted avg       0.93      0.93      0.93      8548

numerical_cols = ['review_length', 'sentiment_score']

from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],
    df['voted_up'],
    test_size=0.2
)

scaler = MinMaxScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])

# Fit TF-IDF on training data and transform both train and test reviews
X_train_tfidf = vectorizer.fit_transform(X_train['review'])
X_test_tfidf = vectorizer.transform(X_test['review'])

# Combine TF-IDF and scaled numerical features
import scipy
X_train_combined = scipy.sparse.hstack((X_train_tfidf, X_train_numerical))
X_test_combined = scipy.sparse.hstack((X_test_tfidf, X_test_numerical))

# we display the transformed data ->
tfidf_dense = pd.DataFrame(
    X_train_tfidf[:5].toarray(), 
    columns=vectorizer.get_feature_names_out() 
)
print("TF-IDF Transformed Data (First 5 Rows):")
print(tfidf_dense.head())

TF-IDF Transformed Data (First 5 Rows):
   010   10  100  1000  10010  100ing  1010   11  110  1110  ...   на   не  \
0  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
1  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
2  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
3  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   
4  0.0  0.0  0.0   0.0    0.0     0.0   0.0  0.0  0.0   0.0  ...  0.0  0.0   

    по  послушай   то   ты  что  ミxノ  ヽ_ヽ___   二つ  
0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
1  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
2  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
3  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
4  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  

[5 rows x 5000 columns]

model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)

print("Classification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.74      0.52       855
           1       0.97      0.88      0.92      7693

    accuracy                           0.86      8548
   macro avg       0.68      0.81      0.72      8548
weighted avg       0.91      0.86      0.88      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")

# import numpy as np

# y_pred_proba = model.predict_proba(X_test_combined)[:, 1]

# for threshold in np.arange(0.1, 0.6, 0.1):
#     y_pred = (y_pred_proba >= threshold).astype(int)
#     print(f"Threshold: {threshold}")
#     print(classification_report(y_test, y_pred))

param_grid = {
    'learning_rate': [0.01, 0.1],         
    'n_estimators': [100, 200],           
    'max_depth': [3, 5],                  
    'subsample': [0.8, 1.0],              
    'colsample_bytree': [0.8, 1.0]        
}

from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

model = LGBMClassifier(class_weight='balanced')

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=3,          # 3-fold cross-validation
    verbose=2,
    n_jobs=-1      # Use all available cores
)

# Fit grid search
grid_search.fit(X_train_combined, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.9126180657708303

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = LGBMClassifier(class_weight='balanced', **best_params)
best_model.fit(X_train_combined, y_train)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
[LightGBM] [Info] Number of positive: 30780, number of negative: 3409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106992
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3057
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200,
               subsample=0.8)

LGBMClassifier(class_weight='balanced', max_depth=5, n_estimators=200,
               subsample=0.8)

#then we do threshold adjustment on the tuned model
y_val_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_val_pred = (y_val_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_val_pred)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_val_pred))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.82      0.20      0.32       855
           1       0.92      1.00      0.96      7693

    accuracy                           0.92      8548
   macro avg       0.87      0.60      0.64      8548
weighted avg       0.91      0.92      0.89      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.66      0.39      0.49       855
           1       0.94      0.98      0.96      7693

    accuracy                           0.92      8548
   macro avg       0.80      0.69      0.72      8548
weighted avg       0.91      0.92      0.91      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.53      0.54      0.54       855
           1       0.95      0.95      0.95      7693

    accuracy                           0.91      8548
   macro avg       0.74      0.74      0.74      8548
weighted avg       0.91      0.91      0.91      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.43      0.66      0.52       855
           1       0.96      0.90      0.93      7693

    accuracy                           0.88      8548
   macro avg       0.69      0.78      0.72      8548
weighted avg       0.91      0.88      0.89      8548

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.37      0.72      0.49       855
           1       0.97      0.86      0.91      7693

    accuracy                           0.85      8548
   macro avg       0.67      0.79      0.70      8548
weighted avg       0.91      0.85      0.87      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.27      0.84      0.41       855
           1       0.98      0.75      0.85      7693

    accuracy                           0.76      8548
   macro avg       0.62      0.80      0.63      8548
weighted avg       0.91      0.76      0.80      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.21      0.92      0.34       855
           1       0.99      0.61      0.76      7693

    accuracy                           0.64      8548
   macro avg       0.60      0.77      0.55      8548
weighted avg       0.91      0.64      0.71      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.17      0.98      0.30       855
           1       0.99      0.48      0.65      7693

    accuracy                           0.53      8548
   macro avg       0.58      0.73      0.47      8548
weighted avg       0.91      0.53      0.62      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.13      0.99      0.23       855
           1       1.00      0.28      0.44      7693

    accuracy                           0.35      8548
   macro avg       0.56      0.64      0.34      8548
weighted avg       0.91      0.35      0.42      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")

#select the best threshold
best_threshold=0.3

final_test_pred = (y_val_proba >= best_threshold).astype(int)

# Evaluate using classification metrics
print("Classification Report with Optimal Threshold:")
print(classification_report(y_test, final_test_pred))

Classification Report with Optimal Threshold:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56       845
           1       0.95      0.95      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.76      0.76      0.76      8548
weighted avg       0.91      0.91      0.91      8548

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier

# we define the base models of LightGBM, logistic regression and naive bayes
base_models = [
    ('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)),
    ('logreg', LogisticRegression(max_iter=1000)),
    ('nb', MultinomialNB())
]

# Logistic Regression would be the meta model
meta_model = LogisticRegression()

stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3 
)

# fit the ensemble model
stacked_model.fit(X_train_combined, y_train)

[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20513, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71475
[LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20514, number of negative: 2279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71744
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20513, number of negative: 2280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064723 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70027
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

StackingClassifier(cv=3,
                   estimators=[('lightgbm',
                                LGBMClassifier(class_weight='balanced',
                                               random_state=42)),
                               ('logreg', LogisticRegression(max_iter=1000)),
                               ('nb', MultinomialNB())],
                   final_estimator=LogisticRegression())

StackingClassifier(cv=3,
                   estimators=[('lightgbm',
                                LGBMClassifier(class_weight='balanced',
                                               random_state=42)),
                               ('logreg', LogisticRegression(max_iter=1000)),
                               ('nb', MultinomialNB())],
                   final_estimator=LogisticRegression())

LGBMClassifier(class_weight='balanced', random_state=42)

LogisticRegression(max_iter=1000)

MultinomialNB()

LogisticRegression()

y_pred = stacked_model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.51      0.61       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.85      0.75      0.79      8548
weighted avg       0.93      0.93      0.93      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")

param_grid = {
    'lightgbm__n_estimators': [100, 150],       # Number of boosting iterations
    'lightgbm__max_depth': [4, 6],             # Maximum depth of trees
    'lightgbm__learning_rate': [0.05, 0.1],    # Learning rate
    'final_estimator__C': [0.1, 1, 10]         # Regularization strength for Logistic Regression (meta-model)
}

grid_search = GridSearchCV(
    estimator=stacked_model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_combined, y_train)

# Best parameters and performance
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

# Use the best model
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20513, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71475
[LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2318
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20514, number of negative: 2279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71744
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 20513, number of negative: 2280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70027
[LightGBM] [Info] Number of data points in the train set: 22793, number of used features: 2253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'final_estimator__C': 10, 'lightgbm__learning_rate': 0.1, 'lightgbm__max_depth': 6, 'lightgbm__n_estimators': 150}
Best F1-Score: 0.9613205813271328
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.74      0.49      0.59       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.84      0.74      0.78      8548
weighted avg       0.93      0.93      0.93      8548

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")

from sklearn.metrics import classification_report

# Predict probabilities for threshold adjustment
y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.85      0.21      0.34       845
           1       0.92      1.00      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.89      0.60      0.65      8548
weighted avg       0.91      0.92      0.90      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.80      0.29      0.43       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.87      0.64      0.69      8548
weighted avg       0.92      0.92      0.91      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.79      0.37      0.50       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.86      0.68      0.73      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.77      0.44      0.56       845
           1       0.94      0.99      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.86      0.71      0.76      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.5

C:\Users\aniru\anaconda3\Lib\site-packages\lightgbm\basic.py:1218: UserWarning: Converting data to scipy sparse matrix.
  _log_warning("Converting data to scipy sparse matrix.")

              precision    recall  f1-score   support

           0       0.74      0.49      0.59       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.84      0.74      0.78      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.71      0.54      0.62       845
           1       0.95      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.83      0.76      0.79      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.65      0.62      0.63       845
           1       0.96      0.96      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.79      0.80      8548
weighted avg       0.93      0.93      0.93      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.57      0.70      0.63       845
           1       0.97      0.94      0.95      7703

    accuracy                           0.92      8548
   macro avg       0.77      0.82      0.79      8548
weighted avg       0.93      0.92      0.92      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.41      0.79      0.54       845
           1       0.97      0.88      0.92      7703

    accuracy                           0.87      8548
   macro avg       0.69      0.83      0.73      8548
weighted avg       0.92      0.87      0.89      8548

from sklearn.ensemble import BaggingClassifier

base_estimator = LGBMClassifier(class_weight='balanced')

# Define the BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=base_estimator,
    n_estimators=10,  # Default number of estimators
    max_samples=1.0,  # Use all samples
    max_features=1.0,  # Use all features
    random_state=42
)

# Fit the model
bagging_model.fit(X_train_combined, y_train)

# Make predictions
y_pred = bagging_model.predict(X_test_combined)

# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501547 -> initscore=0.006190
[LightGBM] [Info] Start training from score 0.006190
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502283 -> initscore=0.009133
[LightGBM] [Info] Start training from score 0.009133
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.148996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502201 -> initscore=0.008806
[LightGBM] [Info] Start training from score 0.008806
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.142197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497975 -> initscore=-0.008098
[LightGBM] [Info] Start training from score -0.008098
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499513 -> initscore=-0.001948
[LightGBM] [Info] Start training from score -0.001948
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494604 -> initscore=-0.021587
[LightGBM] [Info] Start training from score -0.021587
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497169 -> initscore=-0.011323
[LightGBM] [Info] Start training from score -0.011323
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494125 -> initscore=-0.023502
[LightGBM] [Info] Start training from score -0.023502
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501711 -> initscore=0.006843
[LightGBM] [Info] Start training from score 0.006843
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106736
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3025
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500488 -> initscore=0.001951
[LightGBM] [Info] Start training from score 0.001951
Classification Report (Without Hyperparameter Tuning):
              precision    recall  f1-score   support

           0       0.43      0.74      0.55       845
           1       0.97      0.89      0.93      7703

    accuracy                           0.88      8548
   macro avg       0.70      0.82      0.74      8548
weighted avg       0.92      0.88      0.89      8548

# tune parameters

base_estimator = LGBMClassifier(class_weight='balanced')

# Define the BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=base_estimator,
    random_state=42
)

param_grid = {
    'n_estimators': [5, 10],          
    'max_samples': [0.6, 0.8],       
    'max_features': [0.6, 0.8]
}

grid_search = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=3,                  
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_combined, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63766
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1815
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497755 -> initscore=-0.008981
[LightGBM] [Info] Start training from score -0.008981
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62858
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1807
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080
[LightGBM] [Info] Start training from score 0.005080
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65062
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1790
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501270 -> initscore=0.005080
[LightGBM] [Info] Start training from score 0.005080
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62425
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1832
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.512621 -> initscore=0.050495
[LightGBM] [Info] Start training from score 0.050495
[LightGBM] [Info] Number of positive: 30770, number of negative: 3419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64527
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 1825
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504825 -> initscore=0.019301
[LightGBM] [Info] Start training from score 0.019301
Best Parameters: {'max_features': 0.6, 'max_samples': 0.6, 'n_estimators': 5}
Best F1-Score: 0.8968175296168622
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.45      0.69      0.54       845
           1       0.96      0.91      0.93      7703

    accuracy                           0.89      8548
   macro avg       0.71      0.80      0.74      8548
weighted avg       0.91      0.89      0.90      8548

y_pred_proba = best_model.predict_proba(X_test_combined)[:, 1]

# Adjust thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")

# Evaluate with the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.90      0.10      0.18       845
           1       0.91      1.00      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.91      0.55      0.56      8548
weighted avg       0.91      0.91      0.88      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.79      0.28      0.41       845
           1       0.93      0.99      0.96      7703

    accuracy                           0.92      8548
   macro avg       0.86      0.64      0.69      8548
weighted avg       0.91      0.92      0.90      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.68      0.46      0.55       845
           1       0.94      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.72      0.75      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.55      0.58      0.57       845
           1       0.95      0.95      0.95      7703

    accuracy                           0.91      8548
   macro avg       0.75      0.77      0.76      8548
weighted avg       0.91      0.91      0.91      8548

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.45      0.69      0.54       845
           1       0.96      0.91      0.93      7703

    accuracy                           0.89      8548
   macro avg       0.71      0.80      0.74      8548
weighted avg       0.91      0.89      0.90      8548

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.34      0.81      0.48       845
           1       0.98      0.83      0.90      7703

    accuracy                           0.83      8548
   macro avg       0.66      0.82      0.69      8548
weighted avg       0.91      0.83      0.85      8548

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.24      0.91      0.38       845
           1       0.99      0.68      0.80      7703

    accuracy                           0.70      8548
   macro avg       0.61      0.79      0.59      8548
weighted avg       0.91      0.70      0.76      8548

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.19      0.97      0.31       845
           1       0.99      0.54      0.70      7703

    accuracy                           0.58      8548
   macro avg       0.59      0.75      0.50      8548
weighted avg       0.91      0.58      0.66      8548

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.13      0.99      0.23       845
           1       1.00      0.27      0.42      7703

    accuracy                           0.34      8548
   macro avg       0.56      0.63      0.33      8548
weighted avg       0.91      0.34      0.40      8548

Best Threshold: 0.30000000000000004, Best F1-Score: 0.9593827711534783
Classification Report with Best Threshold:
              precision    recall  f1-score   support

           0       0.68      0.46      0.55       845
           1       0.94      0.98      0.96      7703

    accuracy                           0.93      8548
   macro avg       0.81      0.72      0.75      8548
weighted avg       0.92      0.93      0.92      8548

df.columns

Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate', 'sentiment_score'],
      dtype='object')

# we take review_length and sentiment_score as our inputs and mentions_difficulty as our output
features = [
    'review_length', 
    'sentiment_score',   
]

X = df[features]
y = df['mentions_difficulty']

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# both are numerical columns so we scale them
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

import pandas as pd
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled

y.value_counts()

mentions_difficulty
0    36788
1     5949
Name: count, dtype: int64

undersampler = RandomUnderSampler(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)
y_train_resampled.value_counts()

mentions_difficulty
0    20880
1     4176
Name: count, dtype: int64

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_resampled, y_train_resampled)
y_train_resampled.value_counts()

mentions_difficulty
0    20880
1    20880
Name: count, dtype: int64

# train a LightGBM model 
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LGBMClassifier(class_weight='balanced')

LGBMClassifier(class_weight='balanced')

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.80     12822

# run the same model but with some hyperparameter tuning
model = LGBMClassifier(class_weight='balanced', random_state=42)

param_grid = {
    'n_estimators': [50, 100],         
    'max_depth': [3, 5],             
    'learning_rate': [0.05, 0.1],     
    'colsample_bytree': [0.8, 1.0],   
}


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best F1-Score: 0.7800546322105028
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.79     12822

# Predict probabilities for the positive class
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Test thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    # Adjust predictions based on the current threshold
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    
    # Evaluate using F1-score
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))
    
    # Track the best threshold
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# Display the best threshold and corresponding F1-score
print(f"Best Threshold: {best_threshold}, Best F1-Score: {best_f1}")

# Evaluate the best threshold
y_pred_best_threshold = (y_pred_proba >= best_threshold).astype(int)
print("Classification Report with Best Threshold:")
print(classification_report(y_test, y_pred_best_threshold))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.24      0.39     11049
           1       0.17      0.99      0.30      1773

    accuracy                           0.35     12822
   macro avg       0.58      0.62      0.34     12822
weighted avg       0.88      0.35      0.38     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.99      0.46      0.62     11049
           1       0.22      0.96      0.36      1773

    accuracy                           0.53     12822
   macro avg       0.60      0.71      0.49     12822
weighted avg       0.88      0.53      0.59     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.98      0.58      0.72     11049
           1       0.26      0.91      0.40      1773

    accuracy                           0.62     12822
   macro avg       0.62      0.74      0.56     12822
weighted avg       0.88      0.62      0.68     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.67      0.79     11049
           1       0.29      0.85      0.43      1773

    accuracy                           0.69     12822
   macro avg       0.63      0.76      0.61     12822
weighted avg       0.87      0.69      0.74     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11049
           1       0.34      0.76      0.47      1773

    accuracy                           0.76     12822
   macro avg       0.65      0.76      0.66     12822
weighted avg       0.87      0.76      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.84      0.89     11049
           1       0.40      0.65      0.49      1773

    accuracy                           0.82     12822
   macro avg       0.67      0.75      0.69     12822
weighted avg       0.86      0.82      0.83     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     11049
           1       0.52      0.48      0.50      1773

    accuracy                           0.87     12822
   macro avg       0.72      0.71      0.71     12822
weighted avg       0.86      0.87      0.87     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     11049
           1       0.61      0.38      0.47      1773

    accuracy                           0.88     12822
   macro avg       0.76      0.67      0.70     12822
weighted avg       0.87      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.99      0.93     11049
           1       0.75      0.20      0.32      1773

    accuracy                           0.88     12822
   macro avg       0.82      0.60      0.63     12822
weighted avg       0.87      0.88      0.85     12822

Best Threshold: 0.7000000000000001, Best F1-Score: 0.5038123167155425
Classification Report with Best Threshold:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     11049
           1       0.52      0.48      0.50      1773

    accuracy                           0.87     12822
   macro avg       0.72      0.71      0.71     12822
weighted avg       0.86      0.87      0.87     12822

from xgboost import XGBClassifier

model = XGBClassifier(
    scale_pos_weight=2,        # Adjust for class imbalance
    use_label_encoder=False,  
    eval_metric='logloss'
)

# Fit the model
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Classification Report (Without Hyperparameter Tuning):")
print(classification_report(y_test, y_pred))

Classification Report (Without Hyperparameter Tuning):
              precision    recall  f1-score   support

           0       0.96      0.64      0.77     11049
           1       0.27      0.84      0.41      1773

    accuracy                           0.67     12822
   macro avg       0.62      0.74      0.59     12822
weighted avg       0.87      0.67      0.72     12822

C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:20:42] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

param_grid = {
    'n_estimators': [100, 200],      
    'learning_rate': [0.05, 0.1],   
    'max_depth': [4, 6],            
    'subsample': [0.8, 1.0],        
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',    
    cv=3,                     
    verbose=2,               
    n_jobs=-1                 
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 32 candidates, totalling 96 fits

C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [21:23:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.7642612384937363
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.97      0.62      0.76     11049
           1       0.27      0.87      0.41      1773

    accuracy                           0.66     12822
   macro avg       0.62      0.75      0.58     12822
weighted avg       0.87      0.66      0.71     12822

y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold = 0.5  # Default threshold
best_f1 = 0

for threshold in thresholds:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    
    f1 = f1_score(y_test, y_pred_adjusted)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.21      0.35     11049
           1       0.17      0.99      0.29      1773

    accuracy                           0.32     12822
   macro avg       0.58      0.60      0.32     12822
weighted avg       0.88      0.32      0.34     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.99      0.37      0.54     11049
           1       0.20      0.97      0.33      1773

    accuracy                           0.45     12822
   macro avg       0.59      0.67      0.43     12822
weighted avg       0.88      0.45      0.51     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.98      0.46      0.63     11049
           1       0.22      0.95      0.36      1773

    accuracy                           0.53     12822
   macro avg       0.60      0.71      0.49     12822
weighted avg       0.88      0.53      0.59     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.98      0.55      0.70     11049
           1       0.25      0.92      0.39      1773

    accuracy                           0.60     12822
   macro avg       0.61      0.73      0.55     12822
weighted avg       0.88      0.60      0.66     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.97      0.62      0.76     11049
           1       0.27      0.87      0.41      1773

    accuracy                           0.66     12822
   macro avg       0.62      0.75      0.58     12822
weighted avg       0.87      0.66      0.71     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.96      0.71      0.81     11049
           1       0.31      0.80      0.44      1773

    accuracy                           0.72     12822
   macro avg       0.63      0.75      0.63     12822
weighted avg       0.87      0.72      0.76     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.94      0.81      0.87     11049
           1       0.37      0.68      0.48      1773

    accuracy                           0.79     12822
   macro avg       0.65      0.75      0.67     12822
weighted avg       0.86      0.79      0.82     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     11049
           1       0.48      0.51      0.49      1773

    accuracy                           0.86     12822
   macro avg       0.70      0.71      0.70     12822
weighted avg       0.86      0.86      0.86     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     11049
           1       0.63      0.34      0.44      1773

    accuracy                           0.88     12822
   macro avg       0.76      0.66      0.69     12822
weighted avg       0.86      0.88      0.87     12822

xgb = XGBClassifier(
    scale_pos_weight=2,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
logreg = LogisticRegression(max_iter=1000)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('xgb', xgb), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:58] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:15:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.75      0.46      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

param_grid = {
    'xgb__n_estimators': [100, 150],     
    'xgb__max_depth': [4, 6],            
    'xgb__learning_rate': [0.05, 0.1],   
    'final_estimator__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1',  
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 24 candidates, totalling 72 fits

C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\aniru\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:22:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

Best Parameters: {'final_estimator__C': 1, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 150}
Best Score: 0.7598426725216721
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11049
           1       0.32      0.74      0.45      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Adjust thresholds and evaluate
for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.21      0.35     11049
           1       0.17      0.99      0.29      1773

    accuracy                           0.32     12822
   macro avg       0.58      0.60      0.32     12822
weighted avg       0.88      0.32      0.34     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.98      0.45      0.61     11049
           1       0.21      0.94      0.35      1773

    accuracy                           0.52     12822
   macro avg       0.60      0.69      0.48     12822
weighted avg       0.87      0.52      0.58     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.97      0.57      0.72     11049
           1       0.25      0.88      0.39      1773

    accuracy                           0.61     12822
   macro avg       0.61      0.73      0.55     12822
weighted avg       0.87      0.61      0.67     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.68      0.80     11049
           1       0.29      0.80      0.43      1773

    accuracy                           0.70     12822
   macro avg       0.62      0.74      0.61     12822
weighted avg       0.86      0.70      0.75     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11049
           1       0.32      0.74      0.45      1773

    accuracy                           0.75     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.75      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.83      0.88     11049
           1       0.38      0.65      0.48      1773

    accuracy                           0.80     12822
   macro avg       0.66      0.74      0.68     12822
weighted avg       0.86      0.80      0.82     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     11049
           1       0.46      0.53      0.49      1773

    accuracy                           0.85     12822
   macro avg       0.69      0.72      0.70     12822
weighted avg       0.86      0.85      0.85     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     11049
           1       0.58      0.39      0.47      1773

    accuracy                           0.88     12822
   macro avg       0.74      0.67      0.70     12822
weighted avg       0.86      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     11049
           1       0.71      0.24      0.36      1773

    accuracy                           0.88     12822
   macro avg       0.80      0.61      0.65     12822
weighted avg       0.87      0.88      0.86     12822

lgbm = LGBMClassifier(
    class_weight='balanced',  # Automatically handles class imbalance
    n_estimators=100,         
    max_depth=6,              
    learning_rate=0.1,        
    subsample=0.8,            
    colsample_bytree=0.8,     
    random_state=42
)

logreg = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('lgbm', lgbm), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

param_grid = {
    'lgbm__n_estimators': [100, 150],       
    'lgbm__max_depth': [4, 6],             
    'lgbm__learning_rate': [0.05, 0.1],    
    'final_estimator__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1', 
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[LightGBM] [Info] Number of positive: 20880, number of negative: 20880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41760, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 13920, number of negative: 13920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best Parameters: {'final_estimator__C': 10, 'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 6, 'lgbm__n_estimators': 150}
Best Score: 0.7967459488410441
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.99      0.39      0.56     11049
           1       0.20      0.97      0.34      1773

    accuracy                           0.47     12822
   macro avg       0.60      0.68      0.45     12822
weighted avg       0.88      0.47      0.53     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.98      0.58      0.73     11049
           1       0.26      0.91      0.40      1773

    accuracy                           0.63     12822
   macro avg       0.62      0.75      0.56     12822
weighted avg       0.88      0.63      0.68     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.97      0.65      0.78     11049
           1       0.28      0.86      0.42      1773

    accuracy                           0.68     12822
   macro avg       0.62      0.76      0.60     12822
weighted avg       0.87      0.68      0.73     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     11049
           1       0.31      0.81      0.44      1773

    accuracy                           0.72     12822
   macro avg       0.63      0.76      0.63     12822
weighted avg       0.87      0.72      0.76     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11049
           1       0.33      0.76      0.46      1773

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.94      0.83      0.88     11049
           1       0.39      0.66      0.49      1773

    accuracy                           0.81     12822
   macro avg       0.66      0.75      0.69     12822
weighted avg       0.86      0.81      0.83     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     11049
           1       0.45      0.56      0.50      1773

    accuracy                           0.84     12822
   macro avg       0.69      0.72      0.70     12822
weighted avg       0.86      0.84      0.85     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     11049
           1       0.58      0.43      0.49      1773

    accuracy                           0.88     12822
   macro avg       0.74      0.69      0.71     12822
weighted avg       0.87      0.88      0.87     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     11049
           1       0.70      0.25      0.37      1773

    accuracy                           0.88     12822
   macro avg       0.79      0.62      0.65     12822
weighted avg       0.86      0.88      0.86     12822

# from scipy.stats import pointbiserialr

# corr_length, _ = pointbiserialr(df['review_length'], df['mentions_difficulty'])
# corr_sentiment, _ = pointbiserialr(df['sentiment_score'], df['mentions_difficulty'])

# print(f"Correlation (review_length, mentions_difficulty): {corr_length:.4f}")
# print(f"Correlation (sentiment_score, mentions_difficulty): {corr_sentiment:.4f}")

df.columns

Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate', 'sentiment_score'],
      dtype='object')

# we define the list of genre columns
genre_columns=['roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d']

X=df[genre_columns]
X

# take all the genre columns as input and mentions_difficulty as output
X=df[genre_columns]
y=df["mentions_difficulty"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

y_train.value_counts()

mentions_difficulty
0    25715
1     4200
Name: count, dtype: int64

# balance the classes by oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

y_train_resampled.value_counts()

mentions_difficulty
0    25715
1    25715
Name: count, dtype: int64

# build a decision tree classifier model
model = DecisionTreeClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)

DecisionTreeClassifier(class_weight='balanced')

DecisionTreeClassifier(class_weight='balanced')

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822

param_grid = {
    'criterion': ['gini', 'entropy'],    
    'max_depth': [5, 10, 15, None],         
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 5]        
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best F1-Score: 0.6037269912871214
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.94      0.14      0.25     11073
           1       0.15      0.94      0.25      1749

    accuracy                           0.25     12822
   macro avg       0.54      0.54      0.25     12822
weighted avg       0.83      0.25      0.25     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.84      0.28      1749

    accuracy                           0.42     12822
   macro avg       0.55      0.60      0.40     12822
weighted avg       0.83      0.42      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

# then we move to a random forest classifier to try to improve this
model = RandomForestClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66     11073
           1       0.19      0.69      0.29      1749

    accuracy                           0.54     12822
   macro avg       0.55      0.61      0.48     12822
weighted avg       0.82      0.54      0.61     12822

# we try to tune the hyperparameters again
param_grid = {
    'n_estimators': [100, 200],          
    'max_depth': [5, 10, None],         
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 5],         
    'max_features': ['sqrt', 'log2']     
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best F1-Score: 0.6052300517078143
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.94      0.14      0.25     11073
           1       0.15      0.94      0.25      1749

    accuracy                           0.25     12822
   macro avg       0.54      0.54      0.25     12822
weighted avg       0.83      0.25      0.25     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.84      0.28      1749

    accuracy                           0.42     12822
   macro avg       0.55      0.60      0.40     12822
weighted avg       0.83      0.42      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

# we now use a logistic regression model for the same relationship
model = LogisticRegression(
    max_iter=1000
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66     11073
           1       0.19      0.69      0.29      1749

    accuracy                           0.54     12822
   macro avg       0.55      0.61      0.48     12822
weighted avg       0.82      0.54      0.61     12822

# try to tune this model
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear', 'saga'],  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best F1-Score: 0.6054754679057912
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

y_pred_proba = best_model.predict_proba(X_test)[:, 1]

for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     11073
           1       0.14      1.00      0.24      1749

    accuracy                           0.14     12822
   macro avg       0.07      0.50      0.12     12822
weighted avg       0.02      0.14      0.03     12822

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.93      0.10      0.18     11073
           1       0.14      0.95      0.25      1749

    accuracy                           0.22     12822
   macro avg       0.54      0.53      0.22     12822
weighted avg       0.82      0.22      0.19     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.93      0.35      0.51     11073
           1       0.17      0.83      0.28      1749

    accuracy                           0.41     12822
   macro avg       0.55      0.59      0.39     12822
weighted avg       0.82      0.41      0.48     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.91      0.55      0.69     11073
           1       0.19      0.67      0.30      1749

    accuracy                           0.57     12822
   macro avg       0.55      0.61      0.49     12822
weighted avg       0.82      0.57      0.64     12822

Threshold: 0.6
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     11073
           1       0.24      0.24      0.24      1749

    accuracy                           0.79     12822
   macro avg       0.56      0.56      0.56     12822
weighted avg       0.79      0.79      0.79     12822

Threshold: 0.7000000000000001
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     11073
           1       0.27      0.07      0.11      1749

    accuracy                           0.85     12822
   macro avg       0.57      0.52      0.52     12822
weighted avg       0.79      0.85      0.81     12822

Threshold: 0.8
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

Threshold: 0.9
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     11073
           1       0.00      0.00      0.00      1749

    accuracy                           0.86     12822
   macro avg       0.43      0.50      0.46     12822
weighted avg       0.75      0.86      0.80     12822

C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\aniru\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

	review	review_length	sentiment_score
23404	h1this game devolved rubbish playedh1 fan talo...	1620	0.7303
14647	immersive story line excellent fight mechinics...	62	0.7351
24892	greatest 2d platformer ever made	32	0.6369
32911	dont enough one thing lose simulator game garb...	57	-0.3182
27897	play game interstellar music background 55 aud...	60	0.5994

	review	review_length	sentiment_score
26411	objectively beautiful tremendously fun 1010	0.005251	0.908691
12658	really love game demon souls bloodborne next	0.005376	0.834883
35651	really fun game great story line goals players...	0.011503	0.943744
12607	lots talk people needing starship enterprise r...	0.088272	0.962346
24936	great story doesnt intrude gameplay amazing ga...	0.031133	0.967047

	review_length	sentiment_score
0	0.007698	0.746975
1	0.000929	0.228823
2	0.001460	0.474197
3	0.006902	0.500000
4	0.001195	0.500000
...	...	...
29910	0.001460	0.792979
29911	0.014335	0.903740
29912	0.005840	0.625013
29913	0.001062	0.901040
29914	0.013008	0.062456

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever	review_length	...	experience_level_experienced	experience_level_intermediate	sentiment_score
0	elden_ring	love game much someone wants collect everythin...	1	2024-10-18 18:12:33	326	16	8977	3957	8988	100	...	1	0	0.6369
1	elden_ring	roll	1	2024-10-18 18:10:09	11	2	2422	1085	2513	4	...	0	0	0.0000
2	elden_ring	laterally perfect every way bad thing optimiza...	1	2024-10-18 16:53:57	0	1	350	492	492	55	...	0	0	0.4215
3	elden_ring	try finger hole	1	2024-10-18 16:44:56	0	1	6456	80	6536	15	...	0	1	0.0000
4	elden_ring	damn damn explain piece art pure phenomenal	1	2024-10-18 16:29:53	51	3	5563	1424	5563	43	...	1	0	-0.6597

	roguelike	co_op	base_building	soulslike	deckbuilding	puzzle	metroidvania	rpg	competitive	first_person	...	platformer	shooter	open_world	strategy	survival	adventure	crafting	third_person	turn_based	2d
0	0	0	0	1	0	0	0	1	0	0	...	0	0	1	0	0	0	0	1	0	0
1	0	0	0	1	0	0	0	1	0	0	...	0	0	1	0	0	0	0	1	0	0
2	0	0	0	1	0	0	0	1	0	0	...	0	0	1	0	0	0	0	1	0	0
3	0	0	0	1	0	0	0	1	0	0	...	0	0	1	0	0	0	0	1	0	0
4	0	0	0	1	0	0	0	1	0	0	...	0	0	1	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
42732	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	0	0	0	0	0
42733	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	0	0	0	0	0
42734	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	0	0	0	0	0
42735	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	0	0	0	0	0
42736	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	0	0	0	0	0