In [57]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from wordcloud import WordCloud
from collections import Counter
In [59]:
df=pd.read_csv("cleaned_reviews_dataset.csv")
In [61]:
for c in df.columns:
print(f"\"{c}\"",end=", ")
"game_name", "review", "voted_up", "timestamp_created", "author_num_games_owned", "author_num_reviews", "author_playtime_at_review", "author_playtime_last_two_weeks", "author_playtime_forever",
In [63]:
# we list the columns in our dataset and their data types
df.dtypes
Out[63]:
game_name object review object voted_up bool timestamp_created int64 author_num_games_owned int64 author_num_reviews int64 author_playtime_at_review int64 author_playtime_last_two_weeks int64 author_playtime_forever int64 dtype: object
Some summary statistics ->¶
In [66]:
# we can get summary statistics of all numerical columns
df.describe()
Out[66]:
timestamp_created | author_num_games_owned | author_num_reviews | author_playtime_at_review | author_playtime_last_two_weeks | author_playtime_forever | |
---|---|---|---|---|---|---|
count | 4.292400e+04 | 42924.000000 | 42924.000000 | 4.292400e+04 | 42924.000000 | 4.292400e+04 |
mean | 1.715186e+09 | 69.589460 | 15.970017 | 1.115328e+04 | 433.422724 | 1.540057e+04 |
std | 4.154933e+07 | 224.372803 | 46.055941 | 4.077951e+04 | 1084.890568 | 5.696278e+04 |
min | 1.487494e+09 | 0.000000 | 1.000000 | 4.000000e+00 | 0.000000 | 5.000000e+00 |
25% | 1.721491e+09 | 0.000000 | 2.000000 | 6.460000e+02 | 0.000000 | 1.031000e+03 |
50% | 1.726010e+09 | 0.000000 | 6.000000 | 2.148500e+03 | 0.000000 | 2.995000e+03 |
75% | 1.728177e+09 | 54.000000 | 15.000000 | 6.692500e+03 | 326.000000 | 8.014000e+03 |
max | 1.729276e+09 | 12668.000000 | 2364.000000 | 3.145129e+06 | 20054.000000 | 3.194464e+06 |
In [68]:
# we find the mean length of all reviews
(df["review"].apply(len)).mean()
Out[68]:
172.69744199049484
In [70]:
# then we can look at the average playtime in hours for all reviews
(df["author_playtime_at_review"]/60).mean()
Out[70]:
185.88803466592117
In [72]:
# we look at the median playtime in hours for all reviews
(df["author_playtime_at_review"]/60).median()
Out[72]:
35.80833333333334
In [74]:
# we can get the total number of positive and negative reviews in our entire dataset
df["voted_up"].value_counts()
Out[74]:
voted_up True 38625 False 4299 Name: count, dtype: int64
In [76]:
# we can try to analyse if there is any correlation between the playtime and the length of review
df["author_playtime_at_review"].corr(df["review"].apply(len))
Out[76]:
-0.011147521877619334
Data Visualization ->¶
In [79]:
# median playtime for each game
median_playtime = df.groupby("game_name")["author_playtime_at_review"].median() / 60
median_playtime = median_playtime.sort_values()
colors = sns.color_palette("viridis", len(median_playtime))
plt.figure(figsize=(12, 8))
bars = plt.barh(median_playtime.index, median_playtime.values, color=colors)
for bar in bars:
plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{bar.get_width():.2f}', va='center', fontsize=10)
plt.title("Median Playtime per Game (in Hours)", fontsize=16)
plt.xlabel("Median Playtime (Hours)")
plt.ylabel("Game Name")
plt.tight_layout()
plt.savefig('graphs/playtime_by_game.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [81]:
# review length for each game
avg_review_length = df.groupby("game_name")["review"].apply(lambda x: x.str.len().mean())
plt.figure(figsize=(12, 6))
plt.stem(avg_review_length.index, avg_review_length.values, linefmt="C8-", markerfmt="C2o", basefmt="black")
plt.xticks(rotation=90)
plt.title("Lollipop Chart of Average Review Length per Game")
plt.xlabel("Game Name")
plt.ylabel("Average Review Length (Characters)")
plt.savefig('graphs/review_length_by_game.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [91]:
# we look at distribution of positive and negative reviews for each game
sentiment_counts = df.groupby(['game_name', 'voted_up']).size().unstack()
fig, ax = plt.subplots(figsize=(9, 6))
sentiment_counts.plot(kind='bar', ax=ax, width=0.6, color=['salmon', 'teal'])
plt.title('Review Distribution (Positive vs Negative) by Game')
plt.xlabel('Game')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right', fontsize=8)
plt.legend(title='Sentiment', labels=['Negative', 'Positive'])
plt.tight_layout()
plt.savefig('graphs/review_sentiment_by_game.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [93]:
# We then look at percentage of positive reviews for each of our 30 games
sentiment_percent = df.groupby('game_name')['voted_up'].mean() * 100
sentiment_percent = sentiment_percent.sort_values(ascending=False) #we sort them in descending order
fig, ax = plt.subplots(figsize=(9, 7))
bars = sentiment_percent.plot(kind='bar', color='purple', ax=ax, alpha=0.5)
for bar in bars.patches: # we display percentages inside the bars
ax.annotate(f'{bar.get_height():.1f}',
(bar.get_x() + bar.get_width() / 2, bar.get_height()), # Position at the top center of each bar
ha='center', va='bottom', fontsize=6)
plt.title('Percentage of Positive Reviews per Game')
plt.xlabel('Game')
plt.ylabel('Percentage of Positive Reviews (%)')
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.savefig('graphs/percentage_positive_reviews.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [96]:
# scatterplot of review length against the playtime
df["review_length"] = df["review"].apply(len)
df["playtime_at_review_hours"] = df["author_playtime_at_review"] / 60 #we get playtime in terms of hours
plt.figure(figsize=(10, 6))
plt.scatter(df["review_length"], df["playtime_at_review_hours"], alpha=0.5)
plt.title("Scatter Plot of Review Length vs Playtime at Review")
plt.xlabel("Review Length (characters)")
plt.ylabel("Playtime at Review (hours)")
# plt.ylim(0,10000)
plt.grid(True)
plt.savefig('graphs/review_length_vs_playtime.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [99]:
# a sampled version of above plot for better readability
sample_df = df.sample(1000)
plt.figure(figsize=(10, 6))
plt.scatter(sample_df["review_length"], sample_df["playtime_at_review_hours"], alpha=0.5)
plt.title("Scatter Plot of Review Length vs Playtime at Review (Sampled Data)")
plt.xlabel("Review Length (characters)")
plt.ylabel("Playtime at Review (hours)")
plt.ylim(0,2000) #we limit x and y
plt.xlim(0,1000)
plt.grid(True)
plt.savefig('graphs/review_length_vs_playtime_sampled_data.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [103]:
(df["author_playtime_at_review"]/df["author_playtime_forever"])
Out[103]:
0 0.998776 1 0.963788 2 0.711382 3 0.987760 4 1.000000 ... 42919 0.015967 42920 0.494370 42921 0.153954 42922 0.584531 42923 0.507713 Length: 42924, dtype: float64
In [125]:
# distribution when people post reviews and how much playtime they actually have
plt.figure(figsize=(10, 6))
plt.hist(df["author_playtime_at_review"]/df["author_playtime_forever"], bins=50, alpha=0.7, edgecolor='black')
plt.title("Distribution of Playtime Ratio (Playtime at Review / Total Playtime)")
plt.xlabel("Playtime Ratio")
plt.ylabel("Frequency")
plt.grid(axis='y', alpha=0.75)
plt.savefig('graphs/distribution_of_playtime_ratio.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [83]:
#we generate a word cloud of most frequent words
text = " ".join(review for review in df["review"])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # Hide the axes
plt.savefig('graphs/word_cloud.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [109]:
# word cloud for positive reviews
positive_reviews = " ".join(df[df["voted_up"] == True]["review"])
fig, axes = plt.subplots(1, 1, figsize=(15, 7))
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
axes.imshow(wordcloud_pos, interpolation='bilinear')
axes.axis('off')
axes.set_title('Word Cloud - Positive Reviews')
plt.savefig('graphs/wordcloud_positive.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [111]:
# word cloud for negative reviews
negative_reviews = " ".join(df[df["voted_up"] == False]["review"])
fig, axes = plt.subplots(1, 1, figsize=(15, 7))
wordcloud_neg = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(negative_reviews)
axes.imshow(wordcloud_neg, interpolation='bilinear')
axes.axis('off')
axes.set_title('Word Cloud - Negative Reviews')
plt.savefig('graphs/wordcloud_negative.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [87]:
# boxplot of review length for each type of sentiment zoomed in
df["review_length"] = df["review"].apply(len)
plt.figure(figsize=(10, 6))
sns.boxplot(x=df["voted_up"].apply(lambda x: "Positive" if x==True else "Negative"), y="review_length", data=df, color="gold")
plt.title("Review Length Distribution by Sentiment", fontsize=16)
plt.xlabel("Review Sentiment (Positive/Negative)", fontsize=12)
plt.ylabel("Review Length (Characters)", fontsize=12)
plt.ylim(0,1000)
plt.savefig('graphs/review_length_by_sentiment.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [89]:
# playtime by sentiment zoomed in
plt.figure(figsize=(10, 6))
sns.boxplot(x=df["voted_up"].apply(lambda x: "Positive" if x==True else "Negative"), y=df["author_playtime_at_review"] / 60, data=df)
plt.title("Playtime at Review Distribution by Sentiment")
plt.xlabel("Review Sentiment (Positive/Negative)")
plt.ylabel("Playtime at Review (Hours)")
plt.ylim(0,1000)
plt.savefig('graphs/playtime_by_sentiment.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
In [91]:
# player sentiment by playtime
bins = [0, 500, 1000, 5000, 10000, float('inf')]
labels = ['0-500', '500-1000', '1000-5000', '5000-10000', '10000+']
df['playtime_range'] = pd.cut(df['author_playtime_forever'], bins=bins, labels=labels, right=False)
sentiment_counts = df.groupby(['playtime_range', 'voted_up']).size().unstack(fill_value=0)
plt.figure(figsize=(12, 6))
sentiment_counts.plot(kind='bar', stacked=True, color=['green', 'red'], alpha=0.5, ax=plt.gca())
plt.title("Frequency of Negative vs Positive Reviews by Playtime Range", fontsize=16)
plt.xlabel("Playtime Range (minutes)", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.legend(title="Review Sentiment", labels=["Negative Review", "Positive Review"])
plt.tight_layout()
plt.savefig('graphs/review_sentiment_by_playtime.png', format='png', dpi=300, bbox_inches='tight')
plt.show()
C:\Users\aniru\AppData\Local\Temp\ipykernel_6788\2386246034.py:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. sentiment_counts = df.groupby(['playtime_range', 'voted_up']).size().unstack(fill_value=0)
In [ ]: