In [666]:
import requests
import json
import time
import pandas
import re
import nltk
from nltk.corpus import words
from collections import Counter

Getting our reviews ->¶

In [41]:
def get_reviews(app_id,app_name):
    """
    This function gets 1500 most recent steam reviews of the game with the specified steam app id
    """
    total_reviews = 1500 #we collect 1500 reviews
    url = f"https://store.steampowered.com/appreviews/{app_id}"
    cursor = '*' #each API call returns reviews in batches. To access the next batch we need this cursor parameter
    all_reviews = []
    
    while len(all_reviews) < total_reviews: #till review count becomes 1500
        params = {
            "json": 1,
            "filter": "recent", #we filter the reviews by recent        
            "num_per_page": 100,        
            "language": "english", #we only select reviews where detected language is english      
            "cursor": cursor            
        }
        
        try:
            response = requests.get(url, params=params) #make the API call with the specified parameters
            if response.status_code != 200: #if status code 200 then break
                print(f"Failed to fetch reviews. Status Code: {response.status_code}")
                break
        
            data = response.json() 
            reviews = data.get("reviews", []) #the reviews tag contains review data
            all_reviews.extend(reviews) #add these to our all_reviews list
        
            if not reviews: #reviews tag isn't present then that response doesn't have reviews. it means we have no more reviews to fetch
                print("No more reviews to fetch.")
                break
        
            cursor = data.get("cursor") #get the cursor and this will be passed for the next call
            
        except Exception as e: #if error then wait for 2 sec and make the call again
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(2)
            
        time.sleep(1) #we do 1 sec wait between calls to avoid hiting rate limits for the API call

    print(f"Fetched {len(all_reviews)} reviews for {app_name}")
    return all_reviews #return these reviews
In [29]:
def save_reviews_to_json(reviews, filename):
    """
    This function saves the reviews for the game into a json file
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(reviews, f, ensure_ascii=False, indent=4)
In [31]:
def get_and_save_reviews(app_id,app_name):
    """
    This function calls the get_reviews() function, gets the reviews, then passes these to the save_reviews_to_json() function and calls it
    """
    game_reviews=get_reviews(app_id,app_name)
    save_reviews_to_json(game_reviews,f"{app_name}_reviews.json")
In [33]:
# we store our list of games and their respective app ids
games={
    "elden_ring" : 1245620,
    "sekiro" : 814380,
    "dark_souls_remastered" : 570940,
    "armored_core_6" : 1888160,
    "hollow_knight" : 367520,
    "hades" : 1145360,
    "dead_cells" : 588650,
    "slay_the_spire" : 646570,
    "returnal" : 1649240,
    "risk_of_rain_2" : 632360,
    "witcher_3" : 292030,
    "mass_effect" : 1328670,
    "divinity_original_sin_2" : 435150,
    "baldurs_gate_3" : 1086940,
    "pillars_of_eternity": 291650,
    "portal_2" : 620,
    "the_witness" : 210970,
    "celeste" : 504230,
    "ori_and_the_blind_forest" : 387290,
    "inside" : 304430,
    "stardew_valley" : 413150,
    "factorio" : 427520,
    "frostpunk" : 323190,
    "the_forest" : 242760,
    "subnautica" : 264710,
    "cod_modern_warfare" : 2000950,
    "rocket_league" : 252950,
    "counter_strike_2" : 730,
    "team_fortress_2" : 440,
    "dota_2" : 570
}
In [35]:
for game in games: #for every game we call the get_and_save_reviews() function with their name and id
    get_and_save_reviews(games[game],game)
Fetched 1500 reviews for elden_ring
Fetched 1500 reviews for sekiro
Fetched 1500 reviews for dark_souls_remastered
Fetched 1500 reviews for armored_core_6
Fetched 1500 reviews for hollow_knight
Fetched 1500 reviews for hades
Fetched 1500 reviews for dead_cells
Fetched 1500 reviews for slay_the_spire
Fetched 1500 reviews for returnal
Fetched 1500 reviews for risk_of_rain_2
Fetched 1500 reviews for witcher_3
Fetched 1500 reviews for mass_effect
Fetched 1500 reviews for divinity_original_sin_2
Fetched 1500 reviews for baldurs_gate_3
Fetched 1500 reviews for pillars_of_eternity
Fetched 1500 reviews for portal_2
Fetched 1500 reviews for the_witness
Fetched 1500 reviews for celeste
Fetched 1500 reviews for ori_and_the_blind_forest
Fetched 1500 reviews for inside
Fetched 1500 reviews for stardew_valley
Fetched 1500 reviews for factorio
Fetched 1500 reviews for frostpunk
Fetched 1500 reviews for the_forest
Fetched 1500 reviews for subnautica
Fetched 1500 reviews for cod_modern_warfare
Fetched 1500 reviews for rocket_league
Fetched 1500 reviews for counter_strike_2
Fetched 1500 reviews for team_fortress_2
Fetched 1599 reviews for dota_2

Data preprocessing ->¶

In [65]:
def load_reviews(filename):
    """
    This function is used to load up JSON file with reviews of a game into a pandas dataframe
    """
    with open(filename, 'r', encoding="utf-8") as file:
        reviews=json.load(file)
    return pd.DataFrame(reviews)
In [77]:
all_reviews_list=[] #to store reviews of all games together
game_reviews_dict={} #to store reviews of each game separately
In [80]:
for game in games: #for every game
    df=load_reviews(f"{game}_reviews.json") #get the reviews into a dataframe
    df["game_name"]=game #add the column called game_name
    game_reviews_dict[game]=df #add this df to the game_reviews_dict in game : df pairs
    all_reviews_list.append(df) #append this df to the list of dataframes of all games
In [506]:
all_reviews_df=pd.concat(all_reviews_list,ignore_index=True) #then we make a df that is concatenation of all the dfs
In [508]:
all_reviews_df["game_name"].value_counts()
Out[508]:
game_name
dota_2                      1599
sekiro                      1500
team_fortress_2             1500
counter_strike_2            1500
rocket_league               1500
cod_modern_warfare          1500
subnautica                  1500
the_forest                  1500
frostpunk                   1500
factorio                    1500
stardew_valley              1500
inside                      1500
ori_and_the_blind_forest    1500
celeste                     1500
the_witness                 1500
elden_ring                  1500
pillars_of_eternity         1500
baldurs_gate_3              1500
divinity_original_sin_2     1500
mass_effect                 1500
witcher_3                   1500
risk_of_rain_2              1500
returnal                    1500
slay_the_spire              1500
dead_cells                  1500
hades                       1500
hollow_knight               1500
armored_core_6              1500
dark_souls_remastered       1500
portal_2                    1500
Name: count, dtype: int64
In [510]:
all_reviews_df.columns
Out[510]:
Index(['recommendationid', 'author', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access',
       'hidden_in_steam_china', 'steam_china_location', 'primarily_steam_deck',
       'game_name', 'timestamp_dev_responded', 'developer_response'],
      dtype='object')
In [512]:
all_reviews_df.drop_duplicates(subset="recommendationid",inplace=True) #if there are any duplicated reviews then we remove them
In [514]:
all_reviews_df["game_name"].value_counts()
Out[514]:
game_name
dota_2                      1599
sekiro                      1500
team_fortress_2             1500
counter_strike_2            1500
rocket_league               1500
cod_modern_warfare          1500
subnautica                  1500
the_forest                  1500
frostpunk                   1500
factorio                    1500
stardew_valley              1500
inside                      1500
ori_and_the_blind_forest    1500
celeste                     1500
the_witness                 1500
elden_ring                  1500
pillars_of_eternity         1500
baldurs_gate_3              1500
divinity_original_sin_2     1500
mass_effect                 1500
witcher_3                   1500
risk_of_rain_2              1500
returnal                    1500
slay_the_spire              1500
dead_cells                  1500
hades                       1500
hollow_knight               1500
armored_core_6              1500
dark_souls_remastered       1500
portal_2                    1500
Name: count, dtype: int64
In [516]:
print(all_reviews_df.isnull().sum()) #we check if there are any missing values
recommendationid                   0
author                             0
language                           0
review                             0
timestamp_created                  0
timestamp_updated                  0
voted_up                           0
votes_up                           0
votes_funny                        0
weighted_vote_score                0
comment_count                      0
steam_purchase                     0
received_for_free                  0
written_during_early_access        0
hidden_in_steam_china              0
steam_china_location               0
primarily_steam_deck               0
game_name                          0
timestamp_dev_responded        45098
developer_response             45098
dtype: int64
In [518]:
#we check if there are any non-english reviews
all_reviews_df["language"].value_counts()
Out[518]:
language
english    45099
Name: count, dtype: int64
In [520]:
#The author col is a dictionary. We will flatten it by adding a col for each key of the author column

all_reviews_df["author"].loc[72] #if we look at any random author record
Out[520]:
{'steamid': '76561199248999433',
 'num_games_owned': 0,
 'num_reviews': 4,
 'playtime_forever': 4056,
 'playtime_last_two_weeks': 2353,
 'playtime_at_review': 3887,
 'last_played': 1729271154}
In [522]:
#it has these fields
for i in all_reviews_df["author"].loc[72]:
    print(i,end=" ")
steamid num_games_owned num_reviews playtime_forever playtime_last_two_weeks playtime_at_review last_played 
In [524]:
cols="steamid num_games_owned num_reviews playtime_forever playtime_last_two_weeks playtime_at_review last_played".split()
cols
Out[524]:
['steamid',
 'num_games_owned',
 'num_reviews',
 'playtime_forever',
 'playtime_last_two_weeks',
 'playtime_at_review',
 'last_played']
In [526]:
# we will make new fields in our df based on these columns
for col in cols:
    all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[526], line 3
      1 # we will make new fields in our df based on these columns
      2 for col in cols:
----> 3     all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

File ~\anaconda3\Lib\site-packages\pandas\core\series.py:4924, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
   4789 def apply(
   4790     self,
   4791     func: AggFuncType,
   (...)
   4796     **kwargs,
   4797 ) -> DataFrame | Series:
   4798     """
   4799     Invoke function on values of Series.
   4800 
   (...)
   4915     dtype: float64
   4916     """
   4917     return SeriesApply(
   4918         self,
   4919         func,
   4920         convert_dtype=convert_dtype,
   4921         by_row=by_row,
   4922         args=args,
   4923         kwargs=kwargs,
-> 4924     ).apply()

File ~\anaconda3\Lib\site-packages\pandas\core\apply.py:1427, in SeriesApply.apply(self)
   1424     return self.apply_compat()
   1426 # self.func is Callable
-> 1427 return self.apply_standard()

File ~\anaconda3\Lib\site-packages\pandas\core\apply.py:1507, in SeriesApply.apply_standard(self)
   1501 # row-wise access
   1502 # apply doesn't have a `na_action` keyword and for backward compat reasons
   1503 # we need to give `na_action="ignore"` for categorical data.
   1504 # TODO: remove the `na_action="ignore"` when that default has been changed in
   1505 #  Categorical (GH51645).
   1506 action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
-> 1507 mapped = obj._map_values(
   1508     mapper=curried, na_action=action, convert=self.convert_dtype
   1509 )
   1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
   1512     # GH#43986 Need to do list(mapped) in order to get treated as nested
   1513     #  See also GH#25959 regarding EA support
   1514     return obj._constructor_expanddim(list(mapped), index=obj.index)

File ~\anaconda3\Lib\site-packages\pandas\core\base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
    918 if isinstance(arr, ExtensionArray):
    919     return arr.map(mapper, na_action=na_action)
--> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)

File ~\anaconda3\Lib\site-packages\pandas\core\algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
   1741 values = arr.astype(object, copy=False)
   1742 if na_action is None:
-> 1743     return lib.map_infer(values, mapper, convert=convert)
   1744 else:
   1745     return lib.map_infer_mask(
   1746         values, mapper, mask=isna(values).view(np.uint8), convert=convert
   1747     )

File lib.pyx:2972, in pandas._libs.lib.map_infer()

Cell In[526], line 3, in <lambda>(x)
      1 # we will make new fields in our df based on these columns
      2 for col in cols:
----> 3     all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

KeyError: 'playtime_at_review'
In [528]:
# from the above error we can see that some records don't have the playtime_at_review field
# we wont keep these records in our df
for i in all_reviews_df["author"]: #we get these particular records
    if "playtime_at_review" not in i:
        print(i)
{'steamid': '76561198256506010', 'num_games_owned': 0, 'num_reviews': 3, 'playtime_forever': 37, 'playtime_last_two_weeks': 0, 'last_played': 1513029923}
{'steamid': '76561198053705825', 'num_games_owned': 105, 'num_reviews': 3, 'playtime_forever': 41, 'playtime_last_two_weeks': 0, 'last_played': 1487532238}
In [530]:
unwanted=[{'steamid': '76561198256506010', 'num_games_owned': 0, 'num_reviews': 3, 'playtime_forever': 37, 'playtime_last_two_weeks': 0, 'last_played': 1513029923}, {'steamid': '76561198053705825', 'num_games_owned': 105, 'num_reviews': 3, 'playtime_forever': 41, 'playtime_last_two_weeks': 0, 'last_played': 1487532238}]
In [532]:
all_reviews_df[(all_reviews_df["author"]==unwanted[0]) | (all_reviews_df["author"]==unwanted[1])] 
#we can see the records where playtime_at_review field is missing
Out[532]:
recommendationid author language review timestamp_created timestamp_updated voted_up votes_up votes_funny weighted_vote_score ... steam_china_location primarily_steam_deck game_name timestamp_dev_responded developer_response author_steamid author_num_games_owned author_num_reviews author_playtime_forever author_playtime_last_two_weeks
43690 30249428 {'steamid': '76561198256506010', 'num_games_ow... english really addictive and good 1488320291 1488320291 True 0 0 0 ... False dota_2 NaN NaN 76561198256506010 0 3 37 0
45002 30042721 {'steamid': '76561198053705825', 'num_games_ow... english I didn't find this game very fun. Not very bal... 1487532588 1487532588 False 0 0 0 ... False dota_2 NaN NaN 76561198053705825 105 3 41 0

2 rows × 25 columns

In [534]:
# we drop these two records from our df
all_reviews_df = all_reviews_df[~((all_reviews_df["author"] == unwanted[0]) | (all_reviews_df["author"] == unwanted[1]))]
In [536]:
# we check again if there are any records where playtime_at_review field is missing
for i in all_reviews_df["author"]:
    if "playtime_at_review" not in i:
        print(i)
In [538]:
# so now we add the fields of author column as other columns in our df again
for col in cols:
    all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])
In [540]:
all_reviews_df.head() #we can see that the new columns have been added to our df
Out[540]:
recommendationid author language review timestamp_created timestamp_updated voted_up votes_up votes_funny weighted_vote_score ... game_name timestamp_dev_responded developer_response author_steamid author_num_games_owned author_num_reviews author_playtime_forever author_playtime_last_two_weeks author_playtime_at_review author_last_played
0 177283496 {'steamid': '76561198266230202', 'num_games_ow... english I love the game, but… there is too much to do.... 1729275153 1729275153 True 0 0 0 ... elden_ring NaN NaN 76561198266230202 326 16 8988 3957 8977 1729275953
1 177283364 {'steamid': '76561198829689696', 'num_games_ow... english roll 1729275009 1729275009 True 0 0 0 ... elden_ring NaN NaN 76561198829689696 11 2 2513 1085 2422 1729280436
2 177279370 {'steamid': '76561199387385936', 'num_games_ow... english Its Laterally Perfect In Every Way, The Only B... 1729270437 1729270437 True 0 0 0 ... elden_ring NaN NaN 76561199387385936 0 1 492 492 350 1729278945
3 177278879 {'steamid': '76561199572934905', 'num_games_ow... english Try finger, but hole 1729269896 1729269896 True 0 0 0 ... elden_ring NaN NaN 76561199572934905 0 1 6536 80 6456 1729280044
4 177278154 {'steamid': '76561198436642208', 'num_games_ow... english Damn, just damn. how can i explain this piece ... 1729268993 1729268993 True 0 0 0 ... elden_ring NaN NaN 76561198436642208 51 3 5563 1424 5563 1729268292

5 rows × 27 columns

In [542]:
all_reviews_df.columns
Out[542]:
Index(['recommendationid', 'author', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access',
       'hidden_in_steam_china', 'steam_china_location', 'primarily_steam_deck',
       'game_name', 'timestamp_dev_responded', 'developer_response',
       'author_steamid', 'author_num_games_owned', 'author_num_reviews',
       'author_playtime_forever', 'author_playtime_last_two_weeks',
       'author_playtime_at_review', 'author_last_played'],
      dtype='object')
In [544]:
# we don't need all of these columns. We only keep our relevant columns
relevant_columns = ['game_name', 'review', 'voted_up', 'timestamp_created', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_at_review', 'author_playtime_last_two_weeks', 'author_playtime_forever']
In [546]:
all_reviews_df=all_reviews_df[relevant_columns]
all_reviews_df
Out[546]:
game_name review voted_up timestamp_created author_num_games_owned author_num_reviews author_playtime_at_review author_playtime_last_two_weeks author_playtime_forever
0 elden_ring I love the game, but… there is too much to do.... True 1729275153 326 16 8977 3957 8988
1 elden_ring roll True 1729275009 11 2 2422 1085 2513
2 elden_ring Its Laterally Perfect In Every Way, The Only B... True 1729270437 0 1 350 492 492
3 elden_ring Try finger, but hole True 1729269896 0 1 6456 80 6536
4 elden_ring Damn, just damn. how can i explain this piece ... True 1729268993 51 3 5563 1424 5563
... ... ... ... ... ... ... ... ... ...
45094 dota_2 Good True 1487494231 0 1 31575 2916 205094
45095 dota_2 Great Game but some of sea players are toxics True 1487494164 5 2 99538 0 170287
45096 dota_2 Keep up the good work True 1487493619 1 1 56345 0 110978
45097 dota_2 halawoken True 1487493562 0 1 32205 0 48218
45098 dota_2 weowe True 1487493205 0 1 44382 0 124043

45097 rows × 9 columns

In [554]:
#now its time for us to clean the reviews a little
def clean_review(text):
    """
    This function will be used to clean the reviews. It converts all texts to lower case, removes characters which are not alpha numeric or white spaces, and finally removes all extra white space characters
    """
    text=text.lower()
    text = re.sub(r'[^\w\s]', '', text) #removes characters which aren't alpha numeric or white space (like punctuation, special symbols, etc)
    text = re.sub(r'\s+', ' ', text).strip() #remove extra white spaces
    return text
In [556]:
all_reviews_df["review"]=all_reviews_df["review"].apply(clean_review)
In [558]:
all_reviews_df["review"]
Out[558]:
0        i love the game but there is too much to do as...
1                                                     roll
2        its laterally perfect in every way the only ba...
3                                      try finger but hole
4        damn just damn how can i explain this piece of...
                               ...                        
45094                                                 good
45095        great game but some of sea players are toxics
45096                                keep up the good work
45097                                            halawoken
45098                                                weowe
Name: review, Length: 45097, dtype: object
In [560]:
all_reviews_df[all_reviews_df['review'].apply(lambda x: len(x.split())==0)]
Out[560]:
game_name review voted_up timestamp_created author_num_games_owned author_num_reviews author_playtime_at_review author_playtime_last_two_weeks author_playtime_forever
89 elden_ring True 1729196068 0 14 192 358 358
127 elden_ring True 1729165883 0 7 8102 332 8123
165 elden_ring True 1729125671 0 10 9048 986 9098
396 elden_ring True 1728928295 0 2 5747 2049 6025
430 elden_ring True 1728886706 0 2 3343 3311 3729
... ... ... ... ... ... ... ... ... ...
44906 dota_2 True 1487597602 0 1 36328 0 40480
44907 dota_2 True 1487596377 1 1 102099 0 150906
44968 dota_2 True 1487562098 0 1 52456 0 329005
45052 dota_2 True 1487509195 0 1 47569 0 137826
45078 dota_2 True 1487499792 0 2 160 0 196

755 rows × 9 columns

In [562]:
# we can see that some reviews are empty strings. So we remove these
all_reviews_df=all_reviews_df[~all_reviews_df['review'].apply(lambda x: len(x.split())==0)]
In [564]:
all_reviews_df[all_reviews_df['review'].apply(lambda x: len(x.split())==0)]
Out[564]:
game_name review voted_up timestamp_created author_num_games_owned author_num_reviews author_playtime_at_review author_playtime_last_two_weeks author_playtime_forever
In [620]:
# we don't want some random gibberish words in our reviews. we only want them to have valid english words.
nltk.download('words')
english_words = set(nltk.corpus.words.words()) #so we get a dictionary of english words
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
In [656]:
# but some common gaming terms like "goty", "rpg", "fps", etc are not found in this set
# so we extend our set to contain some frequently used terms like these
english_words.update("goty elden rpg fps soulslike roguelike roguelite arpg jrpg coop pvp mmo mmorpg moba respawn afk godlike dlc vr camping meta smurf lfg gank pve buff nerf sandbox crafting skilltree xp hitbox minmaxing despawn noob op lag gg crossplay endgame speedrun".split())
In [680]:
# But we surely could not have covered all commonly used words. So we implement a mechanism to check if the word that appears has been 
# used frequently enough across other reviews too or not
all_words = ' '.join(all_reviews_df['review']).split()
word_freq = Counter(all_words)
In [700]:
def validate_review(text):
    """
    This function will be used to check if the review is valid or not. It will be marked valid if atleast 50% of the words are found to be valid.
    """
    threshold=0.5
    freq_threshold=3 #if the word is a frequently used word across reviews then it is valid
    tokens = text.split()
    valid_words = [word for word in tokens if word in english_words or word_freq[word] > freq_threshold]
    return len(valid_words) / len(tokens) >= threshold
In [724]:
validated_df = all_reviews_df[all_reviews_df['review'].apply(validate_review)]
validated_df
Out[724]:
game_name review voted_up timestamp_created author_num_games_owned author_num_reviews author_playtime_at_review author_playtime_last_two_weeks author_playtime_forever
0 elden_ring i love the game but there is too much to do as... True 1729275153 326 16 8977 3957 8988
1 elden_ring roll True 1729275009 11 2 2422 1085 2513
2 elden_ring its laterally perfect in every way the only ba... True 1729270437 0 1 350 492 492
3 elden_ring try finger but hole True 1729269896 0 1 6456 80 6536
4 elden_ring damn just damn how can i explain this piece of... True 1729268993 51 3 5563 1424 5563
... ... ... ... ... ... ... ... ... ...
45092 dota_2 nice True 1487494906 0 2 1937 6997 121315
45093 dota_2 good game True 1487494747 0 1 124437 0 251708
45094 dota_2 good True 1487494231 0 1 31575 2916 205094
45095 dota_2 great game but some of sea players are toxics True 1487494164 5 2 99538 0 170287
45096 dota_2 keep up the good work True 1487493619 1 1 56345 0 110978

42924 rows × 9 columns

In [732]:
#we save this cleaned dataset
validated_df.to_csv("cleaned_reviews_dataset.csv",index=False)
In [ ]: