import requests
import json
import time
import pandas
import re
import nltk
from nltk.corpus import words
from collections import Counter

def get_reviews(app_id,app_name):
    """
    This function gets 1500 most recent steam reviews of the game with the specified steam app id
    """
    total_reviews = 1500 #we collect 1500 reviews
    url = f"https://store.steampowered.com/appreviews/{app_id}"
    cursor = '*' #each API call returns reviews in batches. To access the next batch we need this cursor parameter
    all_reviews = []
    
    while len(all_reviews) < total_reviews: #till review count becomes 1500
        params = {
            "json": 1,
            "filter": "recent", #we filter the reviews by recent        
            "num_per_page": 100,        
            "language": "english", #we only select reviews where detected language is english      
            "cursor": cursor            
        }
        
        try:
            response = requests.get(url, params=params) #make the API call with the specified parameters
            if response.status_code != 200: #if status code 200 then break
                print(f"Failed to fetch reviews. Status Code: {response.status_code}")
                break
        
            data = response.json() 
            reviews = data.get("reviews", []) #the reviews tag contains review data
            all_reviews.extend(reviews) #add these to our all_reviews list
        
            if not reviews: #reviews tag isn't present then that response doesn't have reviews. it means we have no more reviews to fetch
                print("No more reviews to fetch.")
                break
        
            cursor = data.get("cursor") #get the cursor and this will be passed for the next call
            
        except Exception as e: #if error then wait for 2 sec and make the call again
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(2)
            
        time.sleep(1) #we do 1 sec wait between calls to avoid hiting rate limits for the API call

    print(f"Fetched {len(all_reviews)} reviews for {app_name}")
    return all_reviews #return these reviews

def save_reviews_to_json(reviews, filename):
    """
    This function saves the reviews for the game into a json file
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(reviews, f, ensure_ascii=False, indent=4)

def get_and_save_reviews(app_id,app_name):
    """
    This function calls the get_reviews() function, gets the reviews, then passes these to the save_reviews_to_json() function and calls it
    """
    game_reviews=get_reviews(app_id,app_name)
    save_reviews_to_json(game_reviews,f"{app_name}_reviews.json")

# we store our list of games and their respective app ids
games={
    "elden_ring" : 1245620,
    "sekiro" : 814380,
    "dark_souls_remastered" : 570940,
    "armored_core_6" : 1888160,
    "hollow_knight" : 367520,
    "hades" : 1145360,
    "dead_cells" : 588650,
    "slay_the_spire" : 646570,
    "returnal" : 1649240,
    "risk_of_rain_2" : 632360,
    "witcher_3" : 292030,
    "mass_effect" : 1328670,
    "divinity_original_sin_2" : 435150,
    "baldurs_gate_3" : 1086940,
    "pillars_of_eternity": 291650,
    "portal_2" : 620,
    "the_witness" : 210970,
    "celeste" : 504230,
    "ori_and_the_blind_forest" : 387290,
    "inside" : 304430,
    "stardew_valley" : 413150,
    "factorio" : 427520,
    "frostpunk" : 323190,
    "the_forest" : 242760,
    "subnautica" : 264710,
    "cod_modern_warfare" : 2000950,
    "rocket_league" : 252950,
    "counter_strike_2" : 730,
    "team_fortress_2" : 440,
    "dota_2" : 570
}

for game in games: #for every game we call the get_and_save_reviews() function with their name and id
    get_and_save_reviews(games[game],game)

Fetched 1500 reviews for elden_ring
Fetched 1500 reviews for sekiro
Fetched 1500 reviews for dark_souls_remastered
Fetched 1500 reviews for armored_core_6
Fetched 1500 reviews for hollow_knight
Fetched 1500 reviews for hades
Fetched 1500 reviews for dead_cells
Fetched 1500 reviews for slay_the_spire
Fetched 1500 reviews for returnal
Fetched 1500 reviews for risk_of_rain_2
Fetched 1500 reviews for witcher_3
Fetched 1500 reviews for mass_effect
Fetched 1500 reviews for divinity_original_sin_2
Fetched 1500 reviews for baldurs_gate_3
Fetched 1500 reviews for pillars_of_eternity
Fetched 1500 reviews for portal_2
Fetched 1500 reviews for the_witness
Fetched 1500 reviews for celeste
Fetched 1500 reviews for ori_and_the_blind_forest
Fetched 1500 reviews for inside
Fetched 1500 reviews for stardew_valley
Fetched 1500 reviews for factorio
Fetched 1500 reviews for frostpunk
Fetched 1500 reviews for the_forest
Fetched 1500 reviews for subnautica
Fetched 1500 reviews for cod_modern_warfare
Fetched 1500 reviews for rocket_league
Fetched 1500 reviews for counter_strike_2
Fetched 1500 reviews for team_fortress_2
Fetched 1599 reviews for dota_2

def load_reviews(filename):
    """
    This function is used to load up JSON file with reviews of a game into a pandas dataframe
    """
    with open(filename, 'r', encoding="utf-8") as file:
        reviews=json.load(file)
    return pd.DataFrame(reviews)

all_reviews_list=[] #to store reviews of all games together
game_reviews_dict={} #to store reviews of each game separately

for game in games: #for every game
    df=load_reviews(f"{game}_reviews.json") #get the reviews into a dataframe
    df["game_name"]=game #add the column called game_name
    game_reviews_dict[game]=df #add this df to the game_reviews_dict in game : df pairs
    all_reviews_list.append(df) #append this df to the list of dataframes of all games

all_reviews_df=pd.concat(all_reviews_list,ignore_index=True) #then we make a df that is concatenation of all the dfs

all_reviews_df["game_name"].value_counts()

game_name
dota_2                      1599
sekiro                      1500
team_fortress_2             1500
counter_strike_2            1500
rocket_league               1500
cod_modern_warfare          1500
subnautica                  1500
the_forest                  1500
frostpunk                   1500
factorio                    1500
stardew_valley              1500
inside                      1500
ori_and_the_blind_forest    1500
celeste                     1500
the_witness                 1500
elden_ring                  1500
pillars_of_eternity         1500
baldurs_gate_3              1500
divinity_original_sin_2     1500
mass_effect                 1500
witcher_3                   1500
risk_of_rain_2              1500
returnal                    1500
slay_the_spire              1500
dead_cells                  1500
hades                       1500
hollow_knight               1500
armored_core_6              1500
dark_souls_remastered       1500
portal_2                    1500
Name: count, dtype: int64

all_reviews_df.columns

Index(['recommendationid', 'author', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access',
       'hidden_in_steam_china', 'steam_china_location', 'primarily_steam_deck',
       'game_name', 'timestamp_dev_responded', 'developer_response'],
      dtype='object')

all_reviews_df.drop_duplicates(subset="recommendationid",inplace=True) #if there are any duplicated reviews then we remove them

all_reviews_df["game_name"].value_counts()

game_name
dota_2                      1599
sekiro                      1500
team_fortress_2             1500
counter_strike_2            1500
rocket_league               1500
cod_modern_warfare          1500
subnautica                  1500
the_forest                  1500
frostpunk                   1500
factorio                    1500
stardew_valley              1500
inside                      1500
ori_and_the_blind_forest    1500
celeste                     1500
the_witness                 1500
elden_ring                  1500
pillars_of_eternity         1500
baldurs_gate_3              1500
divinity_original_sin_2     1500
mass_effect                 1500
witcher_3                   1500
risk_of_rain_2              1500
returnal                    1500
slay_the_spire              1500
dead_cells                  1500
hades                       1500
hollow_knight               1500
armored_core_6              1500
dark_souls_remastered       1500
portal_2                    1500
Name: count, dtype: int64

print(all_reviews_df.isnull().sum()) #we check if there are any missing values

recommendationid                   0
author                             0
language                           0
review                             0
timestamp_created                  0
timestamp_updated                  0
voted_up                           0
votes_up                           0
votes_funny                        0
weighted_vote_score                0
comment_count                      0
steam_purchase                     0
received_for_free                  0
written_during_early_access        0
hidden_in_steam_china              0
steam_china_location               0
primarily_steam_deck               0
game_name                          0
timestamp_dev_responded        45098
developer_response             45098
dtype: int64

#we check if there are any non-english reviews
all_reviews_df["language"].value_counts()

language
english    45099
Name: count, dtype: int64

#The author col is a dictionary. We will flatten it by adding a col for each key of the author column

all_reviews_df["author"].loc[72] #if we look at any random author record

{'steamid': '76561199248999433',
 'num_games_owned': 0,
 'num_reviews': 4,
 'playtime_forever': 4056,
 'playtime_last_two_weeks': 2353,
 'playtime_at_review': 3887,
 'last_played': 1729271154}

#it has these fields
for i in all_reviews_df["author"].loc[72]:
    print(i,end=" ")

steamid num_games_owned num_reviews playtime_forever playtime_last_two_weeks playtime_at_review last_played

cols="steamid num_games_owned num_reviews playtime_forever playtime_last_two_weeks playtime_at_review last_played".split()
cols

['steamid',
 'num_games_owned',
 'num_reviews',
 'playtime_forever',
 'playtime_last_two_weeks',
 'playtime_at_review',
 'last_played']

# we will make new fields in our df based on these columns
for col in cols:
    all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[526], line 3
      1 # we will make new fields in our df based on these columns
      2 for col in cols:
----> 3     all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

File ~\anaconda3\Lib\site-packages\pandas\core\series.py:4924, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
   4789 def apply(
   4790     self,
   4791     func: AggFuncType,
   (...)
   4796     **kwargs,
   4797 ) -> DataFrame | Series:
   4798     """
   4799     Invoke function on values of Series.
   4800 
   (...)
   4915     dtype: float64
   4916     """
   4917     return SeriesApply(
   4918         self,
   4919         func,
   4920         convert_dtype=convert_dtype,
   4921         by_row=by_row,
   4922         args=args,
   4923         kwargs=kwargs,
-> 4924     ).apply()

File ~\anaconda3\Lib\site-packages\pandas\core\apply.py:1427, in SeriesApply.apply(self)
   1424     return self.apply_compat()
   1426 # self.func is Callable
-> 1427 return self.apply_standard()

File ~\anaconda3\Lib\site-packages\pandas\core\apply.py:1507, in SeriesApply.apply_standard(self)
   1501 # row-wise access
   1502 # apply doesn't have a `na_action` keyword and for backward compat reasons
   1503 # we need to give `na_action="ignore"` for categorical data.
   1504 # TODO: remove the `na_action="ignore"` when that default has been changed in
   1505 #  Categorical (GH51645).
   1506 action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
-> 1507 mapped = obj._map_values(
   1508     mapper=curried, na_action=action, convert=self.convert_dtype
   1509 )
   1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
   1512     # GH#43986 Need to do list(mapped) in order to get treated as nested
   1513     #  See also GH#25959 regarding EA support
   1514     return obj._constructor_expanddim(list(mapped), index=obj.index)

File ~\anaconda3\Lib\site-packages\pandas\core\base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
    918 if isinstance(arr, ExtensionArray):
    919     return arr.map(mapper, na_action=na_action)
--> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)

File ~\anaconda3\Lib\site-packages\pandas\core\algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
   1741 values = arr.astype(object, copy=False)
   1742 if na_action is None:
-> 1743     return lib.map_infer(values, mapper, convert=convert)
   1744 else:
   1745     return lib.map_infer_mask(
   1746         values, mapper, mask=isna(values).view(np.uint8), convert=convert
   1747     )

File lib.pyx:2972, in pandas._libs.lib.map_infer()

Cell In[526], line 3, in <lambda>(x)
      1 # we will make new fields in our df based on these columns
      2 for col in cols:
----> 3     all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

KeyError: 'playtime_at_review'

# from the above error we can see that some records don't have the playtime_at_review field
# we wont keep these records in our df
for i in all_reviews_df["author"]: #we get these particular records
    if "playtime_at_review" not in i:
        print(i)

{'steamid': '76561198256506010', 'num_games_owned': 0, 'num_reviews': 3, 'playtime_forever': 37, 'playtime_last_two_weeks': 0, 'last_played': 1513029923}
{'steamid': '76561198053705825', 'num_games_owned': 105, 'num_reviews': 3, 'playtime_forever': 41, 'playtime_last_two_weeks': 0, 'last_played': 1487532238}

unwanted=[{'steamid': '76561198256506010', 'num_games_owned': 0, 'num_reviews': 3, 'playtime_forever': 37, 'playtime_last_two_weeks': 0, 'last_played': 1513029923}, {'steamid': '76561198053705825', 'num_games_owned': 105, 'num_reviews': 3, 'playtime_forever': 41, 'playtime_last_two_weeks': 0, 'last_played': 1487532238}]

all_reviews_df[(all_reviews_df["author"]==unwanted[0]) | (all_reviews_df["author"]==unwanted[1])] 
#we can see the records where playtime_at_review field is missing

# we drop these two records from our df
all_reviews_df = all_reviews_df[~((all_reviews_df["author"] == unwanted[0]) | (all_reviews_df["author"] == unwanted[1]))]

# we check again if there are any records where playtime_at_review field is missing
for i in all_reviews_df["author"]:
    if "playtime_at_review" not in i:
        print(i)

# so now we add the fields of author column as other columns in our df again
for col in cols:
    all_reviews_df[f"author_{col}"]=all_reviews_df["author"].apply(lambda x: x[f"{col}"])

all_reviews_df.head() #we can see that the new columns have been added to our df

all_reviews_df.columns

Index(['recommendationid', 'author', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access',
       'hidden_in_steam_china', 'steam_china_location', 'primarily_steam_deck',
       'game_name', 'timestamp_dev_responded', 'developer_response',
       'author_steamid', 'author_num_games_owned', 'author_num_reviews',
       'author_playtime_forever', 'author_playtime_last_two_weeks',
       'author_playtime_at_review', 'author_last_played'],
      dtype='object')

# we don't need all of these columns. We only keep our relevant columns
relevant_columns = ['game_name', 'review', 'voted_up', 'timestamp_created', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_at_review', 'author_playtime_last_two_weeks', 'author_playtime_forever']

all_reviews_df=all_reviews_df[relevant_columns]
all_reviews_df

#now its time for us to clean the reviews a little
def clean_review(text):
    """
    This function will be used to clean the reviews. It converts all texts to lower case, removes characters which are not alpha numeric or white spaces, and finally removes all extra white space characters
    """
    text=text.lower()
    text = re.sub(r'[^\w\s]', '', text) #removes characters which aren't alpha numeric or white space (like punctuation, special symbols, etc)
    text = re.sub(r'\s+', ' ', text).strip() #remove extra white spaces
    return text

all_reviews_df["review"]=all_reviews_df["review"].apply(clean_review)

all_reviews_df["review"]

0        i love the game but there is too much to do as...
1                                                     roll
2        its laterally perfect in every way the only ba...
3                                      try finger but hole
4        damn just damn how can i explain this piece of...
                               ...                        
45094                                                 good
45095        great game but some of sea players are toxics
45096                                keep up the good work
45097                                            halawoken
45098                                                weowe
Name: review, Length: 45097, dtype: object

all_reviews_df[all_reviews_df['review'].apply(lambda x: len(x.split())==0)]

# we can see that some reviews are empty strings. So we remove these
all_reviews_df=all_reviews_df[~all_reviews_df['review'].apply(lambda x: len(x.split())==0)]

all_reviews_df[all_reviews_df['review'].apply(lambda x: len(x.split())==0)]

# we don't want some random gibberish words in our reviews. we only want them to have valid english words.
nltk.download('words')
english_words = set(nltk.corpus.words.words()) #so we get a dictionary of english words

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!

# but some common gaming terms like "goty", "rpg", "fps", etc are not found in this set
# so we extend our set to contain some frequently used terms like these
english_words.update("goty elden rpg fps soulslike roguelike roguelite arpg jrpg coop pvp mmo mmorpg moba respawn afk godlike dlc vr camping meta smurf lfg gank pve buff nerf sandbox crafting skilltree xp hitbox minmaxing despawn noob op lag gg crossplay endgame speedrun".split())

# But we surely could not have covered all commonly used words. So we implement a mechanism to check if the word that appears has been 
# used frequently enough across other reviews too or not
all_words = ' '.join(all_reviews_df['review']).split()
word_freq = Counter(all_words)

def validate_review(text):
    """
    This function will be used to check if the review is valid or not. It will be marked valid if atleast 50% of the words are found to be valid.
    """
    threshold=0.5
    freq_threshold=3 #if the word is a frequently used word across reviews then it is valid
    tokens = text.split()
    valid_words = [word for word in tokens if word in english_words or word_freq[word] > freq_threshold]
    return len(valid_words) / len(tokens) >= threshold

validated_df = all_reviews_df[all_reviews_df['review'].apply(validate_review)]
validated_df

#we save this cleaned dataset
validated_df.to_csv("cleaned_reviews_dataset.csv",index=False)

	recommendationid	author	language	review	timestamp_created	timestamp_updated	voted_up	...	game_name	timestamp_dev_responded	developer_response	author_steamid	author_num_games_owned	author_num_reviews	author_playtime_forever	author_playtime_last_two_weeks	author_playtime_at_review	author_last_played
0	177283496	{'steamid': '76561198266230202', 'num_games_ow...	english	I love the game, but… there is too much to do....	1729275153	1729275153	True	...	elden_ring	NaN	NaN	76561198266230202	326	16	8988	3957	8977	1729275953
1	177283364	{'steamid': '76561198829689696', 'num_games_ow...	english	roll	1729275009	1729275009	True	...	elden_ring	NaN	NaN	76561198829689696	11	2	2513	1085	2422	1729280436
2	177279370	{'steamid': '76561199387385936', 'num_games_ow...	english	Its Laterally Perfect In Every Way, The Only B...	1729270437	1729270437	True	...	elden_ring	NaN	NaN	76561199387385936	0	1	492	492	350	1729278945
3	177278879	{'steamid': '76561199572934905', 'num_games_ow...	english	Try finger, but hole	1729269896	1729269896	True	...	elden_ring	NaN	NaN	76561199572934905	0	1	6536	80	6456	1729280044
4	177278154	{'steamid': '76561198436642208', 'num_games_ow...	english	Damn, just damn. how can i explain this piece ...	1729268993	1729268993	True	...	elden_ring	NaN	NaN	76561198436642208	51	3	5563	1424	5563	1729268292

Getting our reviews ->¶

Data preprocessing ->¶

	recommendationid	author	language	review	timestamp_created	timestamp_updated	voted_up	votes_up	votes_funny	weighted_vote_score	...	steam_china_location	primarily_steam_deck	game_name	timestamp_dev_responded	developer_response	author_steamid	author_num_games_owned	author_num_reviews	author_playtime_forever	author_playtime_last_two_weeks
43690	30249428	{'steamid': '76561198256506010', 'num_games_ow...	english	really addictive and good	1488320291	1488320291	True	0	0	0	...		False	dota_2	NaN	NaN	76561198256506010	0	3	37	0
45002	30042721	{'steamid': '76561198053705825', 'num_games_ow...	english	I didn't find this game very fun. Not very bal...	1487532588	1487532588	False	0	0	0	...		False	dota_2	NaN	NaN	76561198053705825	105	3	41	0

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever
89	elden_ring		True	1729196068	0	14	192	358	358
127	elden_ring		True	1729165883	0	7	8102	332	8123
165	elden_ring		True	1729125671	0	10	9048	986	9098
396	elden_ring		True	1728928295	0	2	5747	2049	6025
430	elden_ring		True	1728886706	0	2	3343	3311	3729
...	...	...	...	...	...	...	...	...	...
44906	dota_2		True	1487597602	0	1	36328	0	40480
44907	dota_2		True	1487596377	1	1	102099	0	150906
44968	dota_2		True	1487562098	0	1	52456	0	329005
45052	dota_2		True	1487509195	0	1	47569	0	137826
45078	dota_2		True	1487499792	0	2	160	0	196

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever
0	elden_ring	i love the game but there is too much to do as...	True	1729275153	326	16	8977	3957	8988
1	elden_ring	roll	True	1729275009	11	2	2422	1085	2513
2	elden_ring	its laterally perfect in every way the only ba...	True	1729270437	0	1	350	492	492
3	elden_ring	try finger but hole	True	1729269896	0	1	6456	80	6536
4	elden_ring	damn just damn how can i explain this piece of...	True	1729268993	51	3	5563	1424	5563
...	...	...	...	...	...	...	...	...	...
45092	dota_2	nice	True	1487494906	0	2	1937	6997	121315
45093	dota_2	good game	True	1487494747	0	1	124437	0	251708
45094	dota_2	good	True	1487494231	0	1	31575	2916	205094
45095	dota_2	great game but some of sea players are toxics	True	1487494164	5	2	99538	0	170287
45096	dota_2	keep up the good work	True	1487493619	1	1	56345	0	110978